深入研究一下线程调度,由于篇幅较多,分章节分析

SwapContext

首先研究一下 SwapContext 函数的实现。

伪代码分析

这里我们不去分析汇编代码,而是直接用 IDA + F5,把定义还原回去即可清晰地看出逻辑

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
bool __usercall SwapContext@<al>(
_KPCR *PCR@<ebx>,
_KTHREAD *CurrentThread@<edi>,
_KTHREAD *NextThread@<esi>,
int irql@<ecx>)
{
unsigned __int64 v4; // rax
unsigned __int32 v5; // ebp
char *InitialStack; // ecx
_EPROCESS *Process; // ecx
_KPROCESS *v8; // ebp
_KPROCESS *v9; // eax
void *ArbitraryUserPointer; // ecx
void *NextThreadStackPointer; // ecx
char *v12; // eax
unsigned __int32 v13; // ebp
int v14; // edx
unsigned int v15; // edx
_NT_TIB *Teb; // eax
_KGDTENTRY *_GDT; // ecx
int v18; // ecx
bool v20; // zf
_KGDTENTRY *GDT; // ecx
_KPRCB *p_PrcbData; // [esp-10h] [ebp-10h]
char v24; // [esp-Ch] [ebp-Ch]
_EXCEPTION_REGISTRATION_RECORD *ExceptionList; // [esp-8h] [ebp-8h] BYREF
int v26; // [esp-4h] [ebp-4h]
_UNKNOWN *retaddr; // [esp+0h] [ebp+0h] BYREF

while ( NextThread->Running )
_mm_pause();
NextThread->Running = 1;
v26 = irql;
_disable();
--PCR->PrcbData.NestingLevel;
v4 = __rdtsc() - PCR->PrcbData.StartCycles;
PCR->PrcbData.HighCycleTime = (v4 + __PAIR64__(PCR->PrcbData.HighCycleTime, PCR->PrcbData.CycleTime)) >> 0x20;
PCR->PrcbData.CycleTime += v4;
PCR->PrcbData.StartCycles += v4;
if ( (NextThread->Header.ThreadControlFlags & 5) == 0 )
goto LABEL_4;
if ( (NextThread->Header.ThreadControlFlags & 4) != 0 )
KiBeginCounterAccumulation(NextThread, 1);
if ( (NextThread->Header.ThreadControlFlags & 1) != 0 )
{
ExceptionList = (_EXCEPTION_REGISTRATION_RECORD *)NextThread;
v24 = 1;
p_PrcbData = &PCR->PrcbData;
_enable();
PsCheckThreadCpuQuota((int)p_PrcbData, v24, (int)ExceptionList);
}
else
{
LABEL_4:
_enable();
}
++PCR->NtTib.Version;
ExceptionList = PCR->NtTib.ExceptionList;
v5 = __readcr0();
if ( CurrentThread->NpxState )
{
InitialStack = (char *)CurrentThread->InitialStack;
if ( (v5 & 0xE) != 0 )
__writecr0(v5 & 0xFFFFFFF1);
_fxsave(InitialStack - 528);
CurrentThread->NpxState &= 0xF8u;
PCR->PrcbData.NpxThread = 0;
}
CurrentThread->KernelStack = &ExceptionList;
Process = (_EPROCESS *)NextThread->Process;
if ( Process != (_EPROCESS *)CurrentThread->Process )
KiUpdateSpeculationControl(Process);
v8 = NextThread->ApcState.Process;
v9 = CurrentThread->ApcState.Process;
if ( v8 != v9 )
{
ArbitraryUserPointer = PCR->NtTib.ArbitraryUserPointer;
_InterlockedXor((volatile signed __int32 *)v8->ActiveProcessors.Bitmap, (unsigned int)ArbitraryUserPointer);
_InterlockedXor((volatile signed __int32 *)v9->ActiveProcessors.Bitmap, (unsigned int)ArbitraryUserPointer);
if ( *(_DWORD *)&v9->LdtDescriptor.LimitLow | *(_DWORD *)&v8->LdtDescriptor.LimitLow )
{
_EAX = *(_DWORD *)&v8->LdtDescriptor.LimitLow;
if ( _EAX )
{
GDT = PCR->GDT;
*(_DWORD *)&GDT[9].LimitLow = _EAX;
GDT[9].HighWord.Bits = v8->LdtDescriptor.HighWord.Bits;
PCR->IDT[0x21] = v8->Int21Descriptor;
LOWORD(_EAX) = 72;
}
__asm { lldt ax }
}
__writecr3(v8->DirectoryTableBase);
}
NextThreadStackPointer = NextThread->InitialStack;
v12 = (char *)NextThreadStackPointer - 528;
if ( (*((char *)NextThreadStackPointer - 554) & 2) == 0 )
v12 = (char *)NextThreadStackPointer - 544;
*((_DWORD *)PCR->NtTib.SubSystemTib + 1) = v12;
*((_WORD *)PCR->NtTib.SubSystemTib + 51) = v8->IopmOffset;
if ( (dword_532FC4 & 4) != 0 )
{
EtwTraceContextSwap((_ETHREAD *)CurrentThread, (_ETHREAD *)NextThread);
NextThreadStackPointer = NextThread->InitialStack;
}
CurrentThread->Running = 0;
v13 = __readcr0();
if ( (v13 & 0xE) != 0 )
{
v13 &= 0xFFFFFFF1;
__writecr0(v13);
}
_fxrstor((char *)NextThreadStackPointer - 528);
NextThread->NpxState |= 7u;
__writefsdword(0x5C0u, (unsigned int)NextThread);// NpxThread
v14 = *((_DWORD *)NextThreadStackPointer - 5);
if ( (NextThread->NpxState & 7) == 0 )
v14 |= NpxStateNotLoaded;
v15 = v13 & 0xFFFFFFF1 | v14;
if ( v15 != v13 )
__writecr0(v15);
Teb = (_NT_TIB *)NextThread->Teb;
PCR->NtTib.Self = Teb;
_GDT = PCR->GDT;
_GDT[7].BaseLow = (unsigned __int16)Teb;
Teb = (_NT_TIB *)((unsigned int)Teb >> 16);
_GDT[7].HighWord.Bytes.BaseMid = (unsigned __int8)Teb;
_GDT[7].HighWord.Bytes.BaseHi = BYTE1(Teb);
++NextThread->ContextSwitches;
PCR->NtTib.ExceptionList = ExceptionList;
v18 = v26;
if ( PCR->PrcbData.DpcRoutineActive )
KeBugCheckEx(0xB8u, (ULONG_PTR)CurrentThread, (ULONG_PTR)NextThread, (ULONG_PTR)CurrentThread->InitialStack, 0);
if ( !NextThread->ApcState.KernelApcPending )
return 0;
v20 = NextThread->SpecialApcDisable == 0;
if ( !NextThread->SpecialApcDisable )
{
v20 = (_BYTE)v26 == 0;
if ( (_BYTE)v26 )
{
LOBYTE(v18) = 1;
return ((unsigned int)&retaddr | HalRequestSoftwareInterrupt(v18)) == 0;
}
}
return v20;
}

对于 _mm_pause(); 其实是汇编指令 pause 的实现,查阅 x86 手册可以看到说明:

Gives hint to processor that improves performance of spin-wait loops. 主要是提高自旋锁循环等待的性能。

其中我们是可以看到一些切换线程必备的操作的:

如果切换的线程所属进程与当前进程不一致,则切换 cr3 为下一个线程所属进程的 cr3 值。

1
2
3
4
5
6
7
v8 = NextThread->ApcState.Process;
v9 = CurrentThread->ApcState.Process;
if ( v8 != v9 )
{
//...
__writecr3(v8->DirectoryTableBase);
}

将当前线程的 TEB 保存到了段选择子为 0x3bgdt[7]) 的段上,这个值是 R3fs 所指示的段的基址。

1
2
3
4
5
6
7
8
Teb = (_NT_TIB *)NextThread->Teb;
PCR->NtTib.Self = Teb;
_GDT = PCR->GDT;
_GDT[7].BaseLow = (unsigned __int16)Teb;
Teb = (_NT_TIB *)((unsigned int)Teb >> 16);
_GDT[7].HighWord.Bytes.BaseMid = (unsigned __int8)Teb;
_GDT[7].HighWord.Bytes.BaseHi = BYTE1(Teb);
++NextThread->ContextSwitches;

汇编分析

从伪代码似乎只能分析得到这么多了,由于 IDA 很难处理类似堆栈切换的函数,所以剩余的逻辑需要分析汇编语句,从伪代码 CurrentThread->KernelStack = &ExceptionList; 处开始分析,其中,edi 指向了 CurrentThreadesi 指向了 NextThread

1
2
3
4
5
6
7
.text:00458FDE                         loc_458FDE:                             ; CODE XREF: SwapContext()+63↑j
.text:00458FDE 89 67 30 mov [edi+30h], esp
.text:00458FE1 8B 46 28 mov eax, [esi+28h]
.text:00458FE4 8B 66 30 mov esp, [esi+30h]
.text:00458FE7 8B 8E 50 01 00 00 mov ecx, [esi+150h]
.text:00458FED 3B 8F 50 01 00 00 cmp ecx, [edi+150h]
.text:00458FF3 74 05 jz short loc_458FFA

这里给出 KTHREAD 参考

1
2
3
4
5
kd> dt _KTHREAD
ntdll!_KTHREAD
+0x028 InitialStack : Ptr32 Void
+0x030 KernelStack : Ptr32 Void
+0x150 Process : Ptr32 _KPROCESS

所以这段代码很明显就是,将当前 esp 的值保存到 CurrentThread->KernelStack,从 NextThread->KernelStack 取出值赋给 esp


中断门提权的时候,要取得保存在 TSS 中的 esp0,既然一个核只有一个 TSS,那么在切换线程的时候必然要把 NextThreadesp 给到 TSS,不然这个线程从中断门提权过来就会进入别的线程的栈。相关代码如下

1
2
3
4
5
6
7
8
9
.text:00459033                         loc_459033:                             ; CODE XREF: SwapContext()+DE↑j
.text:00459033 8B 53 0C mov edx, [ebx+0Ch]
.text:00459036 89 42 04 mov [edx+4], eax
.text:00459039 8B 53 0C mov edx, [ebx+0Ch]
.text:0045903C 66 8B 45 6E mov ax, [ebp+6Eh]
.text:00459040 66 89 42 66 mov [edx+66h], ax
.text:00459044 F7 05 C4 2F 53 00 04 00 test ds:dword_532FC4, 4
.text:00459044 00 00
.text:0045904E 0F 85 0F 01 00 00 jnz loc_459163

其中 ebx+0xC 刚好是 KPCR.TssCopy 其值与 KPCR.TSS 一致。在此地下一个断点,看看 esp 是否会在这里改变。

可以看出,在这个地方的确是将 esp 的值存入了 Tss.esp0 中,至于为什么没有存 ss0,大概是因为 Tss 只会在线程切换的时候修改,而线程切换都是在 r0 去做的,因此 ss0==0x10 恒成立。且仔细观察可以得知,该 Tss 似乎只有 esp0ss0 两个字段上有值,其余值全为 0。

总结

线程切换的时候,会做如下操作:

  1. 保存 ExceptionList
  2. 切换 esp
  3. 判断目标线程与当前线程是否属于同一进程,如果不属于则切换 CR3
  4. 将当前线程的 esp0 写入 Tss 寄存器
  5. 根据当前线程的 PEB 写入段选择子为 0x3b 的描述符表。

空闲线程分析

空闲线程即IdleThread,当CPU空闲的时候会执行这个线程,可以直接通过 KPCR 结构找到该线程,并观察它所执行的代码。

去 IDA 里面查看这个函数,可以发现就是做一些无意义的指令,让 CPU 别闲下来。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
text:004594B0                         @KiIdleLoop@0   proc near               ; CODE XREF: KiSystemStartup(x)+20B↓j
.text:004594B0 ; DATA XREF: KiInitializeThread(x,x,x,x)+14↑o ...
.text:004594B0 EB 0B jmp short loc_4594BD
.text:004594B2 ; ---------------------------------------------------------------------------
.text:004594B2
.text:004594B2 loc_4594B2: ; CODE XREF: KiIdleLoop()+ED↓j
.text:004594B2 8D 8B 20 01 00 00 lea ecx, [ebx+120h]
.text:004594B8 E8 5D 24 00 00 call @PoIdle@4 ; PoIdle(x)
.text:004594BD
.text:004594BD loc_4594BD: ; CODE XREF: KiIdleLoop()↑j
.text:004594BD ; KiIdleLoop()+CE↓j ...
.text:004594BD 80 3D 29 5A 56 00 00 cmp ds:_HvlEnableIdleYield, 0
.text:004594C4 74 02 jz short loc_4594C8
.text:004594C6 F3 90 pause
.text:004594C8
.text:004594C8 loc_4594C8: ; CODE XREF: KiIdleLoop()+14↑j
.text:004594C8 FB sti
.text:004594C9 90 nop
.text:004594CA 90 nop
.text:004594CB FA cli
.text:004594CC F6 83 54 1A 00 00 3F test byte ptr [ebx+1A54h], 3Fh
.text:004594D3 74 13 jz short loc_4594E8
.text:004594D5 B1 02 mov cl, 2
.text:004594D7 FF 15 A8 10 40 00 call ds:__imp_@HalClearSoftwareInterrupt@4 ; HalClearSoftwareInterrupt(x)
.text:004594DD 8D 8B 20 01 00 00 lea ecx, [ebx+120h]
.text:004594E3 E8 0B 01 00 00 call @KiRetireDpcList@4 ; KiRetireDpcList(x)
.text:004594E8
.text:004594E8 loc_4594E8: ; CODE XREF: KiIdleLoop()+23↑j
.text:004594E8 83 BB 28 01 00 00 00 cmp dword ptr [ebx+128h], 0
.text:004594EF 0F 84 A1 00 00 00 jz loc_459596
.text:004594F5 FB sti
.text:004594F6 8B BB 24 01 00 00 mov edi, [ebx+124h]
.text:004594FC F0 0F BA AB 3C 1A 00 00 lock bts dword ptr [ebx+1A3Ch], 0
.text:004594FC 00
.text:00459505 73 0D jnb short loc_459514
.text:00459507 8D 8B 3C 1A 00 00 lea ecx, [ebx+1A3Ch] ; SpinLock
.text:0045950D E8 CE 03 00 00 call @KefAcquireSpinLockAtDpcLevel@4 ; KefAcquireSpinLockAtDpcLevel(x)
.text:00459512 EB 07 jmp short loc_45951B
.text:00459514 ; ---------------------------------------------------------------------------
.text:00459514
.text:00459514 loc_459514: ; CODE XREF: KiIdleLoop()+55↑j
.text:00459514 64 FF 05 80 36 00 00 inc large dword ptr fs:3680h
.text:0045951B
.text:0045951B loc_45951B: ; CODE XREF: KiIdleLoop()+62↑j
.text:0045951B 8B B3 28 01 00 00 mov esi, [ebx+128h]
.text:00459521 3B F7 cmp esi, edi
.text:00459523 74 5E jz short loc_459583
.text:00459525 83 A3 28 01 00 00 00 and dword ptr [ebx+128h], 0
.text:0045952C FA cli
.text:0045952D 0F 31 rdtsc
.text:0045952F 2B 83 18 33 00 00 sub eax, [ebx+3318h]
.text:00459535 1B 93 1C 33 00 00 sbb edx, [ebx+331Ch]
.text:0045953B 8B 4F 10 mov ecx, [edi+10h]
.text:0045953E 03 C8 add ecx, eax
.text:00459540 11 57 18 adc [edi+18h], edx
.text:00459543 01 47 10 add [edi+10h], eax
.text:00459546 11 57 14 adc [edi+14h], edx
.text:00459549 01 83 18 33 00 00 add [ebx+3318h], eax
.text:0045954F 11 93 1C 33 00 00 adc [ebx+331Ch], edx
.text:00459555 FB sti
.text:00459556 FE 83 31 01 00 00 inc byte ptr [ebx+131h]
.text:0045955C 89 B3 24 01 00 00 mov [ebx+124h], esi
.text:00459562 C6 46 68 02 mov byte ptr [esi+68h], 2
.text:00459566 80 A3 53 1A 00 00 00 and byte ptr [ebx+1A53h], 0
.text:0045956D 83 A3 3C 1A 00 00 00 and dword ptr [ebx+1A3Ch], 0
.text:00459574
.text:00459574 loc_459574: ; CODE XREF: KiIdleLoop()+109↓j
.text:00459574 B9 01 00 00 00 mov ecx, 1
.text:00459579 E8 D2 F9 FF FF call _SwapContext@0 ; SwapContext()
.text:0045957E E9 3A FF FF FF jmp loc_4594BD

KiFindReadyThread

该函数用于寻找一个就绪态的线程。

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
_KTHREAD *__userpurge KiFindReadyThread@<eax>(unsigned int a1@<eax>, int a2, _KPRCB *PrcbData)
{
unsigned int v4; // eax
_LIST_ENTRY *v5; // edi
_LIST_ENTRY *Flink; // ecx
_KTHREAD *thread; // eax
_LIST_ENTRY *v8; // edx
_LIST_ENTRY *Blink; // ecx
unsigned int v10; // [esp+Ch] [ebp-4h]

while ( 2 )
{
_BitScanReverse(&v4, a1);
a1 ^= KiMask32Array[v4];
v5 = &PrcbData->DispatcherReadyListHead[v4];
Flink = v5->Flink;
v10 = v4;
do
{
thread = (_KTHREAD *)&Flink[-15].Blink;
if ( *(_WORD *)(a2 + 0x3C6) == LOWORD(Flink[28].Blink) && (*(_DWORD *)(a2 + 0x3C8) & (int)Flink[28].Flink) != 0 )
{
v8 = Flink->Flink;
Blink = Flink->Blink;
Blink->Flink = v8;
v8->Blink = Blink;
if ( v8 == Blink )
PrcbData->ReadySummary ^= KiMask32Array[v10];
thread->NextProcessor = *(_DWORD *)(a2 + 0x3CC);
return thread;
}
Flink = Flink->Flink;
}
while ( Flink != v5 );
if ( a1 )
continue;
break;
}
return 0;
}

直接用一个 x86 指令 bsr 找到,比我想得还简单(不过据说早期的确是使用二分算法找到的),关于该指令描述如下:

Searches the source operand (second operand) for the most significant set bit (1 bit). If a most significant 1 bit is found, its bit index is stored in the destination operand (first operand). The source operand can be a register or a memory location; the destination operand is a register. The bit index is an unsigned offset from bit 0 of the source operand. If the content source operand is 0, the content of the destination operand is undefined.
在源操作数(第二个操作数)中搜索最高有效设置位 (1 位)。如果找到最高有效 1 位,则其位索引将存储在目标操作数(第一个操作数)中。源操作数可以是 register 或内存位置;目标操作数是一个 register。位索引是源操作数位 0 的无符号偏移量。如果内容源操作数为 0,则目标操作数的内容未定义。

剩余的内容,其中 Flink[-15].Blink 等同于 Flink-0x74 也就是得到 _KTHREAD/_ETHREAD 指针,而 Flink[28].BlinkFlink[28].Flink 又分别表示了 EHTREAD.AffinityETHREAD.Process

这里无非就是判断一下当前线程能否由当前的处理器去跑,判断成功后断链该线程返回,返回之前判断一下链表是否为空,如果为空那么给对应的 ReadySummary 位置 0

分析一遍线程切换感觉收获挺多的,突然理解了以前各种不理解的东西,后续还会继续分析线程切换相关的函数,尽量保证分析透彻,个人浅薄的见解难免有误,进请见谅。

参考文献