Why there are three unexpected worker threads when a Win32 console application starts up? [duplicate]

Windows 10 implemented a new way of loading DLLs – several worker threads do it in parallel (LdrpWorkCallback). All Windows 10 processes now have several such threads.

Before Win10, the system (ntdll.dll) always loaded DLLs in a single thread, but starting with Win10 this behaviour changed. Now a “Parallel loader” exists in ntdll. Now the loading task (NTSTATUS LdrpSnapModule(LDRP_LOAD_CONTEXT* LoadContext)) can be executed in worker threads. Almost every DLL has imports (dependent DLLs), so when a DLL is loaded – its dependent DLLs are also loaded and this process is recursive (dependent DLLs have own dependencies).

The function void LdrpMapAndSnapDependency(LDRP_LOAD_CONTEXT* LoadContext) walks the current loaded DLL import table and loads its direct (1st level) dependent DLLs by calling LdrpLoadDependentModule() (which internally calls LdrpMapAndSnapDependency() for the newly loaded DLL – so this process is recursive). Finally, LdrpMapAndSnapDependency() needs to call NTSTATUS LdrpSnapModule(LDRP_LOAD_CONTEXT* LoadContext) to bind imports to the already loaded DLLs. LdrpSnapModule() is executed for many DLLs in the top level DLL load process, and this process is independent for every DLL – so this is a good place to parallelize. LdrpSnapModule() in most cases does not load new DLLs, but only binds import to export from already loaded ones. But if an import is resolved to a forwarded export (which rarely happens) – the new, forwarded DLL, is loaded.


Some current implementation details:

  1. first of all, let us look into the struct _RTL_USER_PROCESS_PARAMETERS new field – ULONG LoaderThreads. this LoaderThreads (if set to nonzero) enables or disables “Parallel loader” in the new process. When we create a new process by ZwCreateUserProcess() – the 9th argument is
    PRTL_USER_PROCESS_PARAMETERS ProcessParameters. but if we use CreateProcess[Internal]W() – we cannot pass PRTL_USER_PROCESS_PARAMETERS directly – only STARTUPINFO. RTL_USER_PROCESS_PARAMETERS is partially initialized from STARTUPINFO, but we do not control ULONG LoaderThreads, and it will always be zero (if we do not call ZwCreateUserProcess() or set a hook to this routine).

  2. In the new process initialization phase, LdrpInitializeExecutionOptions() is called (from LdrpInitializeProcess()). This routine checks HKEY_LOCAL_MACHINE\SOFTWARE\Microsoft\Windows NT\CurrentVersion\Image File Execution Options\<app name> for several values (if the <app name> subkey exists – usually it doesn’t), including MaxLoaderThreads (REG_DWORD) – if MaxLoaderThreads exists – its value overrides RTL_USER_PROCESS_PARAMETERS.LoaderThreads.

  3. LdrpCreateLoaderEvents() is called. This routine must create 2 global events: HANDLE LdrpWorkCompleteEvent, LdrpLoadCompleteEvent;, which are used for synchronization.

    NTSTATUS LdrpCreateLoaderEvents()
    {
        NTSTATUS status = ZwCreateEvent(&LdrpWorkCompleteEvent, EVENT_ALL_ACCESS, 0, SynchronizationEvent, TRUE);
    
        if (0 <= status)
        {
            status = ZwCreateEvent(&LdrpLoadCompleteEvent, EVENT_ALL_ACCESS, 0, SynchronizationEvent, TRUE);
        }
        return status;
    }
    
  4. LdrpInitializeProcess() calls void LdrpDetectDetour(). This name speaks for itself. it does not return a value but initializes the global variable BOOLEAN LdrpDetourExist. This routine first checks whether some loader critical routines are hooked – currently these are 5 routines:

    • NtOpenFile
    • NtCreateSection
    • NtQueryAttributesFile
    • NtOpenSection
    • NtMapViewOfSection

    If yes – LdrpDetourExist = TRUE;

    If not hooked – ThreadDynamicCodePolicyInfo is queried – full code:

    void LdrpDetectDetour()
    {
        if (LdrpDetourExist) return ;
    
        static PVOID LdrpCriticalLoaderFunctions[] = {
            NtOpenFile,
            NtCreateSection,
            ZwQueryAttributesFile,
            ZwOpenSection,
            ZwMapViewOfSection,
        };
    
        static M128A LdrpThunkSignature[5] = {
            //***
        };
    
        ULONG n = RTL_NUMBER_OF(LdrpCriticalLoaderFunctions);
        M128A* ppv = (M128A*)LdrpCriticalLoaderFunctions;
        M128A* pps = LdrpThunkSignature; 
        do
        {
            if (ppv->Low != pps->Low || ppv->High != pps->High)
            {
                if (LdrpDebugFlags & 5)
                {
                    DbgPrint("!!! Detour detected, disable parallel loading\n");
                    LdrpDetourExist = TRUE;
                    return;
                }
            }
    
        } while (pps++, ppv++, --n);
    
        BOOL DynamicCodePolicy;
    
        if (0 <= ZwQueryInformationThread(NtCurrentThread(), ThreadDynamicCodePolicyInfo, &DynamicCodePolicy, sizeof(DynamicCodePolicy), 0))
        {
            if (LdrpDetourExist = (DynamicCodePolicy == 1))
            {
                if (LdrpMapAndSnapWork)
                {
                    WaitForThreadpoolWorkCallbacks(LdrpMapAndSnapWork, TRUE);//TpWaitForWork
                    TpReleaseWork(LdrpMapAndSnapWork);//CloseThreadpoolWork
                    LdrpMapAndSnapWork = 0;
                    TpReleasePool(LdrpThreadPool);//CloseThreadpool
                    LdrpThreadPool = 0;
                }
            }
        }
    }
    
  5. LdrpInitializeProcess() calls NTSTATUS LdrpEnableParallelLoading (ULONG LoaderThreads) – as LdrpEnableParallelLoading(ProcessParameters->LoaderThreads):

    NTSTATUS LdrpEnableParallelLoading (ULONG LoaderThreads)
    {
        LdrpDetectDetour();
    
        if (LoaderThreads)
        {
            LoaderThreads = min(LoaderThreads, 16);// not more than 16 threads allowed
            if (LoaderThreads <= 1) return STATUS_SUCCESS;
        }
        else
        {
            if (RtlGetSuiteMask() & 0x10000) return STATUS_SUCCESS; 
            LoaderThreads = 4;// default for 4 threads
        }
    
        if (LdrpDetourExist) return STATUS_SUCCESS;
    
        NTSTATUS status = TpAllocPool(&LdrpThreadPool, 1);//CreateThreadpool
    
        if (0 <= status)
        {
            TpSetPoolWorkerThreadIdleTimeout(LdrpThreadPool, -300000000);// 30 second idle timeout
            TpSetPoolMaxThreads(LdrpThreadPool, LoaderThreads - 1);//SetThreadpoolThreadMaximum 
            TP_CALLBACK_ENVIRON CallbackEnviron = { };
            CallbackEnviron->CallbackPriority = TP_CALLBACK_PRIORITY_NORMAL;
            CallbackEnviron->Size = sizeof(TP_CALLBACK_ENVIRON);
            CallbackEnviron->Pool = LdrpThreadPool;
            CallbackEnviron->Version = 3;
    
            status = TpAllocWork(&LdrpMapAndSnapWork, LdrpWorkCallback, 0, &CallbackEnviron);//CreateThreadpoolWork
        }
    
        return status;
    }
    

    A special loader thread pool is created – LdrpThreadPool, with LoaderThreads - 1 max threads. Idle timeout is set to 30 seconds (after which the thread exits) and allocated PTP_WORK LdrpMapAndSnapWork, which is then used in void LdrpQueueWork(LDRP_LOAD_CONTEXT* LoadContext).

  6. Global variables used by the parallel loader:

    HANDLE LdrpWorkCompleteEvent, LdrpLoadCompleteEvent;
    CRITICAL_SECTION LdrpWorkQueueLock;
    LIST_ENTRY LdrpWorkQueue = { &LdrpWorkQueue, &LdrpWorkQueue };
    
    
    ULONG LdrpWorkInProgress;
    BOOLEAN LdrpDetourExist;
    PTP_POOL LdrpThreadPool;
    
    PTP_WORK LdrpMapAndSnapWork;
    
    enum DRAIN_TASK {
        WaitLoadComplete, WaitWorkComplete
    };
    
    struct LDRP_LOAD_CONTEXT
    {
        UNICODE_STRING BaseDllName;
        PVOID somestruct;
        ULONG Flags;//some unknown flags
        NTSTATUS* pstatus; //final status of load
        _LDR_DATA_TABLE_ENTRY* ParentEntry; // of 'parent' loading dll
        _LDR_DATA_TABLE_ENTRY* Entry; // this == Entry->LoadContext
        LIST_ENTRY WorkQueueListEntry;
        _LDR_DATA_TABLE_ENTRY* ReplacedEntry;
        _LDR_DATA_TABLE_ENTRY** pvImports;// in same ordef as in IMAGE_IMPORT_DESCRIPTOR piid
        ULONG ImportDllCount;// count of pvImports
        LONG TaskCount;
        PVOID pvIAT;
        ULONG SizeOfIAT;
        ULONG CurrentDll; // 0 <= CurrentDll < ImportDllCount
        PIMAGE_IMPORT_DESCRIPTOR piid;
        ULONG OriginalIATProtect;
        PVOID GuardCFCheckFunctionPointer;
        PVOID* pGuardCFCheckFunctionPointer;
    };
    

    Unfortunately LDRP_LOAD_CONTEXT is not contained in published .pdb files, so my definitions include only partial names.

    struct {
        ULONG MaxWorkInProgress;//4 - values from explorer.exe at some moment
        ULONG InLoaderWorker;//7a (this mean LdrpSnapModule called from worker thread)
        ULONG InLoadOwner;//87 (LdrpSnapModule called direct, in same thread as `LdrpMapAndSnapDependency`)
    } LdrpStatistics;
    
    // for statistics
    void LdrpUpdateStatistics()
    {
      LdrpStatistics.MaxWorkInProgress = max(LdrpStatistics.MaxWorkInProgress, LdrpWorkInProgress);
      NtCurrentTeb()->LoaderWorker ? LdrpStatistics.InLoaderWorker++ : LdrpStatistics.InLoadOwner++
    }
    

    In TEB.CrossTebFlags – now exist 2 new flags:

    USHORT LoadOwner : 01; // 0x1000;
    USHORT LoaderWorker : 01; // 0x2000;
    

    Last 2 bits is spare (USHORT SpareSameTebBits : 02; // 0xc000)

  7. LdrpMapAndSnapDependency(LDRP_LOAD_CONTEXT* LoadContext) includes the following code:

    LDR_DATA_TABLE_ENTRY* Entry = LoadContext->CurEntry;
    if (LoadContext->pvIAT)
    {
        Entry->DdagNode->State = LdrModulesSnapping;
        if (LoadContext->PrevEntry)// if recursive call
        {
            LdrpQueueWork(LoadContext); // !!!
        }
        else
        {
            status = LdrpSnapModule(LoadContext);
        }
    }
    else
    {
        Entry->DdagNode->State = LdrModulesSnapped;
    }
    

    So, if LoadContext->PrevEntry (say we load user32.dll. In the first call to LdrpMapAndSnapDependency(), LoadContext->PrevEntry will be always 0 (when CurEntry points to user32.dll), but when we recursively call LdrpMapAndSnapDependency() for it dependency gdi32.dllPrevEntry will be for user32.dll and CurEntry for gdi32.dll), we do not direct call LdrpSnapModule(LoadContext); but LdrpQueueWork(LoadContext);.

  8. LdrpQueueWork() is simply:

    void LdrpQueueWork(LDRP_LOAD_CONTEXT* LoadContext)
    {
        if (0 <= ctx->pstatus)
        {
            EnterCriticalSection(&LdrpWorkQueueLock);
    
            InsertHeadList(&LdrpWorkQueue, &LoadContext->WorkQueueListEntry);
    
            LeaveCriticalSection(&LdrpWorkQueueLock);
    
            if (LdrpMapAndSnapWork && !RtlGetCurrentPeb()->Ldr->ShutdownInProgress)
            {
                SubmitThreadpoolWork(LdrpMapAndSnapWork);//TpPostWork
            }
        }
    }
    

    We insert LoadContext to LdrpWorkQueue and if “Parallel loader” is started (LdrpMapAndSnapWork != 0) and not ShutdownInProgress – we submit work to loader pool. But even if the pool is not initialized (say because Detours exist) – there will be no error – we process this task in LdrpDrainWorkQueue().

  9. In a worker thread callback, this is executed:

    void LdrpWorkCallback()
    {
        if (LdrpDetourExist) return;
    
        EnterCriticalSection(&LdrpWorkQueueLock);
    
        PLIST_ENTRY Entry = RemoveEntryList(&LdrpWorkQueue);
    
        if (Entry != &LdrpWorkQueue)
        {
            ++LdrpWorkInProgress;
            LdrpUpdateStatistics()
        }
    
        LeaveCriticalSection(&LdrpWorkQueueLock);
    
        if (Entry != &LdrpWorkQueue)
        {
            LdrpProcessWork(CONTAINING_RECORD(Entry, LDRP_LOAD_CONTEXT, WorkQueueListEntry), FALSE);
        }
    }
    

    We simply popup an entry from LdrpWorkQueue, convert it to LDRP_LOAD_CONTEXT* (CONTAINING_RECORD(Entry, LDRP_LOAD_CONTEXT, WorkQueueListEntry)) and call void LdrpProcessWork(LDRP_LOAD_CONTEXT* LoadContext, BOOLEAN LoadOwner).

  10. void LdrpProcessWork(LDRP_LOAD_CONTEXT* ctx, BOOLEAN LoadOwner)
    in general calls LdrpSnapModule(LoadContext) and in the end the next code is executed:

    if (!LoadOwner)
    {
        EnterCriticalSection(&LdrpWorkQueueLock);
        BOOLEAN bSetEvent = --LdrpWorkInProgress == 1 && IsListEmpty(&LdrpWorkQueue);
        LeaveCriticalSection(&LdrpWorkQueueLock);
        if (bSetEvent) ZwSetEvent(LdrpWorkCompleteEvent, 0);
    }
    

    So, if we are not LoadOwner (in worked thread), we decrement LdrpWorkInProgress, and if LdrpWorkQueue is empty then signal LdrpWorkCompleteEvent (LoadOwner can wait on it).

  11. and finally, LdrpDrainWorkQueue() is called from LoadOwner (primary thread) to “drain” the WorkQueue. It can possible pop and directly execute tasks pushed to LdrpWorkQueue by LdrpQueueWork(), and yet is not popped by worked threads or because parallel loader is disabled (in this case LdrpQueueWork() also push LDRP_LOAD_CONTEXT but not really post work to worked thread), and finally wait (if need) on LdrpWorkCompleteEvent or LdrpLoadCompleteEvent events.

    enum DRAIN_TASK {
        WaitLoadComplete, WaitWorkComplete
    };
    
    void LdrpDrainWorkQueue(DRAIN_TASK task)
    {
        BOOLEAN LoadOwner = FALSE;
    
        HANDLE hEvent = task ? LdrpWorkCompleteEvent : LdrpLoadCompleteEvent;
    
        for(;;)
        {
            PLIST_ENTRY Entry;
    
            EnterCriticalSection(&LdrpWorkQueueLock);
    
            if (LdrpDetourExist && task == WaitLoadComplete)
            {
                if (!LdrpWorkInProgress)
                {
                    LdrpWorkInProgress = 1;
                    LoadOwner = TRUE;
                }
                Entry = &LdrpWorkQueue;
            }
            else
            {
                Entry = RemoveHeadList(&LdrpWorkQueue);
    
                if (Entry == &LdrpWorkQueue)
                {
                    if (!LdrpWorkInProgress)
                    {
                        LdrpWorkInProgress = 1;
                        LoadOwner = TRUE;
                    }
                }
                else
                {
                    if (!LdrpDetourExist)
                    {
                        ++LdrpWorkInProgress;
                    }
                    LdrpUpdateStatistics();
                }
            }
            LeaveCriticalSection(&LdrpWorkQueueLock);
    
            if (LoadOwner)
            {
                NtCurrentTeb()->LoadOwner = 1;
                return;
            }
    
            if (Entry != &LdrpWorkQueue)
            {
                LdrpProcessWork(CONTAINING_RECORD(Entry, LDRP_LOAD_CONTEXT, WorkQueueListEntry), FALSE);
            }
            else
            {
                ZwWaitForSingleObject(hEvent, 0, 0);
            }
        }
    }
    
  12. void LdrpDropLastInProgressCount()
    {
      NtCurrentTeb()->LoadOwner = 0;
      EnterCriticalSection(&LdrpWorkQueueLock);
      LdrpWorkInProgress = 0;
      LeaveCriticalSection(&LdrpWorkQueueLock);
      ZwSetEvent(LdrpLoadCompleteEvent);
    }
    

Leave a Comment