通过源码理解UST(用户栈回溯)

UST原理：如果gflags标志中包含了UST标志，堆管理器会为当前进程分配一块内存，这个内存区域就是UST数据库（user-mode stack trace database），并建立一个STACK_TRACE_DATABASE数据结构来管理这个数据库，下面是从WRK上找到的数据结构。

typedef struct _STACK_TRACE_DATABASE {
        
    union {
        RTL_CRITICAL_SECTION CriticalSection;
        ERESOURCE Resource;
        PVOID Lock; // real lock (the other two kept for compatibility)
    };

    PVOID Reserved[3]; // fields no longer used but kept for compatibility
    
    BOOLEAN PreCommitted;       //数据库提交标志
    BOOLEAN DumpInProgress;     //转储标志

    PVOID CommitBase;           //数据库基地址
    PVOID CurrentLowerCommitLimit;
    PVOID CurrentUpperCommitLimit;

    PCHAR NextFreeLowerMemory;  //下一个空闲位置的低地址
    PCHAR NextFreeUpperMemory;  //下一个空闲位置的高地址

    ULONG NumberOfEntriesLookedUp;
    ULONG NumberOfEntriesAdded;

    PRTL_STACK_TRACE_ENTRY *EntryIndexArray;    // Indexed by [-1 .. -NumberOfEntriesAdded]

    ULONG NumberOfBuckets;      //下面Buckets数组的元素数
    PRTL_STACK_TRACE_ENTRY Buckets [1];     //每一项是都是说明数据库信息用的

} STACK_TRACE_DATABASE, *PSTACK_TRACE_DATABASE;

这个是PRTL_STACK_TRACE_ENTRY的数据结构

typedef struct _RTL_STACK_TRACE_ENTRY {

    struct _RTL_STACK_TRACE_ENTRY * HashChain;
    ULONG TraceCount;
    USHORT Index;
    USHORT Depth;
    PVOID BackTrace [MAX_STACK_DEPTH];

} RTL_STACK_TRACE_ENTRY, *PRTL_STACK_TRACE_ENTRY;

继续……

看看初始化UST数据库的过程

NTSTATUS
RtlInitializeStackTraceDataBase(
    IN PVOID CommitBase,        //提交基址
    IN SIZE_T CommitSize,       //
    IN SIZE_T ReserveSize       //
    )
{
    NTSTATUS Status;
    PSTACK_TRACE_DATABASE DataBase;     //声明局部变量的STACK_TRACE_DATABASE结构

    DataBase = (PSTACK_TRACE_DATABASE)CommitBase;
    
    if (CommitSize == 0) {      //如果提交大小＝0，进入默认值处理分支

        //
        // Initially commit enough pages to accommodate the increased
        // number of hash chains (for improved performance we switched from ~100
        // to ~1000 in the hope that the hash chains will decrease ten-fold in 
        // length).
        //
        
        //提交的大小 ＝ 一个默认值大小＝1567
        CommitSize = ROUND_TO_PAGES (NUMBER_OF_BUCKETS * sizeof (DataBase->Buckets[ 0 ]));
        
        //在预期的CommitBase地址分配一块内存，返回地址也在CommitBase
        Status = ZwAllocateVirtualMemory (NtCurrentProcess(),
                                          (PVOID *)&CommitBase,
                                          0,
                                          &CommitSize,
                                          MEM_COMMIT,
                                          PAGE_READWRITE);
        //健壮性
        if (! NT_SUCCESS(Status)) {

            KdPrint (("RTL: Unable to commit space to extend stack "
                      "trace data base - Status = %lx
",
                      Status));
            return Status;
        }
        
        //默认流程处理完，准备提交标志置为false
        DataBase->PreCommitted = FALSE;
    }
    else if (CommitSize == ReserveSize) {//如果提交大小 ＝ 保留大小
        //仅仅初始化数据库结构大小的内存，不留buckets的内存
        RtlZeroMemory (DataBase, sizeof( *DataBase ));
        //准备提交标志置为T
        DataBase->PreCommitted = TRUE;
    }
    else {//error
        
        return STATUS_INVALID_PARAMETER;
    }

    /*
    置一些标志位：
    CommitBase 
    NumberOfBuckets
    NextFreeLowerMemory下一个自由内存地址下线：所有桶位之后地址
    NextFreeUpperMemory下一个自由内存地址上线：提交基址+保留大小
    */
    DataBase->CommitBase = CommitBase;
    DataBase->NumberOfBuckets = NUMBER_OF_BUCKETS; 
    DataBase->NextFreeLowerMemory = (PCHAR)(&DataBase->Buckets[ DataBase->NumberOfBuckets ]);
    DataBase->NextFreeUpperMemory = (PCHAR)CommitBase + ReserveSize;

    if (! DataBase->PreCommitted) {
        // 提交地址的下线是：基址+已经提交大小
        DataBase->CurrentLowerCommitLimit = (PCHAR)CommitBase + CommitSize;
        // 上线：基址+保留大小
        DataBase->CurrentUpperCommitLimit = (PCHAR)CommitBase + ReserveSize;
    }
    else {
    //继续申请默认大小的桶位    
        RtlZeroMemory (&DataBase->Buckets[ 0 ],
                       DataBase->NumberOfBuckets * sizeof (DataBase->Buckets[ 0 ]));
    }

    DataBase->EntryIndexArray = (PRTL_STACK_TRACE_ENTRY *)DataBase->NextFreeUpperMemory;

    //
    // Initialize the database lock.
    //

    DataBase->Lock = &RtlpStackTraceDataBaseLock;

    Status = INITIALIZE_DATABASE_LOCK (DataBase->Lock);

    if (! NT_SUCCESS(Status)) {
        
        KdPrint(("RTL: Unable to initialize stack trace database lock (status %X)
", Status));
        return Status;
    }
    //这里把初始化好的UST数据库赋值给全局变量
    RtlpStackTraceDataBase = DataBase;

    return STATUS_SUCCESS;
}

建立了数据库之后，当堆块分配函数再被调用的时候，堆管理器将当前栈回溯信息记录到UST数据库中。

堆块分配函数调用RtlLogStackBackTrace记录

RtlLogStackBackTrace(
    VOID
    )
/*++

Routine Description:

    This routine will capture the current stacktrace (skipping the
    present function) and will save it in the global (per process)
    stack trace database. It should be noted that we do not save
    duplicate traces.
    
    此函数跳过本函数捕获栈回溯并保存到每个进程的全局UST中

Arguments:

    None.

Return Value:

    Index of the stack trace saved. The index can be used by tools
    to access quickly the trace data. This is the reason at the end of
    the database we save downwards a list of pointers to trace entries.
    This index can be used to find this pointer in constant time.

    栈回溯的索引号。这个索引号可以被工具用来快速访问到trace数据。
    A zero index will be returned for error conditions (e.g. stack
    trace database not initialized).

    返回0为错误。
Environment:

    User mode.

--*/

{
    return RtlpLogStackBackTraceEx (1);
}

RtlpLogStackBackTraceEx函数

USHORT
RtlpLogStackBackTraceEx(
    ULONG FramesToSkip
    )
/*++

Routine Description:

    This routine will capture the current stacktrace (skipping the
    present function) and will save it in the global (per process)
    stack trace database. It should be noted that we do not save
    duplicate traces.

Arguments:

    FramesToSkip - no of frames that are not interesting and 
    should be skipped.

Return Value: 返回栈回溯的索引

    Index of the stack trace saved. The index can be used by tools
    to access quickly the trace data. This is the reason at the end of
    the database we save downwards a list of pointers to trace entries.
    This index can be used to find this pointer in constant time.

    A zero index will be returned for error conditions (e.g. stack
    trace database not initialized).

Environment:

    User mode.

--*/

{
    RTL_STACK_TRACE_ENTRY Trace;
    USHORT TraceIndex;
    NTSTATUS Status;
    ULONG Hash;
    PSTACK_TRACE_DATABASE DataBase;

    //
    // Check the context in which we are running.
    //

    DataBase = RtlpStackTraceDataBase; // 全局变量的数据库指针

    if (DataBase == NULL) {
        return 0;
    }

    if (! OKAY_TO_LOCK_DATABASE (DataBase->Lock)) {
        return 0;
    }

    //
    // Capture stack trace. 
    //
// 4个参数
//显然&Trace, &Hash是输出参数
    if (RtlpCaptureStackTraceForLogging (&Trace, &Hash, FramesToSkip + 1, FALSE) == FALSE) {
        return 0;
    }
    
    //
    // Add the trace if it is not already there.
    // Return trace index.
    //
//添加trace，如果没有在UST中，看起来像是一个查找函数
    TraceIndex = RtlpLogCapturedStackTrace (&Trace, Hash);

    return TraceIndex;
}

现在要分析的是这两个函数

RtlpCaptureStackTraceForLogging
RtlpLogCapturedStackTrace

先从第一个RtlpCaptureStackTraceForLogging搞起

LOGICAL
RtlpCaptureStackTraceForLogging (
    PRTL_STACK_TRACE_ENTRY Trace,//[out] 栈回溯数组+数组深度
    PULONG Hash, //PRTL_STACK_TRACE_ENTRY整个结构的hash
    ULONG FramesToSkip,
    LOGICAL UserModeStackFromKernelMode
    )
{
    //这个参数传进来的是1，跳过此分支
    if (UserModeStackFromKernelMode == FALSE) {
        
        //
        // Capture stack trace. The try/except was useful
        // in the old days when the function did not validate
        // the stack frame chain. We keep it just to be defensive.
        //

        try {

            Trace->Depth = RtlCaptureStackBackTrace (FramesToSkip + 1,
                                                    MAX_STACK_DEPTH,
                                                    Trace->BackTrace,
                                                    Hash);
        }
        except(EXCEPTION_EXECUTE_HANDLER) {

            Trace->Depth = 0;
        }

        if (Trace->Depth == 0) {

            return FALSE;
        }
        else {

            return TRUE;
        }
    }
    else {

        ULONG Index;

        //
        // Avoid weird situations.
        //

        if (KeAreAllApcsDisabled () == TRUE) {
            return FALSE;
        }

        //
        // Capture user mode stack trace and hash value.
        //

        //关键函数:RtlWalkFrameChain
        //Trace->BackTrace参数是栈回溯的函数返回地址的数组
        Trace->Depth = (USHORT) RtlWalkFrameChain(Trace->BackTrace,
                                                  MAX_STACK_DEPTH,
                                                  1);
        if (Trace->Depth == 0) {
            
            return FALSE;
        }
        else {

            *Hash = 0;
        // 计算hash
            for (Index = 0; Index < Trace->Depth; Index += 1) {
                 *Hash += PtrToUlong (Trace->BackTrace[Index]);
            }

            return TRUE;
        }
    }
}

RtlWalkFrameChain函数

ULONG
RtlWalkFrameChain (
    OUT PVOID *Callers,
    IN ULONG Count,
    IN ULONG Flags
    )

/*++
Routine Description:
    This function tries to walk the EBP chain and fill out a vector of
    return addresses. It is possible that the function cannot fill the
    requested number of callers. In this case the function will just return
    with a smaller stack trace. In kernel mode the function should not take
    any exceptions (page faults) because it can be called at all sorts of
    irql levels.

    尝试遍历EBP链填充返回地址向量。可能函数不能填充请求数量的调用者。在这个例子中这个函数将会
    返回一个小的栈回溯。在内核模式函数因为有不同的IRLQ请求等级所以不会异常。

    The `Flags' parameter is used for future extensions. A zero value will be
    compatible with new stack walking algorithms.
    
    flags参数用于未来扩展。传递0兼容新的栈遍历算法。
    A value of 1 for `Flags' means we are running in K-mode and we want to get
    the user mode stack trace.
Return value:
    在栈上被识别了的返回地址的数量。可以小于被请求的数量。
    The number of identified return addresses on the stack. This can be less
    then the Count requested.
--*/

{

    ULONG_PTR Fp, NewFp, ReturnAddress;
    ULONG Index;
    ULONG_PTR StackEnd, StackStart;
    BOOLEAN Result;
    BOOLEAN InvalidFpValue;

    //
    // Get the current EBP pointer which is supposed to
    // be the start of the EBP chain.
    //

    // 得到当前栈桢上的ebp，作为ebp链的开始
    _asm mov Fp, EBP;

    StackStart = Fp;        //start_ebp
    InvalidFpValue = FALSE;

    // 上层函数 flag=1，不进入此分支
    if (Flags == 0) {
        if (! RtlpCaptureStackLimits (Fp, &StackStart, &StackEnd)) {
            return 0;
        }
    }


    try {

        //
        // If we need to get the user mode stack trace from kernel mode
        // figure out the proper limits.
        //

        // 上层函数 flag=1
        if (Flags == 1) {

            PKTHREAD Thread = KeGetCurrentThread ();
            PTEB Teb;
            PKTRAP_FRAME TrapFrame;
            ULONG_PTR Esp;

            // 看了PKTRAP_FRAME的结构，保存了所有的寄存器，感觉是异常环境现场
            TrapFrame = Thread->TrapFrame;
            Teb = Thread->Teb;

            //
            // If this is a system thread, it has no Teb and no kernel mode
            // stack, so check for it so we don't dereference NULL.
            //

            //如果是系统线程没有TEB也有没有内核模式栈

            // If there is no trap frame (probably an APC), or it's attached,
            // or the irql is greater than dispatch, this code can't log a
            // stack.
            //

            //如果是这几种情况，是不能记录栈的，我们关心正常流程
            if (Teb == NULL || 
                IS_SYSTEM_ADDRESS((PVOID)TrapFrame) == FALSE || 
                (PVOID)TrapFrame <= Thread->StackLimit ||
                (PVOID)TrapFrame >= Thread->StackBase ||
                KeIsAttachedProcess() || 
                (KeGetCurrentIrql() >= DISPATCH_LEVEL)) {

                return 0;
            }
            // 我理解StackStart是栈的上线，StackEnd是栈的下线，栈是从下线（高）向上线（低）增长
            StackStart = (ULONG_PTR)(Teb->NtTib.StackLimit);
            StackEnd = (ULONG_PTR)(Teb->NtTib.StackBase);
            Fp = (ULONG_PTR)(TrapFrame->Ebp);

            if (StackEnd <= StackStart) {
                return 0;
            }
            // 探测栈是否可读
            ProbeForRead (StackStart, StackEnd - StackStart, sizeof (UCHAR));
        }
        // 遍历所有的栈，上层参数：Count是栈的最大深度
        for (Index = 0; Index < Count; Index += 1) {

            // 一些check
            if (Fp >= StackEnd || 
                ( (Index == 0)?
                      (Fp < StackStart):
                      (Fp <= StackStart) ) ||
                StackEnd - Fp < sizeof(ULONG_PTR) * 2) {
                break;
            }
            // 回溯到上一层栈
            NewFp = *((PULONG_PTR)(Fp + 0));
            ReturnAddress = *((PULONG_PTR)(Fp + sizeof(ULONG_PTR)));

            //
            // Figure out if the new frame pointer is ok. This validation
            // should avoid all exceptions in kernel mode because we always
            // read within the current thread's stack and the stack is
            // guaranteed to be in memory (no page faults). It is also guaranteed
            // that we do not take random exceptions in user mode because we always
            // keep the frame pointer within stack limits.
            //

            if (! (Fp < NewFp && NewFp < StackEnd)) {

                InvalidFpValue = TRUE;
            }

            //
            // Figure out if the return address is ok. If return address
            // is a stack address or <64k then something is wrong. There is
            // no reason to return garbage to the caller therefore we stop.
            //

            if (StackStart < ReturnAddress && ReturnAddress < StackEnd) {
                break;
            }

            if (Flags == 0 && IS_SYSTEM_ADDRESS((PVOID)ReturnAddress) == FALSE) {
                break;
            }

            //
            // Store new fp and return address and move on.
            // If the new FP value is bogus but the return address
            // looks ok then we still save the address.
            //

            // 保存返回地址到数组
            Callers[Index] = (PVOID)ReturnAddress;
            
            if (InvalidFpValue) {

                Index += 1;
                break;
            }
            else {

                Fp = NewFp;
            }
        }
    }
    except (RtlpWalkFrameChainExceptionFilter (_exception_code(), _exception_info())) {

        Index = 0;
    }

    //
    // Return the number of return addresses identified on the stack.
    //
    // 返回遍历到的索引
    return Index;

}

接着分析第二个函数RtlpLogCapturedStackTrace:

USHORT
RtlpLogCapturedStackTrace(
    PRTL_STACK_TRACE_ENTRY Trace,//PRTL_STACK_TRACE_ENTRY
    ULONG hash
    )
{
    PSTACK_TRACE_DATABASE DataBase;
    PRTL_STACK_TRACE_ENTRY p, *pp;
    ULONG RequestedSize, DepthSize;
    USHORT ReturnValue;

    //RtlpStackTraceDataBase用户模式或系统系统每个进程全局的栈回溯数据库
    DataBase = RtlpStackTraceDataBase;

    //
    // Update statistics counters. Since they are used only for reference and do not
    // control decisions we increment them without protection even if this means we may
    // have numbers slightly out of sync.
    //

    DataBase->NumberOfEntriesLookedUp += 1;

    //
    // Lock the global per-process stack trace database.
    //

    if (RtlpAcquireStackTraceDataBase() == NULL) {

        //
        // Fail the log operation if we cannot acquire the lock.
        // This can happen only if there is a dump in progress or we are in
        // an invalid context (process shutdown (Umode) or DPC routine (Kmode).
        //

        return 0;
    }

    try {

        //
        // We will try to find out if the trace has been saved in the past.
        // We find the right hash chain and then traverse it.
        //

        //遍历hash链，尝试找到是否这个trace已经被保存过。
        DepthSize = Trace->Depth * sizeof (Trace->BackTrace[0]);

        // 当前hash%数组大小 --> hash表，从hash表中比对数组大小，元素值
        pp = &DataBase->Buckets[ Hash % DataBase->NumberOfBuckets ];

        while (p = *pp) {

            if (p->Depth == Trace->Depth) {

                if (RtlCompareMemory( &p->BackTrace[ 0 ], &Trace->BackTrace[ 0 ], DepthSize) == DepthSize) {
                    
                    break;
                }
            }

            pp = &p->HashChain;
        }

        // 没有查询到了相同的栈回溯记录，添加之 
        if (p == NULL) {

            //
            // If we get here we did not find a similar trace in the database. We need
            // to add it.
            //
            // We got the `*pp' value (address of last chain element) while the 
            // database lock was acquired shared so we need to take into consideration 
            // the case where another thread managed to acquire database exclusively 
            // and add a new trace at the end of the chain. Therefore if `*pp' is no longer
            // null we continue to traverse the chain until we get to the end.
            //

            p = NULL;

            if (*pp != NULL) {

                //
                // Somebody added some traces at the end of the chain while we
                // were trying to convert the lock from shared to exclusive.
                //

                while (p = *pp) {

                    if (p->Depth == Trace->Depth) {

                        if (RtlCompareMemory( &p->BackTrace[ 0 ], &Trace->BackTrace[ 0 ], DepthSize) == DepthSize) {

                            break;
                        }
                    }

                    pp = &p->HashChain;
                }
            }

            if (p == NULL) {
                
                //
                // Nobody added the trace and now `*pp' really points to the end
                // of the chain either because we traversed the rest of the chain
                // or it was at the end anyway.
                //

                RequestedSize = FIELD_OFFSET (RTL_STACK_TRACE_ENTRY, BackTrace) + DepthSize;

                // 添加trace到数据库
                p = RtlpExtendStackTraceDataBase (Trace, RequestedSize);

                if (p != NULL) {

                    //
                    // We added the trace no chain it as the last element.
                    //

                    *pp = p;
                }
            }
            else {

                //
                // Some other thread managed to add the same trace to the database
                // while we were trying to acquire the lock exclusive. `p' has the
                // address to the stack trace entry.
                //
            }
        }
    }
    except(EXCEPTION_EXECUTE_HANDLER) {

        //
        // We should never get here if the algorithm is correct.
        //

        p = NULL;
    }

    //
    // Release locks and return. At this stage we may return zero (failure)
    // if we did not manage to extend the database with a new trace (e.g. due to
    // out of memory conditions).
    //

    // 查询到了相同的栈回溯记录 
    if (p != NULL) {

        p->TraceCount += 1;     //TraceCount+1

        ReturnValue = p->Index;
    }
    else {
        
        ReturnValue = 0;
    }

    RtlpReleaseStackTraceDataBase();

    return ReturnValue;
}