声明

本文将借助正则表达式，采用Python2.7编写脚本，自动对C代码工程中的函数添加调试跟踪语句。
正则表达式的基础知识可参考《Python正则表达式指南》一文，本文将不再赘述。
本文同时也发布于作业部落。

一. 问题提出

作者复杂的模块包含数十万行C代码，调用关系复杂。若能对关键函数添加调试跟踪语句，运行时输出当前文件名、函数名、行号等信息，会有助于维护者和新手更好地掌握模块流程。

考虑到代码规模，手工添加跟踪语句不太现实。因此，作者决定编写脚本自动实现这一功能。为保证不影响代码行号信息，跟踪语句与函数左花括号{位于同一行。

二. 代码实现

2.1 函数匹配测试

匹配C函数时，主要依赖函数头的定义模式，即返回类型、函数名、参数列表及相邻的左花括号{。注意，返回类型和参数列表可以为空，{可能与函数头同行或位于其后某行。

函数头模式的组合较为复杂。为正确匹配C函数，首先列举疑似函数头的典型语句，作为测试用例：

funcList = [
    ' static unsigned int test(int a, int b) { ',
    'INT32U* test (int *a, char b[]/*names*/) ', ' void test()',
    '#define MACRO(m) {m=5;}',
    'while(bFlag) {',  ' else if(a!=1||!b)',
    'if(IsTimeOut()){', ' else if(bFlag)',
    'static void test(void);'
]

然后，构造恰当的正则表达式，以匹配funcList中的C函数：

import sys, os, re
def ParseFuncHeader():
    regPattern = re.compile(r'''(?P<Ret>[ws]+[w*]+)          #return type
                                s+(?P<Func>w+)                #function name
                                s*((?P<Args>[,/*[].sw]*)) #args
                                s*({?)s*$''', re.X)

    for i in range(0, len(funcList)):
        regMatch = regPattern.match(funcList[i])
        if regMatch != None:
           print '[%d] %s' %(i, regMatch.groups())
           print '     %s' %regMatch.groupdict()

为简化正则表达式，未区分函数返回类型的限定符(如const, static等)和关键字(如int, INT32U等)。若有需要，可在初步匹配后进一步细化子串的匹配。注意，args项也可使用排除字符集，如排除各种运算符[^<>&|=)]。但C语言运算符繁多，故此处选择匹配函数参数中的合法字符，即逗号、注释符、指针运算符、数组括号、空白和单词字符等。

执行ParseFuncHeader()函数后，运行结果如下：

[0] (' static unsigned int', 'test', 'int a, int b', '{')
     {'Args': 'int a, int b', 'Ret': ' static unsigned int', 'Func': 'test'}
[1] ('INT32U*', 'test', 'int *a, char b[]/*names*/', '')
     {'Args': 'int *a, char b[]/*names*/', 'Ret': 'INT32U*', 'Func': 'test'}
[2] (' void', 'test', '', '')
     {'Args': '', 'Ret': ' void', 'Func': 'test'}
[7] (' else', 'if', 'bFlag', '')
     {'Args': 'bFlag', 'Ret': ' else', 'Func': 'if'}

可见，除正确识别合法的函数头外，还误将 else if(bFlag)识别为函数头。要排除这种组合，可修改上述正则表达式，或在匹配后检查Func分组是否包含if子串。

2.2 插入跟踪语句

构造出匹配C函数头的正则表达式后，稍加修改即可用于实际工程中函数的匹配。

由于工程中C函数众多，尤其是短小的函数常被频繁调用，因此需控制待处理的函数体规模。亦即，仅对超过特定行数的函数插入跟踪语句。这就要求统计函数行数，思路是从函数头开始，向下查找函数体末尾的}。具体实现如下：

#查找复合语句的一对花括号{}，返回右花括号所在行号
def FindCurlyBracePair(lineList, startLineNo):
    leftBraceNum = 0
    rightBraceNum = 0
    #若未找到对应的花括号，则将起始行的下行作为结束行
    endLineNo = startLineNo + 1

    for i in range(startLineNo, len(lineList)):
        #若找到{，计数
        if lineList[i].find('{') != -1:
            leftBraceNum += 1
        #若找到}，计数。}可能与{位于同一行
        if lineList[i].find('}') != -1:
            rightBraceNum += 1
        #若左右花括号数目相等且不为0，则表明最外层花括号匹配
        if (leftBraceNum == rightBraceNum) and (leftBraceNum != 0):
            endLineNo = i
            break

    return endLineNo

接下来是本文的重头戏，即匹配当前文件中满足行数条件的函数，并为其插入跟踪语句。代码如下：

FUNC_MIN_LINE = 10

totalFileNum = 0; totalFuncNum = 0; procFileNum = 0; procFuncNum = 0;
def AddFuncTrace(dir, file):
    global totalFileNum, totalFuncNum, procFileNum, procFuncNum
    totalFileNum += 1

    filePath = os.path.join(dir, file)

    #识别C文件
    fileExt = os.path.splitext(filePath)
    if fileExt[1] != '.c':
        return

    try:
        fileObj = open(filePath, 'r')
    except IOError:
        print 'Cannot open file (%s) for reading!', filePath
    else:
        lineList = fileObj.readlines()

    procFileNum += 1
    #识别C函数
    lineNo = 0
    while lineNo < len(lineList):
        #若为注释行或不含{，则跳过该行
        if re.match('^.*/(?:/|*)+.*?(?:/*)*s*$', lineList[lineNo]) != None 
            or re.search('{', lineList[lineNo]) == None:
            lineNo = lineNo + 1; continue

        funcStartLine = lineNo
        #默认左圆括号与函数头位于同一行
        while re.search('(', lineList[funcStartLine]) == None:
            funcStartLine = funcStartLine - 1
            if funcStartLine < 0:
                lineNo = lineNo + 1; break

        regMatch = re.match(r'''^s*(w+s*[w*]+)    #return type
                                s+(w+)              #function name
                                s*([,/*[].sw]*   #patial args
                                )?[^;]*$''', lineList[funcStartLine], re.X)

        if regMatch == None 
            or 'if' in regMatch.group(2): #排除"else if(bFlag)"之类的伪函数头
            #print 'False[%s(%d)]%s' %(file, funcStartLine+1, lineList[funcStartLine]) #funcStartLine从0开始，加1为真实行号
            lineNo = lineNo + 1; continue

        totalFuncNum += 1
        #print '+[%d] %s' %(funcStartLine+1, lineList[funcStartLine])
        funcName = regMatch.group(2)
        #跳过短于FUNC_MIN_LINE行的函数
        funcEndLine = FindCurlyBracePair(lineList, funcStartLine)
        #print 'func:%s, linenum: %d' %(funcName, funcEndLine - funcStartLine)
        if (funcEndLine - funcStartLine) < FUNC_MIN_LINE:
            lineNo = funcEndLine + 1; continue

        #花括号{与函数头在同一行时，{后通常无语句。否则其后可能有语句
        regMatch = re.match('^(.*){(.*)$', lineList[lineNo])
        lineList[lineNo] = '%s{printf("%s() at %s, %s.\n"); %s
' 
                            %(regMatch.group(1), funcName, file, lineNo+1, regMatch.group(2))
        print '-[%d] %s' %(lineNo+1, lineList[lineNo]) ###
        procFuncNum += 1
        lineNo = funcEndLine + 1

    #return
    try:
        fileObj = open(filePath, 'w')
    except IOError:
        print 'Cannot open file (%s) for writing!', filePath
    else:
        fileObj.writelines(lineList)
        fileObj.close()

因为实际工程中函数头模式更加复杂，AddFuncTrace()内匹配函数时所用的正则表达式与ParseFuncHeader()略有不同。正确识别函数头并添加根据语句后，会直接跳至函数体外继续向下处理。但未识别出函数头时，正则表达式可能会错误匹配函数体内"else if(bFlag)"之类的语句，因此需要防护这种情况。

注意，目前添加的跟踪语句形如printf("func() at file.c, line. ")。读者可根据需要自行定制跟踪语句，如添加打印开关。

因为源代码文件可能以嵌套目录组织，还需遍历目录以访问所有文件：

def ValidateDir(dirPath):
    #判断路径是否存在(不区分大小写)
    if os.path.exists(dirPath) == False:
        print dirPath + ' is non-existent!'
        return ''

    #判断路径是否为目录(不区分大小写)
    if os.path.isdir(dirPath) == False:
        print dirPath + ' is not a directory!'
        return ''
    return dirPath

def WalkDir(dirPath):
    dirPath = ValidateDir(dirPath)
    if not dirPath:
        return

    #遍历路径下的文件及子目录
    for root, dirs, files in os.walk(dirPath):
        for file in files:
            #处理文件
            AddFuncTrace(root, file)

    print '############## %d/%d functions in %d/%d files processed##############' 
          %(procFuncNum, totalFuncNum, procFileNum, totalFileNum)

最后，添加可有可无的命令行及帮助信息：

usage = '''Usage:
  AddFuncTrace(.py) [options] [minFunc] [codePath]
  This program adds trace code to functions in source code.
  Options include:
      --version : show the version number
      --help    : show this help
  Default minFunc is 10, specifying that only functions with
  more than 10 lines will be processed.
  Default codePath is the current working directory.'''

if __name__ == '__main__':
    if len(sys.argv) == 1: #脚本名
        WalkDir(os.getcwd())
        sys.exit()
    if sys.argv[1].startswith('--'):
        option = sys.argv[1][2:]
        if option == 'version':
            print 'Version 1.0 by xywang'
        elif option == 'help':
            print usage
        else:
            print 'Unknown Option.'
        sys.exit()

    if len(sys.argv) >= 3:
        FUNC_MIN_LINE = int(sys.argv[1])
        WalkDir(os.path.abspath(sys.argv[2]))
        sys.exit()
    if len(sys.argv) >= 2:
        FUNC_MIN_LINE = int(sys.argv[1])
        WalkDir(os.getcwd())
        sys.exit()

上述命令行参数解析比较简陋，也可参考《Python实现Linux命令xxd -i功能》一文中的optionparser解析模块。

三. 效果验证

为验证上节的代码实现，建立test调试目录。该目录下包含test.c及两个文本文件。其中，test.c内容如下：

#include <stdio.h>
/* {{{ Local definitions/variables */

unsigned int test0(int a, int b){
    int a0; int b0;}

 unsigned int test1 (int a, int b) {
    int a1; int b1;
    a1 = 1;
    b1 = 2;}

  int test2 (int a, int b)
{
    int a2; int b2;
    a2 = 1;
    b2 = 2;
}


/* {{{ test3 */
int test3(int a,
          int b)
{   int a3 = 1; int b3 = 2;
    if(a3)
    {
        a3 = 0;
    }
    else if(b3) {
        b3 = 0;
    }
}
/* }}} */

static void test4(A *aaa,
       B bbb,
       C ccc[]
       ) {
    int a4; int b4;
}

static void test5(void);
struct T5 {
  int t5;
};

考虑到上述函数较短，故指定函数最短行数为1，运行AddFuncTrace.py：

E:PyTest>python AddFuncTrace.py 1 test
-[4] unsigned int test0(int a, int b){printf("test0() at test.c, 4.
");

-[7]  unsigned int test1 (int a, int b) {printf("test1() at test.c, 7.
"

-[13] {printf("test2() at test.c, 13.
");

-[23] {printf("test3() at test.c, 23.
");    int a3 = 1; int b3 = 2;

-[37]        ) {printf("test4() at test.c, 37.
");

############## 5/5 functions in 1/3 files processed##############

查看test.c文件内容如下：

#include <stdio.h>
/* {{{ Local definitions/variables */

unsigned int test0(int a, int b){printf("test0() at test.c, 4.
"); 
    int a0; int b0;}

 unsigned int test1 (int a, int b) {printf("test1() at test.c, 7.
"); 
    int a1; int b1;
    a1 = 1;
    b1 = 2;}

  int test2 (int a, int b)
{printf("test2() at test.c, 13.
"); 
    int a2; int b2;
    a2 = 1;
    b2 = 2;
}


/* {{{ test3 */
int test3(int a,
          int b)
{printf("test3() at test.c, 23.
");    int a3 = 1; int b3 = 2;
    if(a3)
    {
        a3 = 0;
    }
    else if(b3) {
        b3 = 0;
    }
}
/* }}} */

static void test4(A *aaa,
       B bbb,
       C ccc[]
       ) {printf("test4() at test.c, 37.
"); 
    int a4; int b4;
}

static void test5(void);
struct T5 {
  int t5;
};

可见，跟踪语句的插入完全符合期望。

接着，在实际工程中运行python AddFuncTrace.py 50，截取部分运行输出如下：

-[1619] {printf("bcmGetQuietCrossTalk() at bcm_api.c, 1619.
"); 

-[1244] {printf("afeAddressExist() at bcm_hmiLineMsg.c, 1244.
"); 

-[1300] {printf("afeAddressMask() at bcm_hmiLineMsg.c, 1300.
"); 

-[479] uint32 stpApiCall(uint8 *payload, uint32 payloadSize, uint32 *size) {printf("stpApiCall() at bcm_stpApi.c, 479.
"); 

############## 291/1387 functions in 99/102 files processed##############

查看处理后的C函数，插入效果也符合期望。