用python做含有中文的正则表达式模式匹配

#!/usr/bin/python
#
-*- coding:gbk-*-
'''
spec:根据是否命中126W人名,将usrdict分为两个部分
parms:
[IN] 
[IN]
[OUT]
author: liuyusi0121@sogou-inc.com date 20120808
'''
import re;
import sys;
def LoadKeys(filename):
    '''
    加载key到内存
    
'''
    keys=[];
    p=re.compile('^\s+|\s+$');
    fid=file(filename,"r");
    temp=fid.readlines();
    fid.close();
    for line in temp:
        line=p.sub('',line);
        keys.append(line);


    return keys;
def PrintUsage():
    print 'program [IN] keywords.txt [IN]file.txt [OUT] matched.txt [OUT] notmatched.txt [OUT] ufuwfoverflow';
    exit(1);

if(__name__=="__main__"):
    delim="\t";
    p=re.compile("(^\\s+|\\s+$)");
    if(len(sys.argv)!=6):
        PrintUsage();
    keyfile=str(sys.argv[1]);
    keys=LoadKeys(keyfile);
    print len(keys);
    inputfile=str(sys.argv[2]);
    outputfile1=str(sys.argv[3]);
    outputfile2=str(sys.argv[4]);
    outputfile3=str(sys.argv[5]);
    fout1=open(outputfile1,'w');
    fout2=open(outputfile2,'w');
    fout3=open(outputfile3,'w');
    fid=open(inputfile,"r");
    linecount=0;
    while True:
        line=fid.readline();
        flag=0;
        if(0==len(line)):
            break;
        line=p.sub('',line);
        if(''==line):
            continue;
        if(0==linecount%100000):
            print '语料已经处理%d行'%linecount;
            linecount=linecount+1;
        linesegs=line.split("\t");
        if(4!=len(linesegs)):
            continue;
        if(int(linesegs[2])<=0 or int(linesegs[3])<=0):
            fout3.write(line);
            fout3.write("\n");
            continue;
        try:
            useg=unicode(linesegs[0],'gbk');
            count=0;
            for key in keys:
                if(0==count%100000):
                    print '模式已经扫描%d个'%count;
                count=count+1;
                patternstr="(^"+key+"|"+key+"$)";
                try:
                    upatternstr=unicode(patternstr,"gbk");
                    pattern=re.compile(upatternstr);
                    if(pattern.search(useg)):
                        print line;
                        flag=1;
                        linesegs.append(key)
                        newline=delim.join(linesegs);
                        fout1.write(newline);
                        fout1.write("\n");
                        break;
                except UnicodeDecodeError:
                    pass;
        except:
            pass;
        if(flag==0):
            linesegs.append("_");
            newline=delim.join(linesegs);
            fout2.write(newline);
            fout2.write("\n");
    fid.close();
    fout1.close();
    fout2.close();
    fout3.close();
原文地址:https://www.cnblogs.com/finallyliuyu/p/2629020.html