使用Python提取中文字符

#功能:国际化测试,用于提取应用设计包中的中文字符,并输出report
#解压---筛选---整理路径---提取中文---输出报告

################################################################# 
#author: 陈月白 
#_blogs: http://www.cnblogs.com/chenyuebai/ 
#################################################################

#功能:国际化测试,用于提取应用设计包中中文字符,并输出report
#yuebai 20160328
#解压---筛选---整理路径---提取中文---输出报告

#-*- coding: utf-8 -*-

import os
import shutil
import sys
import re
import zipfile
import glob

workPath = "C:\\users\\yuebai\\Desktop\\国际化测试包"
reportPath = "C:\\users\\yuebai\\Desktop\\国际化输出报告"

#定义日志函数
def logInfo(info):
    log = open("%s\\run.log"%workPath,"a")
    log.write("[Info] %s\n"%info)
    log.close()
    
#定义解压文件函数
def extractZip(fileName,extraPath):
    f = zipfile.ZipFile(fileName,"r")
    f.extractall(extraPath)
    f.close()
    logInfo("%s文件解压完成"%fileName)    

#功能函数,找出中文字符
def getChinese(fileFullPath):
    isChinese = re.compile("([\u4e00-\u9fa5]+)+?")
    f = open(fileFullPath,"r",encoding="UTF-8")     #打开待提取文件
    f_in = open("%s\\chineseTxt.txt"%reportPath,"a")    #打开输出文件
    
    for line in f.readlines():
        getStr = isChinese.findall(str(line))   #逐行判断提取中文
        if not getStr == []:
            f_in.write("发现中文字符(╯' - ')╯︵ ┻━┻ ,文件路径为%s\n"%fileFullPath)
            f_in.write("%s\n"%line)
            
            f.close()
            f_in.close()
    
    logInfo("查找完成,输出报告路径:%s"%reportPath)

#功能函数,列出路径下所有文件
def listAny(workPath):
    if not os.path.exists(workPath):
        print("Error,no such dictionary%s,plz check"%workPath)
    zipList = os.listdir(workPath)
    return zipList

#删除非zip类型的包
notZipList = glob.glob("%s\\*[!p]"%workPath)
logInfo("notZipList =%s,prepare to delete"%notZipList)
for i in notZipList:
    os.remove(i)
logInfo("删除非zip包完成")

#获取zip包列表
zipList = listAny(workPath)

#取zip包解压
#print("开始提取")
for zipPackage in zipList:
    zipName = os.path.split(zipPackage)[0]      #切割获取文件名
    extraPath = os.path.join(workPath,zipName)      #在当前文件夹下创建和zip包同名文件夹,用以做解压目标路径
    os.makedirs(extraPath)
    logInfo("构造解压路径完成,extraPath =%s"%extraPath)
    
    extraFilePath = os.path.join(workPath,zipPackage)   #待解压文件绝对路径
    
    #开始解压zip包,完成后删除源zip文件
    extractZip(extraFilePath, extraPath)
    os.remove(extraFilePath)
    
    #将扩展目录下流程文件汇总至\\plan下
    if os.path.exists("%s\\Plans\\Extend"%extraPath):
        tmpExtendPath = ("%s\\Plans\\Extend"%extraPath)
        tmpPlanPath = ("%s\\Plans"%extraPath)
        
        for t in os.listdir(tmpExtendPath):
            t_FullPath = ("%s\\%s\\"%(tmpExtendPath,t))
            #print("t_FullPath =",t_FullPath)
            if os.path.isfile(t_FullPath):
                shutil.move(t_FullPath,tmpPlanPath)
        
    #调用getChinese,提取中文字符
    for y in os.listdir(tmpPlanPath):
        y_fullPath = os.path.join(tmpPlanPath,y)
        #print(y_fullPath)
        
        logInfo("开始检查文件%s,检查结果路径:%s"%(y_fullPath,reportPath))
        getChinese(y_fullPath)
        
#剔除注释
f_in = open("%s\\chineseTxt.txt"%reportPath,"r")    #全部中文文件
f_comment = open("%s\\comment.txt"%reportPath,"a")    #打开待写入注释文件
f_result = open("%s\\result.txt"%reportPath,"a")    #结果文件

for line in f_in.readlines():
    if re.findall("^//.*",line):
        f_comment.write("%s\n"%line)
    else:
        f_result.write("%s\n"%line)

f_in.close()
f_comment.close()
f_result.close()

print("提取完成,结果路径:%s"%reportPath) 
 
 
 

 
原文地址:https://www.cnblogs.com/chenyuebai/p/5384185.html