GbFileToUTF8File

import os
import shutil
import re
import sys

src = "S:\date\before"        #转换之前的文件夹,支持包含文件夹
ddn = "S:\date\after"         #转换之后的文件夹

def ReadFileandSave(filepath):
    file = open(filepath)
    i = 0
    while 1:
        lines = file.readlines(1000)
        if not lines:
            break
        for line in lines:
            i = i + 1
            if i > 100:
                break
            num = line.find("文件序号")
            print num

#文件编码类型判断
def obtainFileType(filepath):
    import chardet 
    tt = open(filepath, 'rb') 
    ff = tt.readline()        #这里试着换成read(5)也可以,但是换成readlines()后报错 
    enc = chardet.detect(ff) 
    tt.close() 
    return enc['encoding']  #返回文件类型

#####################################
def search(src, handler):
    global ddn
    list = os.listdir(src)
    for f in list:
        cf = src + '\' + f
        #print cf
        #ReadFileandSave(cf)
        if os.path.isdir(cf):       #dir continue search
            search(cf, handler)
        else:
            handler(cf, ddn)        #the function dealing with file
##########################################################

def copy(sfn, ddn):
    global src
    ddn = ddn + os.path.dirname(sfn).replace(src, '')   
    def copymain():                                 #main code of copy function        
        fn = os.path.basename(sfn)
        print "Processing file name ", fn
        if -1 != fn.find(".txt"):
            sf = open(sfn, 'r')
            # print "==sfn==", sfn
            s = sf.read()                                   #读取文件所有内容
            #print obtainFileType(sfn)                      #文件编码类型判断
            try:                                            #针对网页类型的文件的处理部分
                if obtainFileType(sfn) == 'GB2312':         #gb2312类型的进行转换
                    #print '===GB2312===: ', sfn
                    s = re.sub('charset=gb2312','charset=utf-8',s)
                    s = s.decode('gbk')
                    df = open(ddn + '\' + "utf8_" + os.path.basename(sfn), 'w')
                    df.write(s.encode('utf-8'))
                    df.flush()
                    sf.close()
                    df.close()
                    print ddn + '\' + "utf8_" + os.path.basename(sfn)
                elif obtainFileType(sfn) == 'UTF-8-SIG':    #utf-8类型直接复制
                    #print 'utf-8: ',sfn
                    sf.close()
                    shutil.copy2(sfn,ddn)
                else:                                       #不知道类型的按gb2312转换成utf-8类型
                    s = s.decode('gbk')
                    df = open(ddn+'\' + os.path.basename(sfn),'w')
                    df.write(s.encode('utf-8'))
                    df.flush()
                    sf.close()
                    df.close()
                
                    #print '*** ',sfn
            except:
                sf.close
                # print sfn
                sys.exit()
            
        else:                       #非指定类型文件,直接复制
            shutil.copy2(sfn, ddn)
            
    if os.path.exists(ddn):        #目标文件夹不存在,就创建
        copymain()
    #'''
    else:                       #directory not exist,create it
        os.makedirs (ddn)
        copymain()
    #'''


if __name__ == "__main__":
    search(src, copy)




    
原文地址:https://www.cnblogs.com/hgonlywj/p/4842681.html