python 自定义 遍历文件夹下所有文件

  1. 两种方式列表顺序不一样,内容一下
import os

# 方法1(递归完一个目录)
def get_process_files(root_dir):
    """process all files in directory"""
    cur_dir=os.path.abspath(root_dir)
    file_list=os.listdir(cur_dir)
    process_list=[]
    for file in file_list:
        fullfile=cur_dir+"\"+file
        if os.path.isfile(fullfile):  # 检查文件(不是目录)就return 出递归
            process_list.append(fullfile)
        elif os.path.isdir(fullfile):
            dir_extra_list=get_process_files(fullfile)
            if len(dir_extra_list)!=0:
                for x in dir_extra_list:
                    process_list.append(x)
    return process_list

print(get_process_files(r'C:	est'))


# 方法2 (一个目录下所有文件返回完,才遍历另一个目录)
def get_process_files(path):
    process_list = []
    # r=root, d=directories, f = files
    for r, d, f in os.walk(path):
        for file in f:
            # if '.txt' in file:
                process_list.append(os.path.join(r, file))

    return process_list

print(get_process_files(r'C:	est'))

  1. 查找文件内容工具(txt,doc,pdf)

import os
import re
import sys
from docx import Document
import PyPDF2



# 文件类型匹配列表
FileTypeList       = [".xml", ".tpl", ".txt",'.doc', '.docx','.pdf']
# FileNameFilterList = ['1.txt']                 # 指定文件名搜索列表
FileNameFilterList = []                         # 指定文件名搜索列表
fileNameList       = []                         # 待搜索文件列表


# 文件夹存在?
def isFolderExist(dir):
    if (os.path.exists(dir)):
        return True;
    else:
        return False;


# 指定文件名搜索
def isFileNameContainStr(fileNameFilterStrList, filename):
    if len(fileNameFilterStrList) == 0:  # 没指定搜索全部
        return True
    for filterStr in fileNameFilterStrList:
        if filterStr in filename:
            return True
    return False


# 指定文件类型
def isFileNameContainType(typeList, filename):
    if len(typeList) == 0:  # 没指定搜索全部
        return True;
    for type in typeList:
        if os.path.splitext(filename)[1] == type:
            return True
    return False


# 返回文件类型
def file_type(filename):
    return os.path.splitext(filename)[1].upper()


# 待搜索文件列表,满足过滤条件后
def listFile(path, fileNameFilterList, typeList):
    if not isFolderExist(path):
        return False
    for filename in os.listdir(path):
        if os.path.isdir(path + "\" + filename):
            listFile(path + "\" + filename, FileNameFilterList, typeList)
        if os.path.isfile(path + "\" + filename):
            if False == isFileNameContainStr(fileNameFilterList, filename):
                continue
            if isFileNameContainType(typeList, filename):
                fileNameList.append(path + "\" + filename)
                continue
    return True



# FindStrList        = ['test2','t2']            # 文件内容匹配列表
# # 搜索txt
# def findFromFile(filename, strlist):
#     file = open(filename)
#     count = 0
#     for line in file:
#         #if '$' in line:
#         #    continue
#         count = count+1
#         isContained = True
#         for str in strlist:
#             if str not in line:
#                 isContained = False
#                 break
#         if isContained == True:
#             print (f'{filename}; line:{count},{line}')
#     file.close()

# # 搜索过滤后 fileNameList 列表
# def findFromDir(strlist):
#     for name in fileNameList:
#         findFromFile(name, strlist)


# 搜索txt ,区分大小写
def find_txt(filename, str):
    file = open(filename)
    count = 0
    for line in file:
        #if '$' in line:
        #    continue
        count = count+1
        isContained = True

        if str not in line:
            isContained = False
            # continue
        if isContained == True:
            line=line.strip()
            print (f'{filename}; line:{count} ({line})')
    file.close()


# 搜索word ,区分大小写
def find_doc(filename, str):
    document = Document(filename)
    count = 0
    # l = [ paragraph.text.encode('gb2312') for paragraph in document.paragraphs]
    l = [ paragraph.text for paragraph in document.paragraphs]
    for line in l:
        count = count + 1
        i=line.strip()
        if line.find(str) !=-1:
            print(f'{filename}; line:{count} ({line})')


# 搜索pdf
def find_pdf(filename, str):
    # file = open('C:/test/G.8273.2-201908.pdf', 'rb')
    file = open(filename, 'rb')

    fileReader = PyPDF2.PdfFileReader(file)
    num = fileReader.numPages
    for i in range(num):
        pageObj = fileReader.getPage(i)
        if str in pageObj.extractText():
            contents = pageObj.extractText().split('
')
            for line in contents:
                if str in line:
                    index = contents.index(line)
                    print(f'{filename}; page:{i+1},line:{index} ({line})')



# 搜索过滤后 fileNameList 列表
def findFromDir(str):
    for name in fileNameList:
        if file_type(name) in ['.TXT','.XML']:
            find_txt(name, str)
        if file_type(name) in ['.DOC', '.DOCX']:
            find_doc(name, str)
        if file_type(name) in ['.PDF', ]:
            find_pdf(name, str)


# 交互输入
def askInput():
    path = input('What is folder path ? ')
    text = input('Searching for what? ')
    print()
    return text, path



if __name__ == "__main__":

    # DIR = "C:\test"  # 文件目录
    # FindStrList = 'test'  # 文件内容匹配列表
    FindStrList, DIR = askInput()
    if not listFile(DIR, FileNameFilterList, FileTypeList):
        print ("FILE PATH ERROR")
        sys.exit()
    findFromDir(FindStrList)
    print ("FIND END")

原文地址:https://www.cnblogs.com/amize/p/14228504.html