python读xml写到txt

读取xml信息,写到txt中。

这个是在当前路径执行的,只能操作当前路径下的文件。

# -*- coding: utf-8 -*-

import os
import xml.dom.minidom

def extract_xml_to_txt(srcdir, dstdir):
    num = 0
    filelist = os.listdir(srcdir)
    for i in range(0, len(filelist)):
        if filelist[i][-3:] == 'xml':
            file = os.path.join(srcdir, filelist[i])
            newfile = filelist[i].replace(".xml", ".txt")
            dstfile = os.path.join(dstdir, newfile)
            print("processing file", dstfile)
            # write file
            file_lineinfo = open(dstfile, 'w', encoding='utf-8')
            content_tree = xml.dom.minidom.parse(file)
            content = content_tree.documentElement
            print(content)
            LineInfos = content.getElementsByTagName('LineInfo')
            for lineinfo in LineInfos:
                if lineinfo.hasAttribute("ptLTX"):
                    ltx = lineinfo.getAttribute("ptLTX")
                    print("LTX:", ltx)
                if lineinfo.hasAttribute("ptLTY"):
                    lty = lineinfo.getAttribute("ptLTY")
                    print("LTY:", lty)
                if lineinfo.hasAttribute("ptLBX"):
                    lbx = lineinfo.getAttribute("ptLBX")
                    print("LBX:", lbx)
                if lineinfo.hasAttribute("ptLBY"):
                    lby = lineinfo.getAttribute("ptLBY")
                    print("LBY:", lby)
                if lineinfo.hasAttribute("ptRTX"):
                    rtx = lineinfo.getAttribute("ptRTX")
                    print("RTX:", rtx)
                if lineinfo.hasAttribute("ptRTY"):
                    rty = lineinfo.getAttribute("ptRTY")
                    print("RTY:", rty)
                if lineinfo.hasAttribute("ptRBX"):
                    rbx = lineinfo.getAttribute("ptRBX")
                    print("RBX:", rbx)
                if lineinfo.hasAttribute("ptRBY"):
                    rby = lineinfo.getAttribute("ptRBY")
                    print("RBY:", rby)
                if lineinfo.hasAttribute("Chars"):
                    chars = lineinfo.getAttribute("Chars")
                    chars = chars.strip('
')
                    print("Chars:", chars)
                line_info = [ltx, ',', lty, ',', lbx, ',', lby, ',', rtx, ',', rty, ',', rbx, ',', rby, ',', chars, '
']
                file_lineinfo.writelines(line_info)
            file_lineinfo.close()


if __name__ == '__main__':
    src_directory = os.getcwd()
    dst_directory = os.getcwd()
    extract_xml_to_txt(src_directory, dst_directory)

这个是递归执行所有文件的。

# -*- coding: utf-8 -*-

import os
import xml.dom.minidom

def getFiles(path, suffix):
    return [os.path.join(root, file) for root, dirs, files in os.walk(path) for file in files if file.endswith(suffix)]

def extract_xml_to_txt(srcdir, dstdir):
    num = 0
    filelist = getFiles(srcdir, '.xml')
    #filelist = os.listdir(srcdir)
    for i in range(0, len(filelist)):
        if filelist[i][-3:] == 'xml':
            file = os.path.join(srcdir, filelist[i])
            newfile = filelist[i].replace(".xml", ".txt")
            dstfile = os.path.join(dstdir, newfile)
            print("processing file", dstfile)
            # write file
            file_lineinfo = open(dstfile, 'w', encoding='utf-8')
            content_tree = xml.dom.minidom.parse(file)
            content = content_tree.documentElement
            print(content)
            LineInfos = content.getElementsByTagName('LineInfo')
            for lineinfo in LineInfos:
                if lineinfo.hasAttribute("ptLTX"):
                    ltx = lineinfo.getAttribute("ptLTX")
                    print("LTX:", ltx)
                if lineinfo.hasAttribute("ptLTY"):
                    lty = lineinfo.getAttribute("ptLTY")
                    print("LTY:", lty)
                if lineinfo.hasAttribute("ptLBX"):
                    lbx = lineinfo.getAttribute("ptLBX")
                    print("LBX:", lbx)
                if lineinfo.hasAttribute("ptLBY"):
                    lby = lineinfo.getAttribute("ptLBY")
                    print("LBY:", lby)
                if lineinfo.hasAttribute("ptRTX"):
                    rtx = lineinfo.getAttribute("ptRTX")
                    print("RTX:", rtx)
                if lineinfo.hasAttribute("ptRTY"):
                    rty = lineinfo.getAttribute("ptRTY")
                    print("RTY:", rty)
                if lineinfo.hasAttribute("ptRBX"):
                    rbx = lineinfo.getAttribute("ptRBX")
                    print("RBX:", rbx)
                if lineinfo.hasAttribute("ptRBY"):
                    rby = lineinfo.getAttribute("ptRBY")
                    print("RBY:", rby)
                if lineinfo.hasAttribute("Chars"):
                    chars = lineinfo.getAttribute("Chars")
                    chars = chars.strip('
')
                    print("Chars:", chars)
                if chars == "" or rby == '-1':
                    continue
                line_info = [ltx, ',', lty, ',', lbx, ',', lby, ',', rtx, ',', rty, ',', rbx, ',', rby, ',', chars, '
']
                file_lineinfo.writelines(line_info)
            file_lineinfo.close()


if __name__ == '__main__':
    src_directory = os.getcwd()
    dst_directory = os.getcwd()
    extract_xml_to_txt(src_directory, dst_directory)
原文地址:https://www.cnblogs.com/juluwangshier/p/13266461.html