过滤xml文件内容

  将xml文件改写成想要的txt文件。

    原xml文件:

  1 <?xml version="1.0" encoding="UTF-8"?>
  2 
  3 -<ANNOTATION_DOCUMENT xsi:noNamespaceSchemaLocation="http://www.mpi.nl/tools/elan/EAFv2.6.xsd" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" VERSION="2.6" FORMAT="2.6" DATE="" AUTHOR="QNZS">
  4 
  5 
  6 -<HEADER TIME_UNITS="milliseconds" MEDIA_FILE="">
  7 
  8 <MEDIA_DESCRIPTOR MIME_TYPE="audio/mpeg" MEDIA_URL=""/>
  9 
 10 </HEADER>
 11 
 12 
 13 -<TIME_ORDER>
 14 
 15 <TIME_SLOT TIME_VALUE="0" TIME_SLOT_ID="TS0"/>
 16 
 17 <TIME_SLOT TIME_VALUE="1740" TIME_SLOT_ID="TS0001"/>
 18 
 19 <TIME_SLOT TIME_VALUE="0" TIME_SLOT_ID="TS1"/>
 20 
 21 <TIME_SLOT TIME_VALUE="0" TIME_SLOT_ID="TS1001"/>
 22 
 23 </TIME_ORDER>
 24 
 25 
 26 -<TIER TIER_ID="EDU" LINGUISTIC_TYPE_REF="EDU" DEFAULT_LOCALE="ru">
 27 
 28 
 29 -<ANNOTATION>
 30 
 31 
 32 -<ALIGNABLE_ANNOTATION TIME_SLOT_REF2="TS0001" TIME_SLOT_REF1="TS0" ANNOTATION_ID="EDU0">
 33 
 34 <ANNOTATION_VALUE>0</ANNOTATION_VALUE>
 35 
 36 </ALIGNABLE_ANNOTATION>
 37 
 38 </ANNOTATION>
 39 
 40 
 41 -<ANNOTATION>
 42 
 43 
 44 -<ALIGNABLE_ANNOTATION TIME_SLOT_REF2="TS1001" TIME_SLOT_REF1="TS1" ANNOTATION_ID="EDU1">
 45 
 46 <ANNOTATION_VALUE>1</ANNOTATION_VALUE>
 47 
 48 </ALIGNABLE_ANNOTATION>
 49 
 50 </ANNOTATION>
 51 
 52 </TIER>
 53 
 54 
 55 -<TIER TIER_ID="角色" LINGUISTIC_TYPE_REF="EDUProp" DEFAULT_LOCALE="ru" PARENT_REF="EDU">
 56 
 57 
 58 +<ANNOTATION>
 59 
 60 
 61 
 62 
 63 
 64 
 65 
 66 
 67 -<ANNOTATION>
 68 
 69 
 70 -<REF_ANNOTATION ANNOTATION_ID="People1" ANNOTATION_REF="EDU1">
 71 
 72 <ANNOTATION_VALUE>客户</ANNOTATION_VALUE>
 73 
 74 </REF_ANNOTATION>
 75 
 76 </ANNOTATION>
 77 
 78 </TIER>
 79 
 80 
 81 -<TIER TIER_ID="文本" LINGUISTIC_TYPE_REF="EDUProp" DEFAULT_LOCALE="ru" PARENT_REF="EDU">
 82 
 83 
 84 -<ANNOTATION>
 85 
 86 
 87 -<REF_ANNOTATION ANNOTATION_ID="Text0" ANNOTATION_REF="EDU0">
 88 
 89 <ANNOTATION_VALUE>人家不愿意愤怒的您没办法</ANNOTATION_VALUE>
 90 
 91 </REF_ANNOTATION>
 92 
 93 </ANNOTATION>
 94 
 95 
 96 -<ANNOTATION>
 97 
 98 
 99 -<REF_ANNOTATION ANNOTATION_ID="Text1" ANNOTATION_REF="EDU1">
100 
101 <ANNOTATION_VALUE>_end_</ANNOTATION_VALUE>
102 
103 </REF_ANNOTATION>
104 
105 </ANNOTATION>
106 
107 </TIER>
108 
109 
110 -<TIER TIER_ID="规则信息" LINGUISTIC_TYPE_REF="EDUProp" DEFAULT_LOCALE="ru" PARENT_REF="EDU">
111 
112 
113 -<ANNOTATION>
114 
115 
116 -<REF_ANNOTATION ANNOTATION_ID="Comment1" ANNOTATION_REF="EDU1">
117 
118 <ANNOTATION_VALUE/>
119 
120 </REF_ANNOTATION>
121 
122 </ANNOTATION>
123 
124 </TIER>
125 
126 <LINGUISTIC_TYPE TIME_ALIGNABLE="true" LINGUISTIC_TYPE_ID="EDU" GRAPHIC_REFERENCES="true"/>
127 
128 <LINGUISTIC_TYPE TIME_ALIGNABLE="false" LINGUISTIC_TYPE_ID="EDUProp" GRAPHIC_REFERENCES="false" CONSTRAINTS="Symbolic_Association"/>
129 
130 </ANNOTATION_DOCUMENT>

  生成后的TXT文件:

  坐席 : 人家不愿意愤怒的您没办法
  客户 : _end_

  可以通过以下代码显示。

import codecs
import xml.etree.ElementTree as ET
import sys,re
import csv
import os

#获取文件目录下所有文件的文件名
def file_name(file_dir):
    for root, dirs, files in os.walk(file_dir):
        pass
    return files

file_dir = 'D:/untitled/test/fcc'
list_name = []
ll = file_name(file_dir)
ll = str(ll)
ld = ll.replace('.xml','')
print(ld)
list = eval(ld)
print(list)

for i in range(len(list)):
    txt = []
    xml_01 = "./fcc/{}.xml".format(list[i])
    csv_01 = "./csv/{}.txt".format(list[i])
    xmlfile = codecs.open(xml_01, 'r', 'utf-8')
    txtfile = open(csv_01,'a+',encoding='utf-8',newline='')
    line = xmlfile.readline()


    while line:
        result = re.search('<ANNOTATION_VALUE>', line)
        #print(result)
        if result is not None:
            bs = re.sub('<.*?>', "", line)
            bs = bs.strip()
            if bs.isnumeric():
                pass
            #print("成功")
            else:
             txt.append(bs)

        line = xmlfile.readline()
    for i in range(int((len(txt)-1)/2)):
        lines = txt[i] + " : " + txt[i+int((len(txt)-1)/2)] + '
'
        txtfile.write(lines)

    xmlfile.close()
    txtfile.close()

  

原文地址:https://www.cnblogs.com/mtfan01/p/13321208.html