tred_extract_EDED_new

  1 # -*- coding:utf-8 -*-
  2 import re
  3 
  4 
  5 '''
  6 适应新版本
  7 '''
  8 
  9 
 10 year='17a'#用户自定义
 11 ss='./data/'#根目录
 12 filename = ss+'EDED%s.txt'%year#输入文件名
 13 
 14 
 15 
 16 
 17 def tred_nonote():
 18 
 19     p1 = r"^(?:s{5}|Xs{4}|Ws{4})(dddd)ss[A-Z].+]$"#匹配1001
 20     p2 = r"^(?:s{5}|Xs{4}|Ws{4})ddddss([A-Z].+)s+[[A-Z]]$"
 21     p3 = r"^(?:s{5}|Xs{4}|Ws{4})ddddss[A-Z].+s+[([A-Z])]$"
 22     p4 = r"^s{5}Desc:s(.+ww.)
"
 23 
 24     p5 = r"^s{5}Desc:s(.+[^.]|.+.g.)
"#非以.结尾的Desc
 25     p6 = r"^s{11}(.+.)
"#非以.结尾的Desc的第二行
 26     p7 = r"^s{5}Repr:s(.+)
"#Repr
 27 
 28     pattern1 = re.compile(p1)
 29     pattern2 = re.compile(p2)
 30     pattern3 = re.compile(p3)
 31     pattern4 = re.compile(p4)
 32     pattern5 = re.compile(p5)
 33     pattern6 = re.compile(p6)
 34     pattern7 = re.compile(p7)
 35 
 36     fr = open(filename)
 37     temp = ();
 38     flag = 0
 39     for line in fr.readlines():
 40         matcher1 = re.findall(pattern1,line)
 41         matcher2 = re.findall(pattern2,line)
 42         matcher3 = re.findall(pattern3,line)
 43         matcher4 = re.findall(pattern4,line)
 44         matcher5 = re.findall(pattern5,line)
 45         matcher6 = re.findall(pattern6,line)
 46         matcher7 = re.findall(pattern7,line)
 47 
 48         w2 = open(ss+'tred_nonote%s.txt'%year,'a')#a代表追加 w代表重写
 49         if matcher1:
 50             flag = 1
 51             w2.write("
")
 52             for j in matcher1:
 53                 for k in j:
 54                     w2.write(k)
 55 
 56         if ((matcher2!=[])and(flag ==1)):
 57             flag = 2
 58 
 59             w2.write(",")
 60             for j in matcher2:
 61                 for k in j:
 62                     w2.write(k)
 63         if ((matcher3!=[])and(flag ==2)):
 64             flag = 3
 65             # w2.write(",")
 66             for j in matcher3:
 67                 for k in j:
 68                     w2.write(k)
 69         if ((matcher4!=[])and(flag ==3)):
 70             flag = 4
 71             w2.write(","")
 72             for j in matcher4:
 73                 for k in j:
 74                     w2.write(k)
 75             w2.write(""")
 76         if ((matcher5!=[])and(flag ==3 or 5)):
 77             flag = 5
 78             w2.write(","")
 79             for j in matcher5:
 80                 for k in j:
 81                     w2.write(k)
 82         if ((matcher6!=[])and(flag ==5)):
 83             flag = 6
 84             w2.write(" ")
 85             for j in matcher6:
 86                 for k in j:
 87                     w2.write(k)
 88             w2.write(""")
 89         if ((matcher7!=[])and(flag ==4 or 6)):
 90             flag = 7
 91             w2.write(",")
 92             for j in matcher7:
 93                 for k in j:
 94                     w2.write(k)
 95 
 96         w2.close( )
 97 
 98 
 99 def tred_note():
100 
101     p1 = r"^(?:s{5}|Xs{4}|Ws{4})(dddd)ss[A-Z].+]$"#匹配1001
102     p2 = r"^s{5}Note:s
"#Note
103     p3= r"^s{11}([^ ].+)
"#Note内容
104     p4= r"^(?:-|컴)+
"
105     pattern1 = re.compile(p1)
106     pattern2 = re.compile(p2)
107     pattern3 = re.compile(p3)
108     pattern4 = re.compile(p4)
109 
110 
111     fr = open(filename)
112     w2 = open(ss+'tred_note%s.txt'%year,'a')#a代表追加 w代表重写
113     # temp = ();
114     flag = 0
115     flag1=0
116     for line in fr.readlines():
117         matcher1 = re.findall(pattern1,line)
118         matcher2 = re.findall(pattern2,line)
119         matcher3 = re.findall(pattern3,line)
120         matcher4 = re.findall(pattern4,line)
121 
122        
123         #print matcher
124 
125         if matcher1!=[]:
126             flag = 1
127             w2.write("
")
128             # for j in matcher1:
129                 
130             #     w2.write(j)
131 
132         if ((matcher2!=[])and(flag == 1)):
133             flag = 2
134             flag1=1
135             # w2.write(",")
136         if flag1==1:
137             if ((matcher3!=[])and(flag ==2 or 3)):
138                 flag = 3
139                 w2.write(" ")
140                 for j in matcher3:
141                     
142                     w2.write(j)
143             # w2.write(")
144             if ((matcher4!=[])and(flag == 3)):
145                 flag=0
146                 flag1=0
147     w2.write("
")
148     w2.close( )
149     fr.close()
150 
151 def join():
152 
153 
154 
155     f1= open(ss+'tred_note%s.txt'%year)
156     f2 =open(ss+'tred_nonote%s.txt'%year) 
157 
158     list_note=[]
159     for line1 in f1:
160         # print(line1)
161         if line1.isspace():
162             list_note.append('')
163         else:
164             list_note.append(line1)
165          
166     f1.close()
167 
168     # print(list_note)
169     f2_w= open(ss+'tred%s.csv'%year,'a')  
170     # for i in range(len(list_note)):
171     i=0
172         # f2_r = open(ss+'/new/%s_w.txt'%list_tag[i])
173     for line2 in f2:
174 
175         str11="%s,"%s"
"%(line2.strip('
'),list_note[i].strip('
'))
176         i=i+1
177         # print(i)
178         # print(str11)
179         f2_w.write(str11)
180 
181 
182     f2_w.close() 
183     f2.close()
184 if __name__ == '__main__':
185     tred_nonote()
186     tred_note()
187     join()
原文地址:https://www.cnblogs.com/smuxiaolei/p/7427670.html