trmd_b1_ok

  1 # -*- coding:utf-8 -*-
  2 '''
  3 从11c开始提取
  4 '''
  5 import re
  6 import numpy as np
  7 import os
  8 year = '17A'
  9 ss="./data/edmd/"
 10 # filename=ss+"/EDMDI1.17A"
 11 def get_tag():
 12     try:
 13         os.rename(ss+"/EDMDI1.17A",ss+"/EDMDI1.txt")
 14     except:
 15         pass
 16     f1=open(ss+"/EDMDI1.txt")
 17     p1=re.compile(r"^(?:s{3}|Xs{2}|Ws{2})([A-Z]{6})s.+
")
 18     list_tag=list()
 19     for line in f1.readlines():
 20         # print(line)
 21         match1=re.findall(p1,line)
 22         # print(match1)
 23         if match1:
 24             for j in match1:
 25                 list_tag.append(j)
 26     # filename_w1= ss+'%s'%list_tag[MM]
 27     print(list_tag)
 28     return list_tag
 29 def trmd_b1_nonote(list_tag):
 30     if not os.path.exists('./data/edmd/new/'):
 31         os.makedirs('./data/edmd/new/')
 32 
 33     for MM in range(len(list_tag)):
 34         try:
 35             os.rename(ss+'%s_D.17A'%list_tag[MM],ss+'%s.txt'%list_tag[MM])
 36         except:
 37             break
 38 
 39         filename_w= ss+'new/%s_w.txt'%list_tag[MM]
 40         if os.path.exists(filename_w):
 41             os.remove(filename_w)
 42         # import os
 43 
 44         # os.rename('./data/CODECO_D.02A','./data/CODECO_D.txt')
 45         filename_r = ss+'%s.txt'%list_tag[MM]  # txt文件和当前脚本在同一目录下,所以不用写具体路径
 46         #00010   UNH Message header      M   1
 47         pattern1   =  re.compile(r"(^d{4,5})s{3}[A-Z]{3}.+[CM]s{3}d*s{1,}|{0,}
")#00010
 48         pattern1_2 =  re.compile(r"^d{4,5}s{3}([A-Z]{3}).+[CM]s{3}d*s{1,}|{0,}
")#UNH
 49         #pattern1_3 =  re.compile(r"^d{5}s{3}[A-Z]{3}(.+)[CM]s{3}d*s{1,}|{0,}
")#Message header
 50         pattern1_4 =  re.compile(r"^d{4,5}s{3}[A-Z]{3}.+([CM])s{3}d*s{1,}|{0,}
")#C
 51         pattern1_5 =  re.compile(r"^d{4,5}s{3}[A-Z]{3}.+[CM]s{3}(d*)s{1,}|{0,}
")#1
 52         #pattern2 = re.compile(r"^d{5}.+Segmentsgroups(d)*.+[CM]s{3}d*-++
" )#+结尾
 53         #00050       ---- Segment group 1  ------------------ C   9----------------+
 54         pattern4_1 = re.compile(r"(^d{4,5}).+Segmentsgroupsd*.+[CM]s{3}d*.+
")
 55         pattern4_2 = re.compile(r"^d{4,5}.+Segmentsgroups(d*).+[CM]s{3}d*.+
")
 56         pattern4_3 = re.compile(r"^d{4,5}.+Segmentsgroupsd*.+([CM])s{3}d*.+
")
 57         pattern4_4 = re.compile(r"^d{4,5}.+Segmentsgroupsd*.+[CM]s{3}(d*).+
")
 58         #匹配每组的单独结尾的一行即没有Segment group的以+、+|、+||、+|||……结尾的的每个字段
 59         #如00280   RNG Range details                            C   1---------------+|
 60         pattern5_1 = re.compile(r"(^d{4,5})s{3}[A-Z]{3}.+[CM]s{3}d*-++{1,10}|{0,20}
" )
 61         pattern5_2 = re.compile(r"^d{4,5}s{3}([A-Z]{3}).+[CM]s{3}d*-++{1,10}|{0,20}
" )
 62         pattern5_3 = re.compile(r"^d{4,5}s{3}[A-Z]{3}.+([CM])s{3}d*-++{1,10}|{0,20}
" )
 63         pattern5_4 = re.compile(r"^d{4,5}s{3}[A-Z]{3}.+[CM]s{3}(d*)-++{1,10}|{0,20}
" )
 64         #以下是确定层级关系
 65         #匹配每组的单独结尾的一行即没有Segment group的以+、+|、+||、+|||……结尾的
 66         pattern5 = re.compile(r"^d{5}s{3}[A-Z]{3}.+[CM]s{3}d*-++|{0,10}
" )
 67         #匹配每组的开头一行即有Segment group的以+、+|、+||、+|||……结尾的
 68         pattern2_1 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++
" )#+结尾
 69         pattern2_2 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++|
" )#+|结尾
 70         pattern2_3 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++||
" )#+||结尾
 71         pattern2_4 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++|||
" )
 72         pattern2_5 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++||||
" )
 73         pattern2_6 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++|||||
" )
 74         pattern2_7 = re.compile(r"^d{5}.+Segmentsgroups(d*).+[CM]s{3}d*-++||||||
" )
 75         #匹配有同时多个组同时结束的情况,即以++、++|、++||……++、++|、++||……等结尾的
 76         pattern3_1 = re.compile(r"^d{5}.+[CM]s{3}d*-++{2}|{0,20}
")# 匹配++、++|、++||……等结尾
 77         pattern3_2 = re.compile(r"^d{5}.+[CM]s{3}d*-++{3}|{0,20}
")# 匹配+++、+++|、+++||……等结尾
 78         pattern3_3 = re.compile(r"^d{5}.+[CM]s{3}d*-++{4}|{0,20}
")
 79         pattern3_4 = re.compile(r"^d{5}.+[CM]s{3}d*-++{5}|{0,20}
")
 80         pattern3_5 = re.compile(r"^d{5}.+[CM]s{3}d*-++{6}|{0,20}
")
 81         pattern3_6 = re.compile(r"^d{5}.+[CM]s{3}d*-++{7}|{0,20}
")
 82 
 83 
 84         flag = 0
 85         #listgr中第一个不为0的点
 86         pos = -1
 87         listgr =[0,0,0,0,0,0,0,0,0,0]
 88 
 89         fr = open(filename_r)
 90         w2 = open(filename_w,'a')#a代表追加 w代表重写
 91         # w2.write("code_pos,parent,TRSD_tag,year,list_tag[MM],S,R")
 92         for line in fr.readlines():
 93             matcher1 = re.findall(pattern1,line)
 94             matcher1_2 = re.findall(pattern1_2,line)
 95             #matcher1_3 = re.findall(pattern1_3,line)
 96             matcher1_4 = re.findall(pattern1_4,line)
 97             matcher1_5 = re.findall(pattern1_5,line)
 98             matcher2_1 = re.findall(pattern2_1,line)
 99             matcher2_2 = re.findall(pattern2_2,line)
100             matcher2_3 = re.findall(pattern2_3,line)
101             matcher2_4 = re.findall(pattern2_4,line)
102             matcher2_5 = re.findall(pattern2_5,line)
103             matcher2_6 = re.findall(pattern2_6,line)
104             matcher2_7 = re.findall(pattern2_7,line)
105             matcher3_1 = re.findall(pattern3_1,line)
106             matcher3_2 = re.findall(pattern3_2,line)
107             matcher3_3 = re.findall(pattern3_3,line)
108             matcher3_4 = re.findall(pattern3_4,line)
109             matcher3_5 = re.findall(pattern3_5,line)
110             matcher3_6 = re.findall(pattern3_6,line)
111             matcher4_1 = re.findall(pattern4_1,line)
112             matcher4_2 = re.findall(pattern4_2,line)
113             matcher4_3 = re.findall(pattern4_3,line)
114             matcher4_4 = re.findall(pattern4_4,line)
115             matcher5   = re.findall(pattern5,line)
116             matcher5_1 = re.findall(pattern5_1,line)
117             matcher5_2 = re.findall(pattern5_2,line)
118             matcher5_3 = re.findall(pattern5_3,line)
119             matcher5_4 = re.findall(pattern5_4,line)
120 
121             if matcher4_1!=[]:
122                 w2.write("
")
123                 for j in matcher4_1:
124                     for k in j:
125                         w2.write(k)
126             if matcher4_2!=[]:
127                 w2.write(",")
128                 #写入parent列
129                 if pos!= -1:
130                     numgr =listgr[pos]
131                 else:
132                     numgr = 0
133                 if numgr ==0:
134                     w2.write("SG0,")
135                 else:
136                     w2.write("SG"+str(numgr)+",")
137                 for j in matcher4_2:
138                     for k in j:
139                         w2.write("SG"+str(k))
140             if matcher4_3!=[]:
141                 flag = 3
142                 w2.write(",")
143                 #默认写入year,list_tag[MM]两列
144                 w2.write(year+","+list_tag[MM]+",")
145                 for j in matcher4_3:
146                     for k in j:
147                         w2.write(k)
148             if matcher4_4!=[]:
149                 w2.write(",")
150                 for j in matcher4_4:
151                     for k in j:
152                         w2.write(k)
153             if matcher5_1!=[]:
154                 w2.write("
")
155                 for j in matcher5_1:
156                     for k in j:
157                         w2.write(k)
158             if matcher5_2!=[]:
159                 w2.write(",")
160                 #写入parent列
161                 if pos!= -1:
162                     numgr =listgr[pos]
163                 else:
164                     numgr = 0
165                 if numgr ==0:
166                     w2.write("SG0,")
167                 else:
168                     w2.write("SG"+str(numgr)+",")
169                 for j in matcher5_2:
170                     for k in j:
171                         w2.write(k)
172             if matcher5_3!=[]:
173                 flag = 3
174                 w2.write(",")
175                 #默认写入year,list_tag[MM]两列
176                 w2.write(year+","+list_tag[MM]+",")
177                 for j in matcher5_3:
178                     for k in j:
179                         w2.write(k)
180             if matcher5_4!=[]:
181                 w2.write(",")
182                 for j in matcher5_4:
183                     for k in j:
184                         w2.write(k)
185             #确定层级关系,也就是确定listgr
186             if(matcher5!=[]):
187                 for i in listgr:
188                     if i==0:
189                         pos = listgr.index(i)-1
190                         break
191                 listgr[pos]=0
192             if (matcher2_1!=[]):
193                 # print "2_1"
194                 for j in matcher2_1:
195                     # print j
196                     if(listgr[0]==0):
197                         listgr[0]=j
198                     else:
199                         listgr[0]=0
200                 # print listgr
201             if (matcher2_2!=[]):
202                 for j in matcher2_2:
203                     #numgr_d = j
204                     if(listgr[1]==0):
205                         listgr[1]=j
206                     else:
207                         listgr[1]=0
208             if (matcher2_3!=[]):
209                 for j in matcher2_3:
210                     if(listgr[2]==0):
211                         listgr[2]=j
212                     else:
213                         listgr[2]=0
214             if (matcher2_4!=[]):
215                 for j in matcher2_4:
216                     if(listgr[3]==0):
217                         listgr[3]=j
218                     else:
219                         listgr[3]=0
220             if (matcher2_5!=[]):
221                 for j in matcher2_5:
222                     if(listgr[4]==0):
223                         listgr[4]=j
224                     else:
225                         listgr[4]=0
226             if (matcher2_6!=[]):
227                 for j in matcher2_6:
228                     if(listgr[5]==0):
229                         listgr[5]=j
230                     else:
231                         listgr[5]=0
232             if (matcher2_7!=[]):
233                 for j in matcher2_7:
234                     if(listgr[6]==0):
235                         listgr[6]=j
236                     else:
237                         listgr[6]=0
238             if (matcher3_1!=[]):
239                 for i in listgr:
240                     if i==0:
241                         pos = listgr.index(i)-1
242                         break
243                 listgr[pos]=0
244                 listgr[pos-1]=0
245             if (matcher3_2!=[]):
246                 for i in listgr:
247                     if i==0:
248                         pos = listgr.index(i)-1
249                         break
250                 for k in range((pos-2),(pos+1)):
251                     listgr[k]=0
252             if (matcher3_3!=[]):
253                 for i in listgr:
254                     if i==0:
255                         pos = listgr.index(i)-1
256                         break
257                 for k in range((pos-3),(pos+1)):
258                     listgr[k]=0
259             if (matcher3_4!=[]):
260                 for i in listgr:
261                     if i==0:
262                         pos = listgr.index(i)-1
263                         break
264                 for k in range(pos-4,pos+1):
265                     listgr[k]=0
266             if (matcher3_5!=[]):
267                 for i in listgr:
268                     if i==0:
269                         pos = listgr.index(i)-1
270                         break
271                 for k in range(pos-5,pos+1):
272                     listgr[k]=0
273             if (matcher3_6!=[]):
274                 for i in listgr:
275                     if i==0:
276                         pos = listgr.index(i)-1
277                         break
278                 for k in range(pos-6,pos+1):
279                     listgr[k]=0
280              #确定层级关系结束
281             if (matcher1!=[]):
282                 flag = 1
283                 w2.write("
")
284                 for j in matcher1:
285                     for k in j:
286                         w2.write(k)
287             #print listgr
288             #判断当前lit不为0的位置
289             for i in listgr:
290                 if i==0:
291                     pos = listgr.index(i)-1
292                     break
293             if matcher1_2!=[]:
294                 flag = 2
295                 w2.write(",")
296                 #写入parent列
297                 if pos!= -1:
298                     numgr =listgr[pos]
299                 else:
300                     numgr = 0
301                 if numgr ==0:
302                     w2.write("SG0,")
303                 else:
304                     w2.write("SG"+str(numgr)+",")
305                 for j in matcher1_2:
306                     for k in j:
307                         w2.write(k)
308         #    if matcher1_3!=[]:
309         #        flag = 3
310         #        w2.write(",")
311         #        for j in matcher1_3:
312         #            for k in j:
313         #                w2.write(k)
314             if matcher1_4!=[]:
315                 flag = 4
316                 w2.write(",")
317                 #默认写入year,list_tag[MM]两列
318                 w2.write(year+","+list_tag[MM]+",")
319                 for j in matcher1_4:
320                     for k in j:
321                         w2.write(k)
322             if ((matcher1_5!=[])and(flag ==4)):
323                 flag = 5
324                 w2.write(",")
325                 for j in matcher1_5:
326                     for k in j:
327                         w2.write(k)
328         w2.close()
329         fr.close()
330 def trmd_b1_note(list_tag):
331     for MM in range(len(list_tag)):
332         filename_r = ss+'%s.txt'%list_tag[MM]
333         filename_w =  ss+'new/%s_wnote.txt'%list_tag[MM]
334         if os.path.exists(filename_w):
335             os.remove(filename_w)
336 
337         fr = open(filename_r)
338         w2 = open(filename_w,'a')
339         m=0
340         for line in fr.readlines():
341             list1 = [3,6,9,12,15,18,21,24,27,30]
342             for i in range(10):
343                 k = list1[i]
344                 # print k
345                 pattern1 = re.compile(r"^(d{4,5})s{"+str(k)+"}[^ ].+
")
346                 matcher1 = re.findall(pattern1,line)
347                 if matcher1!=[]:
348                     flag = 1
349                     m = k
350                     # print m
351                     w2.write(""
")
352                     # for j in matcher1:
353                     #     w2.write(j)
354                     flag = 1
355                     w2.write(""")
356                     break
357             v = m+5
358             #print v
359             pattern2 = re.compile(r"^s{"+str(v)+"}([^ ].+)
")
360             matcher2 = re.findall(pattern2,line)
361             if (matcher2!=[]):
362                 for j in matcher2:
363                     w2.write(j)
364                 w2.write(" ")
365                 #防止匹配到下面结构中的行
366             pattern3 = re.compile(r"(:?4.3s{4}Messagesstructure)|(:?Poss+TagsNames+Ss+R)")
367             matcher3 = re.findall(pattern3,line)
368             if (matcher3!=[]):
369                 break
370         w2.write(""")
371         w2.close( )
372         #把第一行的“修改为note
373         old_file=filename_w
374         fopen=open(old_file,'r')
375         w_str=""
376         i =0
377         for line in fopen:
378             i =i+1
379             if ((re.search(""",line)) and (i ==1)):
380                     line=re.sub('"','code_pos,note',line)
381                     w_str+=line
382             else:
383                     w_str+=line
384         # print w_str
385         wopen=open(old_file,'w')
386         wopen.write(w_str)
387         fopen.close()
388         wopen.close()
389 def join(list_tag):
390     for MM in range(len(list_tag)):
391         f1 = open(ss+'new/%s_w.txt'%list_tag[MM])
392         f2 = open(ss+'new/%s_wnote.txt'%list_tag[MM])
393 
394 
395         list_note=[]
396         for line1 in f1:
397             # print(line1)
398 
399             list_note.append(line1)
400              
401         f1.close()
402 
403         # print(list_note)
404         f2_w= open(ss+'new/b1%s.csv'%year,'a')  
405         # for i in range(len(list_note)):
406         j=0
407             # f2_r = open(ss+'/new/%s_w.txt'%list_tag[MM])
408         for line2 in f2:
409 
410             str11="%s,%s
"%(list_note[j].strip('
'),line2.strip('
'))
411             j=j+1
412             # print(i)
413             # print(str11)
414             f2_w.write(str11)
415 
416 
417          
418         f2.close()
419     f2_w.close()
420 
421 
422   
423 if __name__ == '__main__':
424     list_tag=get_tag()
425     trmd_b1_nonote(list_tag)
426     trmd_b1_note(list_tag)
427     join(list_tag) 
428 
429 """
430     特殊情况
431 
432 
433 
434     """
原文地址:https://www.cnblogs.com/smuxiaolei/p/7427674.html