re模块详解

  1 #!/usr/bin/env python
  2 #-*- coding:UTF-8 -*-
  3 #####################################################
  4 # Author: sunfx   xingrhce@163.com
  5 # Last modified:  2014/11/18
  6 # Filename:  re.py
  7 # Q  Q  群:  236147801
  8 #####################################################
  9  
 10 import re
 11  
 12 #1.查找文本中的字符
 13  
 14 pattern = 'this'
 15 text = 'Does this text match the pattern?'
 16  
 17 match = re.search(pattern,text)
 18  
 19 s = match.start()
 20 e = match.end()
 21  
 22 print 'Found "%s"
in "%s"
from %d to %d ("%s")' %
 23       (match.re.pattern,match.string,s,e,text[s:e])
 24  
 25 '''
 26 match.re.pattern 要匹配的内容
 27 match.string 匹配的字符
 28 s  匹配到内容开始索引
 29 d  匹配到内容结束索引
 30 text[s:e] 匹配字符
 31 '''
 32  
 33 #2.编译表达式
 34  
 35 regexes = [ re.compile(p)
 36             for p in ['this','that']              
 37 ] #把字符转换Regexobject格式
 38  
 39  
 40  
 41 print 'Text: %r
' % text #输出text内容
 42  
 43 for regex in regexes:
 44  
 45     print 'Seeking "%s"->' % regex.pattern,  #regex.pattern 要匹配的字符
 46  
 47     if regex.search(text): #在text中搜索this or that
 48  
 49         print 'match!'
 50  
 51     else:
 52  
 53         print 'no match'
 54  
 55 #3.多重匹配
 56  
 57 text = 'abbaaabbbbaaaaa'
 58  
 59 pattern = 'ab'
 60  
 61 for match in re.findall(pattern,text):
 62  
 63     print 'Found: "%s"' % match
 64  
 65 #findall 直接返回字符串
 66  
 67  
 68 for match in re.finditer(pattern,text):
 69     s = match.start()
 70     e = match.end()
 71     print 'Found "%s" at %d:%d' % (text[s:e],s,e)
 72  
 73 #finditer 返回原输入文字在字符串的位置
 74  
 75 #4.模式语法
 76  
 77 def test_patterns(text,patterns=[]):
 78  
 79     for pattern,desc in patterns: 
 80         print 'Pattern %r (%s) 
' %(pattern,desc) 
 81         print '   %r' % text
 82         for match in re.finditer(pattern,text):
 83             s = match.start()
 84             e = match.end()
 85             substr = text[s:e] #匹配到的字符
 86             n_backslashes = text[:s].count('\') #查找文本:s坐标之前的包含多少\
 87             prefix = '.' * ( s + n_backslashes ) 
 88             print '    %s%r' % (prefix,substr) 
 89         print
 90     return
 91  
 92 test_patterns('abbaaabbbbaaaaa',
 93             [('ab',"'a' followed by 'b'")]
 94     )
 95  
 96 #贪婪模式 这种模式会减少单个匹配减少
 97 '''
 98      *                '匹配一次到多次'
 99      +                '至少匹配一次到多次'
100      ?                '只匹配一次'
101      ab*,             'a followerd by zero or more b'),  #匹配0次或者更多次
102      ab+,             'a followerd by one or mrore b'),  #最少匹配一次或者更多次
103      ab?,             'a followerd by zero or one b'),   #匹配0最多一次
104      ab{3},           'a followerd by three b'),         #最少匹配三次
105      ab{2,3},           'a followerd by two to three b')   #匹配两至三次
106  
107  
108      ab*?,             'a followerd by zero or more b'),  #匹配0次或者更多次
109      ab+?,             'a followerd by one or mrore b'),  #最少匹配一次或者更多次
110      ab??,             'a followerd by zero or one b'),   #匹配0最多一次
111      ab{3}?,           'a followerd by three b'),         #最少匹配三次
112      ab{2,3}?,           'a followerd by two to three b')   #匹配两至三次
113 '''
114  
115 #用法如下:
116  
117 str = 'absdsdsdsdsd'
118  
119 print re.findall('ab*',str)
120 #['ab']
121  
122 print re.findall('ab*?',str)
123 #['a']
124  
125 #5.字符集
126  
127 '''
128 [ab]     'either a or b 匹配a或者b'
129 a[ab]+   'a followerd by 1 more a or b 匹配一次a、b或者多次 '
130 a[ab]+?  'a followerd by 1 or more a or b,not greedy 匹配1一次可以匹配多次'
131 [^]      '不包含内容'
132 [a-z]    '所有小写ASCII字母' 
133 [A-Z]    '所有大写写ASCII字母' 
134 [a-zA-Z] '一个小写和大写的序列'
135 [A-Za-z] '一个大写小写的序列'
136 '''
137 str ='aaaaaaaaaaaaaaaaaabbbbbbbbbbbbbbbbbbbbabbbbbbbbbbbasbsbab,a_baba'
138  
139 print re.findall('[ab]',str)
140 print re.findall('a[ab]+',str)
141 print re.findall('a[ab]+?',str)
142 print re.findall('[^_]',str)
143  
144 str = 'China,lovE'
145  
146 print re.findall('[a-z][A-Z]',str)  #['vE'] 
147 print re.findall('[A-Z][a-z]',str)  #['Ch']
148  
149 print re.findall('[A-Z][a-z]+',str) #['China']
150 print re.findall('[a-z][A-Z]+',str) #['vE']
151  
152 print re.findall('[A-Z][a-z]*',str) #['China', 'E']
153 print re.findall('[a-z][A-Z]*',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE']
154  
155 print re.findall('[A-Z][a-z]?',str) #['Ch', 'E']
156 print re.findall('[a-z][A-Z]?',str) #['h', 'i', 'n', 'a', 'l', 'o', 'vE']
157  
158 '''
159 .      元字符匹配一个字符
160 a.
161 b.
162 a.*b
163 a.*?b
164 '''
165  
166 c = 'woaizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbbsd'
167  
168 print re.findall('a.',c)  #['ai', 'aw', 'as', 'aa', 'ab']
169 print re.findall('b.',c)  #['b,', 'bs', 'ba', 'bb', 'bb', 'bb', 'bs']
170 print re.findall('a.*b',c)  #['aizhongguoawsb,wasssssssssssssdsdsdsdbsdddddddbaaabbbbbbb'] #贪婪模式匹配a到b之间的任意字符长度字符
171 print re.findall('a.*?b',c)  #['aizhongguoawsb', 'asssssssssssssdsdsdsdb', 'aaab'] # ?结束了* 的贪婪模式,
172                              #它不会到最后一个b再去匹配而且见好就收,匹配可能最短的字符
173  
174  
175 #6.转义码
176  
177 '''
178 转义码                                   含义
179  d                                    一个数字
180  D                                    一个非字符
181  s                                    空白符(制表符、空格、换行符)
182  S                                    非空白符(符号、字母、数字)
183  w                                    字母数字
184  W                                    非字母数字(符号、制表符、空格、换行符)
185 '''
186  
187 #7.锚定
188  
189 '''
190 锚定码                               含义
191   ^                              字符串或行的开始
192   $                              字符串或行结束
193   A                             字符串开始
194                                字符串结束
195                                一个单词开头或者末尾的空串
196   B                             不在一个单词的开头活末尾的空串
197 '''
198 #8.限制搜索 match、search
199  
200 text = 'This is some text --with punctuation.'
201  
202 pattern = 'is'
203  
204 print 'Text    :',text
205 print 'pattern:',pattern
206  
207 m = re.match(pattern,text)   #因为match是从字符开头开始匹配 is没有在开头所以没有匹配到.
208 print 'Match :',m   
209  
210 s = re.search(pattern,text) #is在文本中出现了两次所以匹配到内容
211 print 'Search :',s
212  
213 pattern = re.compile(r'w*isw*') #编译规则
214  
215 print 'Text:',text
216  
217  
218 pos = 0
219 while  True:
220     match = pattern.search(text,pos) #搜索规则
221     if not match:
222         break
223     s = match.start()
224     e = match.end() 
225     print '  %d : %d = "%s"' % (s,e-1,text[s:e]) 
226     pos = e
227  
228 #9 用户组解析匹配(任何一个正则都可以为组并嵌套在一个更大的表达式中)
229 regex = re.compile(r'(tw+)W+(w+)')
230  
231 print 'Input  text      :',text
232  
233 print 'Pattern          :',regex.pattern
234  
235 match = regex.search(text)
236 print 'Entire match     :',match.group(0) #表示整个表达式的字符串,子组从1开始排序
237 print 'World start with "t":',match.group(1) #匹配到的第一组
238 print 'World after "t" word :',match.group(2) #匹配到的第二组
239  
240 #python对基本分组进行了扩展 (?P<name>pattern)
241  
242 print text
243 print
244 for pattern in [ r'^(?P<first_word>w+)',  #组名和正则表达式组成
245                  r'(?P<last_word>w+)S*$',
246                  r'(?P<t_word>tw+)W+(?P<other_word>w+)',
247                  r'(?P<ends_with_t>w+t)',
248                  ]:
249     regex = re.compile(pattern)
250     match = regex.search(text)
251     print 'Matching "%s"' % pattern
252     print ' ',match.groups()  #匹配到所有的组的值
253     print ' ',match.groupdict() #把组名和字串生成字典 
254     print
255  
256 def test_patterns(text,patterns=[]):
257     '''Given source text and a list of patterns,look for 
258     matches for each pattern within the text and print
259     them to stdout.
260     '''
261     #look for each pattern in the text and print the resuls
262  
263     for pattern,desc in patterns:
264         print 'Pattern %r (%s)
' % (pattern,desc)
265         print '   %r' % text
266     for match in re.finditer(pattern,text):
267         s = match.start()
268         e = match.end()
269         prefix = ' ' * (s) #'空格 X 次数'
270         print '   %s%r%s' % (prefix,text[s:e],' '*(len(text)-e)),
271         print match.groups()
272         if match.groupdict():
273             print '%s%s' % (' ' * (len(text) -s),match,groupdict())
274             print
275     return
276  
277 print test_patterns(text,[(r'(a(a*)(b*))','a followerd by 0-n a and 0-n b')])
278  
279 '''
280 |       代表左右表达式任意匹配一个,他总是先尝试匹配左边的表达式,一旦成功匹配则
281 跳过匹配右边的表达式。如果|没有被包括()中,则它的范围是整个正则表达式
282 ?:pattern
283 '''
284  
285  
286 #10.搜索选项 - 不区分大小写的匹配
287 '''
288 re.IGNORECASE 忽略大小写
289 '''
290  
291 text  = 'This is some text  -- with punctuation.'
292 pattern = r'Tw+'
293 with_case = re.compile(pattern)
294 whitout_case = re.compile(pattern,re.IGNORECASE) #re.IGNORECASE 忽略大小写
295  
296 print 'Text: 
  %r' % text
297 print 'Pattern:
 %s' % pattern
298 print 'Case-sensitive:'
299 for match in with_case.findall(text):
300     print '  %r' % match
301 print 'Case-insensitive:'
302 for match in whitout_case.findall(text):
303     print ' %r' % match
304  
305 #11.多行输入
306 '''
307 MULTILINE  多行匹配
308 '''
309  
310 text = 'This is some text  -- with punctuation.
A secone lines.'
311 pattern = r'(^w+)|(w+S*$)'
312 single_line = re.compile(pattern)
313 multiline = re.compile(pattern,re.MULTILINE) 
314 print 'Text:
 %r' % text
315 print 'Pattern:
  %s' % pattern
316 print 'Single Line :'
317 for match in single_line.findall(text):
318     print '  %r' % (match,)
319 print 'MULTILINE  :'
320 for match in multiline.findall(text):
321     print '  %r'  % (match,)
322  
323 '''
324 DOTALL 让点字符也可以匹配换行符
325 '''
326  
327 pattern = r'.+'
328 no_newlines = re.compile(pattern)
329 dotall = re.compile(pattern,re.DOTALL)
330  
331 print 'Text :
   %r' % text
332 print 'Pattern:
 %s' % pattern
333 print 'No newlines :'
334 for match in no_newlines.findall(text):
335     print '  %r' % match
336 print 'Dotall    :'
337 for  match in dotall.findall(text):
338     print '  %r' % match
339  
340 #12 Unicode匹配
341 '''
342 re.UNICODE 匹配Unicode
343 '''
344  
345  
346 import codecs
347 import sys
348  
349 #set standard output encoding to UTF-8
350  
351 sys.output = codecs.getwriter('UTF-8')(sys.stdout)
352  
353 pattern = ur'w+'
354 ascii_pattern = re.compile(pattern)
355 unicde_pattern = re.compile(pattern,re.UNICODE)
356  
357 print 'Text    :',text
358 print 'Pattern :',pattern
359 print 'ASCII   :',u', '.join(ascii_pattern.findall(text))
360 print 'Unicode :',u', '.join(unicde_pattern.findall(text))
361  
362 '''
363 re.VERBOSE 让正则更容易读
364 '''
365  
366 address = re.compile(
367         '''
368         [wd.+-]+    #username
369         @ 
370         ([wd.]+.)+ #domain name prefix
371         (com|org|edu) #TODO:support more top-level domains
372         ''',
373         re.UNICODE | re.VERBOSE)
374  
375 candidates = [
376         u'first.last@example.com',
377         u'first.last+category@gmail.com',
378         u'valid-address@mail.example.com',
379         u'not-valid@example.foo'
380 ]
381  
382 for candidate in candidates:
383     match = address.search(candidate)
384     print '%-30s %s' % (candidate,'Matche' if match else 'no match')
385  
386  
387 address = re.compile (
388     '''
389     #A name is made up of letters,and may include "."
390     #for title abbreviations and middle initials.
391     ((?P<name>
392         ([w.,]+S+)*[w.,]+)
393         s*
394         # Email addresses are wrapped in angle
395         # brackets: <> but only if a name is 
396         # found, so keep the start bracket in this
397         # group.
398         <
399     )?  # the entire name is optional
400      
401     # the address itself:username@domain.tld
402     (?P<email>
403         [wd.+-]+    #username
404         @ 
405         ([wd.]+.)+ #domain name prefix
406         (com|org|edu) #TODO:support more top-level domains
407     )
408     >? # optional closeing angle break
409     ''',
410     re.UNICODE | re.VERBOSE)
411  
412 candidates = [
413         u'first.last@example.com',
414         u'first.last+category@gmail.com',
415         u'valid-address@mail.example.com',
416         u'not-valid@example.foo'
417         u'Fist Last <first.last@example.com>'
418         u'NO Brackets first.last@example',
419         u'First Last',
420         u'First Middle Last <first.last@example.com>',
421         u'First M. Last <first.last@example.com>',
422         u'<first.last@example.com>',
423 ]
424  
425 for candidate in candidates:
426     print 'candidate:',candidate
427     match = address.search(candidate)
428     if match:
429         print ' Name:',match.groupdict()['name']
430         print ' Email:',match.groupdict()['email']
431     else:
432         print '   No match'
433  
434 '''
435                     正则表达式标志缩写表
436  
437     标志                  缩写               描述
438  
439   IGNORECASE              i           忽略大小写
440   MULTILINE                 m           多行匹配
441   DOTALL                    s          让点字符也可以匹配换行符
442   UNICODE                  u          匹配Unicode
443   VERBOSE                 x          让正则更容易读
444 在模式中嵌入标签(?imu)会打开相应的选项
445 '''
446 text = 'This is  some text -- with punctuation.'
447 pattern = r'(?i)Tw+'
448 regex = re.compile(pattern)
449  
450 print 'Text   :',text
451 print 'Pattern    :',pattern
452 print 'Matches   :',regex.findall(text)
453  
454 #13 前向或后向
455  
456 address = re.compile(
457     '''
458     # A name is made up of letters, and may include "."
459     # for title abbreviations and middle initials
460     ((?P<name>
461         ([w.,]+s+)*[w.,]+
462         )
463     s+
464     )  # name is no longer optional
465     # LOOKAHEAD
466     # Email address are wrapped in angle brackets, but only
467     # if they are both present or neither is .
468     (?= (<.*>$)
469         |
470         ([^<].*[^>]$)
471     )
472     <? # optional opening angle bracket
473  
474     # The address itself: username@domain.tld
475     (?P<email>
476         [wd.+-]+
477         @
478         ([wd.]+.)+
479         (com|org|edu)
480     )
481     >?
482     ''',
483     re.UNICODE | re.VERBOSE)
484  
485 candidates = [
486     u'First Last <first.last@example.com>',
487     u'No Brackets first.last@example.com',
488     u'Open Brackets <first.last@example.com>',
489     u'Close Brackets first.last@example.com',
490     ]
491 for candidate in candidates:
492     print 'Candidate:',candidate
493     match = address.search(candidate)
494     if match:
495         print ' Name :',match.groupdict()['name']
496         print ' Email :',match.groupdict()['email']
497     else:
498         print '  No match'
499  
500 #自动忽略系统常用的noreply邮件地址
501 '''
502 (?!noreply@.*$) 忽略这个邮件地址
503 (?<!noreply>)  两种模式 写在username之前不会向后断言 
504 (?<=pattern)   用肯定向后断言查找符合某个模式的文本 
505 '''
506 address = re.compile(
507     '''
508     ^
509     # An address: username@domain.tld
510  
511     # Ignore noreply address
512     (?!noreply@.*$)
513  
514     [wd.+-]+     # username
515     @
516     ([wd.]+.)+  # domain name prefix
517     (com|org|edu)  # limit the allowed top-level domains
518  
519     $
520     ''',
521     re.UNICODE | re.VERBOSE)
522  
523 candidates = [
524  
525     u'first.last@example.com',
526     u'noreply@example.com',
527 ]
528  
529 for candidate in candidates:
530     print 'Candidate:',candidate
531     match = address.search(candidate)
532     if match:
533         print '  Match:',candidate[match.start():match.end()]
534     else:
535         print '  No match'
536  
537 twitter = re.compile(
538     '''
539     # A twitter handle: @username
540     (?<=@)
541     ([wd_]+)   # username
542     ''',
543     re.UNICODE | re.VERBOSE)
544  
545 text = ''' This text includes two Twitter handles.
546 One for @TheSF,and one for the author,@doughellmann.
547 '''
548 print text
549 for match in twitter.findall(text):
550     print 'handle:',match
551  
552 #14 自引用表达式 #可以把表达式编号后面来引用
553  
554 address = re.compile(
555     '''
556     (w+)          # first name
557     s+
558     (([w.]+)s+)?  # optional middle name or initial
559     (w+)           # last name
560  
561     s+
562     <
563  
564     # The address: first_name.last_name@domain.tld
565     (?P<email>
566         1         #first name
567         .
568         4         #last name
569         @
570         ([wd.]+.)+
571         (com|org|edu)
572         )            
573     >
574     ''',
575     re.UNICODE | re.VERBOSE | re.IGNORECASE)
576  
577 candidates = [
578     u'First Last <first.last@example.com>',
579     u'Different Name <first.last.example.com>',
580     u'First Middle Last <first.last@example.com>', 
581 ]
582 for candidate in candidates:
583     print 'Candidate:',candidate
584     match = address.search(candidate)
585 if match:
586     print '  Match name:',match.group(1),match.group(4)
587 else:
588     print ' No match'
589  
590 #正则表达式解析包括一个扩展,可以使用(?P=name)指示表达式先前匹配的一个命名组的值.
591  
592 address = re.compile(
593     '''
594  
595     # The regular name
596     (?P<first_name>w+)
597     s+
598     (([w.]+)s+)?
599     (?P<last_name>w+)
600     s+
601     <
602  
603     # The address: first_name.last_name@domain.tld
604     (?P<email>
605         (?P=first_name)
606         .
607         (?P=last_name)
608         @
609         ([wd.]+.)+
610         (com|org|edu)
611         )
612     >
613     ''',
614     re.UNICODE | re.VERBOSE | re.IGNORECASE)
615  
616 candidates = [
617     u'First last <first.last@example.com>',
618     u'Different Name <first.last@example.com>',
619     u'First Middle last <first.last@example.com>',
620     u'First M. Last<first.last@example.com>',
621 ]
622  
623 for candidate in candidates:
624     print 'Candidate:',candidate
625     match = address.search(candidate)
626     if match:
627         print '  Match name:',match.groupdict()['first_name']
628         print match.groupdict()['last_name']
629         print '  Match email:',match.groupdict()['email']
630  
631     else:
632         print 'No match'
633  
634 #15 用模式修改字符串
635 '''
636 re支持使用正则表达式作为搜索机制来修改文本，而且可以替换可以引用正则表达式中的匹配组作为替换文本的一部分。
637 '''
638 bold = re.compile(r'*{2}(.*?)*{2}')
639 text = 'Make this **bold**. This **too**.'
640 print 'Text:',text
641 print 'Bold:',bold.sub(r'<b>1</b>',text)
642  
643 '''
644 使用命名组来替换
645 count 来限制替换次数
646 sbun 工作原理和sub相似 subn同时返回修改后的字符串和完成的替换次数
647 '''
648  
649 bold = re.compile(r'*{2}(?P<bold_text>.*?)*{2}',re.UNICODE,)
650  
651 print 'Text:',text
652 print 'Bold:',bold.sub(r'<b>g<bold_text></b>',text,count=1)
653  
654 #16 利用模式拆分
655  
656 '''
657 str.split() 是分解字符串来完成解析的最常用方法之一,它只是支持字面值得作为分隔符
658 '''
659  
660 text = '''Paragraph one
661 one tuo lines.
662  
663 Paragraph two.
664  
665 Paragraph three.'''
666  
667 print 'With findall:'
668 for num,para in enumerate(re.findall(r'.+?
{2,}|$',
669                                     text,
670                                     flags = re.DOTALL)
671                             ):
672     print num,repr(para)
673     print
674  
675 print 
676 print 'With split:'
677 for num,para in enumerate(re.split(r'
{2,}',text)):
678     print num,repr(para)
679     print