正则表达式补充

# 2.正则

# 方法：findall | match | split | sub
# a = 10
# print(a.__hash__())
# def fn():
# pass
# print(fn.__name__)
# import json
# print(json.dumps([1,2,3]))
import re

# 全文匹配，返回值是列表
res = re.findall('d*?', 'd1') # ['', '', ''] | ['', '', '1', '']
print(res)

# 非贪婪匹配的引用场景: 一般都要结合特定的开头与结尾
res = re.findall('<.*>', '<a>abc</a>')
print(res) # ['<a>abc</a>']
res = re.findall('<.*?>', '<a>abc</a>')
print(res) # ['<a>', '</a>']
res = re.findall('w*?>', '<a>abc</a>')
print(res) # ['a>', 'a>']

# 分组：通过分组加()，拿到匹配到的结果中的指定信息
res = re.findall('((w*?)>)', '<a>abc</a>')
print(res) # [('a>', 'a'), ('a>', 'a')]

# 操作分组的方法
# (?P<name>...): 有名分组
# 返回值是match匹配的结果对象，可以.group(组序号|组名)来取具体组的信息
res = re.match('(d{3})(?P<center>d{3})(d{3})', '123456789')
print(res.group('center'))

# 传入一个有正则语法的普通字符串，得到一个可以调用正则方法的正则字符串
r = re.compile('(w*?)>')
print(r.findall('<a>abc</a>'))

# res = re.findall('<([a-z]{1,3})>(w*?)</[a-z]{1,3}>', '<a>abc</a><b>123</b>')
# print(res) # [('a', 'abc'), ('b', '123')]

res = re.sub('(d{3})(?P<center>d{3})(d{3})', r'213', '<123456789>')
print(res)

a|b == [ab]
[^msg]: msg的对立面

*
+
?

*?
+?
??
'''
import re
print(re.findall(r'a[a-z]*', 'a ab c abc def ab'))
print(re.findall(r'[^ab]', ' aab c abc def ab'))

# 身份证：18位
# 200000200808081111
# [1-7][0-9]{5}
# (?:19[0-9]{2}|20[01][0-9]): 1900 - 2019
# (?:0[1-9]|1[0-2]): 01-12
# (?:0[1-9]|[12][0-9]|3[01]) : 01-31
# [0-9]{3}
# [0-9Xx]
# [1-7][0-9]{5}(?:19[0-9]{2}|20[01][0-9])(?:0[1-9]|1[0-2])(?:0[1-9]|[12][0-9]|3[01])[0-9]{3}[0-9Xx]

# 邮箱
# 30000000@qq.com
# [a-zA-Z0-9]w{,15}@[176|178|192].[com|com.cn|cn]

# 获取百度首页图片地址
import requests
responds = requests.get('https://www.baidu.com/')
# print(responds.text)
content = responds.text
# www.baidu.com/img/gs.gif
content += 'http://www.baidu.com/wimg/gs.gif'
# res_list = re.findall('www[^w]*?(?:gif|png)', content)
res_list = re.findall('www.{1,30}(?:gif|png)', content)
# print(len(res_list))
print(res_list)