Python 正则笔记

import re

"""
匹配单个字符
.       匹配任意1个字符（除了
）
[ ]     匹配[ ]中列举的字符
d      匹配数字，即0-9
D      匹配非数字，即不是数字
s      匹配空白，即 空格，tab键
S      匹配非空白
w      匹配非特殊字符，即a-z、A-Z、0-9、_、汉字
W      匹配特殊字符，即非字母、非数字、非汉字
[u4e00-u9fa5] 匹配汉字

match 是从头开始匹配,search 是从头或中间开始匹配
match/search 匹配失败,结果返回None
"""


def match_func(pattern,string):
    res = re.match(pattern,string)
    try:
        print(res.group())
    except Exception as e:
        print(e)

match_func('<as+hrefs*=s*""?([^"" >]+)""?>(.+)</a>','<a href="javascript:void(0);">发现</a>')
match_func("t.+o", "qt0o")  # 'NoneType' object has no attribute 'group'
match_func("t.+o", "t0o")   #  t0o
# 匹配汉字
match_func("t.+o[u4e00-u9fa5]+",'t0o武士se精神sw')    # t0o武士


def search_func(pattern,string):
    res = re.search(pattern,string)
    try:
        print(res.group())
    except Exception as e:
        print(e)


search_func("t.+o", "qt0o") # t0o
search_func("t.+os", "t0o")  # 'NoneType' object has no attribute 'group'

"""
匹配多个字符
*       匹配前一个字符出现0次或者无限次，即可有可无
+       匹配前一个字符出现1次或者无限次，即至少有1次
?       匹配前一个字符出现1次或者0次，即要么有1次，要么没有
{m}     匹配前一个字符出现m次
{m,n}   匹配前一个字符出现从m到n次
"""

match_func("ht{2,}ps?", "httttps")  # httttps

"""
匹配开头和结尾
^       匹配字符串开头
$       匹配字符串结尾
"""

# 匹配邮箱
match_func("[a-zA-Z0-9_]{4,20}@163.com$", "hello@163.com")  # hello@163.com

#匹配手机号
match_func("1[3-9]d{8}[0-35689]$", "13666777888")  # 13666777888


# 可以发现.*是从头尾同时开始搜索匹配的，如下 中间有#是识别不出来的
match_func("#.*#", "#幸福是奋斗#出来的#山东省")    # #幸福是奋斗#出来的#
#   [^#] *  匹配不含#的多个连续数据
match_func("#[^#]*#", "#幸福是奋斗#出来的#扫地")  # #幸福是奋斗#


num = ["12","2","3","4","5","65"]
for i in num:
    # | 匹配左右任意一个表达式
    #  2|4|6  表示匹配到2或4或6 ，都会成功
    match_func(".*(2|4|6)$",i)


match_func(".*(2d|4|6)","ad12d3f4567")  # ad12d3f456
list = ['黄蓉','刘亦菲','黄月英','蓉祖儿','王菲','月华']
for i in list:
    # 匹配包含'蓉' 或以 '菲'结尾的
    match_func(".*蓉.*|.*菲$",i)

# <html><div>hh</div></html>
# ToDo 下面 2,1 分别对前面[] 中的匹配关系,作为一种省略写法
match_func("<([a-zA-Z1-6]+)><([a-zA-Z1-6]+)>.*</\2></\1>","<html><div>hh</div></html><p>都是对的读书")
search_func("<([a-zA-Z1-6]+)><([a-zA-Z1-6]+)>.*</\2></\1>","<Meta>标题:人民日报:今日快报>..<html><div>hh</div></html><p>都是对的读书...")


search_func("d+","拍摄时10，方法12，收发22")    # 10
match_func(".*?(d+)","拍摄时10，方法12，收发22")    # 拍摄时10

# ToDo findall 匹配全部,返回结果列表
result= re.findall("d+","苹果10个，橘子12个，一共22个")
if result:
    print(result)   # ['10', '12', '22']
else:
    print("匹配失败")


# ToDo sub  正则替换,从左到右,count表示替换的数量,默认0-全部替换
result= re.sub("d+","15","苹果10个，橘子12个，一共22个",count=1)
if result:
    print(result)   # "苹果15个，橘子12个，一共22个"
else:
    print("匹配失败")


def replace(obj):
    replace_data = int(obj.group())
    data = replace_data +100
    return str(data)


# ToDo 对匹配的结果进行函数处理,然后替换
# 数字结果全加上100
result = re.sub("d+", replace, "苹果10个，橘子12个，一共22个")
if result:
    print(result)
else:
    print("匹配失败")


# ToDo split
mystr = "a:b,e;c:d"
result = re.split(":|,|;",  "a:b,e;c:d")
if result:
    print(result)   # ['a', 'b', 'e', 'c', 'd']
else:
    print("匹配失败")

str = 'callback( {"client_id":"YOUR_APPID","openid":"YOUR_OPENID"} )
'
result = re.search(r"(W(.*)s+)",str)
if result:
    print(result.group())  # ( {"client_id":"YOUR_APPID","openid":"YOUR_OPENID"} ) 返回匹配整个正则的结果,groups返回正则中()的正则匹配结果列表,如果娶具体哪个,则group(n);
    print(result.groups())  # ('{"client_id":"YOUR_APPID","openid":"YOUR_OPENID"}',)
    print(result.group(1))  # {"client_id":"YOUR_APPID","openid":"YOUR_OPENID"}
else:
    print("匹配失败")
other

关于正则表达式语法,可参考下面两篇文章
最全的常用正则表达式大全
一文掌握开发利器：正则表达式
<人追求理想之时,便是坠入孤独之际.> By 史泰龙