Spider--补充--Re模块

Spider--补充--Re模块_2

# @ Author : Collin_PXY

# Python 正则表达式的应用(二)

# 正则表达式之所以让人头疼，很大程度是因为表达式里有大量的符号及它们的组合，还有很多匹配模式，想要记住比较困难。
# 总结一下，方便用到的时候查询。

# 1--特殊字符表：
# d       0~9 的整数字元
# D       排除0~9的整数字元的其他字符
# s       空白、定位、Tab键、换行、换页字符
# S       除了 空白、定位、Tab键、换行、换页字符 之外的字符
# w       数字、大小写字母和底线_字符，即：[A-Za-z0-9_]
# W       除了数字、大小写字母和底线_字符，即：[^A-Za-z0-9_]

# 速记：小写： d--数字  s--功能字符  w--数字、大小写字母和 _
#           大写：与小写对应的互斥的集合。

# 2--在上面的字符后面添加+后表示不限长度（至少1次），添加*后表示不限长度（至少0次）
# 比如：
# d+       表示不限长度的数字（至少1次）
# w+       表示不限长度的数字、大小写字母和底线_字符（至少1次）
# w*       表示不限长度的数字、大小写字母和底线_字符（至少0次）
# w?       表示数字、大小写字母和底线_字符 可有可无 （至多1次）

# '.' 单字符通配符，'.*' 任意数量字符通配符（遇到换行符就结束了，Python的 re 模块提供参数 re.DOTALL来继续查找）
# '?'（至多一次）,'*'（至少0次）,'+' （至少1次）数量修饰符
# '^',$" 位置修饰符（eg, pattern = '^John',pattern = 'John$'）；[^]排除修饰符，表示不在。




# 示例：
import re
# 测试 1--将字符串从句子分离
msg = 'John, Johnson, Johnnason and Johnnathan will attend my party tonight.'
pattern = 'w+'                    # 不限长度的单字
txt = re.findall(pattern,msg)      # 传回搜寻结果
print(txt)

# ['John', 'Johnson', 'Johnnason', 'and', 'Johnnathan', 'will', 'attend', 'my','party', 'tonight']  
# 因为","和空格 不属于w所以，就以它们为分界了。


# 测试 2--将John开始的字符串分离
msg = 'John, Johnson, Johnnason and Johnnathan will attend my party tonight.'
pattern = 'Johnw*'                # John开头的单字
txt = re.findall(pattern,msg)      # 传回搜寻结果
print(txt)
# ['John', 'Johnson', 'Johnnason', 'Johnnathan'] 因为","不属于w所以，就以逗号为分界了。


# 测试 3--将John及John再加一位的的字符串分离
msg = 'John, Johnson, Johnnason and Johnnathan will attend my party tonight.'
pattern = 'Johnw?'                # John开头的单字
txt = re.findall(pattern,msg)      # 传回搜寻结果
print(txt)
# ['John', 'Johns', 'Johnn', 'Johnn']



# 3--字符分类[]
# [a-z]          表示a-z的小写字符
# [A-Z]          表示A-Z的小写字符
# [aeiouAEIOU]   元音字符
# [2-6]          代表2~6的数字


# 4--字符分类[^]
# [^a-d]          表示不在a-d的小写字符的字符
# [^A-D]          表示不在A-Z的小写字符的字符
# [^aeiouAEIOU]   表示不是元音字符的字符
# [^2-6]          代表不是2~6范围的数字


# 5--正则表示法的 ^ 符号
# 表示目标字符串必须在要搜索的字符串的起始位置，否则为对于 search是 None,对于 findall()是空列表[].

import re
# 测试 1--搜寻John字符串在最前面
msg = 'John will attend my party tonight.'
pattern = '^John'
txt = re.search(pattern,msg)
print(txt.group())               # John
txt = re.findall(pattern,msg)
print(txt)                       # ['John']


# 测试 2--搜寻John字符串不是在最前面
import re
# 测试 1：
msg = 'My best friend is John'
pattern = '^John'
txt = re.search(pattern,msg)
print(txt)                         # None 注意当search()返回None时，不能使用group()方法了，否则报错, 可加个判断 if txt != None:。
txt = re.findall(pattern,msg)      # []
print(txt)

# 测试 3--搜寻数字是不是在最前面
msg = '21 John will attend my party 28 tonight.'
pattern = '^[0-9]'
txt = re.findall(pattern,msg)      # 传回搜寻结果
print(txt)   # ['2']

# 测试 4--搜寻数字是不是在最前面
msg = '21 John will attend my party 28 tonight.'
pattern = '^d'
txt = re.findall(pattern,msg)      # 传回搜寻结果
print(txt)   # ['2']


# 6--正则表示法的 $ 字符
# 与 ^相反，正则表示法(包括分类字符的外面)的末端放置$字符时，表示正则表示法的字符串必须出现在被搜索的字符串的最后位置。

import re
# 测试 1--搜寻最后字符是非英文字母数字和底线字符
msg = 'My best friend is John'
pattern = 'John$'
txt = re.search(pattern,msg)       # 传回搜寻结果
if txt != None:
    print(txt.group())             # John
txt = re.findall(pattern,msg)
print(txt)                         # ['John']

# 测试 2--搜寻最后字符是非英文字母数字和底线字符
msg = 'John will attend my party 28 tonight.'
pattern = 'W$'
txt = re.findall(pattern,msg)      # 传回搜寻结果
print(txt)   # ['.']

# 测试 3--搜寻最后字符是非英文字母数字和底线字符
msg = 'I am 28'
pattern = 'W$'
txt = re.findall(pattern,msg)      # 传回搜寻结果
print(txt)  # []

# 测试 4--搜寻最后字符是数字
msg = 'I am 28'
pattern = 'd$'
txt = re.findall(pattern,msg)      # 传回搜寻结果
print(txt)  # ['8']

# 测试 5--搜寻最后字符是数字
msg = 'I am 28 year old.'
pattern = 'd$'
txt = re.findall(pattern,msg)      # 传回搜寻结果
print(txt)  #[]

# 测试 6--搜寻最后字符是数字
msg = 'I am 28 year old.'
pattern = '[0-9]$'
txt = re.findall(pattern,msg)      # 传回搜寻结果
print(txt)  #[]

# 测试 7--搜寻最后字符是数字
msg = 'I am 28 year old 30'
pattern = '[0-9]$'
txt = re.findall(pattern,msg)      # 传回搜寻结果
print(txt)  #['0']



# 8--通配符 "."   匹配任意单一字符
import re
msg = 'cat hat sat at matter flat'
pattern = '.at'
txt = re.search(pattern,msg)       # search()
if txt != None:
    print(txt.group())             # cat

txt = re.findall(pattern,msg)      # findall()
print(txt)  # ['cat', 'hat', 'sat', ' at', 'mat', 'lat']

# 如果我们就是要搜寻".",而不把它当成通配符时，需要使用转义字符处理一下 '.'
import re
msg = 'cat hat s.at at m.atter flat'
pattern = '.at'
txt = re.search(pattern,msg)       # search()
if txt != None:
    print(txt.group())             # .at

txt = re.findall(pattern,msg)      # findall()
print(txt)    # ['.at', '.at']


# 9--通配符 ".*"  匹配任意数量的任意字符(换行符除外，遇到换行符就停止搜索)

# 示例 1：
import re
msg = 'cat hat sat at matter flat'
pattern = 'h.* at'
txt = re.search(pattern,msg)       # search()
if txt != None:
    print(txt.group())             # hat sat at


# 示例 2：
import re
msg = 'Name: Jiin-Kwei Hung Address: 8F, Nan-Jing E. Rd, Taipei'
pattern = 'Name: (.*) Address: (.*)'
txt = re.search(pattern,msg)      # 传回搜寻结果
Name, Address = txt.groups()      # groups()多重指定
print("Name:    ", Name)
print("Address: ", Address)
# 如果我们就是要搜寻".*",而不把它当成通配符时，需要使用转义字符处理一下 '.*'


# 综合示例：
# excel里有一列储存了用户的 qq邮箱账号，而我们想要得到的是 qq号放在一个新的列中，我们可以使用 pandas和正则表达式：
# step 1:读取 excel 获取列数据 （这里省略，这里使用pandas 手动建立一个 存有邮箱账号的 DataFrame）:

import pandas as pd
from pandas import DataFrame
df=DataFrame({
    'email':['1234567@qq.com','2234567@qq.com','3234567@qq.com','4234567@qq.com']
})

df['qq']=df['email'].str.findall(pat='(.*?)@').str.get(0)  # 字符串类本身就有 findall()方法，不需要使用 re库。
df
# step 2: 将 df存储在 excel文件中。（略）




# 10--换行字符的处理
# 使用'.*'搜索时遇到换行符就结束了，Python的 re 模块提供参数 re.DOTALL,功能是包括搜索换行字符，可以将此参数放在
# compile(),search(),findall()里。

示例：
import re
#测试1搜寻除了换行字符以外字符
msg = 'Name: Jiin-Kwei Hung 
Address: 8F'
pattern = '.*'
txt = re.search(pattern,msg)           # 传回搜寻不含换行字符结果
print(txt.group())  # Name: Jiin-Kwei Hung

#测试2搜寻包括换行字符
msg = 'Name: Jiin-Kwei Hung 
Address: 8F'
pattern = '.*'
txt = re.search(pattern,msg,re.DOTALL) # 传回搜寻含换行字符结果
print(txt.group())

"""
Name: Jiin-Kwei Hung 
Address: 8F
"""


# 11--再来看 MatchObject对象：

# 11-1)--解读 MatchObject对象：
# <re.Match object; span=(0, 4), match='John'>
# span=(0,4)表示起始索引位置是0，结束索引位置是4

# 11-2)--获取 MatchObject对象 ： re.match()和re.search()方法
# re.match()和re.search()都可以生成MatchObject对象。
# re.match()只搜寻字符串开始的字符，若匹配则传回MatchObject对象，若失败则传回None.

# 11-3)--MatchObject对象的几个重要方法：
# MatchObject.group()    传回搜寻到的字符串
# MatchObject.start()    传回搜寻到的字符串的起始位置
# MatchObject.end()      传回搜寻到的字符串的结束位置
# MatchObject.span()     传回搜寻到的字符串的(起始，结束）位置元组


# 12--re.sub(pattern,newstr,msg)方法
# 用于搜寻并替代给定字符串里的目标字符串，但并不会改变原来的字符串，会返回一个替代过的新的字符串。（因为字符串是不可变的）
# 成功时，返回新的字符串，失败时将旧的字符串返回给变量：

# 示例 1  re.sub()的一般应用--取代匹配成功的所有字符串中的指定内容：
import re
#测试1取代使用re.sub()结果成功
msg = 'Eli Nan will attend my party tonight. My best friend is Eli Nan'
pattern = 'Eli Nan'
newstr = 'Kevin Thomson'            # 新字符串
txt = re.sub(pattern,newstr,msg)
if txt != msg:                      # 如果txt与msg内容不同表示取代成功
    print("取代成功: ", txt)
else:
    print("取代失败: ", txt)


# 示例2  re.sub()的进阶应用--取取代匹配成功的所有字符串中的 指定分组内的内容：

# 测试 1：pattern里用(),newstr里用数字。
import re
# 使用隐藏文字执行取代
msg = 'CIA Mark told CIA Linda that secret USB had given to CIA Peter.'
pattern01 = r'CIA (w)w*'            # 欲搜寻CIA + 空一格后的名字 遇到符号就停止了
newstr = r'1***'                   # 新字符串使用隐藏文字替换 pattern里的字符串
txt = re.sub(pattern01,newstr,msg)    # 执行取代
print("取代成功: ", txt)             # 列出取代结果

# 1表示只替代 pattern01 = r'CIA (w)w*' 里第一个分组的内容 
# M*** told L*** that secret USB had given to P***.

# 测试 2：
import re
# 使用隐藏文字执行取代
msg = 'CIA Mark told CIA Linda that secret USB had given to CIA Peter.'
pattern02 = r'CIA (w)(w)w*'        # 欲搜寻CIA + 空一格后的名字 遇到符号就停止了
newstr = r'2***'                   # 新字符串使用隐藏文字替换pattern里的字符串
txt = re.sub(pattern02,newstr,msg)    # 执行取代
print("取代成功: ", txt)             # 列出取代结果

# 2表示只替代 pattern02 = r'CIA (w)(w)w*' 里第二个分组的内容
#  a*** told i*** that secret USB had given to e***.
参考资料：《Python 王者归来》洪锦魁