python常用语法

详情参考：https://www.runoob.com/python/python-tutorial.html

'''python抓取数据方式>>>开始'''

# 第一种：response 获取

data = response.text

# 第二种：requests 获取

data = requests.get(link)

data = data.text

# 第三种：urlopen 获取

data = urlopen(link).read()

# Beautiful Soup自动将输入文档转换为Unicode编码，输出文档转换为utf-8编码

data = BeautifulSoup(data, "html.parser")

# 第四种：xpath 解析获取

data = response.xpath('//div[@id="endText"]').get()

# Beautiful Soup自动将输入文档转换为Unicode编码，输出文档转换为utf-8编码

data = BeautifulSoup(data, 'html.parser')

print(data)

'''python抓取数据方式>>>结束'''

字符串是否包含

if 'ce' in nice:

去除第一个字符

nice = nice[1:]

去除最后一个字符

nice = nice[:-1]

去除字符串左边的空格

nice.lstrip()

去除字符串右边的空格

nice.rstrip()

数组的长度

length = len(array)

nice转字符串:

nice = ''.join(nice)

或者

nice = repr(nice)

nice转json:

json.loads()解码python json格式

json.load()加载python json格式文件

循环遍历:

for str in list:

print(str)

if 'nice' in str:

continue

break

替换字符串中的反斜杠

str = eval(repr(str).replace('\', '@'))

字符串str转换成int

int_value = int(str_value)

int转换成字符串

str: str_value = str(int_value)

分割字符串

nice.rsplit(",")

解决urlopen乱码开始

typeEncode = sys.getfilesystemencoding() ##系统默认编码

infoencode = chardet.detect(html).get('encoding', 'utf-8') ##通过第3方模块来自动提取网页的编码

html = html.decode(infoencode, 'ignore').encode(typeEncode) ##先转换成unicode编码，然后转换系统编码输出

# 解决urlopen乱码结束

soup = BeautifulSoup(html, "html.parser")

scripts = soup.select("script") # CSS 选择器

延时抓取

import time

time.sleep(3)

时间戳转时间格式(时间戳的长度为10位才可以,否则会报此异常:OSError: [Errno 22] Invalid argument)

timeArray = time.localtime(upload_time)

upload_time = time.strftime("%Y-%m-%d %H:%M:%S", timeArray)

# 若时间戳的长度为13位,则需要变成10位的

# timeArray = time.localtime(int(upload_time/1000))

#多个if判断

if 'pic-group clear' == divclass:

print('4张图')

elif 'pic img-do left' == divclass:

print('1张图')

else:

print('无图')

#python爬虫去除网页中的script结构

import re

clear = re.compile('<s*script[^>]*>[^<]*<s*/s*scripts*>', re.I)

content = clear.sub("", content)

# 去除id="content-ad"的div

clear = re.compile(r'<div id="content-ad">(.*)</div>', re.S)

content = clear.sub("", content)