requests爬虫get请求三部曲(快速编码)-小结

一、背景

每次使用python的requests构建get请求爬虫时,都需要从零开始编写,效率太低。需要探索一种快速生成编码的方法。

二、分析方案

现将编码总结好,使用的时候取回来拼凑,改造。时间久了,积累多了,就做一个大项目的总结。

三、实现方法

(一)第一步:快速构建请求头(heads)

(二)第二步:快速构建请求过程(requests的get请求)

(三)第三步:快速解析数据(html、json等)并存储文件

(一)第一步:快速构建请求头(heads)

def denghao2maohao(cookie_str):
    # 截断数据对
    list1 = cookie_str.split(";")
    # print(list1)
    # 初始化字典
    cookie_dict_str = {}
    for item in list1:
        list2 = item.split("=", 1)  # 按照等号只切割一次
        # print(list2)
        dict_key = list2[0].strip()
        dict_value = list2[1].strip()
        cookie_dict_str[dict_key] = dict_value
    return cookie_dict_str
 
 
def maohao2yinhao(maohao_str):
    list1 = maohao_str.strip().splitlines()
    maohao_str_dict = {}
    for item in list1:
        if item.strip().startswith(":"):
            # print(item.strip())
            list2 = item.strip().split(":", 2)  # 按照分号截断2次
            # print(list2)
            new_key = list2[1]
            new_value = list2[2].strip()
            # maohao_str_dict[":" + new_key] = new_value  # 保留首冒号
            maohao_str_dict[new_key] = new_value  # 删除首冒号
            print("'%s':'%s'," % (new_key, new_value))
        else:
            # print(item)
            list2 = item.split(":", 1)  # 按照分号截断1次
            maohao_str_dict[list2[0].strip()] = list2[1].strip()
 
            new_key = list2[0].strip()
            new_value = list2[1].strip()
            maohao_str_dict[new_key] = new_value
            print("'%s':'%s'," % (new_key, new_value))  # 输出格式化好的键值对
 
    return maohao_str_dict
 
 
if __name__ == '__main__':
    # # cookie中,等号转为冒号
    # cookie_str = "ss_lang=cs; product=WGSN; ss_udid=0ed9a26e6dd6bb892c796cda69bca4a3; PHPSESSID=ci56j78njjgdqde5tjepslaah5; exclusionChecked=True; ss_token=f77dcbc5a65f43977e02b61e9d6ff947; trwv.uid=stylesight-1525165098107-fd45157e%3A2; trwsa.sid=stylesight-1525177471085-3d01fa38%3A2; _ga=GA1.2.1824486173.1525165097; _gid=GA1.2.1794994253.1525165097; cp_browStat=Logged In; cp_UserID=-1; cp_hybridBrowStat=Logged In; cp_SubStat=Subscriber"
    # # print(cookie_str)
    # cookie_dict_str = denghao2maohao(cookie_str)
    # print("======【1】 cookie等号转为冒号  ========")
    # print(cookie_str)
    # print()
    # print(cookie_dict_str)
 
    # 请求中,冒号添引号,删除首冒号
    maohao_str = """
    :authority:www.wgsnchina.cn
    :method:POST
    :path:/api/cherry/search/query
    :scheme:https
    accept:application/json, text/plain, */*
    accept-encoding:gzip, deflate, br
    accept-language:zh-CN,zh;q=0.9
    content-length:149
    content-type:application/json;charset=UTF-8
    cookie:ss_lang=cs; product=WGSN; ss_udid=0ed9a26e6dd6bb892c796cda69bca4a3; PHPSESSID=ci56j78njjgdqde5tjepslaah5; exclusionChecked=True; ss_token=f77dcbc5a65f43977e02b61e9d6ff947; _gat_UA-1004012-2=1; cp_SubStat=Subscriber; cp_browStat=Logged In; cp_UserID=-1; cp_hybridBrowStat=Logged In; _dc_gtm_UA-1004012-2=1; _ga=GA1.2.1824486173.1525165097; _gid=GA1.2.1794994253.1525165097; trwv.uid=stylesight-1525165098107-fd45157e%3A3; trwsa.sid=stylesight-1525179968287-e61a7bc2%3A2
    origin:https://www.wgsnchina.cn
    referer:https://www.wgsnchina.cn/library/results/ab745207e8ed3dcfa16b4814748beead
    user-agent:Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.89 Safari/537.36
    """
    print("======【2】 请求中,冒号添引号,删除首冒号  ========")
    maohao_str_dict = maohao2yinhao(maohao_str)
    # print(maohao_str)
    print()
    print(maohao_str_dict)

代码原文:https://www.cnblogs.com/andy9468/p/8977406.html  

(二)第二步:快速构建请求过程(requests的get请求)

url = 'https://www.baidu.com'
headers = {
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
    'accept-encoding': 'gzip, deflate, br',
    'accept-language': 'zh-CN,zh;q=0.9',
    'upgrade-insecure-requests': '1',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/65.0.3325.181 Safari/537.36',
    # 'cookie': 'xxx;yyy;zzz',
    # 'referer': 'https://xxx.yyy.zzz'
}
 
# 发起get请求
response = requests.get(url, headers=headers, verify=True)
# 获取html文本
html_data = response.content.decode()
print(html_data)
print(len(html_data))

代码原文:https://www.cnblogs.com/andy9468/p/11492910.html

(三)第三步:快速解析数据(html、json等)并存储文件

1、html解析

from lxml import etree
# 获取全部有意义正文文本
html_str="""<div>hah<a>六六六</a>cccc收拾收拾</div>"""
html_etree = etree.HTML(html_str)  # 获取element 类型的html
all_content = html_etree.xpath('string(.)').strip()
print(all_content)

代码原文:https://www.cnblogs.com/andy9468/p/10144867.html

xpath语法:

https://www.cnblogs.com/andy9468/p/10144867.html

html解析类封装:

https://www.cnblogs.com/andy9468/p/8060372.html

2、json解析

json字串转dict字典:

dict_data=json.loads(json_data)

代码原文:https://www.cnblogs.com/andy9468/p/8252897.html

3、数据存入普通文本

with写文件:

save_file = "1.txt"
str_data = "123a
bc"
with open(save_file, 'a', encoding="utf-8") as f:
    f.write(str_data)

代码原文:https://www.cnblogs.com/andy9468/p/11493062.html

4、数据存入Excel表格

写入Excel表格实例代码:

from openpyxl import Workbook
 
 
def main():
    sheet_name = "表名1"
    row_count = 6  # 行数
    info_result = []
    page = 1
    while page <= row_count:
        info = ['a', 'b', 'c']  # 每行的内容
        info_result.append(info)
        page += 1
    # 写入Excel表格
    wb = Workbook()
    ws1 = wb.active
    ws1.title = sheet_name  # sheet名称
    for row in info_result:
        ws1.append(row)
    wb.save('拉钩职位信息.xls')  # Excel文件名称,保存文件
 
 
if __name__ == '__main__':
    main()

代码原文:https://www.cnblogs.com/andy9468/p/10999135.html

原文地址:https://www.cnblogs.com/andy9468/p/12666866.html