爬取网页数据实例

爬取拉勾网招聘的职位

拉勾网，网址：https://www.lagou.com/

比如我们要搜索python的职位

https://www.lagou.com/jobs/list_python/p-city_3?&cl=false&fromSearch=true&labelWords=&suginput=

https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false

import requests
import json
import time

previous_url = "https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput="
craw_url = "https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false"

header = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    'referer': 'https://www.lagou.com/jobs/list_python/p-city_3?&cl=false&fromSearch=true&labelWords=&suginput=',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
}



# 建立 session
s = requests.Session()
# 获取搜索页的Cookies
s.get(previous_url,headers=header,timeout=3)
# 拿到此处获取的Cookie
cookie = s.cookies
# 获取此次的文本信息
for i in range(1,16):
    payload = {
        'first': 'true',
        'pn': str(i),
        'kd': 'python',
    }
    res = s.post(craw_url,data=payload,headers=header,timeout=5).text
    recruit = json.loads(res)
    print(recruit)
    position_info = recruit.get('content').get('positionResult').get('result')
    with open('position.txt',mode='ab+') as fw:
        fw.write(json.dumps(position_info,ensure_ascii=False).encode('utf-8'))
        fw.write('
'.encode('utf-8'))
    time.sleep(20)

在爬取职位信息的时候，需要携带搜索页的cookie

爬取红楼梦小说

红楼梦小说网址：http://www.shicimingju.com/book/hongloumeng.html

import requests

from bs4 import BeautifulSoup

header = {
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    'accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3;q=0.9',
}

response = requests.get('https://www.shicimingju.com/book/hongloumeng.html', headers=header)

soup = BeautifulSoup(response.text, 'lxml')
link_list = [
    'https://www.shicimingju.com' + li.find('a').get('href')
    for li in soup.select('.book-mulu li')
]
with open('hlm.txt', mode='ab+') as fw:
    for link in link_list:
        res = requests.get(link, headers=header)
        soup2 = BeautifulSoup(res.text, 'lxml')
        fw.write((soup2.select('.bookmark-list h1')[0].text).encode('utf-8'))
        fw.write('
'.encode('utf-8'))
        fw.write((soup2.select('.bookmark-list p')[0].text).encode('utf-8'))
        fw.write('
'.encode('utf-8'))

爬取肯德基门店信息

import requests

res = requests.get("http://www.kfc.com.cn/kfccda/storelist/index.aspx")

with open('text2.html',mode='wb') as fw:
    for line in res.iter_content():
        fw.write(line)
        

import requests
import json

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    'Referer': 'http://www.kfc.com.cn/kfccda/storelist/index.aspx',
}

res = requests.post(
    "http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx",
    params={
        'op': 'cname'
    },
    data={
        'cname': '上海',
        'pid': '',
        'keyword':'',
        'pageIndex': 1,
        'pageSize': 500
    },
    headers=header
)

kfc_info = json.loads(res.text).get('Table1')
kfc_list = [
    {
        "storeName":kfc.get('storeName')+'餐厅',
        "addressDetail":kfc.get("addressDetail"),
        "pro":kfc.get("pro")
    }
    for kfc in kfc_info
]

print(kfc_list)
print(len(kfc_list)) #455

爬取糗事百科段子

糗事百科：https://www.qiushibaike.com/

import requests

from bs4 import BeautifulSoup

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.163 Safari/537.36',
    'Cookie': '_xsrf=2|c757820a|8689eab698fb588fb9f2057ccf7d7ff7|1596541908; _qqq_uuid_="2|1:0|10:1596541909|10:_qqq_uuid_|56:N2E0ODM0MzQ0MzhhMmQ0ODhiN2VkOWEzZjZlNjgwZWIwYjFhYmUyOQ==|628d31f1d77ddca4ff48407bae2999366c0a036422afa9a71656a0f181373394"; gr_user_id=48d9d1c7-67fb-403b-8bec-b830ce07b762; ff2672c245bd193c6261e9ab2cd35865_gr_session_id=706bbad7-66f7-4880-8b06-7c39369518e2; Hm_lvt_2670efbdd59c7e3ed3749b458cafaa37=1596541910; _ga=GA1.2.2084709124.1596541910; _gid=GA1.2.298303643.1596541910; ff2672c245bd193c6261e9ab2cd35865_gr_session_id_706bbad7-66f7-4880-8b06-7c39369518e2=true; grwng_uid=62b3537d-3023-4060-a4fe-9a45f7e07d67; Hm_lpvt_2670efbdd59c7e3ed3749b458cafaa37=1596542096',
}

details_list = []
for i in range(1,14):
    url = f'https://www.qiushibaike.com/8hr/page/{i}/'
    res = requests.get(url, headers=header)
    soup = BeautifulSoup(res.text, 'lxml')
    div_list = soup.select('.recmd-right')
    for div in div_list:
        try:
            comment = div.find_all('span')[3].text
        except Exception as e:
            comment = 0
        details = {
            'subject': div.find('a').text,
            'link': 'http://www.qiushibaike.com' + div.find('a').get('href'),
            'support': div.find_all('span')[0].text,
            'comment': comment,
            'author':div.select('.recmd-name')[0].text
        }
        details_list.append(details)

print(details_list)
print(len(details_list))  # 189