爬虫作业

1、爬取三国演义http://www.shicimingju.com/book/sanguoyanyi.html

代码:

import requests
from bs4 import BeautifulSoup

res = requests.get('https://www.shicimingju.com/book/sanguoyanyi.html')
soup = BeautifulSoup(res.text, 'lxml')
# 先取出书名
book_name = soup.find(class_='bookmark-list').find(name='h1').text
# print(book_name)
# 取出所有回合章节的url
url_list = soup.select('.book-mulu ul li a')
# print(url_list)
for line in url_list:
    url = 'https://www.shicimingju.com' + line.attrs.get('href')
    # print(url)
    # 依次get回合url
    res1 = requests.get(url)
    soup1 = BeautifulSoup(res1.text, 'lxml')
    # print(soup1)
    # 取出该回合名称
    title = soup1.select('.bookmark-list h1')[0].text
    # print(title,type(title))
    # 取出该回合内容
    content = soup1.find(class_='chapter_content').text
    # print(content)
    with open('%s.txt' % book_name, 'a', encoding='utf-8') as f:
        # 追加回合标题
        f.write(title)
        # 追加回合内容
        f.write(content)

 2、爬取肯德基门店信息:http://www.kfc.com.cn/kfccda/storelist/index.aspx

import requests
import json
from bs4 import BeautifulSoup

data = {
    'cname': '上海',
    'pid': '',
    'keyword': '',
    'pageIndex': 1,
    'pageSize': 1000
}
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36',
    'Referer': 'http://www.kfc.com.cn/kfccda/storelist/index.aspx',
}
res = requests.post('http://www.kfc.com.cn/kfccda/ashx/GetStoreList.ashx', params={'op': 'cname'}, data=data, headers=header)
soup = BeautifulSoup(res.text, 'lxml')

kfc_info = json.loads(res.text).get('Table1')
kfc_list = [
    {
        "storeName":kfc.get('storeName')+'餐厅',
        "addressDetail":kfc.get("addressDetail"),
        "pro":kfc.get("pro")
    }
    for kfc in kfc_info
]

print(kfc_list)
print(len(kfc_list)) #455

 3、爬取拉钩网职位信息

import requests

# 实际要爬取的url
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'

payload = {
    'first': 'true',
    'pn': '1',
    'kd': 'python',
}

header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36',
    'Referer': 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput=',
    'Accept': 'application/json, text/javascript, */*; q=0.01'
}
# 原始的url
urls = 'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='
# 建立session
s = requests.Session()
# 获取搜索页的cookies
s.get(urls, headers=header, timeout=3)
# 为此次获取的cookies
cookie = s.cookies
# 获取此次文本
response = s.post(url, data=payload, headers=header, cookies=cookie, params={'city': '上海'}, timeout=5).text
print(response)
原文地址:https://www.cnblogs.com/baicai37/p/13429806.html