分析Ajax请求并抓取今日头条街拍美图

分析网页

以上作为data参数，offset不是页码，页面下滑，新的内容刷新，offset值变大。

找到请求地址：

请求网址高亮部分为固定内容，后半在data中。

如果请求中不加cookie会遇到下列问题：

取出来的数据不完整。

所以要找到Cookie，将其放入请求头中。

requests.get请求内容出错最有是cookie问题，下载图片时不要有header。

  1 import json
  2 import os
  3 import re
  4 from urllib.parse import urlencode
  5 import requests
  6 from bs4 import BeautifulSoup
  7 from requests.exceptions import RequestException
  8 import config
  9 import pymongo
 10 from hashlib import md5
 11 from multiprocessing import Pool
 12 
 13 client = pymongo.MongoClient(config.MONGO_URL, connect=False)
 14 db = client[config.MONGO_DB]
 15 
 16 def get_page_index(offset, keyword):
 17     data = {
 18         'aid': 24,
 19         'app_name': 'web_search',
 20         # 'offset': 0,
 21         'offset': offset,
 22         'format': 'json',
 23         # 'keyword': '人像摄影',
 24         'keyword': keyword,
 25         'autoload': 'true',
 26         'count': 20,
 27         'en_qc': 1,
 28         'cur_tab': 1,
 29         'from': 'search_tab',
 30         'pd': 'synthesis',
 31         # 'timestamp': 1560048654702,
 32     }
 33     url = 'https://www.toutiao.com/api/search/content/?' + urlencode(data)
 34     # print(url)
 35     headers = {
 36         'Cookie': 'UM_distinctid=16b3761e1ad453-0595b1308b57738-70226752-100200-16b3761e1ae16e;'
 37                   'tt_webid=6700158723510421000;csrftoken=9529582d3a82b38c3de771c3142adee4;'
 38                   'tt_webid=6700158723510421000;'
 39                   'WEATHER_CITY=%E5%8C%97%E4%BA%AC;'
 40                   'CNZZDATA1259612802=627663064-1560000420-%7C1560062522;'
 41                   '__tasessionId=3e436xfba1560065889568;'
 42                   's_v_web_id=fc9f6b68f10a508f45a92c3bc7ca0400',
 43         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0',
 44         'Host': 'www.toutiao.com',
 45     }
 46     try:
 47         response = requests.get(url, headers=headers)
 48         if response.status_code == 200:
 49             # print(response.text)
 50             return response.text
 51         return None
 52     except RequestException:
 53         print('请求索引页出错')
 54         return None
 55 
 56 # 取出具体文章的地址
 57 def parse_page_index(html):
 58     data = json.loads(html)  # 请求得到的是json格式
 59     if data and 'data' in data.keys():   # 判断获取是否成功, 键中是否有data
 60         try:
 61             for item in data.get('data'):    # 从响应中得到data数据
 62                 yield item.get('article_url')   # 在页面响应 中找到 文章的地址
 63         except TypeError:
 64             pass
 65 
 66 # 详情
 67 def get_page_detail(url):
 68     headers = {
 69         'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:67.0) Gecko/20100101 Firefox/67.0',
 70         'Host': 'www.toutiao.com',
 71     }
 72     try:
 73         response = requests.get(url, headers=headers)
 74         if response.status_code == 200:
 75             # print(response.text)
 76             return response.text
 77         return None
 78     except RequestException:
 79         print('请求详情页出错', url)
 80         return None
 81 
 82 # 解析详情页
 83 def parse_page_detail(html, url):
 84     # print(html)
 85     soup = BeautifulSoup(html, 'lxml')
 86 
 87     title = soup.select('title')[0].get_text()
 88     # print("title:", title)
 89     image_pattern = re.compile('pgc-img&quot;&gt.*?(http.*?)&quot', re.S)
 90     results = re.findall(image_pattern, html)
 91     for image in results:
 92         print(image)
 93         download_image(image)
 94     if results:
 95         return {
 96             'title': title,
 97             'url': url,
 98             'image': results
 99         }
100 
101 
102 # 存储
103 def save_to_mongo(result):
104     if db[config.MONGO_TABLE].insert(result):
105         print('存储MongoDB成功', result)
106         return True
107     return False
108 
109 def download_image(url):  # 传入的是图片url
110     print("保存的图片：", url)
111     try:
112         response = requests.get(url)
113         if response.status_code == 200:
114 
115             save_image(response.content)  # 二进制内容
116         return None
117     except RequestException:
118         print('请求图片出错', url)
119         return None
120 
121 def save_image(content):            #当前路径
122     file_path = '{0}/picture/{1}.{2}'.format(os.getcwd(), md5(content).hexdigest(), 'jpg')
123     if not os.path.exists(file_path):
124         with open(file_path, 'wb') as f:
125             f.write(content)
126             f.close()
127 
128 def main(offset):
129     html = get_page_index(offset, '人像摄影')
130     print("offset:", offset)
131     for url in parse_page_index(html):
132         if url == None:  #  url文章url
133             continue
134         # print(url)
135         html_detail = get_page_detail(url)    # 得到该文章具体页面内容
136 
137         if html_detail:
138             re = parse_page_detail(html_detail, url)    # 找出标题与图片
139             if re == None:
140                 continue
141             save_to_mongo(re)
142 
143 
144 if __name__ == '__main__':
145     groups = [x * 20 for x in range(config.GROUP_START, config.GROUP_END + 1)]
146 
147     pool = Pool()
148     pool.map(main, groups)

配置文件：

MONGO_URL='localhost'
MONGO_DB='toutiao'
MONGO_TABLE='toutiao'
GROUP_START=1
GROUP_END=20

可以将数据保存在mongodb中，并将图片保存在picture下。

可视化数据库robomongo