练习3-微博爬取

微博分页采用since_id,下一页的since_id在上一页的response中

from urllib.parse import urlencode
import requests
from pyquery import PyQuery as PQ
base_url='https://m.weibo.cn/api/container/getIndex?'

headers = {
    'Host':'m.weibo.cn',
    'Referer':'https://m.weibo.cn/u/2830678474',
    'User-Agent':'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/90.0.4430.212 Safari/537.36',
    'X-Requested-With': 'XMLHttpRequest',}

def get_page(since_id):
    param = {
        'type':'uid',
        'value':'2830678474',
        'containerid':'1076032830678474',
        'since_id':since_id
    }
    url = base_url + urlencode(param)
    try:
        response = requests.get(url,headers=headers)
        if response.status_code == 200:
            return response.json()
    except Exception as e:
        print('Error:',e)

def parse_page(json):
    if json:
        items = json.get('data').get('cards')
        for item in items:
            item = item.get('mblog')
            weibo = {}
            weibo['id'] = item.get('id')
            weibo['text'] = PQ(item.get('text')).text()
            weibo['attitudes'] = item.get('attitudes_count')
            weibo['comments'] = item.get('comments_count')
            weibo['reposts'] = item.get('reposts_count')
            yield weibo

if __name__ == '__main__':
    for i in range(10):
        if i == 0:
            results = get_page('')
        else:
            results = get_page(results.get('data').get('cardlistInfo').get('since_id'))
        for result in parse_page(results):
            print(result)
原文地址:https://www.cnblogs.com/tingshu/p/14770945.html