练习4-今日头条爬取

import requests
from urllib.parse import urlencode
from pyquery import PyQuery as pq
import re,os
from hashlib import md5

def get_page(page_num,search_id):
    param1={
        'dvpf': 'pc',
        'source':'input',
        'keyword': '街拍'
    }
    param2 = {
        'keyword': '街拍',
        'pd': 'synthesis',
        'source': 'pagination',
        'dvpf': 'pc',
        'aid': 4916,
        'page_num': page_num,
        'search_id': search_id
    }
    if page_num == 0:
        param=param1
    else:
        param = param2
    url = 'https://so.toutiao.com/search?' + urlencode(param)
    try:
        reponse = requests.get(url)
        if reponse.status_code == 200:
            return reponse.text
    except Exception as e:
        print('ERROR1:', e)

def parse_pg(html):
    doc=pq(html)
    imgs=doc('.abs-fill img').items()
    for img in imgs:
        src=img.attr('src')
        print(src)
        yield  src


def save_img(img):
    if not os.path.exists(r'D:pycharm_projects街拍'):
        os.mkdir(r'D:pycharm_projects街拍')
    try:
        response=requests.get(img)
        if response.status_code ==200:
            file_path='{}/{}.{}'.format(r'D:pycharm_projects街拍',md5(response.content).hexdigest(),'jpg')
            if not os.path.exists(file_path):
                with open(file_path,'wb') as f:
                    f.write(response.content)
            else:
                print('alredy download')
    except Exception as e:
        print('ERROR2:',e)


def main():
    search_id=''
    for i in range(2):
        if i == 0:
            html = get_page(i, search_id)
            doc = pq(html)
            search_id = re.search(r'search_id=(.*)&?',doc('.result-content:last-child a:first-child').attr('href')).group(1)
        else:
            html = get_page(i, search_id)
        imgs=parse_pg(html)
        for img in imgs:
            print(img)
            save_img(img)

if __name__ == '__main__':
    main()

原文地址:https://www.cnblogs.com/tingshu/p/14773354.html