爬取表情

#coding=utf-8
import os
from time import sleep

import requests
import re
from bs4 import BeautifulSoup

headers = {'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
           'Accept-Encoding': 'gzip, deflate',
           'Accept-Language': 'zh-CN,zh;q=0.9',
           'Connection': 'keep-alive',
           'Cookie': '_iuqxldmzr_=32; _ntes_nnid=0e6e1606eb78758c48c3fc823c6c57dd,1527314455632; '
                     '_ntes_nuid=0e6e1606eb78758c48c3fc823c6c57dd; __utmc=94650624; __utmz=94650624.1527314456.1.1.'
                     'utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); WM_TID=blBrSVohtue8%2B6VgDkxOkJ2G0VyAgyOY;'
                     ' JSESSIONID-WYYY=Du06y%5Csx0ddxxx8n6G6Dwk97Dhy2vuMzYDhQY8D%2BmW3vlbshKsMRxS%2BJYEnvCCh%5CKY'
                     'x2hJ5xhmAy8W%5CT%2BKqwjWnTDaOzhlQj19AuJwMttOIh5T%5C05uByqO%2FWM%2F1ZS9sqjslE2AC8YD7h7Tt0Shufi'
                     '2d077U9tlBepCx048eEImRkXDkr%3A1527321477141; __utma=94650624.1687343966.1527314456.1527314456'
                     '.1527319890.2; __utmb=94650624.3.10.1527319890',
           'Host': 'music.163.com',
           'Referer': 'http://music.163.com/',
           'Upgrade-Insecure-Requests': '1',
           'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
                         'Chrome/66.0.3359.181 Safari/537.36'}


def get_img_list(url):
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.content, 'lxml')
    img_list = soup.find_all('img', class_='ui image lazy')
    return img_list



def validateTitle(title):
    rstr = r"[/\:*?"<>|
]"  # '/  : * ? " < > |'
    new_title = re.sub(rstr, "_", title)  # 替换为下划线
    new_title = new_title[0:20]
    return new_title


try:
    path = "d:/crawl1/"
    #_url = 'https://fabiaoqing.com/biaoqing/lists/page/{page}.html'
    _url = 'https://fabiaoqing.com/tag/detail/id/{page}.html'
    urls = [_url.format(page=page) for page in range(1, 54673+1)]
    for real_url in urls:
        # https: // fabiaoqing.com / tag / detail / id / 2 / page / 227.
        # html
        # https: // fabiaoqing.com / tag / detail / id / 2.
        # html
        tag_id = real_url.split('/')[-1].split('.')[-2];
        for i in range(1,300):
            if i != 1:
                child_url = "https://fabiaoqing.com/tag/detail/id/"+tag_id+"/page/"+str(i)+".html"
            else :
                child_url = "https://fabiaoqing.com/tag/detail/id/"+tag_id+".html"
            print('crawl url ' + child_url)

            img_list = get_img_list(child_url)
            for img in img_list:
                try:

                       image = img.get('data-original')

                       pattern = re.compile(r'http://wxl.sinaimg.cn.*')
                       # 使用Pattern匹配文本,获得匹配结果,无法匹配时将返回None
                       title = img.get('title')
                       title = validateTitle(title);
                       with open(path + title + os.path.splitext(image)[-1], 'wb') as f:
                            img = requests.get(image).content
                            f.write(img)
                except Exception as e:
                    print(str(e))

except Exception as e:
    print(str(e))
原文地址:https://www.cnblogs.com/brady-wang/p/12409280.html