爬取图片(三)

源码:

 1 import requests
 2 import json
 3 import re
 4 import os
 5 from urllib import request
 6 
 7 # 获取图集链接
 8 def get_urls(offset,headers):
 9     url = 'https://www.toutiao.com/search_content/?offset={}&format=json&keyword=%E8%A1%97%E6%8B%8D&autoload=true&count=20&cur_tab=3&from=gallery'.format(offset)
10     response = requests.get(url,headers=headers)
11     res = response.json()['data']
12     url_list = []
13     for i in res:
14         if 'article_url' in i:
15             article_url = i['article_url']
16             url_list.append(article_url)
17     return url_list
18 
19 # 下载图片
20 def download_pictures(url, headers):
21     try:
22         response = requests.get(url,headers=headers)
23         # print(response.text)
24         print(url)
25         pat_dir = r'<title>(.*?)</title>'
26         dir_name = re.search(pat_dir,response.text).group(1)
27         print(dir_name)
28         # 正则匹配,图片地址
29         pat = r'gallery: JSON.parse((.*?))'
30         res = re.search(pat, response.text)
31         res = res.group(1)
32         json_str = json.loads(res)
33         json_dict = json.loads(json_str)
34         dic = json_dict['sub_images']
35         dir_name = '街拍图/' + dir_name
36         if not os.path.exists(dir_name):
37             os.makedirs(dir_name)
38 
39         for i in dic:
40             image_url = i['url']
41             filename = dir_name + '/' + image_url.split('/')[-1] + '.jpg'
42             if not os.path.exists(filename):
43                 print('正在下载:' + filename)
44                 request.urlretrieve(image_url, filename)
45     except:
46         pass
47 
48 
49 if __name__ == '__main__':
50     headers = {
51         'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
52     }
53     for offset in range(0,60,20):
54         url_list = get_urls(offset,headers)
55         for url in url_list:
56             download_pictures(url, headers)
原文地址:https://www.cnblogs.com/zhxd-python/p/9501326.html