python爬取站长之家植物图片

 1 from lxml import etree
 2 from urllib import request
 3 import urllib.parse
 4 import time
 5 import os
 6 
 7 
 8 def handle_request(url,page):
 9     if page == 1:
10         url = url.format('')
11     else:
12         url = url.format('_'+str(page))
13     headers = {
14         "User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
15     }
16     request = urllib.request.Request(url=url,headers=headers)
17 
18     return request
19 
20 def download_img(image_src):
21     dirpath = r'G:/untitled/zhiwu'
22     if not os.path.exists(dirpath):
23         os.mkdir(dirpath)
24     # 文件名
25     filename= os.path.basename(image_src)
26     # 文件路径
27     filepath = os.path.join(dirpath, filename)
28     # 发送请求保存图片
29     headers = {
30         "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36"
31     }
32     request = urllib.request.Request(url=image_src, headers=headers)
33     response = urllib.request.urlopen(request)
34     print(response)
35     with open(filepath,'wb') as fp:
36         fp.write(response.read())
37 
38 def parse_content(content):
39     # 解析内容,获取图片
40     tree = etree.HTML(content)
41     image_list = tree.xpath('//div[@id="container"]/div/div/a/img/@src')
42     for image_src1 in image_list:
43         image_src = image_src1.split('/')[-1]
44         download_img(image_src)
45 
46 def main():
47     url = 'http://sc.chinaz.com/tupian/huadetupian{}.html'
48     start_page = int(input('请输入起始页码:'))
49     end_page = int(input('请输入结束页码:'))
50     for page in range(start_page, end_page + 1):
51         request = handle_request(url, page)
52         content = urllib.request.urlopen(request).read().decode()
53         parse_content(content)
54         time.sleep(1)
55 
56 
57 if __name__ == '__main__':
58     main()
原文地址:https://www.cnblogs.com/erlchixiha/p/11805319.html