最简单的爬虫——小白

import requests
from lxml import etree
import os
#页数
page = 0
#统计图片数量
num = 1
#网址链接
start_url = "http://pic.netbian.com/index_{}.html"
#1——1169页实际少一页
for page in range(1, 1169):
url = start_url.format(page)

headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/78.0.3904.87 Safari/537.36'}

#乱码解码
response = requests.get(url, headers=headers).content.decode('gbk')
etree_html = etree.HTML(response)
# class="slist" ul li a href
xpath_url = etree_html.xpath('//div[@ class="slist"]/ul/li/a/@href')
for k in xpath_url:
x_url = 'http://pic.netbian.com/' + k

inner_html = requests.get(x_url, headers=headers).content.decode('gbk')

xpath_img_url = etree.HTML(inner_html)
# class="photo-pic" a img title
img_title = xpath_img_url.xpath('//div[@ class="photo-pic"]/a/img/@title')
# class="photo-pic" a img src
img_resource = xpath_img_url.xpath('//div[@ class="photo-pic"]/a/img/@src')
for title, img in zip(img_title, img_resource):
img_title = title
img_content = 'http://pic.netbian.com/' + img
# print(img_title, img_content)
image_content = requests.get(img_content, headers=headers).content
with open('./不知火/{}.png'.format(img_title), 'wb') as f:
f.write(image_content)
print('已完成{}下载, 第{}张图片'.format(img_title, num))
num += 1
原文地址:https://www.cnblogs.com/LQ970811/p/11821199.html