爬取图虫网 示例网址 https://wangxu.tuchong.com/23892889/

#coding=gbk
import requests
from fake_useragent import UserAgent
from lxml import etree
import urllib
import re
import os

pattern = 'https://(.+?).(.*).com'
# url = 'https://wangxu.tuchong.com/23892889/'
url = input("请输入图虫网图片地址:")
headers = {
    'User-Agent':UserAgent().chrome
}
response = requests.get(url,headers = headers)
e = etree.HTML(response.text)
img_path = '//article//img/@src'
img_urls = e.xpath(img_path)
# print(img_urls)
num = 1
for img_url in img_urls:
    response = requests.get(img_url,headers = headers)
    name = re.search(pattern,url).group(1)
    if os.path.exists("图虫_{}".format(name)):
        pass
    else:
        os.mkdir('图虫_{}'.format(name))
    urllib.request.urlretrieve(img_url, './图虫_{0}/图{1}.png'.format(name,num))
    print("第{}张图片下载完毕".format(num))
    num += 1

 


2020-07-15

原文地址:https://www.cnblogs.com/hany-postq473111315/p/13306056.html