python 爬图片演练

import requests
from pyquery import PyQuery as pq
import time
import os
import random

#自定义header
header = {
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36'
}
#自定义代理ip
proxyip = {
    'https': '180.109.124.30:4216'
}
#获取页面 url 注: url_list 为一个列表,第一个元素为首页,page_url 为2-9 页面的url;
def page_url():
    url_list=["http://www.netbian.com/meinv/"]
    url = "http://www.netbian.com/meinv/"
    for i in range(2,10):
        page_url = url + "index_" +str(i) + ".htm"
        url_list.append(page_url)
    # print(url_list)
    return url_list

# 请求网页,获取源码,提取文本就用text;提取图片、文件,就要用到content
def start_request():
    url = page_url()
    y = 0
    for i in url:
        r = requests.get(i,headers=header,proxies=proxyip)
        r.encoding = 'GBK'
        html = r.text
        doc = pq(html)
        # 匹配大致图片地址
        images = doc('div.list ul li img').items()

        for image in images:
            #print (image)
            #精准匹配图片url
            img_url = image.attr('src')
            print (img_url)
            #提取文本就用text;提取图片、文件,就要用到content
            img = requests.get(img_url,headers=header,proxies=proxyip).content
            dirs = "F:image"
            if not os.path.exists(dirs):
                os.mkdir(dirs,777)
            path = "F:\image\" + str(y) + ".jpg"

            with open(path,'wb') as f:
                f.write(img)
                time.sleep(2)
                print('正在下载第{}张图片'.format(y))
            print ("写入完成")
            y +=1

def main():
    start_request()
if __name__ == "__main__":
    main()

  

效果如下

原文地址:https://www.cnblogs.com/lixinliang/p/13795818.html