目前学习的爬取小数据图片zzz

import os
import threading
import re
import time
from lxml import etree


all_img_urls = []    # 图片列表页面的数组

g_lock = threading.Lock()      # 初始化一个锁

# 声明一个生产者的类,来不断地获取图片详情页地址,然后添加到 all_img_url列表中

# url = "http://www.xiaohuar.com/"

all_urls = []

class Spider(object):
    # 构造函数,初始化数据实用
    def __init__(self,target_url,headers):
        self.target_url = target_url
        self.headers = headers

    # 获取所有的想要抓取的URL
    def getUrls(self,start_page,page_num):
        for i in range(start_page,page_num):
            url = self.target_url % i
            all_urls.append(url)


if __name__ == '__main__':
    headers = {
                "User-Agent":"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
                "Host":"eclick.baidu.com",

            }
    target_url = "http://www.xiaohuar.com/list-1-%d.html"   # 抓取链接的样式

    spider = Spider(target_url,headers)    # 抓取链接的对象传入 链接与请求头
    spider.getUrls(0,14)    # 抓取的多少页面的链接
    # print (all_urls)


class Producer(threading.Thread):    #创建一个生产者用来批量的'生产'链接

    def run(self):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
            "Host": "eclick.baidu.com",

        }

        while len(all_urls) > 0:    # 这里写了一个死循环,为的是能够一直抓取为爬去数据的链接
            g_lock.acquire()     # 锁,为的是不让不同的线程共同使用同一条连接
        # for url in all_urls:
            url = all_urls.pop()    # 使用pop方法,可以获取链接
            g_lock.release()      # 获取连接后 释放锁,让其他线程可前去列表中获取链接
            response = requests.get(url,headers).text

            selector = etree.HTML(response)    # 使用xpath

            mods = selector.xpath("//div[@class='item_t']")    # 获取指定标签

            for i in mods:
                img_link = i.xpath("div[@class='img']/a/img/@src")
                name = i.xpath("div[@class='img']/span/text()")
                name = name[0].encode("utf8")
                img_link = img_link[0].encode("utf8")

                comment = {name: img_link}
                if img_link.startswith("/"):    # 因为抓取的链接,有一部分是本地,所以在此处将之拼接成可直接访问的url
                    str = "http://www.xiaohuar.com"
                    img_link = str + img_link
                    comment = {name: img_link}
                    all_img_urls.append(comment)
                all_img_urls.append(comment)


for x in range(10):     # 创建10个线程用来爬去链接
    down = Producer()
    down.run()
# print all_img_urls




class DownPic(threading.Thread):      # 用来下载爬取数据的类

    def run(self):
        headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.62 Safari/537.36",
            "Host": "eclick.baidu.com",

        }

        while True:  # 这个地方写成死循环,为的是不断监控图片链接数组是否更新
            g_lock.acquire()
            if len(all_img_urls) == 0: #没有图片了,就解锁
                g_lock.release()
                continue
            else:
                img = all_img_urls.pop()
                g_lock.release()
                # 遍历字典列表
                for key,value in img.items():
                    path =  "xiaohua/%s.jpg"% key.decode("utf8")
                    response = requests.get(value)
                    # print path
                    with open (path,"wb") as f:
                        f.write(response.content)
                        f.close()#
# #
#
#
for x in range(10):
    down = DownPic()
    down.run()
原文地址:https://www.cnblogs.com/stfei/p/10149120.html