爬虫策略

import time
import queue
import random
import threading
import requests

class V2ProxyItem(object):
    USER_AGENT_LIST = [
        "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/85.0.4183.83 Safari/537.36
"
    ]
    
    def __init__(self, handler, proxy, start_timestamp=None):
        self.handler = handler
        self.proxy = proxy
        self.start_timestamp = start_timestamp or time.time()
        
    def request(self, url):
        for agent in self.USER_AGENT_LIST:
            try:
                headers = {
                    "User-Agent": agent
                }
                proxies = {
                    "http": self.proxy,
                    "https": self.proxy
                }
                res = requests.get(url=url, headers=headers, proxies=proxies, timeout=10)
                res.close()
                if res.status_code == 200:
                    self.start_timestamp = time.time() + random.randint(2, 5)
                    self.handler.push(self, 200)
                    return True, res.text
            except Exception as e:
                self.start_timestamp = time.time() + 10*60 # 10分钟
                self.handler.push(self, 503)
                return False, str(e)
            
        self.start_timestamp = time.time() + 10*60 
        self.handler.push(self, 503)
        return False, res.text


class V2ProxyHandler(object):
    QUEUE_200 = queue.Queue()
    QUEUE_503 = queue.Queue()
    
    def __init__(self):
        self.lock_status = False
        self.lock = threading.RLock()
        
    def initial(self, proxy_list):
        for ele in proxy_list:
            item = V2ProxyItem(self, ele)
            self.QUEUE_200.put(item)
            
    def pop(self):
        try:
            if self.QUEUE_200.qsize == 0:
                with self.lock:
                    if self.QUEUE_200.qsize == 0:
                        self.get_503_to_200()
            while True:
                item = self.QUEUE_200.get(timeout=10)
                if item.start_timestamp > time.time():
                    self.push(item, 200)
                    continue
                return item
        except Exception as e:
            return V2ProxyItem(self, None)
    
    def push(self, item, code):
        if not item.proxy:
            return
        if code == 200:
            self.QUEUE_200.put(item)
        else:
            self.QUEUE_503.put(item)
        
    def get_503_to_200(self):
        loop_counter = self.QUEUE_503.qsize()
        for _ in range(loop_counter):
            try:
                item = self.QUEUE_503.get(block=False)
                if item.start_timestamp < time.time():
                    self.QUEUE_200.put(item)
                else:
                    self.QUEUE_503.put(item)
            except queue.Empty as e:
                pass


proxy_object = V2ProxyHanlder()
proxy_object.initial(["47.22.1.20:8080", "127.0.0.1:5000"])
proxy_item_object = proxy_object.pop()
flag, text = proxy_item_object.request("https://www.amazon.cn/dp/B07XZR8GJZ/ref=lp_1397971071_1_14?s=music-players&ie=UTF8&qid=1599142215&sr=1-14")
print(flag, text)
原文地址:https://www.cnblogs.com/xuqidong/p/13611227.html