IP_POOL

#!/usr/bin/python
# -*-coding:utf-8-*-
"""
@file : demo.py
@time: 2017/11/15 15:58
"""


from multiprocessing import Process

import requests
import redis
import time
try:
from aiohttp.errors import ProxyConnectionError,ServerDisconnectedError,ClientResponseError,ClientConnectorError
except:
from aiohttp import ClientProxyConnectionError as ProxyConnectionError,ServerDisconnectedError,ClientResponseError,ClientConnectorError
from fake_useragent import UserAgent
import re


HOST = 'localhost'
PORT = 6379

TEST_API = 'http://www.baidu.com'


POOL_LOWER_THRESHOLD = 300 # 在IP池检测时: 数量小于这个数是调用爬虫进行抓取;如果大于这个数,则进行等待
POOL_UPPER_THRESHOLD = 10000 # 临界值 在抓取IP的过程中,如果抓取后的个数大于这个数,则这个循环结束

POOL_LEN_CHECK_CYCLE = 10 # 检测IP池是否有足够IP的频率
VALID_CHECK_CYCLE = 5*60 # 检查代理是否有效的频率


def get_page(url, options={}):
ua = UserAgent()
base_headers = {
'User-Agent': ua.random,
'Accept-Encoding': 'gzip,deflate,sdch',
'Accept-Language': 'zh-CN,zh;q=0.8'
}
headers = dict(base_headers, **options)
print('Getting', url)
try:
r = requests.get(url, headers=headers)
if r.status_code == 200:
return r.text
except ConnectionError:
print('Crawling Failed', url)
return None


class ResourceDepletionError(Exception):

def __init__(self):
Exception.__init__(self)

def __str__(self):
return repr('The proxy source is exhausted')


class FreeProxyGetter(object):

def get_raw_proxies(self):
proxies = []

start_url = 'http://localhost:8080'
html = get_page(start_url)
ip_address = re.compile(r'd+.d+.d+.d+:d+')
re_ip_address = ip_address.findall(html)
for address in re_ip_address:
proxies.append(address)

return proxies


class RedisClient(object):
def __init__(self, host=HOST, port=PORT):
self._db = redis.Redis(host=host, port=port)

def get(self, count=1):
proxies = self._db.lrange("proxies", 0, count-1)
self._db.ltrim("proxies", count, -1) # ltrim对一个列表进行修剪,让列表只保留指定区间内的元素,不在指定区间之内的元素都将被删除
return proxies

def put(self, proxy):
self._db.rpush("proxies", proxy)

@property
def queue_len(self):
return self._db.llen("proxies")


class PoolAdder(object):

def __init__(self, threshold):
self._threshold = threshold # 极限,临界值
self._conn = RedisClient()
self._tester = ValidityTester()
self._crawler = FreeProxyGetter()

def is_over_threshold(self):
if self._conn.queue_len >= self._threshold:
return True
else:
return False

def add_to_queue(self):
proxy_count = 0
while not self.is_over_threshold():
raw_proxies = self._crawler.get_raw_proxies() # return proxies
self._tester.set_raw_proxies(raw_proxies)
self._tester.test()
proxy_count += len(raw_proxies)
if proxy_count == 0:
raise ResourceDepletionError


class ValidityTester(object):
test_api = TEST_API

def __init__(self):
self._raw_proxies = None
self._usable_proxies = []

def set_raw_proxies(self, proxies):
self._raw_proxies = proxies
self._conn = RedisClient()

def test(self):
for proxy in self._raw_proxies:
if isinstance(proxy, bytes):
proxy = proxy.decode('utf-8')
real_proxy = 'http://' + proxy
response = requests.get(self.test_api, proxies={'http:':real_proxy})
if response.status_code == 200:
self._conn.put(proxy)
print('Valid proxy', proxy)
else:
print('Invalid proxy', proxy)


class Schedule(object):
@staticmethod
def valid_proxy(cycle=VALID_CHECK_CYCLE):
conn = RedisClient()
tester = ValidityTester()

count = int(0.5 * conn.queue_len)
if count == 0:
print('Waiting for adding')
time.sleep(cycle)

raw_proxies = conn.get(count=count) # ***

tester.set_raw_proxies(raw_proxies)
tester.test()
time.sleep(cycle) # sleep 10s

@staticmethod
def check_pool(upper_threshold=POOL_UPPER_THRESHOLD,
lower_threshold=POOL_LOWER_THRESHOLD,
cycle=POOL_LEN_CHECK_CYCLE):
conn = RedisClient()
adder = PoolAdder(upper_threshold)
num = 1
while True:
if conn.queue_len < lower_threshold:
adder.add_to_queue()
time.sleep(cycle)
num = num + 1

def run(self):
print('IP processing running')
valid_process = Process(target=Schedule.valid_proxy)
check_process = Process(target=Schedule.check_pool)
valid_process.start()
check_process.start()


def main():
s = Schedule()
s.run()


if __name__ == '__main__':
main()
原文地址:https://www.cnblogs.com/liyugeng/p/7845567.html