干货分享!用户级爬虫,怎敢封IP

一般来说,我们在爬取其他网站的数据的时候,会遇到ip被限制的情况,这时候就需要代理ip池进行处理了

1、获取ip代理的方法

def getProxyIp():
 header = {
 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'
 }
 proxy = []
 try:
 #html解析获取ip代理
 url1 = 'https://raw.githubusercontent.com/fate0/proxylist/master/proxy.list'
 resq = requests.get(url1, headers=header)
 res = '[' + resq.replace('}
{','},{') + ']'
 _s = json.loads(res)
 ips = []
 for i in _s:
 if i['type'] == 'https':
 ip = str(i['host']) + ':' + str(i['port'])
 ips.append(ip)
 #验证每一个ip代理的有效性,然后加入到代理池中
 for x in ips:
 try:
 requests.get(url2, proxies={'http':'http://'+x},timeout = 2)
 proxy.append(x)
 except Exception as e:
 print(str(e))
 except Exception as e:
 print(str(e))
 return proxy

2、在爬虫中使用代理时先检查代理ip的可用性,若可用则该ip代理爬取网页,若不可用则从代理池中剔除,调用获取代理池方法加入新代理ip。

#调用之前的ip代理获取函数能用的ip代理进行下一步流程,不能用的剔除代理池,加入新的ip代理
def getHrefInfo(url, a):
 url_h = url
 headers = {
 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.36'}
 proxy_list = getProxyIp()
 proxy_ip = proxy_list[a]
 #获取新的有效ip 当原来的ip连接不上的时候删除并获得新的ip加入到代理池中
 while True:
 #ip代理验证,如果http请求回应为200,则ip代理可用。
 try:
 resq = requests.get(url_h,headers=headers,proxies={'https':'https://' + proxy_ip})
 if resq.status_code == 200:
 break
 #删除不能获取正确http回应的ip代理,并新加入不在代理池中的代理
 else:
 proxy_list.remove(proxy_ip)
 _new_ip = getProxyIp()
 for ip in _new_ip:
 if ip not in proxy_list:
 proxy_list.append(ip)
 break
 #删除不能获取http回应的ip代理,并新加入不在代理池中的代理
 except Exception as e:
 proxy_list.remove(proxy_ip)
 _new_ip = getProxyIp()
 for ip in _new_ip:
 if ip not in proxy_list:
 proxy_list.append(ip)
 break
 proxy_ip = random.choice(proxy_list)
 print(proxy_ip + "已从代理池中移除" + '已添加新ip进入代理池' + str(e))
 time.sleep(3)

3、最后进行检测模块

import requests
import time
import traceback
from requests.exceptions import ProxyError, ConnectionError
from db.mongo_db import MongoDB
from multiprocessing.pool import ThreadPool
def valid_many(proxy_list, method):
 pool = ThreadPool(16)
 for proxy in proxy_list:
 pool.apply_async(valid_one, args=(proxy, method))
 pool.close()
 pool.join()
def valid_one(proxy, method, url='https://www.baidu.com'):
 headers = {
 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.106 Safari/537.36'
 }
 proxies = {
 'http': 'http://' + proxy['proxy'],
 'https': 'http://' + proxy['proxy']
 }
 try:
 start_time = time.time()
 # requests.packages.urllib3.disable_warnings()
 resp = requests.get(url, headers=headers, proxies=proxies, timeout=5, verify=False)
 delay = round(time.time() - start_time, 2)
 if resp.status_code == 200:
 proxy['delay'] = delay
 if method == 'insert':
 MongoDB().insert(proxy)
 elif method == 'check':
 MongoDB().update({'proxy': proxy['proxy']}, {'delay': proxy['delay']})
 else:
 if method == 'check':
 MongoDB().delete({'proxy': proxy['proxy']})
 except (ProxyError, ConnectionError):
 if method == 'check':
 MongoDB().delete({'proxy': proxy['proxy']})
 except Exception:
 traceback.print_exc()
原文地址:https://www.cnblogs.com/jiguangdongtaiip/p/13536403.html