python3获取代理IP

在GitHub 上找了个获取代理IP的脚本,发现已经失效了,所以自己改了下
使用python3.8
# -*- coding:UTF-8 -*-
from bs4 import BeautifulSoup
import subprocess as sp
import requests
import random
import re
import logging

logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s- %(message)s')


def get_proxys():
# requestsSession可以自动保持cookie,不需要自己维护cookie内容
S = requests.Session()
# 西祠代理高匿IP地址
target_url = 'http://www.xiladaili.com/gaoni/'
# 完善的headers
target_headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) '
'Chrome/88.0.4324.146 Safari/537.36'}
# get请求
target_response = S.get(url=target_url, headers=target_headers)
# utf-8编码
target_response.encoding = 'utf-8'
# 获取网页信息
target_html = target_response.text
# 获取idip_listtable
bf1_ip_list = BeautifulSoup(target_html, 'lxml')
bf2_ip_list = bf1_ip_list.find('tbody').find_all('tr')
proxys_list = []

for i in bf2_ip_list:
iptxt = i.find('td').string
try:
ph = re.compile(r'd+.d+.d+.d+')
mo = ph.search(iptxt)
ipmo = mo.group()
proxys_list.append(ipmo)
logging.debug(proxys_list)
except AttributeError:
return None
#print(proxys_list)
return proxys_list


def check_ip(ip, lose_time, waste_time):
# 命令 -n 要发送的回显请求数 -w 等待每次回复的超时时间(毫秒)
cmd = "ping -n 4 -w 4 %s"
# 执行命令
p = sp.Popen(cmd % ip, stdin=sp.PIPE, stdout=sp.PIPE, stderr=sp.PIPE, shell=True)
# 获得返回结果并解码
out = p.stdout.read().decode("gbk")
# 丢包数
lose_time = lose_time.findall(out)
# 当匹配到丢失包信息失败,默认为三次请求全部丢包,丢包数lose赋值为3
if len(lose_time) == 0:
lose = 3
else:
lose = int(lose_time[0])
# 如果丢包数目大于2,则认为连接超时,返回平均耗时1000ms
if lose > 2:
# 返回False
return 1000
# 如果丢包数目小于等于2,获取平均耗时的时间
else:
# 平均时间
average = waste_time.findall(out)
# 当匹配耗时时间信息失败,默认三次请求严重超时,返回平均好使1000ms
if len(average) == 0:
return 1000
else:
average_time = int(average[0])
# 返回平均耗时
return average_time


def initpattern():
# 匹配丢包数
lose_time = re.compile(u"丢失 = (d+)", re.IGNORECASE)
# 匹配平均时间
waste_time = re.compile(u"平均 = (d+)ms", re.IGNORECASE)
return lose_time, waste_time


def proxyip():
# 初始化正则表达式
lose_time, waste_time = initpattern()
proxys_list = get_proxys()
# 如果平均时间超过200ms重新选取ip
while True:
# 50IP中随机选取一个IP作为代理进行访问
split_proxy = random.choice(proxys_list)
# 获取IP
ip = split_proxy
logging.debug(ip)
# 检查ip
average_time = check_ip(ip, lose_time, waste_time)
logging.debug(average_time)
if average_time > 300:
# 去掉不能使用的IP
proxys_list.remove(split_proxy)
# print("ip连接超时, 重新获取中!")
if average_time < 300:
break

# 去掉已经使用的IP
proxys_list.remove(split_proxy)
proxy_dict = split_proxy
# print("使用代理:", proxy_dict)
return proxy_dict

if __name__ == '__main__':
# 获取IP代理
proxyip(
 
原文地址:https://www.cnblogs.com/fanpiao/p/15273086.html