用代理池 + redis 刷博客浏览量(2)

用大才哥的代理池 地址:https://github.com/Germey/ProxyPool 

1.get_proxy() :从redis的端口随机拿一个代理
2.使用selenium模块,配置上代理信息,用webdrive刷浏览器地址
在此程序中,缺少了一个检验代理是否能有效快速连接博客,有的代理很慢,而还是用这些很慢的代理刷了一圈,浪费了很多时间。其实可以添加一断代码,如果连接第一篇博客很慢,就跳过这个代理地址。然而我很懒 嘿嘿
import time
import requests
import json
import re
from selenium import webdriver
from selenium.webdriver.support.wait import WebDriverWait
import redis
import time

proxy_url = 'http://localhost:5000/get'
url = 'https://i.cnblogs.com/categories'
base_url = 'https://i.cnblogs.com/posts?'
views = 0
url_list = []
headers = {
    # 在headers中添加上自己的cookie
    'cookie': '你的cookie',
    'user-agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36',
    'upgrade-insecure-requests': '1'
}
pattern1 = re.compile('<td>发布</td>.*?d.*?(d{1,})',
                      re.S)  # 正则表达式 #这边尤其要注意到浏览量有可能不只是一个位数,还可能是二位数三位数,所以写正则的时候应该是(d{1,})
pattern2 = re.compile('<td class="post-title"><a href="(.*?)"', re.S)  # 正则表达式
response = requests.get(url=url, headers=headers)
html = response.text
data = json.loads(html)  # 通过json.loads把数据转化成字典格式
categories = (i['CategoryId'] for i in data)
for category in categories:
    cate_url = base_url + 'categoryid=' + str(category)  # build每个category的地址
    headers['referer'] = cate_url
    response = requests.get(cate_url, headers=headers)
    html = response.text
    results1 = re.findall(pattern1, html)  # 浏览量的findall结果
    results2 = re.findall(pattern2, html)  # 网页地址的findall结果
    if results1:
        for result1 in results1:
            views = views + int(result1)  # 计算浏览量
        for result2 in results2:
            url_list.append('https://' + result2)  # build地址

print('总浏览量为:', views)
print('一共{}篇文章'.format(len(url_list)))
print('文章平均浏览量', views / len(url_list))
print(url_list)
def get_proxy():
    response = requests.get(proxy_url)
    if response.status_code == 200:
        return response.text
    else:
        return None
while True:
    proxy = get_proxy()
    print('the proxy is:', proxy)
    options = webdriver.ChromeOptions()  # 通过selenium 模块中的webdriver 模拟一个chrome浏览器
    # 设置中文
    options.add_argument('lang=zh_CN.UTF-8')
    # 更换头部
    options.add_argument(
        'user-agent="Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36"')
    options.add_argument('--proxy-server={0}'.format(proxy))
    # https://www.cnblogs.com/francischeng/p/9437809.html
    # www.cnblogs.com/francischeng/p/9437809.com'
    driver = webdriver.Chrome(chrome_options=options)
    r = redis.StrictRedis()
    try:
        for url in url_list:
            driver.delete_all_cookies()
            print('connecting to', url)
            past = time.time()
            driver.get(url)
            now = time.time()
            print('成功打开网页,使用了{}秒'.format(now-past))
            # print('睡眠1秒钟.....')
            # time.sleep(1)  # 睡眠两秒
        driver.quit()
    except:
        pass

 
原文地址:https://www.cnblogs.com/francischeng/p/9664836.html