Python_爬虫_Scrapy设置代理

0、检测IP是否可用

# -*- coding: UTF-8 -*-
from urllib import request

if __name__ == "__main__":
    #访问网址
    #url = 'http://2017.ip138.com/ic.asp'
    url = 'http://www.whatismyip.com.tw'
    #这是代理IP
    proxy = {'https':'218.26.217.77:3128'}
    #创建ProxyHandler
    proxy_support = request.ProxyHandler(proxy)
    #创建Opener
    opener = request.build_opener(proxy_support)
    #添加UserAngent
    opener.addheaders = [
        ('User-Agent','Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/62.0.3202.75 Safari/537.36'),
        ('Host','www.whatismyip.com.tw')
    ]
    #安装OPener
    request.install_opener(opener)
    #使用自己安装好的Opener
    response = request.urlopen(url)
    #读取相应信息并解码
    html = response.read().decode("utf-8")
    #打印信息
    print(html)
检测代理IP是否可用

1、在 middlewares.py 中添加如下代码,找到 IP 后粘贴在对应位置【找不到可以去淘宝买】 

# -*- coding: utf-8 -*-
# 导入随机模块
import random
# 导入有关IP池有关的模块
from scrapy.downloadermiddlewares.httpproxy import HttpProxyMiddleware
# 导入有关用户代理有关的模块
from scrapy.downloadermiddlewares.useragent import UserAgentMiddleware

# IP池
class HTTPPROXY(HttpProxyMiddleware):
    # 初始化 注意一定是 ip=''
    def __init__(self, ip=''):
        self.ip = ip

    def process_request(self, request, spider):
        item = random.choice(IPPOOL)
        try:
            print("当前的IP是:"+item["ipaddr"])
            request.meta["proxy"] = "http://"+item["ipaddr"]
        except Exception as e:
            print(e)
            pass


# 设置IP池
IPPOOL = [
    {"ipaddr": "182.117.102.10:8118"},
    {"ipaddr": "121.31.102.215:8123"},
    {"ipaddr": "1222.94.128.49:8118"}
]


# 用户代理
class USERAGENT(UserAgentMiddleware):
    #初始化 注意一定是 user_agent=''
    def __init__(self, user_agent=''):
        self.user_agent = user_agent

    def process_request(self, request, spider):
        item = random.choice(UPPOOL)
        try:
            print("当前的User-Agent是:"+item)
            request.headers.setdefault('User-Agent', item)
        except Exception as e:
            print(e)
            pass


# 设置用户代理池
UPPOOL = [
    "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:52.0) Gecko/20100101 Firefox/52.0", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/51.0.2704.79 Safari/537.36 Edge/14.14393"
]

2、在 settngs.py 中添加一下代码(注意根据项目名修改指向,如这里的工程名是“demo”)

DOWNLOADER_MIDDLEWARES = {
    # 'scrapy.downloadermiddlewares.httpproxy.HttpProxyMiddleware':123,
    # 'demo3.middlewares.HTTPPROXY' : 125,
    'scrapy.downloadermiddlewares.useragent.UserAgentMiddleware': 2,
    'demo.middlewares.USERAGENT': 1  #demo是项目名
}
原文地址:https://www.cnblogs.com/hellangels333/p/8690080.html