爬取西刺网代理ip，并把其存放mysql数据库

需求：

　　获取西刺网代理ip信息，包括ip地址、端口号、ip类型

　　西刺网：http://www.xicidaili.com/nn/

那，如何解决这个问题？

　　分析页面结构和url设计得知：

　　　　数据都在本页面可以全部获取，没有单独的详情页面

　　　　下一页通过更改当前页面最后url后缀进行跳转页面，那我实现URL的拼接不就解决这个问题了

那，软件的运行环境？

　　　　python3.5

　　　　scrapy

　　　　twisted

　　　　request

　　　　pymysql

　　以上是第三方包，通过pip安装

　　MySQL服务

其中db，user，password的值根据实际情况而定

#!/usr/bin/python3

__author__ = 'beimenchuixue'
__blog__ = 'http://www.cnblogs.com/2bjiujiu/'

import requests
import pymysql
from time import sleep
from random import randint, choice
from scrapy.selector import Selector
from twisted.enterprise import adbapi
from twisted.internet import reactor

# 数据库基本配置, 自行配置
db_settings = {
    'host': 'localhost',
    'db': 'db_name',
    'user': 'user_name',
    'password': 'password',
    'charset': 'utf8',
    'use_unicode': True
}
# conn = pymysql.connect(**db_settings)
# cursor = conn.cursor()

# 生成连接池
db_conn = adbapi.ConnectionPool('pymysql', **db_settings)


def go_sleep():
    """进行随机io堵塞，模仿人访问"""
    while randint(0, 1):
        sleep(choice([0.1, 0.2, 0.3, 0.4, 0.5, 0.6]))


def get_sql(ip, port, ip_type):
    """获得sql语句"""
    if ip and port and ip_type:
        sql = """insert into
              ip_server(ip, port, ip_type)
               value (%s, %s, %s)
              on DUPLICATE key update ip=values(ip), port=values(port), ip_type=values(ip_type)"""
        try:
            params = (ip, int(port), ip_type)
        except Exception as e:
            print(e)
            return None
        return sql, params
    else:
        return None


def go_insert(cursor, sql, params):
    """数据库插入操作"""
    try:
        cursor.execute(sql, params)
    except Exception as e:
        print(e)


def get_ip():
    """爬取ip信息并存入数据库"""
    # 设置请求头
    headers = {
        'Referer': 'http://www.xicidaili.com/nn/',
        'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/59.0.3071.115 Safari/537.36'
    }
    # 获取50页的数据
    for page in range(1, 50):
        # 建立关系映射，增加程序可阅读性
        ip_index, port_index, type_index = 2, 3, 6
        # 爬取的url
        url = 'http://www.xicidaili.com/nn/{page}'.format(page=page)
        
        go_sleep()
        
        response = requests.get(url, headers=headers)
        # 打印状态码
        print(response.status_code)
        # 进行页面解析
        selectors = Selector(text=response.text)
        all_trs = selectors.css('#ip_list .odd')
        for tr in all_trs:
            ip = tr.css('td:nth-child(%s)::text' % ip_index).extract_first()
            port = tr.css('td:nth-child(%s)::text' % port_index).extract_first()
            ip_type = tr.css('td:nth-child(%s)::text' % type_index).extract_first()
            sql, params = get_sql(ip, port, ip_type)
            if sql:
                try:
                    # cursor.execute(sql, params)
                    # conn.commit()
                    # 执行sql操作
                    db_conn.runInteraction(go_insert, sql, params)
    
                except Exception as e:
                    print(e)
            else:
                break

if __name__ == '__main__':
    get_ip()
    # 让twisted的sql操作去完成
    reactor.callLater(4, reactor.stop)
    reactor.run()