python_爬虫

python_爬虫_模块

import pymysql
from http import cookiejar
from urllib import request,parse
from urllib.error import HTTPError,URLError

class Session_cookie(object):
    def __init__(self): # 初始化
        cookie = cookiejar.CookieJar()
        handler = request.HTTPCookieProcessor(cookie)
        self.opener = request.build_opener(handler)
        # request.install_opener(opener) # 这个是将opner变成全局参数，慎用，尤其在大型项目开发中
    def main(self,url,headers=None,data=None):
        return main(url,headers,data,self.opener)

# 爬虫连接
# ------------------------------------------------------------
def main(url,headers=None,data=None,opener=None): # 调用函数
    if not data:
        print('无data数据')
        return get_response(url,headers=headers,opener=opener)
    else:
        print('有data数据')
        return get_response(url,headers=headers,data=data,opener=opener)

def get_response(url,data=None,headers=None,opener=None):
    if not headers:
        headers = {'User-Agent':get_random('p_useragent')[0][1]}
    html = '' # 定义一个空值，避免出错无法返回
    try:
        if data:
            data = parse.urlencode(data)
            data = bytes(data,encoding='utf-8')
            req = request.Request(url, data=data, headers=headers)
        else:
            req = request.Request(url,headers=headers)
        if not opener:
            response = request.urlopen(req)
        else:
            print('使用opener')
            response = opener.open(req)
        html = response.read().decode()
    except HTTPError as e: # 总的错误信息，不适合用于调试
        print(e)
    except URLError as e:
        print(e)
    return html  # 返回数据 如果写在try里面，当httperror时无返回值，外面就接不到值

# 数据库处理
# ------------------------------------------------------------
def mysql_connect(table,sql,data=None): # 数据库连接
    conn = pymysql.connect('127.0.0.1', 'root', '123456', 'PaChong', charset='utf8')
    cursor = conn.cursor()
    if not data:
        row = cursor.execute(sql) # 查询类
    else:
        row = cursor.execute(sql,data) # 需要提交的类
        conn.commit() # 提交
    return cursor

def get_random(table=None): # 由数据库获取随机信息 proxy agent
    sql = 'SELECT * FROM {} WHERE id >= ((SELECT MAX(Id) FROM {})-(SELECT MIN(Id) FROM {})) * RAND() + (SELECT MIN(Id) FROM {})  LIMIT 1'.format(table, table, table, table)
    cursor = mysql_connect(table,sql=sql)
    return cursor.fetchall()

def proxies_save_mysql(data): # 保存代理
    table = 'p_proxies'
    keys = ','.join(data.keys())
    values = ','.join(['%s']*len(data))
    sql = 'insert into {}({}) values({})'.format(table,keys,values)
    data = tuple(data.values())

    mysql_connect(table,sql=sql,data=data)
# ------------------------------------------------------------

if __name__ == '__main__':
    # url = 'http://fanyi.baidu.com/sug'
    # data = {'kw':'中国'}
    # import json
    # res = json.loads(main(url,data=data))
    # print(res)

    # url = 'http://www.baidu.com'
    # res = main(url)
    # print(res)
    pass

正常情况下，每写一个爬虫，都需要执行分析->请求->响应->下载(存储)的流程，但诸多功能，其实都是在重复造轮子，比如请求、调用请求头、post请求data值，可以将这些功能写到一个py文件里，这样再写其他爬虫文件时，直接调用，就可以略过输入请求头、post传参转码等诸多操作。