import pymysql from http import cookiejar from urllib import request,parse from urllib.error import HTTPError,URLError class Session_cookie(object): def __init__(self): # 初始化 cookie = cookiejar.CookieJar() handler = request.HTTPCookieProcessor(cookie) self.opener = request.build_opener(handler) # request.install_opener(opener) # 这个是将opner变成全局参数,慎用,尤其在大型项目开发中 def main(self,url,headers=None,data=None): return main(url,headers,data,self.opener) # 爬虫连接 # ------------------------------------------------------------ def main(url,headers=None,data=None,opener=None): # 调用函数 if not data: print('无data数据') return get_response(url,headers=headers,opener=opener) else: print('有data数据') return get_response(url,headers=headers,data=data,opener=opener) def get_response(url,data=None,headers=None,opener=None): if not headers: headers = {'User-Agent':get_random('p_useragent')[0][1]} html = '' # 定义一个空值,避免出错无法返回 try: if data: data = parse.urlencode(data) data = bytes(data,encoding='utf-8') req = request.Request(url, data=data, headers=headers) else: req = request.Request(url,headers=headers) if not opener: response = request.urlopen(req) else: print('使用opener') response = opener.open(req) html = response.read().decode() except HTTPError as e: # 总的错误信息,不适合用于调试 print(e) except URLError as e: print(e) return html # 返回数据 如果写在try里面,当httperror时无返回值,外面就接不到值 # 数据库处理 # ------------------------------------------------------------ def mysql_connect(table,sql,data=None): # 数据库连接 conn = pymysql.connect('127.0.0.1', 'root', '123456', 'PaChong', charset='utf8') cursor = conn.cursor() if not data: row = cursor.execute(sql) # 查询类 else: row = cursor.execute(sql,data) # 需要提交的类 conn.commit() # 提交 return cursor def get_random(table=None): # 由数据库获取随机信息 proxy agent sql = 'SELECT * FROM {} WHERE id >= ((SELECT MAX(Id) FROM {})-(SELECT MIN(Id) FROM {})) * RAND() + (SELECT MIN(Id) FROM {}) LIMIT 1'.format(table, table, table, table) cursor = mysql_connect(table,sql=sql) return cursor.fetchall() def proxies_save_mysql(data): # 保存代理 table = 'p_proxies' keys = ','.join(data.keys()) values = ','.join(['%s']*len(data)) sql = 'insert into {}({}) values({})'.format(table,keys,values) data = tuple(data.values()) mysql_connect(table,sql=sql,data=data) # ------------------------------------------------------------ if __name__ == '__main__': # url = 'http://fanyi.baidu.com/sug' # data = {'kw':'中国'} # import json # res = json.loads(main(url,data=data)) # print(res) # url = 'http://www.baidu.com' # res = main(url) # print(res) pass
正常情况下,每写一个爬虫,都需要执行分析->请求->响应->下载(存储)的流程,但诸多功能,其实都是在重复造轮子,比如请求、调用请求头、post请求data值,可以将这些功能写到一个py文件里,这样再写其他爬虫文件时, 直接调用,就可以略过输入请求头、post传参转码等诸多操作。