解码url
from urllib import request
if __name__ == '__main__':
url = "https://i.cnblogs.com/EditPosts.aspx?opt=1"
rsp = request.urlopen(url)
html = rsp.read()
# 内容转换成字符串
html = html.decode()
print(html)
网页编码解决
- chardet
- 安装:pip install chardet
import urllib
import chardet
if __name__ == '__main__':
url = "https://i.cnblogs.com/EditPosts.aspx?opt=1"
rsp = urllib.request.urlopen(url)
html = rsp.read()
cs = chardet.detect(html)
html = html.decode(cs.get('encoding', 'utf-8'))
print(html)
urlopen的返回对象
- geturl:返回请求对象的url
- info:请求返回对象的meta信息
- code:返回http code
from urllib import request
if __name__ == '__main__':
url = "https://i.cnblogs.com/EditPosts.aspx?opt=1"
rsp = request.urlopen(url)
print('URL:{0}'.format(rsp.geturl()))
print('INFO:{0}'.format(rsp.info()))
print('code:{0}'.format(rsp.code()))
request.date的使用
from urllib import request, parse
if __name__ == '__main__':
url = ‘http://www.baidu.com/s?'
wd = input()
qs = {
'wd':wd
}
qs = parse.urlencode(qs)
fullurl = url + qs
print(fullurl)
rsp = request.urlopen(fullurl)
html = rsp.read()
html = html.decode()
print(html)
from urllib import request, parse
import json
baseurl = 'https://fanyi.baidu.com/sug'
data = {
'kw':'girl'
}
data = parse.urlencode(data).encode('utf-8')
headers = {
‘Content-Length’:len(data)
}
req = request.Request(url=baseurl, data=data, headers=headers)
rsp = request.urlopen(req)
json_data = rsp.read().decode(utf-8)
json_data = json.loads(json_data)
for item in json_data['data']:
print(item['k'], '--', item['v'])
urllib.error
- URLError产生的原因:
- 没网
- 服务器链接失败
- 不知道指定服务器
- 是OSError的子类
- HTTPError,是URLError的一个子类
- 两者的区别
- HTTPError是对应的HTTP请求的返回码错误,如果返回错误码是400以上的,则引发HTTPError
- URLError对应的一般是网络出现问题,包括url问题
- 关系区别OSError-URLError-HTTPError
UserAgent
from urllib import request, error
if __name__ == '__main__':
url = 'http://www.baidu.com'
try:
headers = {}
headers['User-Agent'] = 'Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5'
req = request.Request(url, headers=headers)
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
except error.HTTPError as e:
print(e)
except error.URLError as e:
print(e)
except Exception as e:
print(e)
print("Done")
ProxyHandler处理(代理服务器)
- 获取代理服务器网址
- www.xicidaili.com
- www.goubanjia.com
# coding=gbk
from urllib import request, error
if __name__ == '__main__':
url = 'http://www.baidu.com'
# 设置代理地址
proxy = {'http':'221.2.155.35:8060'}
# 创建ProxyHandler
proxy_handler = request.ProxyHandler(proxy)
# 创建Opener
opener = request.build_opener(proxy_handler)
# 安装Opener
request.install_opener( opener )
try:
rsp = request.urlopen(url)
html = rsp.read().decode()
print(html)
except error.HTTPError as e:
print(e)
except error.URLError as e:
print(e)
except Exception as e:
print(e)
print("Done")
使用cookie登录
- http模块包含一些关于cookie的模块。通过他们我们可以自动使用cookie
- CookieJar
- 管理存储cookie,向传出的http请求添加cookie
- cookie存储在内存中,CookieJar实例回收后cookie将消失
- FileCookieJar
- 使用文件管理cookie
- filename是保存cookie的文件
- MozillarCookieJar
- LwpCookieJar
# -*- coding:gbk -*-
from urllib import request, parse
from http import cookiejar
# 创建cookiejar的实例
cookie = cookiejar.CookieJar()
# 生成cookie的管理器
cookie_handler = request.HTTPCookieProcessor(cookie)
# 创建http请求管理器
http_handler = request.HTTPHandler()
# 生成https管理器
https_handler = request.HTTPSHandler()
# 创建请求管理器
opener = request.build_opener(http_handler, https_handler, cookie_handler)
def login():
'''
负责初次登入
需要输入用户名密码,用来获取登录cookie凭证
'''
# 此url需要从登录form的action属性中提取
url = 'http://www.renren.com/PLogin.do'
data = {
'email':'15651115780',
'password':'13743426096'
}
# 把数据编码
data = parse.urlencode(data)
# 创建一个请求对象
req = request.Request(url, data=data.encode())
# 使用opener发起请求
rsp = opener.open(req)
def getHomePage():
url = 'http://www.renren.com/SysHome.do'
# 如果已经执行了login函数,则opener自动已经包含相应的cookie值
rsp = opener.open(url)
html= rsp.read().decode()
with open("rsp2.html", "w") as f:
f.write(html)
if __name__ == '__main__':
login()
getHomePage()
- cookie的属性
- name:名称
- value
- domain:可以访问此cookie的域名
- path:可以访问此cookie的页面路径
- expires:过期时间
- size:大小
- Http字段
SSL
- SSL证书就是指遵守SSL安全套阶层协议的服务器数字证书
- 美国网景公司开发
- CA是数字证书认证中心,是发放,管理,废除数字证书的收信人的第三方机构
- 遇到不信任的SSL证书,需要单独处理 V17
js加密
- 在线代码格式化
- tool.oschina.net/cordformat/js
- 有的反爬虫策略采用js对需要传输的数据进行加密处理(通常是取MD5值)
- 经过加密,传输的就是密文,但是加密函数或者过程一定是在浏览器完成,也就是一定会把代码(js代码)暴露给使用者
- 通过阅读加密算法,就可以模拟出加密过程,从而达到破解
- 过程v18,v19
# coding=gbk
# salt: "" + (new Date).getTime() + parseInt(10 * Math.random(), 10)
# sign: n.md5("fanyideskweb" + e + salt + "@6f#X3=cCuncYssPsuRUE")
def getSalt():
"""
salt公式是"" + (new Date).getTime() + parseInt(10 * Math.random(), 10)
"""
import time, random
salt = int(time.time()*1000) + random.randint(0,10)
return salt
def getMD5(v):
import hashlib
md5 = hashlib.md5()
md5.update(v.encode('utf-8'))
sign = md5.hexdigest()
return sign
def getSign(key,salt):
sign = "fanyideskweb" + key + str(salt) + "@6f#X3=cCuncYssPsuRUE"
sign = getMD5(sign)
return sign
from urllib import request,parse
def youdao(key):
url = "http://fanyi.youdao.com/translate_o?smartresult=dict&smartresult=rule"
salt = getSalt()
data = {
"i": key,
"from":"AUTO",
"to":"AUTO",
"smartresult":"dict",
"client":"fanyideskweb",
"salt": str(salt),
"sign": getSign(key, salt),
"ts":"1558941232097",
"bv":"242cf5b812990bc02d30b6b44aeafc41",
"doctype":"json",
"version":"2.1",
"keyfrom":"fanyi.web",
"action":"FY_BY_REALTlME",
}
data = parse.urlencode(data).encode()
headers = {
"Accept": "application/json,text/javascript,*/*;q=0.01",
# "Accept-Encoding": "gzip,deflate",
"Accept-Language": "zh-CN,zh;q=0.9",
"Connection": "keep-alive",
# 长度会不同
"Content-Length":len(data),
"Content-Type":"application/x-www-form-urlencoded;charset=UTF-8",
"Cookie":"OUTFOX_SEARCH_USER_ID=-706733229@10.169.0.82;JSESSIONID=aaaMY2QmJ5IRoeix5h4Rw;__guid=204659719.1294217786932070400.1558941222576.428;monitor_count=1;OUTFOX_SEARCH_USER_ID_NCOO=1649781088.13294;___rl__test__cookies=1558941232081",
"Host":"fanyi.youdao.com",
"Origin":"http://fanyi.youdao.com",
"Referer":"http://fanyi.youdao.com/",
"User-Agent":"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36",
"X-Requested-With":"XMLHttpRequest",
}
req = request.Request(url=url, data=data, headers=headers)
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
if __name__ == '__main__':
youdao('破冰行动')
Request
- 继承了urllib的所有特征
- 底层使用的是urllib3
- 开源地址:https://github.com/requests/requests
- 中文文档:http://cn.python-requests.org/zh_CN/latest/、
- 安装:pip install requests
- get 请求
- requests.get()
- requests.request('get', url)
- 可以带有headers和params类
- post
- rsp = requests.post(url, data=data)
- data,headers要求dict类型
- proxy
- proxies = {"http":"address of proxy",
"https":"address of proxy"
}
- rsp = requests.request("get", "http:XXXX", proxies=proxies)
- 代理有可能报错,如果使用人数多,考虑安全问题,可能会被强行关闭
- 用户验证
可能需要时用HTTP basic Auth
格式为 用户名:密码@代理地址:端口地址
proxy = {"http":"china:123456@192.168.1.123:4444"}
rsp = requests.get("http://baidu.com", proxies=proxy)
- web客户端验证
- 如果遇到web客户端验证,需要添加auth=(用户名,密码)
- auth = ('test1', "123456")# 授权信息
- rsp = requests.get("http://www.baidu.com", auth=auth)
动态HTML
- 动态HTML介绍
- JavaScript
- iQuery
- Ajax
- DHTML
- Python采集动态数据
- 从Javascript代码入手采集
- Python第三方库运行JavaScript,直接采集你在浏览器看到的页面
- Selenium:web自动化测试工具
- PhantomJS
- Selenium库有一个WebDriver的API
- WebDriver可以跟页面上的元素进行各种交互,用它可以来进行爬取
- 案例v36
# coding=gbk
from selenium import webdriver
import time2isoz
#通过Keys模拟键盘
from selenium.webdriver.common.keys import Keys
# 操作那个浏览器就对哪个浏览器建一个实例
# 自动按照环境变量查找响应的浏览器
driver = webdriver.PhantomJS()
# 如果浏览器没有在响应环境变量中,需要指定浏览器位置
driver.get("http://www.baidu.com")
# 通过函数查找title标签
print('Title:{0}'.format(driver.title))
- chrome + chromedriver
- 下载安装chrome
- 下载安装chromedriver
- 把chromedriver放到chrome根目录中,以及python的scripts文件中
# coding=utf-8
from selenium import webdriver
import time
driver = webdriver.Chrome()
driver.get('http://www.baidu.com')
text = driver.find_element_by_id('wrapper').text
print(text)
print(driver.title)
# 得到页面快照
driver.save_screenshot('index.png')
# id="kw"的是百度的输入框,我们得到输入框的ui元素后直接输入"大熊猫"
driver.find_element_by_id('kw').send_keys(u'大熊猫')
# id='su'是百度搜索的按钮,click模拟点击
driver.find_element_by_id('su').click()
time.sleep(5)
driver.save_screenshot('panda.png')
# 获取当期那页面的cookie
print(driver.get_cookies())
# 模拟输入两个按键 ctrl+a
driver.find_element_by_id('kw').send_keys(Keys.CONTROL, 'a')
# ctrl+x 是剪切快捷键
driver.find_element_by_id('kw').send_keys(Keys.CONTROL, 'x')
driver.find_element_by_id('kw').send_keys(u'美女')
driver.save_screenshot('beauty.png')
driver.find_element_by_id('su').send_keys(Keys.RETURN)
time.sleep(3)
driver.save_screenshot('beauty2.png')
# 清空输入框
driver.find_element_by_id('kw').clear()
driver.save_screenshot('clear.png')
# 关闭浏览器
driver.quit()