urllib库

from urllib import request
form urllib import parse

rsp = request.urlopen('http://www.baidu.com/') 获取url的页面
print(rsp.getcode()) 获取页面的返回值 200为成功
print(rsp.read()) rsp是一个句柄 用read()显示出来 rsp.read()返回的是bytes类型

request.urlretrieve('http://www.baidu.com/','baidu.html') 将页面下载下来 下载到‘baidu.html’中

用浏览器发送请求时,如果包含中文或其他字符 浏览器会自动编码 用代码发送请求时,必须要手动编码
url = 'http://www.baidu.com/s' 如果直接发送url='http://www.baidu.com/s?wd=刘德华' 会识别不了
qs = {'wd':'刘德华'}
qs = parse.urlencode(qs) 使用parse.urlencode(qs)来编码使请求可以被识别
url = url+'?'+qs
rsp = request.urlopen(url)
print(rsp.read().decode())

date = {'name':'周超', 'age':'18', 'greet':'hello world'}
date = parse.urlencode(date)
print(type(date))
date = parse.parse_qs(date) 使用parse.parse_qs(date)来解码之前编码过的数据
print(date)

from urllib import parse

url = 'http://www.baidu.com./s?wd=python'
result = parse.urlparse(url)

result = parse.urlsplit(url) parse.urlparse(url)和parse.urlsplit(url)功能基本相同 都是用来解析url各组成部分 区别在于parse.urlsplit(url)没有params选项

print(result)
print('scheme:',result.scheme)
print('netloc:',result.netloc)
print('path:',result.path)
print('params:',result.params)
print('query:',result.query)
print('fragment:',result.fragment)

要增加请求头 需要requesrs.Request来实现
url = 'https://www.lagou.com/jobs/positionAjax.json?needAddtionalResult=false'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400',
'Referer':'https://www.lagou.com/jobs/list_python?labelWords=&fromSearch=true&suginput='}
date = {'first':'true',
'pn':1,
'kd':'python'}

date = parse.urlencode(date).encode('utf-8') python3中默认的是unicode 这里需要bytes类型 unicode转bytes需要encode()
rsp = request.Request(url,headers=headers,data=date,method='POST')
res = request.urlopen(rsp)
print(res.read().decode()) res.read()时bytes类型 需要解码成unicode 所以需要res.read().decode()

7.设置代理
from urllib import request
url = 'http://httpbin.org/ip' http://httpbin.org/ip 可以查看ip地址等
handler = request.ProxyHandler({'http':'183.129.244.17:45745'}) 通过 request.ProxyHandler({'http':'183.129.244.17:45745'}) 创建一个handler
opener = request.build_opener(handler) 通过request.build_opener(handler) 创建opener
res = opener.open(url)
print(res.read())

8.cookie
from urllib import request
url = 'https://study.163.com/my?from=study'
headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.26 Safari/537.36 Core/1.63.6726.400 QQBrowser/10.2.2265.400',
'Referer':'https://chat.study.163.com/study-chat/index.htm?accid=s-1137692519&token=39b6d1cfc35c8e19c82ebdcee9e16fc3',
'Cookie':'EDUWEBDEVICE=3451afbac1a440369a55941240853d9c; _f=1536210599096; _ntes_nnid=a7b4d27203bf9eb9f82ba5bf0b410392,1536210592051; _ntes_nuid=a7b4d27203bf9eb9f82ba5bf0b410392; 1137692519=1137692519; hasVolume=true; videoVolume=0.4; sideBarPost=651; hb_MA-BFF5-63705950A31C_u=%7B%22utm_source%22%3A%20%22cp-1025897964%22%2C%22utm_medium%22%3A%20%22share%22%2C%22utm_campaign%22%3A%20%22commission%22%2C%22utm_content%22%3A%20%22%22%2C%22utm_term%22%3A%20%22%22%2C%22promotional_id%22%3A%20%22%22%7D; hb_MA-BFF5-63705950A31C_source=www.baidu.com; __utmz=129633230.1536484261.12.3.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; UTM_CPS=1025897964|2; videoRate=1.5; NTESSTUDYSI=7cd66f6bf5784901af063378928d2990; __utmc=129633230; STUDY_UUID=9b6b2ce6-e746-40c3-8135-4e867730acd2; STUDY_SESS="Tai3FvilqwjtCbmXjAJcj2j/mTqoSZ/RcrB9loI9ZOFaoycTtiOAgkXM1tMEL8CKyt/Wtmb3LKGs26Kt/uDsVeWnuHUwcBws771wcLIKkoBwwsCzhJWJ/TEjwxcoD+74K50kS2lTpm9BIjRvbEFmBGxfxNLia6E4X+R3124aKEcAFhqsm7+DHVfJhiFOprz2"; STUDY_INFO=5600144693|2|1137692519|1536545905785; NETEASE_WDA_UID=1137692519#|#1521627640058; NTES_STUDY_YUNXIN_ACCID=s-1137692519; NTES_STUDY_YUNXIN_TOKEN=39b6d1cfc35c8e19c82ebdcee9e16fc3; videoResolutionType=2; __utma=129633230.1332926595.1536210590.1536546039.1536546039.16; utm="eyJjIjoiIiwiY3QiOiIiLCJpIjoiIiwibSI6IiIsInMiOiIiLCJ0IjoiIn0=|aHR0cHM6Ly9zdHVkeS4xNjMuY29tL215P2Zyb209c3R1ZHk="; __utmb=129633230.3.10.1536546039'}

rsp = request.Request(url=url,headers=headers)
req = request.urlopen(rsp)

print(req.read().decode())

with open('wangyiyun.html','w',encoding='utf-8') as f:
f.write(req.read().decode())

原文地址:https://www.cnblogs.com/zhouchao123/p/9622892.html