Python 爬虫_Urllib库的详解

Urllib库的详解
1. Urllib是python内置的HTTP请求库
包含4个模块: urllib.request(请求模块); urllib.error(异常处理模块); urllib.parse(url解析模块); urllib.robotparse(robots.text解析模块)
@urlopende 基础用法
~ 获取response内容:

import urllib.request #get类型的请求
response=urllib.request.urlopen('http://www.xxx.com')
print(response.read().decode('utf-8'))

~ 获取response内容:

import urllib.parse #post请求类型
import urllib.request
data=bytes(urllib.parse.urlencode({'word':hello}),encoding='utf-8')
response=urllib.request.urlopen('http://www.xxxx.com',data=data)
print(response.read())

~ 超时设置:在urlopen的设置框里加一个timeout功能

import socket
import urllib.request
import urllin.error
try
response=urllib.request.urlopen('http://www.xxxx.com',timeout=1)
except urllib.error.URLError as e:
if isinstance(e.reason,socket.timeout):
print('timeout')

@响应request的用法
~状态码,响应头:

import urllib.request
response=urllib.request.urlopen('http://www.xxxx.com')
print(response.status) #获取响应状态码,200是OK;
print(response.getheaders)#获取所有响应头,是一个元组;
print(response.getheaders('server'))#可以添加参数,添加server可以获取服务器类型
print(response.getheaders.read().decde('utf-8')) #可以得到响应头的内容;
~request:
import urllib.request
request=urllib.request.Request('http://www.xxxx.com')
response=urllib.request.urlopen(request) #通过urlopen request数据
print(response.read().decode('utf-8'))

from urllib import request, parse
url='htp://www.xxxx.com'
headers={
'User-Agent':'电脑的配置信息',
'Host':'httpbin.org'
}    
dict={
'name':'Germany'
}
data=bytes(parse.urlencode(dict),encoding='utf-8')
req=request.Request(url=url,date=date,headers=headers,method='post') #完整的request参数构造
response=request.urlopen(req)
print(response.read().decod('utf-8'))

@Handler:
~代理:可以伪装IP地址

import urllib.request
proxy_handler=urllib.request.ProxyHandler({
'http'='http://127.0.0.1:9743', #代理端口
'https'='http://127.0.0.1:9743'
})
opener=urllib.request.build_opener(proxy_handler)
response=opener.open('http://www.baidu.com')
print(response.read())

~cookie

import http.cookiejar,urllib.request
cookie=http.cookiejar.CookJar() #声明为cookjar的对象
handler=urllib.request.HTTPCookieProcessor(cookie)
opener=urllib.request.build_opener(handler) #传递handler
response=opener.open('http://www.baidu.com')
for item in cookie
print(item.name+'='+item.value)

!把cookie保存在文本文件

import http.cookiejar,urllib.request
filename='cookie.text'
cookie=http.cookiejar.MozillaCookieJar(filename)
handler=urllib.request.HTTPCookieProcessor(cookie)
opener=urllib.request.build_opener(handler)
response=opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)

@异常处理

from urllib import request.error #捕捉异常
try
response=request.urlopen('http://www.xxx.com')
except error.URLError as e: #可以加其他的操作
print(e.reason) 

~具体可以处理的异常:
URLError: reason属性
HTTPError: code属性;reason属性; headers属性
ContentTooShortError

from urllib import request.error #捕捉异常
try
response=request.urlopen('http://www.xxx.com')
except error.HTTPError as e: 
print(e.reason,e.code,e.headers,sep='
')
except error.URLError as e:
print(e.reason)
else
print('request successfully')

@URL解析
~urlparse: 将url进行拆分
  

urllib.parse.urlparse(urlstring,scheme='',allow_fragments=True)

~urlunparse:将url进行拼接

from urllib.parse import urlunparse
data={'http','www.xxxx.com','index.html','user','comment'}
~urljoin:拼接URL
~urlencode:将字典对象编程get请求参数
from urllib.parse import urlencode
params={
'name':'germany'
'age':28    
}
base_url='http://www.baidu.com?'
url=base_url+urlencode(params)
print(url)



















原文地址:https://www.cnblogs.com/spencersun/p/9531122.html