Python爬虫学习==>第七章：urllib库的基本使用方法

学习目的：

　　urllib提供了url解析函数，所以需要学习
正式步骤

Step1：什么是urllib

　　urllib库是Python自带模块，是Python内置的HTTP请求库

　　包含4个模块：
　　

>>> import urllib
>>> # urllib.request　　请求模块
>>> # urllib.error　　异常处理模块
>>> # urllib.parse　　url解析模块
>>> # urllib.robotparser　　robot.txt解析模块

Step2：用法讲解

urlopen

# -*-  coding:utf-8 -*

import urllib.request

'''
urlopen语法格式如下
urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,*, cafile=None, capath=None, cadefault=False, context=None)
'''
#示例1
response = urllib.request.urlopen('http://www.baidu.com')
print(response.read().decode('utf-8'))    #read()方法是获取了response内容，然后指定编码打印出来,如果不加decode，那么打印则显示在一行


print('
')
print('urllib.parse实例')
print('
')
import urllib.request
import urllib.parse

data =  bytes(urllib.parse.urlencode({'word':'hello'}),encoding='utf-8')
response = urllib.request.urlopen('http://httpbin.org/post',data=data)
print(response.read())

print('
')
print('urllib中的timeout用法和urllib.error异常处理模块')
print('
')

import urllib.request
import socket
import urllib.error
try:
    response = urllib.request.urlopen('http://httpbin.org/get',timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason,socket.error):
        print('TIMEOUT')

响应

# -*-  coding:utf-8 -*-

print("响应类型实例")

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
print(type(response))

状态码和响应头

# -*-  coding:utf-8 -*-

print('状态码和响应头的实例')

import urllib.request

response = urllib.request.urlopen('http://www.baidu.com')
print(response.status)
print(response.getheaders())
print(response.getheader('Content-Type'))
print(response.getheader('Date'))
print(response.getheader('Server'))

运行结果

状态码和响应头的实例
200
[('Date', 'Tue, 03 Apr 2018 14:29:52 GMT'), ('Content-Type', 'text/html; charset=utf-8'), ('Transfer-Encoding', 'chunked'), ('Connection', 'Close'), ('Vary', 'Accept-Encoding'), ('Set-Cookie', 'BAIDUID=6150350FD6AF7F0B4629DA49AEF7DEAE:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'BIDUPSID=6150350FD6AF7F0B4629DA49AEF7DEAE; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'PSTM=1522765792; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com'), ('Set-Cookie', 'BDSVRTM=0; path=/'), ('Set-Cookie', 'BD_HOME=0; path=/'), ('Set-Cookie', 'H_PS_PSSID=1430_25809_13290_21093_20927; path=/; domain=.baidu.com'), ('P3P', 'CP=" OTI DSP COR IVA OUR IND COM "'), ('Cache-Control', 'private'), ('Cxy_all', 'baidu+66a85a47dcb1b7de8cd2d7ba25b3a1dc'), ('Expires', 'Tue, 03 Apr 2018 14:29:42 GMT'), ('X-Powered-By', 'HPHP'), ('Server', 'BWS/1.1'), ('X-UA-Compatible', 'IE=Edge,chrome=1'), ('BDPAGETYPE', '1'), ('BDQID', '0xa1de1b2000003abd'), ('BDUSERID', '0')]
text/html; charset=utf-8
Tue, 03 Apr 2018 14:29:52 GMT
BWS/1.1

handler 代理

# -*-  coding:utf-8 -*-
import urllib.request

proxy_hander = urllib.request.ProxyHandler(
    {'http':'http://127.0.0.1:9743','https':'https://127.0.0.1:9743'}
)#代理以实际代理为准
opener = urllib.request.build_opener(proxy_hander)
response = opener.open('http://www.baidu.com')
print(response.read())

cookie （记录用户身份的文本文件）

# -*-  coding:utf-8 -*-
import urllib.request,http.cookiejar
#将cookie保存
filename = 'cookie.txt'
cookie = http.cookiejar.LWPCookieJar(filename)
hander = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(hander)
responer = opener.open('http://www.baidu.com')
cookie.save(ignore_discard=True,ignore_expires=True)

打印cookie

# -*-  coding:utf-8 -*-

import urllib.request,http.cookiejar

#声明cookie为cookiejar对象
cookie = http.cookiejar.CookieJar()
#hander是处理浏览器中的cookie作用
hander = urllib.request.HTTPCookieProcessor(cookie)
#利用build_opener将cookie传给opener
opener = urllib.request.build_opener(hander)
responser = opener.open('http://www.baidu.com')
for i in cookie:
    print(i.name + '=' + i.value)

将本地的cookie值赋到浏览器

# -*-  coding:utf-8 -*-

import urllib.request,http.cookiejar

cookie = http.cookiejar.LWPCookieJar()
cookie.load('cookie.txt',ignore_discard=True,ignore_expires=True)
hander = urllib.request.HTTPCookieProcessor(cookie)
opener = urllib.request.build_opener(hander)
response = opener.open('http://www.baidu.com')
print(response.read().decode('utf-8'))

学习总结：

　　其余的内置方法未操作，直接学习下一节requests库