python2.x urllib2和urllib的使用

1.最简单用法

　　urlopen(url, data=None, timeout=socket._GLOBAL_DEFAULT_TIMEOUT,...)

 1 import urllib2
 2 import urllib
 3 
 4 
 5 response = urllib2.urlopen("http://www.baidu.com")
 6 
 7 print 'getcode():',response.getcode()
 8 print 'geturl():',response.geturl()
 9 print 'url:',response.url
10 print 'headers:
',response.headers
11 print 'msg:',response.msg
12 
13 #-------------------------------------out--------------------------------------
14 getcode(): 200
15 geturl(): http://www.baidu.com
16 url: http://www.baidu.com
17 headers:
18 Date: Thu, 29 Dec 2016 06:28:36 GMT
19 Content-Type: text/html; charset=utf-8
20 Transfer-Encoding: chunked
21 Connection: Close
22 Vary: Accept-Encoding
23 Set-Cookie: BAIDUID=9A1E663B4C3AB33D11266F0D865A1F59:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
24 Set-Cookie: BIDUPSID=9A1E663B4C3AB33D11266F0D865A1F59; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
25 Set-Cookie: PSTM=1482992916; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
26 Set-Cookie: BDSVRTM=0; path=/
27 Set-Cookie: BD_HOME=0; path=/
28 Set-Cookie: H_PS_PSSID=21858_1464_21112_17001_21553_20930; path=/; domain=.baidu.com
29 P3P: CP=" OTI DSP COR IVA OUR IND COM "
30 Cache-Control: private
31 Cxy_all: baidu+0ba0b09e0fa305471b5e3b42c352570f
32 Expires: Thu, 29 Dec 2016 06:27:54 GMT
33 X-Powered-By: HPHP
34 Server: BWS/1.1
35 X-UA-Compatible: IE=Edge,chrome=1
36 BDPAGETYPE: 1
37 BDQID: 0x889c1bcd00004be7
38 BDUSERID: 0
39 
40 msg: OK

View Code

获取html内容

1 print response.read()     #以str字符串形式返回整个页面
2 print response.readline() #每执行一次返回一行
3 print response.readlines() #以列表形式返回

View Code

2. 构造Request 设置headers

 1 def set_headers():
 2     #构造Request,设置headers
 3     #__init__(self, url, data=None, headers={},origin_req_host=None, unverifiable=False)
 4     import urllib2
 5     headers = {'User-Agent':'liubi-Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
 6     request = urllib2.Request("http://localhost:5000/urllib2testget",headers=headers)
 7 
 8     response = urllib2.urlopen(request)
 9     print request.headers
10     #追加一个header
11     request.add_header("addheader","nice")
12     response = urllib2.urlopen(request)
13     print request.headers
14 
15 set_headers()
16 
17 #--------------------------------输出:
18 
19 {'User-agent': 'liubi-Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
20 {"a": "1", "": "2"}
21 ------------------------------------------------
22 {'Addheader': 'nice', 'User-agent': 'liubi-Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36'}
23 {"a": "1", "": "2"}

View Code

3.发送get请求,发送post请求

 1 def get_post():
 2     #get方式
 3     import urllib2
 4     import urllib
 5     headers = {'User-Agent':'liu bi'}
 6     values = {"username":"diaosir_get","password":"diao123_get"}
 7     data = urllib.urlencode(values)
 8     print '---------------------get:'
 9     url = "http://localhost:5000/urllib2testget"
10     get_url=url+"?"+data
11     request = urllib2.Request(get_url,headers=headers)
12     response = urllib2.urlopen(request)
13     print json.loads(response.read())
14     print '---------------------post:'
15     url = "http://localhost:5000/urllib2testpost"
16     request = urllib2.Request(url,data,headers=headers)
17     response = urllib2.urlopen(request)
18     print json.loads(response.read())
19 
20 get_post()
21 
22 #---------------------------------------------------------输出:
23 ---------------------get:
24 {u'username': u'diaosir_get', u'password': u'diao123_get'}
25 ---------------------post:
26 {u'username': u'diaosir_get', u'password': u'diao123_get'}

post&get

4.代理模式设置

def set_proxies():
    #1.proxy_handler
    #2.创建operner
    #3.安装opener[非必须]
    #4.拿operner去请求url
    enable_proxy = True
    proxy_handler = urllib2.ProxyHandler({"http":'http://120.24.73.165:3128'})
    null_proxy_handler = urllib2.ProxyHandler({})
    if enable_proxy:
        opener = urllib2.build_opener(proxy_handler)#挂载opener
    else:
        opener = urllib2.build_opener(null_proxy_handler)
    request = urllib2.Request('http://www.baidu.com')
    print '---------------------不使用代理'
    response = urllib2.urlopen(request)
    print response.getcode(),request.host
    print '---------------------使用代理'
    response = opener.open(request)
    print response.getcode(),request.host

#----------------------------------------------------------输出
---------------------不使用代理
200 www.baidu.com
---------------------使用代理
200 120.24.73.165:3128

View Code

5.debug模式, 代码中urllib2.build_opener中的httpsHandler需要去掉，

 1 def debug_set():
 2     #代理，调试
 3     import  urllib2,urllib
 4     proxy_handler = urllib2.ProxyHandler({"http":'http://192.168.1.108:89'})
 5 
 6     #debuglog的使用
 7     httpHandler = urllib2.HTTPHandler(debuglevel=1)
 8     opener = urllib2.build_opener(httpHandler, httpsHandler,)
 9     urllib2.install_opener(opener) 
10     request = urllib2.Request('http://127.0.0.1:5000/urllib2testget?a=2&b=3',headers={'User-Agent':'liubi00'})
11     response = opener.open(request)
12     print response.getcode(),response.read()
13 
14 
15 
16 
17 #-------------------------------------------输出:
18 send: 'GET /urllib2testget?a=2&b=3 HTTP/1.1
Accept-Encoding: identity
Host: 127.0.0.1:5000
Connection: close
User-Agent: liubi00

'
19 reply: 'HTTP/1.0 200 OK
'
20 header: Content-Type: text/html; charset=utf-8
21 header: Content-Length: 20
22 header: Server: Werkzeug/0.11.11 Python/2.7.12
23 header: Date: Fri, 30 Dec 2016 15:12:40 GMT
24 200 {"a": "2", "b": "3"}

View Code

6.获取cookie存到cookie.txt

import cookielib
import  urllib2

def get_cookie():
    filename = 'cookie.txt'
    #声明一个MozillaCookieJar对象实例来保存cookie，之后写入文件
    cookie = cookielib.MozillaCookieJar(filename)
    #利用urllib2库的HTTPCookieProcessor对象来创建cookie处理器
    handler = urllib2.HTTPCookieProcessor(cookie)
    #通过handler来构建opener
    opener = urllib2.build_opener(handler,)
    request = urllib2.Request('http://www.baidu.com')
    request.add_header('User-Agent','fuckyou')
    response = opener.open(request)
    #保存cookie到文件
    cookie.save(ignore_discard=True, ignore_expires=True)
    print response.getcode()

get_cookie()

#----------------------------------------------输出:
200

View Code

7.通过cookie请求，更多查看http://www.cnblogs.com/sysu-blackbear/p/3629770.html

 1 import cookielib
 2 import urllib2
 3 def use_cookie():
 4     #cookie--从cookies.txt读取cookies,携带cookies请求
 5     cookie_file = 'cookie.txt'
 6     #创建MozillaCookieJar实例对象
 7     cookie = cookielib.MozillaCookieJar(cookie_file)
 8     #从文件中读取cookie内容到变量
 9     cookie.load( ignore_discard=True, ignore_expires=True)
10     #创建请求的request
11     req = urllib2.Request("http://www.baidu.com")
12     #利用urllib2的build_opener方法创建一个opener
13     opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
14     response = opener.open(req)
15     print response.read()

View Code

8.异常处理

 1 def deal_errors():
 2     #异常处理
 3     import urllib2
 4     #HTTPError
 5     req = urllib2.Request('http://blog.csdn.net/cqcre')
 6     try:
 7         urllib2.urlopen(req)
 8     except urllib2.HTTPError, e:
 9         print e.code
10         print e.reason
11 
12     #URLError
13     requset = urllib2.Request('http://www.xxxxx.com')
14     try:
15         urllib2.urlopen(requset)
16     except urllib2.URLError, e:
17         print e.reason
18 
19     #HTTPERROR&URLERROR
20     req = urllib2.Request('http://blog.csdn.net/cqcre')
21     try:
22         urllib2.urlopen(req)
23     except urllib2.URLError, e:
24         if hasattr(e,"code"):
25             print e.code
26         if hasattr(e,"reason"):
27             print e.reason
28     else:
29         print "OK"

View Code