Python 爬虫学习 urllib2

用urllib2抓取被限制的网站页面

# coding:utf-8

import urllib2

url = "http://blog.csdn.net/troubleshooter"

html = urllib2.urlopen(url)

print html.read()

　　返回403错误

模拟用户访问

# coding:utf-8

import urllib2

url = "http://blog.csdn.net/troubleshooter"

url_headers = {'User-Agent':'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/38.0.2125.111 Safari/537.36',
				'Referer':'http://www.cnblogs.com/evilxr/p/4038902.html',
				'Host':'blog.csdn.net',
				'GET':url
				}



req = urllib2.Request(url, headers=url_headers)
html = urllib2.urlopen(req)
print html.getcode()

200
[Finished in 0.4s]

获取Cookie信息

import urllib2
import cookielib

cookie = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie))
response = opener.open('http://www.baidu.com')
In [12]: for i in cookie:
    print i.name,i.value
   ....:     
BAIDUID 4722B044786BAE8B1E484C0535706271:FG=1
BIDUPSID 4722B044786BAE8B1E484C0535706271
H_PS_PSSID 10299_16540_1430_16474_12824_10812_12868_14669_16520_16326_16662_16424_16514_15050_12386_13932
PSTM 1438398244
BDSVRTM 0
BD_HOME 0

打开调试功能

import urllib2

httpHandler = urllib2.HTTPHandler(debuglevel=1)
httpsHandler = urllib2.HTTPSHandler(debuglevel=1)
opener = urllib2.build_opener(httpHandler, httpsHandler)

urllib2.install_opener(opener)
response = urllib2.urlopen('http://www.baidu.com')
response = urllib2.urlopen('http://www.baidu.com')
send: 'GET / HTTP/1.1
Accept-Encoding: identity
Host: www.baidu.com
Connection: close
User-Agent: Python-urllib/2.7

'
reply: 'HTTP/1.1 200 OK
'
header: Date: Sat, 01 Aug 2014 03:14:07 GMT
header: Content-Type: text/html; charset=utf-8
header: Transfer-Encoding: chunked
header: Connection: Close
header: Vary: Accept-Encoding
header: Set-Cookie: BAIDUID=0E3FD673DED07D3DBB4D6048AB469A32:FG=1; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
header: Set-Cookie: BIDUPSID=0E3FD673DED07D3DBB4D6048AB469A32; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
header: Set-Cookie: PSTM=1438398847; expires=Thu, 31-Dec-37 23:55:55 GMT; max-age=2147483647; path=/; domain=.baidu.com
header: Set-Cookie: BDSVRTM=0; path=/
header: Set-Cookie: BD_HOME=0; path=/
header: Set-Cookie: H_PS_PSSID=13289_1441_10813_14432_12867_14667_16521_14951_16663_16427_16514_15291_12315_13932_10634; path=/; domain=.baidu.com
header: P3P: CP=" OTI DSP COR IVA OUR IND COM "
header: Cache-Control: private
header: Cxy_all: baidu+d4d7821ea11368a1cad938a4de84b7ab
header: Expires: Sat, 01 Aug 2015 03:13:12 GMT
header: X-Powered-By: HPHP
header: Server: BWS/1.1
header: X-UA-Compatible: IE=Edge,chrome=1
header: BDPAGETYPE: 1
header: BDQID: 0x8824b3dc0001bdbb
header: BDUSERID: 0