Python 爬虫学习 urllib

网页抓取

# -*-coding: utf-8 -*-

import urllib

url = "http://www.cndzz.com/"

html = urllib.urlopen(url)

print html.read()

　　对于网页编码为gb2312等格式的网页，使用如下方法

# -*-coding: utf-8 -*-

import urllib

url = "http://www.sina.com.cn/"

html = urllib.urlopen(url)

print html.read().decode("gbk").encode("utf-8")

　　如果有多种编码，可以使用如下方法

# -*-coding: utf-8 -*-
# Author:Evilxr

import urllib
 
url = "http://www.sina.com.cn/"
 
html = urllib.urlopen(url)
 
print html.read().decode("gbk", "ignore").encode("utf-8")

获取Web服务器头部信息

# -*-coding: utf-8 -*-
# Author:Evilxr

import urllib

url = "http://www.sina.com.cn/"

html = urllib.urlopen(url)

print html.info()

　　返回信息：

Server: nginx
Date: Otc, 10 Nov 2014 12:54:50 GMT
Content-Type: text/html
Last-Modified: Otc, 10 Nov 2014 12:54:11 GMT
Vary: Accept-Encoding
Expires: Otc, 10 Nov 2014 12:55:50 GMT
Cache-Control: max-age=60
X-Powered-By: schi_v1.03
Age: 27
Content-Length: 563513
X-Cache: HIT from cd31-151.sina.com.cn
Connection: close


[Finished in 0.2s]

获取网页状态码

# -*-coding: utf-8 -*-
# Author:Evilxr

import urllib

url = "http://www.sina.com.cn/"

html = urllib.urlopen(url)

# 200正常访问	301重定向	403 禁止访问 404页面不存在	500 服务器忙或者服务器无响应
print html.getcode()

# 获取用户传入的url
print html.geturl()

# 关闭文件
html.close

保存网页内容

# -*-coding: utf-8 -*-
# Author:Evilxr

import urllib

url = "http://www.cdnzz.com/"

urllib.urlretrieve(url, "d:\evilxr.html")

获取网站编码类型

# coding:utf8
# Author:Evilxr

import urllib

url = "http://www.163.com"

html = urllib.urlopen(url)

print html.info().getparam('charset')
html.close()

GBK
[Finished in 0.6s]

# coding:utf8
# Author:Evilxr

import urllib

url = "http://www.cnblogs.com/Evilxr"

html = urllib.urlopen(url)

print html.info().getparam('charset')
html.close()

utf-8
[Finished in 0.3s]

自动获取网站编码 chardet[字符集检测]

#先安装chardet
#pip install chardet

# coding:utf8

import urllib 
import chardet

def automatic_detect(url):
	"""" doc """
	content = urllib.urlopen(url).read()
	result= chardet.detect(content)
	encoding = result['encoding']
	return encoding

url_list = ["http://www.sina.com.cn/", 
			 "http://www.cnblogs.com/evilxr",
			  "http://bbs.hackav.com/",
			  "http://www.baidu.com/",
			  "http://fuli.ba/"]
for url in url_list:
	print url, automatic_detect(url)

http://www.sina.com.cn/ GB2312
http://www.cnblogs.com/evilxr utf-8
http://bbs.hackav.com/ GB2312
http://www.baidu.com/ utf-8
http://fuli.ba/ utf-8
[Finished in 17.1s]