python学习(六)

python需要unicode编码,网页需要utf-8。

判断类型:

>>> s = "你好"    #utf-8编码
>>> 
>>> l1 = [s]
>>> 
>>> print l1
['xe4xbdxa0xe5xa5xbd']
>>> 
>>> s = u"你好"  #unicode编码
>>> 
>>> l1 = [s]
>>> 
>>> print l1
[u'u4f60u597d']

蜘蛛爬虫

1.由一个或几个起始网址分析其内容,目的是分析出新的网址(种子)。然后对新的种子进行分析。

2.获取需要的内容,读取需要的内容。

用chrome浏览器打开指定页面

google-chrome ***.html

抓取百度搜索数据

#coding:utf-8
import urllib2,re

myinput = raw_input()  #输入搜索关键字
httpaddr = "http://www.baidu.com/s?wd=%s" %myinput
f = urllib2.urlopen(httpaddr)
buf = f.read().replace("
","")    #去掉换行符
#print buf
all_buf = re.findall('''<div id="content_left">.*<div style="clear:both;height:19px;">''',buf)
for i in all_buf:
        print i

循环抓取指定网站的链接存入数据库

 1 #coding:utf-8
 2 import urllib2,re
 3 import zlib
 4 import sys
 5 from uliweb.orm import *
 6 
 7 #db = get_connection("mysql://root:mysql@localhost/sprider?charset=utf8")
 8 #获取url地址
 9 def get_url(httpaddr):
10     f = urllib2.urlopen(httpaddr)
11     buf = f.read()
12     if f.headers.get('Content-Encoding') == 'gzip':
13         buf = zlib.decompress(buf, 16+zlib.MAX_WBITS)
14     buf = buf.replace("
","")
15     #print buf
16     all_buf = re.findall(r'''<a.*?href.*?=["'](http://.*?)["'][> ]''',buf)
17     return all_buf
18 #将新的url存入数据库
19 def saved_url(url):
20     newtab = spider_url.get(spider_url.c.url == url)  #判断是否有重复
21     if newtab:
22         return
23     newtab = spider_url()
24     newtab.url = url
25     newtab.status = "0"
26     newtab.save()
27 #更新url读取状态
28 def update_url(url):
29     newtab = spider_url.get(spider_url.c.url == url)
30     newtab.status = "1"
31     newtab.save()
32 #执行新的任务
33 def get_new_task():
34     eachurl = spider_url.get(spider_url.c.status == "0")
35     if eachurl:
36         return eachurl.url
37     return eachurl
38 
39 if __name__ == "__main__":
40     url = sys.argv[1]
41     db = get_connection("mysql://root:mysql@localhost/sprider?charset=utf8")
42     #注意以下数据库表的操作不能单独列出来,必须在create前面,connection后面
43     class spider_url(Model):
44         url = Field(str)
45         status = Field(str)
46 
47     db.metadata.drop_all()
48     db.metadata.create_all()
49 
50     while 1:
51         try:
52             url_buf =  get_url(url)
53             for url in url_buf:
54                 saved_url(url)
55         except:
56             pass
57         url = get_new_task()
58         if url is None:
59             break
60         #url_buf = get_url(url)
61         update_url(url)
原文地址:https://www.cnblogs.com/goodhacker/p/3175339.html