用python做爬虫的例子

主要就是用了两个库，urllib和BeautifulSoup.
作用是从HTML中解析出解梦的查询词和具体的解释。
 1 # -*- coding: utf-8 -*-
 2 import urllib, urllib2
 3 import time, random
 4 from BeautifulSoup import BeautifulSoup
 5 
 6 def fetchURL(str_url):
 7 
 8     user_agent = 'Mozilla/5.0 (Windows NT 6.1; WOW64) 
 9                   AppleWebKit/537.36 (KHTML, like Gecko)'
10     values     = {}
11     headers    = {'User-Agent': user_agent}
12     data       = urllib.urlencode(values)
13 
14     content = ''
15     
16     try:
17         request = urllib2.Request(str_url)
18         response = urllib2.urlopen(request)
19         html = response.read().decode('gb2312')
20         content = parse_content_page(html)
21     except:
22         content = None
23 
24     return content
25 
26 def parse_content_page(html):
27     parsed_html = BeautifulSoup(html)
28     try:
29         title   = parsed_html.body.find('h1', attrs={'class':'art_title'}).text
30         content = parsed_html.body.find('div', attrs={'class':'dream_detail'}).text
31     except:
32         return None
33         
34     return [title, content]
35 
36 
37 
38 if __name__ == '__main__':
39 
40     foutput = 'jiemeng.txt'
41     with open(foutput, 'w') as fout:
42         for i in xrange(1, 10):
43             reques_url = 'http://tools.2345.com/zhgjm/%s.htm' % str(i)
44             x = fetchURL(reques_url)
45             if x != None:
46                 print >>fout, x[0].encode('utf8')[3:-3]
47                 print >>fout, x[1].encode('utf8')
48             
49             # sleep for a while between two http requests 
50             seconds = random.random()*10 + 2
51             time.sleep(seconds)