多进程的妙用

 1 #coding:utf-8
 2 import time
 3 import threading
 4 from html_downLoader import HtmlDownLoader
 5 import ParseAlexa
 6 import multiprocessing
 7 from MongoQueue import MongoQueue
 8 import sys
 9 if sys.getdefaultencoding()!="utf-8":
10     reload(sys)
11     sys.setdefaultencoding("utf-8")
12 SLEEP_TIME=1
13 alexaCallback=ParseAlexa.AlexaCallback()
14 crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
15 max_threads=5
16 result={}
17 def threaded_crawler():
18     threads=[]
19     #crawl_queue=alexaCallback("http://s3.amazonaws.com/alexa-static/top-1m.csv.zip")
20     dlownloader=HtmlDownLoader()
21     def process_queue():
22         while True:
23             try:
24                 url=crawl_queue.pop()
25                 crawl_queue.complete(url)
26             except Exception,e:
27                 print e.message
28                 break
29             else:
30                 print "正在爬取%s"%url
31                 html=dlownloader.downLoad(url)
32                 result[url]=html
33 
34     while threads or crawl_queue.__nonzero__():
35         while len(threads)<max_threads and crawl_queue.__nonzero__():
36             thread=threading.Thread(target=process_queue)
37             thread.setDaemon(True)
38             thread.start()
39             threads.append(thread)
40             time.sleep(SLEEP_TIME)
41         for thread in threads:
42             if not thread.is_alive():
43                 threads.remove(thread)
44     print result,'




'
45 
46 def process_crawler():
47     num_cpus=multiprocessing.cpu_count()
48     print "Starting {} process".format(num_cpus)
49     process=[]
50     for i in range(num_cpus):
51         p=multiprocessing.Process(target=threaded_crawler)
52         p.daemon=True
53         p.start()
54         # p.join()
55         process.append(p)
56     for p in process:
57         p.join()
58     # print result
59 if __name__ == '__main__':
60     #alexaCallback=ParseAlexa.AlexaCallback()
61     #threaded_crawler(alexaCallback)
62     process_crawler()
63     # print result