python多进程

以抓取猫眼的Top100热门电影的信息为例:

# -*- coding: utf-8 -*-
import urllib
import urllib2
import re
import json
import lxml.html
import time
import datetime
from bs4 import BeautifulSoup
import multiprocessing
from multiprocessing import Pool
import sys
reload(sys)
sys.setdefaultencoding('utf8')
fd = open('E:\result.txt', 'w')
URL = 'http://maoyan.com/board/4'

def download(url, user_agent='wswp', num_try=2):

    headers = {'User_agent': user_agent}
    request = urllib2.Request(url, headers=headers)
    try:
        html = urllib2.urlopen(request).read()
    except urllib2.URLError as e:
        print 'Download error', e.reason
        html = None
        if num_try > 0:
            if hasattr(e, 'code') and 500 <= e.code < 600:
                return download(url, user_agent, num_try - 1)
            elif e.code == 403:
                return None
    return html


def get_message(url):
    html = download(url)
    soup = BeautifulSoup(html,'lxml')
    results = soup.find_all(name = 'div',attrs = {'class':'movie-item-info'})
    res_rank = r'<i class="board-index board-index-.*?">(.*?)</i>'
    rank = re.findall(res_rank,html)
    res_title = r'<p class="name"><.*?>(.*?)</a>'
    title = re.findall(res_title,html,re.S|re.M)
    res_major = r'<p class="star">(.*?)</p>'
    major = re.findall(res_major,html,re.S|re.M)
    res_data = r'<p class="releasetime">(.*?)</p>'
    data = re.findall(res_data,html,re.S|re.M)
    res_inte = r'<i class="integer">(.*?)</i>'
    inte = re.findall(res_inte,html,re.S|re.M)
    res_pe = r'<i class="fraction">(.*?)</i>'
    pe = re.findall(res_pe,html,re.S|re.M)
    for each in range(0,9):
        print title[each]
        mess = 'Rand:'+rank[each]
        fd.write(mess)
        mess = '电影:' + title[each]
        fd.write(mess)
        mess = '评分 ' + inte[each] + pe[each]
        fd.write(mess)
        mess = major[each].replace(' ','')
        fd.write(mess)
        mess = data[each]
        fd.write(mess)
        fd.write('
')


def main(offset):

    url = 'http://maoyan.com/board/4?offset={}'.format(offset)
    print url
    get_message(url)

if __name__ == '__main__':
    t = time.time()
    for i in range(10):
         main(i*10)
    t1 = time.time()
    print 'Total time:'
    print t1 - t
    fd.close()

单进程的代码所花费的时间是:

利用多进程的Pool的时间是:

pool更改的代码是:

pool = Pool()
pool.map(main, [i * 10 for i in range(10)])
原文地址:https://www.cnblogs.com/chenyang920/p/7308348.html