python generator

1 异步IO模型

loop = get_event_loop()
while True:
    event = loop.get_event()
    process_event(event)

loop是一个事件集合，然后循环“取出一个事件—处理一个事件”。

一个线程在执行一个事件中可能会有堵塞，当堵塞时，会将此时“状态”保存在loop中，然后进入下个循环，以此类推。

2 事件循环+回调

在事件循环的过程中，如果一个task执行完毕，就可以通过了callback将result返回给另一个等待process的task2

3 基于python generator的协程

python的generator不仅可以按需生成数据，他还可以某个事情执行一部分，另一部分在某个事件发生后（callback）再执行下一部分，实现异步。

3.1 生成器基本语法：
　　通过 (...) 解析器形成

　　通过yield关键字形成

3.2 生成器中的return:
　　在一个生成器中，如果没有return，则默认执行到函数完毕时返回StopIteration；

　　如果遇到return,如果在执行过程中 return，则直接抛出 StopIteration 终止迭代；

　　如果在return后返回一个值，那么这个值为StopIteration异常的说明，不是程序的返回值。

3.3 生成器中的send(self, value)

　　生成器函数最大的特点是可以接受外部传入的一个变量，并根据变量内容计算结果后返回。

　　gen.send(None), generator的第一个参数一定是None，否则会报错。且gen.next() 等价于 gen.send(None)

def foo():
    num = 5
    while True:
        s = yield num
        num = num + s
        print num


a = foo()
'''
通过g.send(None)或者next(g)可以启动生成器函数，并执行到第一个yield语句结束的位置。此时，执行完了yield语句，但是没有给receive赋值。yield value会输出初始值0
'''
print a.send(None)
a.send(100)
a.send(100)

[out:]
5 # 输出的初始值
105
205

4 基于generator 生成器调度的crawler

# coding=utf-8
from collections import deque
import requests
import re
import time

p_list = [7647647, 7620172, 7591696]


class Crawler(object):
    def __init__(self, p):
        self.url = 'http://www.cnblogs.com/fuzzier/p/%d.html' % p  # 就拿博客园测试了，哈哈哈

    def get_html(self):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
                                '(KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'}
        html = requests.get(self.url, headers=headers).text
        time.sleep(0.5)
        return html

    def parser_html(self, html):
        result = re.search(r'<a id="cb_post_title_url" class="postTitle2".*</a>', html).group()
        return result

    def run(self):
        print 'start crawler ' + self.url
        yield
        html = self.get_html()
        yield
        result = self.parser_html(html)
        print result


class Runner(object):
    def __init__(self, tasks):
        self.tasks = deque(tasks)

    def my_pop(self):
        return self.tasks.pop()

    def run(self):
        while len(self.tasks):
            task = self.my_pop()
            try:
                next(task)
            except StopIteration:
                print len(self.tasks)   # 因为到最后已经没有生成器了，但还在next()中
            else:  # 如果try成功，就会执行else语句，所next的gen就会继续被保存在tasks中
                self.tasks.appendleft(task)


tasks = map(lambda p: Crawler(p).run(), p_list)
Runner(tasks).run()

[out:]

start crawler http://www.cnblogs.com/fuzzier/p/7591696.html
start crawler http://www.cnblogs.com/fuzzier/p/7620172.html
start crawler http://www.cnblogs.com/fuzzier/p/7647647.html
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7591696.html">Beautifulsoup模块的一些细节说明</a>
2
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7620172.html">requests源码框架浅析</a>
1
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7647647.html">flask0.1版本源码浅析——请求上下文</a>
0

View Code

基于generator的半协程的Crawler

# coding=utf-8
from collections import deque
import requests
import re
import time

p_list = [7647647, 7620172, 7591696]


class Crawler(object):
    def __init__(self, p):
        self.url = 'http://www.cnblogs.com/fuzzier/p/%d.html' % p
        self.p = self.parser_html()  # 相当于一个coroutines

    def get_html(self):
        headers = {'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36'
                                '(KHTML, like Gecko) Chrome/51.0.2704.63 Safari/537.36'}
        self.p.send(None)  # 第一回send的值必须是None
        print 'GET ' + self.url
        yield
        html = requests.get(self.url, headers=headers).text
        time.sleep(2)
        self.p.send(html)

    def parser_html(self):
        html = yield
        if html:
            result = re.search(r'<a id="cb_post_title_url" class="postTitle2".*</a>', html).group()
            print result


class Runner(object):
    def __init__(self, tasks):
        self.tasks = deque(tasks)

    def my_pop(self):
        return self.tasks.pop()

    def run(self):
        while len(self.tasks):
            task = self.my_pop()
            try:
                next(task)
            except StopIteration:
                print len(self.tasks)   # 因为到最后已经没有生成器了，但还在next()中
            else:  # 如果try成功，就会执行else语句，所next的gen就会继续被保存在tasks中
                self.tasks.appendleft(task)


tasks = map(lambda p: Crawler(p).get_html(), p_list)
Runner(tasks).run()

[out:]

GET http://www.cnblogs.com/fuzzier/p/7591696.html
GET http://www.cnblogs.com/fuzzier/p/7620172.html
GET http://www.cnblogs.com/fuzzier/p/7647647.html
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7591696.html">Beautifulsoup模块的一些细节说明</a>
2
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7620172.html">requests源码框架浅析</a>
1
<a id="cb_post_title_url" class="postTitle2" href="http://www.cnblogs.com/fuzzier/p/7647647.html">flask0.1版本源码浅析——请求上下文</a>
0

View Code