节约内存,用一个迭代器来逐篇输出

import re
import pymongo
from tqdm import tqdm
import hashlib

db = pymongo.MongoClient().weixin.text_articles
md5 = lambda s: hashlib.md5(s).hexdigest()

def texts():
    texts_set = set()
    for a in tqdm(db.find(no_cursor_timeout=True).limit(3000000)):
        if md5(a['text'].encode('utf-8')) in texts_set:
            continue
        else:
            texts_set.add(md5(a['text'].encode('utf-8')))
            for t in re.split(u'[^u4e00-u9fa50-9a-zA-Z]+', a['text']):
                if t:
                    yield t
    print u'最终计算了%s篇文章' % len(texts_set)
原文地址:https://www.cnblogs.com/cupleo/p/11457844.html