节约内存，用一个迭代器来逐篇输出

import re
import pymongo
from tqdm import tqdm
import hashlib

db = pymongo.MongoClient().weixin.text_articles
md5 = lambda s: hashlib.md5(s).hexdigest()

def texts():
    texts_set = set()
    for a in tqdm(db.find(no_cursor_timeout=True).limit(3000000)):
        if md5(a['text'].encode('utf-8')) in texts_set:
            continue
        else:
            texts_set.add(md5(a['text'].encode('utf-8')))
            for t in re.split(u'[^\u4e00-\u9fa50-9a-zA-Z]+', a['text']):
                if t:
                    yield t
    print u'最终计算了%s篇文章' % len(texts_set)

posted @ 2019-09-04 11:14 cup_leo 阅读(299) 评论(0) 收藏举报

刷新页面返回顶部

cup_leo

节约内存，用一个迭代器来逐篇输出

公告