节约内存,用一个迭代器来逐篇输出
import re import pymongo from tqdm import tqdm import hashlib db = pymongo.MongoClient().weixin.text_articles md5 = lambda s: hashlib.md5(s).hexdigest() def texts(): texts_set = set() for a in tqdm(db.find(no_cursor_timeout=True).limit(3000000)): if md5(a['text'].encode('utf-8')) in texts_set: continue else: texts_set.add(md5(a['text'].encode('utf-8'))) for t in re.split(u'[^\u4e00-\u9fa50-9a-zA-Z]+', a['text']): if t: yield t print u'最终计算了%s篇文章' % len(texts_set)