mongodb去重
由于某些原因,我们的MongoDB里存在重复数据,甚至已经影响到数据统计。
其实在MongoDB 3.2之前可以通过索引直接去重。但这一特性在3.2版本之初已经移除。
{unique : true, dropDups : true} # 无法使用了
大概思路是,通过aggregation
先group
出重复的键值对并做count
,之后match
所有count>2
的键值对,认为他们是重复的,保留其中一条,删除其余。实现代码如下:
from pymongo import DeleteOne from threading import Thread from apscheduler.schedulers.blocking import BlockingScheduler from Application.Utils.Log import Log class DupKeywordRemove: def __init__(self): models = [monde1, monde2, monde2, monde3, monde4, monde4] # mongoengine的modle self.pipeline = [ # 根据几个字段去判断键值对的唯一性,这里特别写明了{"$exists": True},必须保证需要判断的字段完成,否则会影响到后面的group {"$match": { "keyword": {"$exists": True}, "country_id": {"$exists": True}, }}, # 将重复的键值对group起来,并用count计数 {"$group": { "_id": { "keyword": "$keyword", "country_id": "$country_id", }, "count": {"$sum": 1} }}, # 匹配count大于2的键值对,他们就是重复的 {"$match": { "count": {"$gt": 1} }} ] self.main(models) def find_dup_id(self, model): try: _collection = model._get_collection() # 配置allowDiskUse=True应对mongodb的16M limit all_dup_key = list(_collection.aggregate(self.pipeline, allowDiskUse=True)) delete_list = [] # 重复的doc作为list返回,每一组键值对作为find条件去数据库里查询 for dup_event in all_dup_key: match_pipe = dup_event['_id'] remove_id_list = [] dups = list(_collection.find(match_pipe)) if len(dups) >= 2: for key in dups: remove_id_list.append(key['_id']) needToSave = remove_id_list.pop() # pop一个出去,作为保留的doc for to_del in remove_id_list: delete_list.append( DeleteOne({ '_id': to_del }) ) print(_collection, len(delete_list)) if delete_list: print('删除重复数据') _collection.bulk_write(delete_list) else: print('无重复数据') pass except Exception as e: Log('keyword_dup_remove').info(e) def main(self, models): t_li = [] for _model in models: t_li.append( Thread( target=self.find_dup_id, kwargs={ 'model': _model, } ) ) for t in t_li: t.start() for t in t_li: t.join() if __name__ == '__main__': DupKeywordRemove()
参考链接:http://yangcongchufang.com/remove-duplicate-from-mongodb.html