mongo helper

import datetime

import pymongo
import click

# 数据库基本信息
db_configs = {
    'type': 'mongo',
    'host': '127.0.0.1',
    'port': '27017',
    "user": "",
    "password": "",
    'db_name': 'spider'
}


class Mongo():
    def __init__(self):
        self.db_name = db_configs.get("db_name")
        self.host = db_configs.get("host")
        self.port = db_configs.get("port")
        self.client = pymongo.MongoClient(f'mongodb://{self.host}:{self.port}', connect=False, maxPoolSize=10)
        self.username = db_configs.get("user")
        self.password = db_configs.get("passwd")
        if self.username and self.password:
            self.db = self.client[self.db_name].authenticate(self.username, self.password)
        self.db = self.client[self.db_name]

    def reset_status(self, col="dianping_seed_data"):
        item = dict()
        item["status"] = 0
        item["update_time"] = datetime.datetime.now()
        self.db[col].update_many({'$or': [{'status': 1}, {'status': 3}]}, {'$set': item})

    def reset_all_status(self, col="dianping_seed_data"):
        item = dict()
        item["status"] = 0
        item["count"] = 0
        item["update_time"] = datetime.datetime.now()
        self.db[col].update_many({}, {'$set': item})

    def add_index(self, col="dianping_seed_data"):
        # status_code 0:初始,1:开始下载,2下载完了
        self.db[col].create_index([('status', pymongo.ASCENDING)], unique=True)


    def get_index(self, col="dianping_seed_data"):
        index_list = self.db[col].list_indexes()
        for index in index_list:
            print(index)

     def update(self, item, col="zol_spider_data"):
      coll = self.db[col]

      coll.update_one({"url":item["url"]}, {'$set': item}, upsert=True)

    # 找出重复的放入result表中
    def find_duplicate(self, col="dianping_seed_data"):
        """
         {'$out': 'result'}:聚合之后将结果写到新的集合result表里。
        :param col:
        :return:
        """
        group = {'$group': {
            '_id': {'url': "$url"},  # 以url分组
            '_id_list': {'$addToSet': "$_id"},  # _id字段添加到返回结果里面去
            'count': {'$sum': 1}  # 结果计数加一
        }}
        # match将上面传过来的结果做进一步处理
        match = {"$match": {"count": {"$gt": 1}}}
        # 聚合之后的结果输出到表_duplicate_result
        out = {'$out': f'{col.split("_")[0]}_duplicate_result'}
        try:
            result = self.db[col].aggregate([
                group, match, out
            ], allowDiskUse=True)
            print("聚合成功")
        except Exception as e:
            print("聚合失败", e.args)
        return result

    def delete_dup(self, col="dianping_seed_data"):
        dup = f'{col.split("_")[0]}_duplicate_result'
        delete_data = self.db[dup].find()
        try:
            for d in delete_data:
                # 保留一条
                unique_id_list = d.get("_id_list")[1:]
                for did in unique_id_list:
                    self.db[col].delete_one({'_id': did})
            print("准备删除表")
            self.db[dup].drop()
            print("删除表成功")
        except Exception as e:
            print("删除的时候出现问题", e.args)


@click.command()
@click.option('--s', type=str, help="状态:all表示全部重置为0,two:表示重置状态为1、3的重置为0")
@click.option('--i', type=str, help="a:增加索引 g:获取索引")
@click.option('--d', type=str, help="d:删除 f:查询并生成聚合之后的结果")
def run(s, i, d):
    m = Mongo()
    if s:
        print("获取参数为:", s)
        if s == "all":
            print("所有数据状态重置为0:", s)
            m.reset_all_status()
        elif s == "two":
            m.reset_status()
            print("部分数据状态重置为0:", s)
    if i:
        if i == "a":
            m.add_index()
        elif i == "g":
            m.get_index()
    if d:
        if d == "d":
            m.delete_dup()
        elif d == "f":
            m.find_duplicate()


if __name__ == '__main__':
    run()

posted @ 2019-12-13 15:34  公众号python学习开发  阅读(208)  评论(0编辑  收藏  举报