flask设置mongodb索引通过pymongo/flask_pymongo/pymongo

建立mongodb索引某集合的索引:

一  脚本方式, 通过pymongo方式

version:pymongo 3.7 

连接本地mongodb的annosys数据库的image集合,为image建立索引 

client = MongoClient('mongodb://localhost:27017/')
db = client.annosys
image_db = db['image']
image_db.create_index([('image_id', ASCENDING)], unique=True)
image_index = sorted(list(image_db.index_information()))
print(image_index)

假如代码位于docker容器名web内部,mongodb又是容器名mongodb,需要连接两docker的networks网络,以docker-compose为例, 内部的连接也要改成"mongodb://mongodb:27017/":

version: '2'
services:
  web:
    build: ./web
    container_name: "web_test"
    ports:
     - 8080:80
    volumes:
     - /data/static:/data/static
    depends_on:
     - mongodb
    networks:
     - net_set

  mongodb:
    image: mongo:latest
    container_name: "mongodb"
    environment:
     - MONGO_DATA_DIR=/data/db
     - MONGO_LOG_DIR=/dev/null
    volumes:
     - ./db:/data/db
    networks:
     - net_set

networks:
  net_set:
    driver: bridge

 

二  框架方式, 通过flask_pymongo方式

flask配置flask_pymongo:

models_second.py

from flask_pymongo import PyMongo
mongo = PyMongo()
app.py

app = Flask(__name__, static_folder='/data/www/web/static')
app.config["MONGO_URI"] = "mongodb://localhost:27017/annosys"
from models_second import mongo
mongo.init_app(app)
views.py

from models_second import mongo

这下面一句是mongoengine的:
task_obj = Task.objects.get_or_404(id=id)

下面这句是flask_pymongo的查找和批量删除:
ias = mongo.db.image_annotation.find({"task_id": id})
ias = [ia['_id'] for ia in ias]
mongo.db.image_annotation.remove({'_id': {'$in': ias}})

 

三  框架方式, 通过mongoengine方式

mongoegine没有单独建立索引都是设置在models.py当中

# -*- coding: utf-8 -*-

import pytz
import helper
import datetime
import time
import uuid
from flask_mongoengine import MongoEngine
from mongoengine import signals
from werkzeug.security import generate_password_hash, check_password_hash

# from datetime import datetime

db = MongoEngine()


class User(db.Document):
    """
    user table
    """
    # user_id = db.DateTimeField(required=True, default=datetime.datetime.utcnow)
    username = db.StringField(required=True, max_length=100)
    password = db.StringField(required=True, max_length=100)
    role = db.StringField(required=True, max_length=50)
    status = db.StringField(required=True, max_length=50, default="enabled")

    # identified_id = db.StringField(max_length=50)
    # phone_num = db.StringField(max_length=20)
    # email = db.StringField(max_length=100)
    # wechat_id = db.StringField(max_length=100)
    # alipay_id = db.StringField(max_length=100)
    # position = db.StringField(max_length=100)
    #
    last_login_time = db.DateTimeField(required=True, default=datetime.datetime.utcnow)
    # last_login_city = db.StringField(max_length=50)
    # last_login_ip = db.StringField(max_length=50)
    #
    created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow)
    updated_at = db.DateTimeField(required=True)
    deleted_at = db.DateTimeField(default=None)
    #
    # description = db.StringField(max_length=5000)

    # meta = {
    #     'indexes': [
    #         {
    #             'fields': ['username', 'phone_num', 'email', 'wechat_id', 'alipay_id'],
    #             'unique': True
    #         }
    #     ]
    # }

    meta = {
        'indexes': [
            {
                'fields': ['username'],
                'unique': True
            }
        ]
    }

    def to_dict(self):
        return helper.mongo_to_dict(self, [])

    def set_password(self, password):
        return generate_password_hash(password)

    def check_password(self, hash, password):
        return check_password_hash(hash, password)

    def get(self, id):
        try:
            user_obj = self.objects.filter(id=id).first()
            return user_obj
        except Exception as e:
            ret = {"code": 404, "message": "unexpected error", "data": str(e)}
            return ret


class Task(db.Document):
    """
    parents task
    """

    task_title = db.StringField(required=True, max_length=200)
    task_type = db.StringField(required=True, max_length=50)

    coin = db.FloatField(required=True, max_length=50)
    creater = db.StringField(required=True, max_length=50)
    owners = db.ListField(default=[])
    images = db.ListField(default=[])

    dataset_id = db.StringField(required=True, max_length=200)
    anno_id = db.StringField(required=True, max_length=200)
    images_count = db.FloatField(max_length=100, default=0)

    status = db.StringField(required=True, max_length=50, default="created")
    # stock = db.StringField(required=True, max_length=50, default="on")
    # created/working/finished

    created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow)
    updated_at = db.DateTimeField(required=True)
    deleted_at = db.DateTimeField(default=None)

    description = db.StringField(max_length=50000, default=None)

    meta = {
        'indexes': [
            {
                'fields': ['task_type', 'task_title'],
                'unique': True
            }
        ]
    }

    def to_dict(self):
        return helper.mongo_to_dict(self, [])


class UserTask(db.Document):
    """
    sub task
    """

    task_id = db.StringField(required=True, max_length=200)
    user_id = db.StringField(required=True, max_length=200)
    status = db.StringField(required=True, max_length=50, default="created")
    mutex_status = db.BooleanField(default=False)
    # created/working/finished/self-check/admin-check/unpaid/hitbacked/discarded

    worked_count = db.FloatField(required=True, max_length=50, default=0)
    coin = db.FloatField(required=True, max_length=50)
    task_salary = db.FloatField(max_length=50, default=0)

    created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow)
    updated_at = db.DateTimeField(required=True)
    deleted_at = db.DateTimeField(default=None)

    reviews = db.StringField(max_length=50000)

    meta = {
        'indexes': [
            {
                'fields': ['task_id', 'user_id'],
                'unique': True
            }
        ]
    }

    def to_dict(self):
        return helper.mongo_to_dict(self, [])


class Annotation(db.Document):
    """
    attribute table
    """

    task_type = db.StringField(required=True, max_length=200)
    category = db.StringField(required=True, max_length=200)
    attribute_key = db.StringField(max_length=200, default=None)
    attribute_values = db.ListField(default=[])

    status = db.StringField(required=True, max_length=50, default="created")
    tasks = db.ListField(default=[])

    created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow)
    updated_at = db.DateTimeField(required=True)
    deleted_at = db.DateTimeField(default=None)

    meta = {
        'indexes': [
            {
                'fields': ['task_type', 'category', 'attribute_key', 'attribute_values'],
                'unique': True
            }
        ]
    }

    def to_dict(self):
        return helper.mongo_to_dict(self, [])


class ImageAnnotation(db.Document):
    """
    image tag table
    """
    task_id = db.StringField(required=True, max_length=200)
    image_id = db.StringField(required=True, max_length=200)

    subtask_id = db.StringField(required=False, max_length=200, default=None)
    user_id = db.StringField(required=False, max_length=200, default=None)

    definition = db.StringField(max_length=200, default=None)
    first = db.StringField(max_length=200, default=None)
    second = db.StringField(max_length=200, default=None)
    is_skip = db.BooleanField(default=False)

    user_worked = db.BooleanField(default=False)
    hitbacked = db.BooleanField(default=False)

    reviews = db.StringField(max_length=10000)
    timestamp = db.FloatField(required=False, max_length=100, default=0)

    created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow)
    updated_at = db.DateTimeField(required=True)
    deleted_at = db.DateTimeField(default=None)

    meta = {
        'indexes': [
            {
                'fields': ['task_id', 'subtask_id', 'user_id', 'image_id'],
                'unique': True
            }
        ]
    }

    def to_dict(self):
        return helper.mongo_to_dict(self, [])


class ImagePick(db.Document):
    """
    image pick table
    """
    task_id = db.StringField(required=True, max_length=200)
    image_id = db.StringField(required=True, max_length=200)

    subtask_id = db.StringField(required=False, max_length=200, default=None)
    user_id = db.StringField(required=False, max_length=200, default=None)

    pick_tag = db.StringField(required=False, max_length=200, default=None)

    # 被用户管理员修改后的最终状态,是不是那个标签,是镶钻,不是镶钻
    status = db.BooleanField(default=False)

    # 用户打是不是那个标签,是镶钻,不是镶钻
    # user_status = db.BooleanField(default=False)

    # 管理员修改是不是那个标签,是镶钻,不是镶钻
    # admin_status = db.BooleanField(default=False)

    # 每次批量提交
    # user_worked = db.BooleanField(default=False)

    # 表示是否自审核过
    # user_checked = db.BooleanField(default=False)

    # 表示是否被管理员审核过
    # admin_id = db.StringField(required=False, max_length=200, default=None)
    # admin_checked = db.BooleanField(default=False)

    # 表示是否被打回过

    hitbacked = db.BooleanField(default=False)
    # status = db.StringField(required=False, max_length=200, default=None)

    # 用户选择了这照片就被锁住,写下猎取时间,长时间无操作,直接跳到下一张
    # is_lock = db.BooleanField(default=False)
    # is_expired = db.BooleanField(default=False)


    reviews = db.StringField(max_length=10000, default=None)
    timestamp = db.FloatField(required=False, max_length=100, default=0)

    created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow)
    updated_at = db.DateTimeField(required=True)
    deleted_at = db.DateTimeField(default=None)

    meta = {
        'indexes': [
            {
                'fields': ['task_id', 'subtask_id', 'user_id', 'image_id'],
                'unique': True
            }
        ]
    }

    def to_dict(self):
        return helper.mongo_to_dict(self, [])


class Dataset(db.Document):
    """
    dataset table
    """

    dataset_name = db.StringField(required=True, max_length=200)
    file_name = db.StringField(required=True, max_length=50)
    file_size = db.FloatField(required=True, max_length=50)
    file_path = db.StringField(required=True, max_length=200)
    tasks = db.ListField(default=[])
    status = db.StringField(required=True, max_length=50, default="enabled")

    created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow)
    updated_at = db.DateTimeField(required=True)
    description = db.StringField(max_length=10000, default=None)

    meta = {
        'indexes': [
            {
                'fields': ['dataset_name'],
                'unique': True
            }
        ]
    }

    def to_dict(self):
        return helper.mongo_to_dict(self, [])


class Image(db.Document):
    """
    image table
    """
    image_id = db.StringField(required=True, unique=True)
    url = db.StringField(required=True, max_length=500)
    width = db.FloatField(required=True, max_length=50)
    height = db.FloatField(required=True, max_length=50)

    datasets = db.ListField(default=[])
    description = db.StringField(max_length=10000, default=None)

    meta = {
        'indexes': [
            {
                'fields': ['image_id'],
                'unique': True
            }
        ]
    }

    def to_dict(self):
        return helper.mongo_to_dict(self, [])


def update_timestamp(sender, document):
    document.updated_at = datetime.datetime.utcnow()
    # print("update_timestamp:", document.updated_at)

signals.pre_save.connect(update_timestamp, sender=User)
signals.pre_save.connect(update_timestamp, sender=Task)
signals.pre_save.connect(update_timestamp, sender=UserTask)
signals.pre_save.connect(update_timestamp, sender=Annotation)
signals.pre_save.connect(update_timestamp, sender=ImageAnnotation)
signals.pre_save.connect(update_timestamp, sender=ImagePick)
signals.pre_save.connect(update_timestamp, sender=Dataset)
models.py

 

四:总结

(1)如果业务自始至终都是用mongoengine,索引都建立在models中即可

(2)如果mongoengine业务不够用,需要pymomgo, 但是只用mongoengine连接数据库。

比如mongoengine的批量添加业务不灵活,我需要插入一万条数据,此时如下:

这个flask默认项目是mongoengine,但是使用了pymongo,但是没有用pymongo连接数据库,而是使用mongoengine的connect连接,get_db获取数据库,然后后面使用pymongo的方法,这样的好处是不用pymongo重新连接数据库,mongoengine连接数据库,mongoengine和pymongo都可以使用

# 第一步插入任务部分字段,得到任务ID,遇到重复NotUniqueError及时返回
                try:
                    task_obj = Task(task_title=task_title, task_type=task_type, coin=coin, anno_id=anno_id,
                                    creater=creater, dataset_id=dataset_id, description=description).save()

                except mongoengine.NotUniqueError as e:
                    result = formalReturn(40001, 'task already exists', data=str(e))
                    return jsonify(result), 200
                except Exception as e:
                    result = formalReturn(40004, 'unexpected error', data=str(e))
                    return jsonify(result), 200

                # 第二步,新任务插入图片表并且插入图片关联的数据集id
                s1 = time.time()
                try:
                    image_batch = []
                    anno_batch = []

                    exist = os.access(dataset_path, os.F_OK)
                    with open(dataset_path) as f:
                        lines = f.readlines()
                        for idx, line in enumerate(lines):
                            line = json.loads(line)
                            # print(line)

                            try:

                                image_batch.append(UpdateOne(
                                    {'image_id': line.get("id")}, {
                                        '$set': {
                                            'url': line.get("url"),
                                            'width': line.get("width"),
                                            'height': line.get("height")
                                        },
                                        '$addToSet': {
                                            'datasets': dataset_id
                                        }
                                    },
                                    upsert=True
                                ))

                                anno_batch.append(InsertOne(
                                    {'task_id': str(task_obj.id), 'image_id': line.get("id")}
                                ))

                            except Exception as e:
                                print("插入图片 unexpected error", e)
                                pass

                            if idx % 2000 == 0:
                                s3 = time.time()
                                try:
                                    get_db()['image'].bulk_write(image_batch)
                                    image_batch = []
                                    get_db()['image_annotation'].bulk_write(anno_batch)
                                    anno_batch = []
                                except BulkWriteError as bwe:
                                    print("mongo error", bwe.details)
                                    result = formalReturn(40004, 'unexpected error', data=str(bwe.details))
                                    return jsonify(result), 400
                                print("s3:", time.time() - s3)

                        s4 = time.time()
                        get_db()['image'].bulk_write(image_batch)
                        get_db()['image_annotation'].bulk_write(anno_batch)
                        print("s4:", time.time() - s4)


                except Exception as e:
                    result = formalReturn(40004, 'unexpected io error', data=str(e))
                    return jsonify(result), 200

                print("s1:",time.time()-s1)

      
View Code
views.py

from mongoengine.connection import get_db, connect
from pymongo import MongoClient, ASCENDING, UpdateOne, InsertOne
from pymongo.errors import BulkWriteError


connect()
with open(dataset_path) as f:
lines = f.readlines()
for idx, line in enumerate(lines):
    line = json.loads(line)
    # print(line)

    try:
        image_batch.append(UpdateOne(
            {'image_id': line.get("id")}, {
                '$set': {
                    'url': line.get("url"),
                    'width': line.get("width"),
                    'height': line.get("height")
                },
                '$addToSet': {
                    'datasets': dataset_id
                }
            },
            upsert=True
        ))
    except Exception as e:
        print("插入图片 unexpected error", e)
        pass

    if idx % 2000 == 0:
        try:
            get_db()['image'].bulk_write(image_batch)
        except BulkWriteError as bwe:
            print("mongo error", bwe.details)
            result = formalReturn(40004, 'unexpected error', data=str(bwe.details))
            return jsonify(result), 400

get_db()['image'].bulk_write(image_batch)

(3)如果mongoengine业务不够用,需要pymomgo, 但是mongoengine用mongoengine连接数据库,pymomgo需要pymomgo连接数据库。

如果没有docker隔离,以上设置就OK,如果docker mongodb, 就需要连接以上networks。以上批量插入业务需要事先建立索引,但是'connect': False懒连接

mongoengine:
app.config['MONGODB_SETTINGS'] = {
'host': 'localhost',
'port': 27017,
'db': 'annosys',
'connect': False
}
pymongo:
client = MongoClient('mongodb://localhost:27017/')

mongoengine的索引还没有构建,因此需要pymongo事先建立索引,然后pymongo批量插入,其他业务可以继续使用mongoengine。

 

posted @ 2018-12-21 10:53  Adamanter  阅读(898)  评论(0编辑  收藏  举报