flask设置mongodb索引通过pymongo/flask_pymongo/pymongo
建立mongodb索引某集合的索引:
一 脚本方式, 通过pymongo方式
version:pymongo 3.7
连接本地mongodb的annosys数据库的image集合,为image建立索引
client = MongoClient('mongodb://localhost:27017/') db = client.annosys image_db = db['image'] image_db.create_index([('image_id', ASCENDING)], unique=True) image_index = sorted(list(image_db.index_information())) print(image_index)
假如代码位于docker容器名web内部,mongodb又是容器名mongodb,需要连接两docker的networks网络,以docker-compose为例, 内部的连接也要改成"mongodb://mongodb:27017/":
version: '2' services: web: build: ./web container_name: "web_test" ports: - 8080:80 volumes: - /data/static:/data/static depends_on: - mongodb networks: - net_set mongodb: image: mongo:latest container_name: "mongodb" environment: - MONGO_DATA_DIR=/data/db - MONGO_LOG_DIR=/dev/null volumes: - ./db:/data/db networks: - net_set networks: net_set: driver: bridge
二 框架方式, 通过flask_pymongo方式
flask配置flask_pymongo:
models_second.py from flask_pymongo import PyMongo mongo = PyMongo()
app.py app = Flask(__name__, static_folder='/data/www/web/static') app.config["MONGO_URI"] = "mongodb://localhost:27017/annosys" from models_second import mongo mongo.init_app(app)
views.py from models_second import mongo 这下面一句是mongoengine的: task_obj = Task.objects.get_or_404(id=id) 下面这句是flask_pymongo的查找和批量删除: ias = mongo.db.image_annotation.find({"task_id": id}) ias = [ia['_id'] for ia in ias] mongo.db.image_annotation.remove({'_id': {'$in': ias}})
三 框架方式, 通过mongoengine方式
mongoegine没有单独建立索引都是设置在models.py当中
# -*- coding: utf-8 -*- import pytz import helper import datetime import time import uuid from flask_mongoengine import MongoEngine from mongoengine import signals from werkzeug.security import generate_password_hash, check_password_hash # from datetime import datetime db = MongoEngine() class User(db.Document): """ user table """ # user_id = db.DateTimeField(required=True, default=datetime.datetime.utcnow) username = db.StringField(required=True, max_length=100) password = db.StringField(required=True, max_length=100) role = db.StringField(required=True, max_length=50) status = db.StringField(required=True, max_length=50, default="enabled") # identified_id = db.StringField(max_length=50) # phone_num = db.StringField(max_length=20) # email = db.StringField(max_length=100) # wechat_id = db.StringField(max_length=100) # alipay_id = db.StringField(max_length=100) # position = db.StringField(max_length=100) # last_login_time = db.DateTimeField(required=True, default=datetime.datetime.utcnow) # last_login_city = db.StringField(max_length=50) # last_login_ip = db.StringField(max_length=50) # created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow) updated_at = db.DateTimeField(required=True) deleted_at = db.DateTimeField(default=None) # # description = db.StringField(max_length=5000) # meta = { # 'indexes': [ # { # 'fields': ['username', 'phone_num', 'email', 'wechat_id', 'alipay_id'], # 'unique': True # } # ] # } meta = { 'indexes': [ { 'fields': ['username'], 'unique': True } ] } def to_dict(self): return helper.mongo_to_dict(self, []) def set_password(self, password): return generate_password_hash(password) def check_password(self, hash, password): return check_password_hash(hash, password) def get(self, id): try: user_obj = self.objects.filter(id=id).first() return user_obj except Exception as e: ret = {"code": 404, "message": "unexpected error", "data": str(e)} return ret class Task(db.Document): """ parents task """ task_title = db.StringField(required=True, max_length=200) task_type = db.StringField(required=True, max_length=50) coin = db.FloatField(required=True, max_length=50) creater = db.StringField(required=True, max_length=50) owners = db.ListField(default=[]) images = db.ListField(default=[]) dataset_id = db.StringField(required=True, max_length=200) anno_id = db.StringField(required=True, max_length=200) images_count = db.FloatField(max_length=100, default=0) status = db.StringField(required=True, max_length=50, default="created") # stock = db.StringField(required=True, max_length=50, default="on") # created/working/finished created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow) updated_at = db.DateTimeField(required=True) deleted_at = db.DateTimeField(default=None) description = db.StringField(max_length=50000, default=None) meta = { 'indexes': [ { 'fields': ['task_type', 'task_title'], 'unique': True } ] } def to_dict(self): return helper.mongo_to_dict(self, []) class UserTask(db.Document): """ sub task """ task_id = db.StringField(required=True, max_length=200) user_id = db.StringField(required=True, max_length=200) status = db.StringField(required=True, max_length=50, default="created") mutex_status = db.BooleanField(default=False) # created/working/finished/self-check/admin-check/unpaid/hitbacked/discarded worked_count = db.FloatField(required=True, max_length=50, default=0) coin = db.FloatField(required=True, max_length=50) task_salary = db.FloatField(max_length=50, default=0) created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow) updated_at = db.DateTimeField(required=True) deleted_at = db.DateTimeField(default=None) reviews = db.StringField(max_length=50000) meta = { 'indexes': [ { 'fields': ['task_id', 'user_id'], 'unique': True } ] } def to_dict(self): return helper.mongo_to_dict(self, []) class Annotation(db.Document): """ attribute table """ task_type = db.StringField(required=True, max_length=200) category = db.StringField(required=True, max_length=200) attribute_key = db.StringField(max_length=200, default=None) attribute_values = db.ListField(default=[]) status = db.StringField(required=True, max_length=50, default="created") tasks = db.ListField(default=[]) created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow) updated_at = db.DateTimeField(required=True) deleted_at = db.DateTimeField(default=None) meta = { 'indexes': [ { 'fields': ['task_type', 'category', 'attribute_key', 'attribute_values'], 'unique': True } ] } def to_dict(self): return helper.mongo_to_dict(self, []) class ImageAnnotation(db.Document): """ image tag table """ task_id = db.StringField(required=True, max_length=200) image_id = db.StringField(required=True, max_length=200) subtask_id = db.StringField(required=False, max_length=200, default=None) user_id = db.StringField(required=False, max_length=200, default=None) definition = db.StringField(max_length=200, default=None) first = db.StringField(max_length=200, default=None) second = db.StringField(max_length=200, default=None) is_skip = db.BooleanField(default=False) user_worked = db.BooleanField(default=False) hitbacked = db.BooleanField(default=False) reviews = db.StringField(max_length=10000) timestamp = db.FloatField(required=False, max_length=100, default=0) created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow) updated_at = db.DateTimeField(required=True) deleted_at = db.DateTimeField(default=None) meta = { 'indexes': [ { 'fields': ['task_id', 'subtask_id', 'user_id', 'image_id'], 'unique': True } ] } def to_dict(self): return helper.mongo_to_dict(self, []) class ImagePick(db.Document): """ image pick table """ task_id = db.StringField(required=True, max_length=200) image_id = db.StringField(required=True, max_length=200) subtask_id = db.StringField(required=False, max_length=200, default=None) user_id = db.StringField(required=False, max_length=200, default=None) pick_tag = db.StringField(required=False, max_length=200, default=None) # 被用户管理员修改后的最终状态,是不是那个标签,是镶钻,不是镶钻 status = db.BooleanField(default=False) # 用户打是不是那个标签,是镶钻,不是镶钻 # user_status = db.BooleanField(default=False) # 管理员修改是不是那个标签,是镶钻,不是镶钻 # admin_status = db.BooleanField(default=False) # 每次批量提交 # user_worked = db.BooleanField(default=False) # 表示是否自审核过 # user_checked = db.BooleanField(default=False) # 表示是否被管理员审核过 # admin_id = db.StringField(required=False, max_length=200, default=None) # admin_checked = db.BooleanField(default=False) # 表示是否被打回过 hitbacked = db.BooleanField(default=False) # status = db.StringField(required=False, max_length=200, default=None) # 用户选择了这照片就被锁住,写下猎取时间,长时间无操作,直接跳到下一张 # is_lock = db.BooleanField(default=False) # is_expired = db.BooleanField(default=False) reviews = db.StringField(max_length=10000, default=None) timestamp = db.FloatField(required=False, max_length=100, default=0) created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow) updated_at = db.DateTimeField(required=True) deleted_at = db.DateTimeField(default=None) meta = { 'indexes': [ { 'fields': ['task_id', 'subtask_id', 'user_id', 'image_id'], 'unique': True } ] } def to_dict(self): return helper.mongo_to_dict(self, []) class Dataset(db.Document): """ dataset table """ dataset_name = db.StringField(required=True, max_length=200) file_name = db.StringField(required=True, max_length=50) file_size = db.FloatField(required=True, max_length=50) file_path = db.StringField(required=True, max_length=200) tasks = db.ListField(default=[]) status = db.StringField(required=True, max_length=50, default="enabled") created_at = db.DateTimeField(required=True, default=datetime.datetime.utcnow) updated_at = db.DateTimeField(required=True) description = db.StringField(max_length=10000, default=None) meta = { 'indexes': [ { 'fields': ['dataset_name'], 'unique': True } ] } def to_dict(self): return helper.mongo_to_dict(self, []) class Image(db.Document): """ image table """ image_id = db.StringField(required=True, unique=True) url = db.StringField(required=True, max_length=500) width = db.FloatField(required=True, max_length=50) height = db.FloatField(required=True, max_length=50) datasets = db.ListField(default=[]) description = db.StringField(max_length=10000, default=None) meta = { 'indexes': [ { 'fields': ['image_id'], 'unique': True } ] } def to_dict(self): return helper.mongo_to_dict(self, []) def update_timestamp(sender, document): document.updated_at = datetime.datetime.utcnow() # print("update_timestamp:", document.updated_at) signals.pre_save.connect(update_timestamp, sender=User) signals.pre_save.connect(update_timestamp, sender=Task) signals.pre_save.connect(update_timestamp, sender=UserTask) signals.pre_save.connect(update_timestamp, sender=Annotation) signals.pre_save.connect(update_timestamp, sender=ImageAnnotation) signals.pre_save.connect(update_timestamp, sender=ImagePick) signals.pre_save.connect(update_timestamp, sender=Dataset)
四:总结
(1)如果业务自始至终都是用mongoengine,索引都建立在models中即可
(2)如果mongoengine业务不够用,需要pymomgo, 但是只用mongoengine连接数据库。
比如mongoengine的批量添加业务不灵活,我需要插入一万条数据,此时如下:
这个flask默认项目是mongoengine,但是使用了pymongo,但是没有用pymongo连接数据库,而是使用mongoengine的connect连接,get_db获取数据库,然后后面使用pymongo的方法,这样的好处是不用pymongo重新连接数据库,mongoengine连接数据库,mongoengine和pymongo都可以使用
# 第一步插入任务部分字段,得到任务ID,遇到重复NotUniqueError及时返回 try: task_obj = Task(task_title=task_title, task_type=task_type, coin=coin, anno_id=anno_id, creater=creater, dataset_id=dataset_id, description=description).save() except mongoengine.NotUniqueError as e: result = formalReturn(40001, 'task already exists', data=str(e)) return jsonify(result), 200 except Exception as e: result = formalReturn(40004, 'unexpected error', data=str(e)) return jsonify(result), 200 # 第二步,新任务插入图片表并且插入图片关联的数据集id s1 = time.time() try: image_batch = [] anno_batch = [] exist = os.access(dataset_path, os.F_OK) with open(dataset_path) as f: lines = f.readlines() for idx, line in enumerate(lines): line = json.loads(line) # print(line) try: image_batch.append(UpdateOne( {'image_id': line.get("id")}, { '$set': { 'url': line.get("url"), 'width': line.get("width"), 'height': line.get("height") }, '$addToSet': { 'datasets': dataset_id } }, upsert=True )) anno_batch.append(InsertOne( {'task_id': str(task_obj.id), 'image_id': line.get("id")} )) except Exception as e: print("插入图片 unexpected error", e) pass if idx % 2000 == 0: s3 = time.time() try: get_db()['image'].bulk_write(image_batch) image_batch = [] get_db()['image_annotation'].bulk_write(anno_batch) anno_batch = [] except BulkWriteError as bwe: print("mongo error", bwe.details) result = formalReturn(40004, 'unexpected error', data=str(bwe.details)) return jsonify(result), 400 print("s3:", time.time() - s3) s4 = time.time() get_db()['image'].bulk_write(image_batch) get_db()['image_annotation'].bulk_write(anno_batch) print("s4:", time.time() - s4) except Exception as e: result = formalReturn(40004, 'unexpected io error', data=str(e)) return jsonify(result), 200 print("s1:",time.time()-s1)
views.py from mongoengine.connection import get_db, connect from pymongo import MongoClient, ASCENDING, UpdateOne, InsertOne from pymongo.errors import BulkWriteError connect() with open(dataset_path) as f: lines = f.readlines() for idx, line in enumerate(lines): line = json.loads(line) # print(line) try: image_batch.append(UpdateOne( {'image_id': line.get("id")}, { '$set': { 'url': line.get("url"), 'width': line.get("width"), 'height': line.get("height") }, '$addToSet': { 'datasets': dataset_id } }, upsert=True )) except Exception as e: print("插入图片 unexpected error", e) pass if idx % 2000 == 0: try: get_db()['image'].bulk_write(image_batch) except BulkWriteError as bwe: print("mongo error", bwe.details) result = formalReturn(40004, 'unexpected error', data=str(bwe.details)) return jsonify(result), 400 get_db()['image'].bulk_write(image_batch)
(3)如果mongoengine业务不够用,需要pymomgo, 但是mongoengine用mongoengine连接数据库,pymomgo需要pymomgo连接数据库。
如果没有docker隔离,以上设置就OK,如果docker mongodb, 就需要连接以上networks。以上批量插入业务需要事先建立索引,但是'connect': False懒连接
mongoengine:
app.config['MONGODB_SETTINGS'] = {
'host': 'localhost',
'port': 27017,
'db': 'annosys',
'connect': False
}
pymongo:
client = MongoClient('mongodb://localhost:27017/')
mongoengine的索引还没有构建,因此需要pymongo事先建立索引,然后pymongo批量插入,其他业务可以继续使用mongoengine。