ElasticSearch 数据增删改实现

前言

本文介绍 ElasticSearch 增加、删除、修改数据的使用示例。通过Restful 接口和 Python 实现。ES最新版本中有Delete By Query 和 Update By Query等功能，但是老版本是没有相关功能的，这里需要特别注意下。详细可参考官网资料：
5.4版本：https://www.elastic.co/guide/en/elasticsearch/reference/current/docs.html
2.4版本：https://www.elastic.co/guide/en/elasticsearch/reference/2.4/docs.html

事先需要安装好ElasticSearch和head插件。可参考：http://blog.csdn.net/xsdxs/article/details/52815270

Restful API 实现

创建索引

curl -XPOST 'localhost:9200/customer?pretty'

插入数据

单条插入-指定id

curl -XPOST 'localhost:9200/customer/external/1?pretty' -d' {"name": "John Doe" }'

单条插入-不指定id

curl -XPOST 'localhost:9200/customer/external?pretty' -d' {"name": "Jane Doe" }'

批量插入：

curl -XPOST 'localhost:9200/bank/account/_bulk?pretty' --data-binary “@accounts.json"

参考资料：http://blog.csdn.net/pilihaotian/article/details/52452014
数据下载：https://raw.githubusercontent.com/bly2k/files/master/accounts.zip

删除数据

删除数据：下面的语句将执行删除Customer中ID为2的数据

curl -XDELETE 'localhost:9200/customer/external/2?pretty'

根据查询条件删除（PS:这条本人没试过，我用的还是2.4版本，这是参照官网资料的5.4版本写的）

curl -XPOST 'localhost:9200/customer/external/_delete_by_query?pretty' -d '{
    "query": {
        "match": {
            "name": "John"
        }
    }
}'

删除全部

{
    "query": {
        "match_all": {}
    }
}

更新数据

更新文档: 修改id=1的name属性，并直接增加属性和属性值

curl -XPOST 'localhost:9200/customer/external/1/_update?pretty' -d ' {
    "doc": {
        "name": "xyd",
        "age": 20
    }
}'

更新索引–脚本方式

curl -XPOST 'localhost:9200/customer/external/1/_update?pretty' -d' {
    "script": "ctx._source.age += 5"
}'

Python API 实现

说明

以下代码实现是：单条增加、根据_id删除、根据_id更新、批量增加等接口。调试的时候建议一个一个功能运行。

代码

# -*- coding: utf-8 -*-

from elasticsearch.helpers import bulk
import elasticsearch


class ElasticSearchClient(object):
    @staticmethod
    def get_es_servers():
        es_servers = [{
            "host": "localhost",
            "port": "9200"
        }]
        es_client = elasticsearch.Elasticsearch(hosts=es_servers)
        return es_client


class LoadElasticSearch(object):
    def __init__(self):
        self.index = "hz"
        self.doc_type = "xyd"
        self.es_client = ElasticSearchClient.get_es_servers()
        self.set_mapping()

    def set_mapping(self):
        """
        设置mapping
        """
        mapping = {
            self.doc_type: {
                "properties": {
                    "document_id": {
                        "type": "integer"
                    },
                    "title": {
                        "type": "string"
                    },
                    "content": {
                        "type": "string"
                    }
                }
            }
        }

        if not self.es_client.indices.exists(index=self.index):
            # 创建Index和mapping
            self.es_client.indices.create(index=self.index, body=mapping, ignore=400)
            self.es_client.indices.put_mapping(index=self.index, doc_type=self.doc_type, body=mapping)

    def add_date(self, row_obj):
        """
        单条插入ES
        """
        _id = row_obj.get("_id", 1)
        row_obj.pop("_id")
        self.es_client.index(index=self.index, doc_type=self.doc_type, body=row_obj, id=_id)

    def add_date_bulk(self, row_obj_list):
        """
        批量插入ES
        """
        load_data = []
        i = 1
        bulk_num = 2000  # 2000条为一批
        for row_obj in row_obj_list:
            action = {
                "_index": self.index,
                "_type": self.doc_type,
                "_id": row_obj.get('_id', 'None'),
                "_source": {
                    'document_id': row_obj.get('document_id', None),
                    'title': row_obj.get('title', None),
                    'content': row_obj.get('content', None),
                }
            }
            load_data.append(action)
            i += 1
            # 批量处理
            if len(load_data) == bulk_num:
                print '插入', i / bulk_num, '批数据'
                print len(load_data)
                success, failed = bulk(self.es_client, load_data, index=self.index, raise_on_error=True)
                del load_data[0:len(load_data)]
                print success, failed

        if len(load_data) > 0:
            success, failed = bulk(self.es_client, load_data, index=self.index, raise_on_error=True)
            del load_data[0:len(load_data)]
            print success, failed

    def update_by_id(self, row_obj):
        """
        根据给定的_id,更新ES文档
        :return:
        """
        _id = row_obj.get("_id", 1)
        row_obj.pop("_id")
        self.es_client.update(index=self.index, doc_type=self.doc_type, body={"doc": row_obj}, id=_id)

    def delete_by_id(self, _id):
        """
        根据给定的id,删除文档
        :return:
        """
        self.es_client.delete(index=self.index, doc_type=self.doc_type, id=_id)

if __name__ == '__main__':
    write_obj = {
        "_id": 1,
        "document_id": 1,
        "title": u"Hbase 测试数据",
        "content": u"Hbase 日常运维,这是个假数据监控Hbase运行状况。通常IO增加时io wait也会增加，现在FMS的机器正常情况......",
    }

    load_es = LoadElasticSearch()

    # 插入单条数据测试
    load_es.add_date(write_obj)

    # 根据id更新测试
    # write_obj["title"] = u"更新标题"
    # load_es.update_by_id(write_obj)

    # 根据id删除测试
    # load_es.delete_by_id(1)

    # 批量插入数据测试
    # row_obj_list = []
    # for i in range(2, 2200):
    #     temp_obj = write_obj.copy()
    #     temp_obj["_id"] = i
    #     temp_obj["document_id"] = i
    #     row_obj_list.append(temp_obj)
    # load_es.add_date_bulk(row_obj_list)