from elasticsearch import Elasticsearch

es = Elasticsearch()

#  创建索引
def deleteInices(my_index):
    if True and es.indices.exists(my_index): # 确认删除再改为True
        print('删除之前存在的')
        es.indices.delete(index=my_index)

def createIndex(my_index,my_doc):
    # index settings    索引配置
    settings = {
        "mappings":{
            my_doc:{ # 只有my_doc可以改变(相当于表名)
                "properties":{
                    "my_id":{"type":"integer"}, # 相当于字段名,指定该字段的类型
                    "my_word":{
                        "type":"text",
                        "analyzer":"ik_smart", #指定分词为智能分词,如果不指定,则会用默认分词,会把每个字作为分词
                        "search_analyzer":"ik_smart" # 指定搜索引擎为智能分词搜索

                    }
                }

            }
        }
    }
    # create index
    es.indices.create(index=my_index,ignore=400,body=settings)
    print('创建索引成功')

def mainCreateIndex():
    # 调用后创建index
    my_index = "word2vec_index"
    my_doc = "my_doc"
    deleteInices(my_index)
    createIndex(my_index,my_doc)

# mainCreateIndex()


# 插入数据

from elasticsearch import helpers

def getAllWords(path="vocab.txt"):
    # 将数据从文件中读出

    words = []
    with open(path,"r",encoding='utf-8') as f:
        for i,item in enumerate(f.readlines()):
            words.append((i,item.strip()))
        return words


def insertData(words,my_index,my_doc,one_bulk):
    # 插入数据
    # one_bulk表示一个bulk里装多少个
    body = []
    body_count = 0 # 记录bodu里面有多少个
    # 最后一个Bulk可能没满one_bulk,但也要插入
    print("共需要插入%d条"%len(words))

    for id,word in words:
        data1 = {
            "my_id":id,
            "my_word":word
        }
        every_body = {
            "_index":my_index,
            "_type":my_doc,
            "_source":data1
        }
        if body_count<one_bulk:
            body.append(every_body)
            body_count += 1
        else:
            helpers.bulk(es,body)
            body_count = 0
            body.clear()
            body.append(every_body)
            body_count +=1

    if len(body)>0:
        # 如果body里还有数据,则再插入一次
        helpers.bulk(es,body)
    print("插入数据完成")

def mainInset():
    #调用后插入数据
    my_index = "word2vec_index"
    my_doc = "my_doc"
    words = getAllWords()
    insertData(words,my_index,my_doc,one_bulk=5000)


# mainInset()

# es查询

def keywordSearch(keywords1,my_index,my_doc):
    # 根据keywords1来查找
    my_search1 = {
        "query":{
            "match":{
                "my_word":keywords1
            }
        }
    }
    # 直接查询
    # res = es.search(index=my_index,body=my_search1)
    # total = res["hits"]["total"] # 一共这么多个
    # print("共查询到%d条数据"%total.get('value'))

    # helpers查询
    es_result = helpers.scan(
        client=es,
        query=my_search1,
        scroll='10m',
        index=my_index,
        timeout = '10m'
    )
    es_result = [item for item in es_result] # 原始是生成器<generator object scan at 0x0000021210>
    print(es_result) # 现在才可以直接打印查看
    search_res = []
    for item in es_result:
        tmp = item['_source']
        search_res.append((tmp['my_id'],tmp['my_word']))
    print("共查询到%d条数据"%len(es_result))
    print(search_res)


def mainSearch():
    # 调用后检索数据
    my_index = "word2vec_index"
    my_doc = "my_doc"
    keywords1 = "氨基酸"
    keywordSearch(keywords1,my_index,my_doc)


mainSearch()

 

posted on 2022-05-04 13:12  輪滑少年  阅读(558)  评论(0编辑  收藏  举报