milvus调用阿里云大模型例子

环境:
OS:Windows
pycharm:2022.1
python:3.11.9

 

1.安装依赖模块
pip install pymilvus tqdm dashscope
或是分别单独安装
pip install dashscope --timeout=100
pip install tqdm --timeout=100
pip install pymilvus --timeout=100

 

2.导入文本报道内容
将如下文本文件解压到项目的当前目录
通过百度网盘分享的文件:allSourceText.rar
链接:https://pan.baidu.com/s/1HjMXJHrnvOWFN7za6moPBA
提取码:aqc3

如下:

 

 

 

3.文本内容向量化
embedding.py

#!/usr/bin/env python
#coding=utf-8

import os
import time
from tqdm import tqdm
import dashscope
from dashscope import TextEmbedding
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility

def prepareData(path, batch_size=25):
    batch_docs = []
    for file in os.listdir(path):
        with open(path + '/' + file, 'r', encoding='utf-8') as f:
            batch_docs.append(f.read())
            if len(batch_docs) == batch_size:
                yield batch_docs
                batch_docs = []

    if batch_docs:
        yield batch_docs

def getEmbedding(news):
    model = TextEmbedding.call(
        model=TextEmbedding.Models.text_embedding_v1,
        input=news
    )

    embeddings = [record['embedding'] for record in model.output['embeddings']]

    return embeddings if isinstance(news, list) else embeddings[0]


if __name__ == '__main__':
    current_path = os.path.abspath(os.path.dirname(__file__))  # 当前目录
    print(current_path)

    root_path = os.path.abspath(os.path.join(current_path, '..'))  # 上级目录
    data_path = f'{current_path}/allSourceText'  # 数据下载git clone https://github.com/shijiebei2009/CEC-Corpus.git
    print(data_path)

    # 配置Dashscope API KEY,这个需要开通阿里云账号,在Dashscope产品控制台开通
    dashscope.api_key = "XXXXXXXX"

    # 配置Milvus参数
    COLLECTION_NAME = 'CEC_Corpus'
    DIMENSION = 1536
    MILVUS_HOST = '192.168.1.135'
    MILVUS_PORT = '19530'
    USER = 'root'
    PASSWORD = 'Milvus'

    connections.connect(host=MILVUS_HOST, port=MILVUS_PORT, user=USER, password=PASSWORD)

    # Remove collection if it already exists
    if utility.has_collection(COLLECTION_NAME):
        utility.drop_collection(COLLECTION_NAME)

    # Create collection which includes the id, title, and embedding.
    fields = [
        FieldSchema(name='id', dtype=DataType.INT64, descrition='Ids', is_primary=True, auto_id=False),
        FieldSchema(name='text', dtype=DataType.VARCHAR, description='Text', max_length=4096),
        FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='Embedding vectors', dim=DIMENSION)
    ]
    schema = CollectionSchema(fields=fields, description='CEC Corpus Collection')
    collection = Collection(name=COLLECTION_NAME, schema=schema)

    # Create an index for the collection.
    index_params = {
        'index_type': 'IVF_FLAT',
        'metric_type': 'L2',
        'params': {'nlist': 1024}
    }
    collection.create_index(field_name="embedding", index_params=index_params)

    id = 0
    for news in tqdm(list(prepareData(data_path))):
        ids = [id + i for i, _ in enumerate(news)]
        id += len(news)

        ##print(news)

        vectors = getEmbedding(news)
        # insert Milvus Collection
        for id, vector, doc in zip(ids, vectors, news):
            insert_doc = (doc[:498] + '..') if len(doc) > 500 else doc
            ins = [[id], [insert_doc], [vector]]  # Insert the title id, the text, and the text embedding vector
            collection.insert(ins)
            time.sleep(2)

 

4.提问ai

answer.py

#!/usr/bin/env python
#coding=utf-8

import os
import dashscope
from dashscope import Generation
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
from embedding import getEmbedding


def getAnswer(query, context):
    prompt = f'''请基于```内的报道内容,回答我的问题。
          ```
          {context}
          ```
          我的问题是:{query}。
       '''

    rsp = Generation.call(model='qwen-turbo', prompt=prompt)
    return rsp.output.text


def search(text):
    # Search parameters for the index
    search_params = {
        "metric_type": "L2"
    }

    results = collection.search(
        data=[getEmbedding(text)],  # Embeded search value
        anns_field="embedding",  # Search across embeddings
        param=search_params,
        limit=1,  # Limit to five results per search
        output_fields=['text']  # Include title field in result
    )

    ret = []
    for hit in results[0]:
        ret.append(hit.entity.get('text'))
    return ret


if __name__ == '__main__':
    current_path = os.path.abspath(os.path.dirname(__file__))  # 当前目录
    root_path = os.path.abspath(os.path.join(current_path, '..'))  # 上级目录
    data_path = f'{root_path}/CEC-Corpus/raw corpus/allSourceText'

    # 配置Dashscope API KEY
    dashscope.api_key = "XXXXXXXXXXXXXXXXXXXXXX"

    # 配置Milvus参数
    COLLECTION_NAME = 'CEC_Corpus'
    DIMENSION = 1536
    MILVUS_HOST = '192.168.1.135'
    MILVUS_PORT = '19530'
    USER = 'root'
    PASSWORD = 'Milvus'

    connections.connect(host=MILVUS_HOST, port=MILVUS_PORT, user=USER, password=PASSWORD)

    fields = [
        FieldSchema(name='id', dtype=DataType.INT64, descrition='Ids', is_primary=True, auto_id=False),
        FieldSchema(name='text', dtype=DataType.VARCHAR, description='Text', max_length=4096),
        FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='Embedding vectors', dim=DIMENSION)
    ]
    schema = CollectionSchema(fields=fields, description='CEC Corpus Collection')
    collection = Collection(name=COLLECTION_NAME, schema=schema)

    # Load the collection into memory for searching
    collection.load()

    question = '北京中央电视台工地发生大火,发生在哪里?出动了多少辆消防车?人员伤亡情况如何?'
    context = search(question)
    answer = getAnswer(question, context)
    print(answer)

 

运行answer.py效果如下

 

posted @ 2024-08-14 16:42  slnngk  阅读(33)  评论(0编辑  收藏  举报