milvus调用阿里云大模型例子
环境:
OS:Windows
pycharm:2022.1
python:3.11.9
1.安装依赖模块
pip install pymilvus tqdm dashscope
或是分别单独安装
pip install dashscope --timeout=100
pip install tqdm --timeout=100
pip install pymilvus --timeout=100
2.导入文本报道内容
将如下文本文件解压到项目的当前目录
通过百度网盘分享的文件:allSourceText.rar
链接:https://pan.baidu.com/s/1HjMXJHrnvOWFN7za6moPBA
提取码:aqc3
如下:
3.文本内容向量化
embedding.py
#!/usr/bin/env python
#coding=utf-8
import os
import time
from tqdm import tqdm
import dashscope
from dashscope import TextEmbedding
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection, utility
def prepareData(path, batch_size=25):
batch_docs = []
for file in os.listdir(path):
with open(path + '/' + file, 'r', encoding='utf-8') as f:
batch_docs.append(f.read())
if len(batch_docs) == batch_size:
yield batch_docs
batch_docs = []
if batch_docs:
yield batch_docs
def getEmbedding(news):
model = TextEmbedding.call(
model=TextEmbedding.Models.text_embedding_v1,
input=news
)
embeddings = [record['embedding'] for record in model.output['embeddings']]
return embeddings if isinstance(news, list) else embeddings[0]
if __name__ == '__main__':
current_path = os.path.abspath(os.path.dirname(__file__)) # 当前目录
print(current_path)
root_path = os.path.abspath(os.path.join(current_path, '..')) # 上级目录
data_path = f'{current_path}/allSourceText' # 数据下载git clone https://github.com/shijiebei2009/CEC-Corpus.git
print(data_path)
# 配置Dashscope API KEY,这个需要开通阿里云账号,在Dashscope产品控制台开通
dashscope.api_key = "XXXXXXXX"
# 配置Milvus参数
COLLECTION_NAME = 'CEC_Corpus'
DIMENSION = 1536
MILVUS_HOST = '192.168.1.135'
MILVUS_PORT = '19530'
USER = 'root'
PASSWORD = 'Milvus'
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT, user=USER, password=PASSWORD)
# Remove collection if it already exists
if utility.has_collection(COLLECTION_NAME):
utility.drop_collection(COLLECTION_NAME)
# Create collection which includes the id, title, and embedding.
fields = [
FieldSchema(name='id', dtype=DataType.INT64, descrition='Ids', is_primary=True, auto_id=False),
FieldSchema(name='text', dtype=DataType.VARCHAR, description='Text', max_length=4096),
FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='Embedding vectors', dim=DIMENSION)
]
schema = CollectionSchema(fields=fields, description='CEC Corpus Collection')
collection = Collection(name=COLLECTION_NAME, schema=schema)
# Create an index for the collection.
index_params = {
'index_type': 'IVF_FLAT',
'metric_type': 'L2',
'params': {'nlist': 1024}
}
collection.create_index(field_name="embedding", index_params=index_params)
id = 0
for news in tqdm(list(prepareData(data_path))):
ids = [id + i for i, _ in enumerate(news)]
id += len(news)
##print(news)
vectors = getEmbedding(news)
# insert Milvus Collection
for id, vector, doc in zip(ids, vectors, news):
insert_doc = (doc[:498] + '..') if len(doc) > 500 else doc
ins = [[id], [insert_doc], [vector]] # Insert the title id, the text, and the text embedding vector
collection.insert(ins)
time.sleep(2)
4.提问ai
answer.py
#!/usr/bin/env python
#coding=utf-8
import os
import dashscope
from dashscope import Generation
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection
from embedding import getEmbedding
def getAnswer(query, context):
prompt = f'''请基于```内的报道内容,回答我的问题。
```
{context}
```
我的问题是:{query}。
'''
rsp = Generation.call(model='qwen-turbo', prompt=prompt)
return rsp.output.text
def search(text):
# Search parameters for the index
search_params = {
"metric_type": "L2"
}
results = collection.search(
data=[getEmbedding(text)], # Embeded search value
anns_field="embedding", # Search across embeddings
param=search_params,
limit=1, # Limit to five results per search
output_fields=['text'] # Include title field in result
)
ret = []
for hit in results[0]:
ret.append(hit.entity.get('text'))
return ret
if __name__ == '__main__':
current_path = os.path.abspath(os.path.dirname(__file__)) # 当前目录
root_path = os.path.abspath(os.path.join(current_path, '..')) # 上级目录
data_path = f'{root_path}/CEC-Corpus/raw corpus/allSourceText'
# 配置Dashscope API KEY
dashscope.api_key = "XXXXXXXXXXXXXXXXXXXXXX"
# 配置Milvus参数
COLLECTION_NAME = 'CEC_Corpus'
DIMENSION = 1536
MILVUS_HOST = '192.168.1.135'
MILVUS_PORT = '19530'
USER = 'root'
PASSWORD = 'Milvus'
connections.connect(host=MILVUS_HOST, port=MILVUS_PORT, user=USER, password=PASSWORD)
fields = [
FieldSchema(name='id', dtype=DataType.INT64, descrition='Ids', is_primary=True, auto_id=False),
FieldSchema(name='text', dtype=DataType.VARCHAR, description='Text', max_length=4096),
FieldSchema(name='embedding', dtype=DataType.FLOAT_VECTOR, description='Embedding vectors', dim=DIMENSION)
]
schema = CollectionSchema(fields=fields, description='CEC Corpus Collection')
collection = Collection(name=COLLECTION_NAME, schema=schema)
# Load the collection into memory for searching
collection.load()
question = '北京中央电视台工地发生大火,发生在哪里?出动了多少辆消防车?人员伤亡情况如何?'
context = search(question)
answer = getAnswer(question, context)
print(answer)
运行answer.py效果如下