ragas测试
ragas可支持使用不同的 embedding 和 llm 进行计算,默认使用的是 OpenAIEmbeddings(model="text-embedding-ada-002")、ChatOpenAI(model="gpt-3.5-turbo")。
目前国内支持的大模型有:百度千帆、通义千问、百川、Yuan2、智谱ai
下面以三种渠道的embedding 和 llm为例进行ragas测试。不同渠道的api key获取见 API KEY获取和测试
使用ragas+langchain进行rag评估
openai
安装三方库
pip install ragas
pip install datasets
pip install langchain_openai
测试代码
import os
from ragas.metrics import answer_relevancy,faithfulness, context_precision,context_relevancy,context_recall
from datasets import Dataset
from ragas import evaluate, RunConfig
from langchain_openai import ChatOpenAI
from langchain_openai import OpenAIEmbeddings
from ragas.llms.base import LangchainLLMWrapper
# 设置环境变量
os.environ["OPENAI_API_BASE"] = "https://api.f2gpt.com"
os.environ["OPENAI_API_KEY"] = "******"
op=ChatOpenAI(model="gpt-4o",temperature=0.5, max_retries=2)
llm = LangchainLLMWrapper(langchain_llm=op)
embeddings=OpenAIEmbeddings()
data_samples = {
'question': ['四川面积是多少'],
'answer': ['四川省总面积48万平方千米'],
'contexts' : [
['四川省总面积48.6万平方千米,辖21个[地级行政区],其中18个[地级市]、3个[自治州]。共55个[市辖区]、19个[县级市],105个[县],4个[自治县],合计183个县级区划。街道459个、镇2016个、乡626个,合计3101个乡级区划。']
],
'ground_truth': ['四川省总面积48.6万平方千米']
}
run_config = RunConfig(
max_retries=5,
max_wait=120,
thread_timeout=500,
log_tenacity=True
)
dataset = Dataset.from_dict(data_samples)
print("开始计算。。。。。")
# 计算结果 'faithfulness': 0.0000, 'answer_relevancy': nan,, 暂时未找到原因
# 不传llm 和 embeddings,会默认使用 OpenAIEmbeddings(model="text-embedding-ada-002") ChatOpenAI(model="gpt-3.5-turbo")
score = evaluate(dataset,metrics=[faithfulness,answer_relevancy,context_precision,context_relevancy,context_recall],
llm=llm,embeddings=embeddings,
raise_exceptions=False,run_config=run_config)
print("计算完成。。。。。。")
print(score)
百度千帆
安装三方库
pip install ragas
pip install datasets
pip install langchain_community
pip install qianfan
测试代码
import os
from ragas.metrics import answer_relevancy,faithfulness, context_precision
from datasets import Dataset
from ragas import evaluate
from langchain_community.llms.baidu_qianfan_endpoint import QianfanLLMEndpoint
from langchain_community.embeddings.baidu_qianfan_endpoint import QianfanEmbeddingsEndpoint
# 设置环境变量
os.environ["QIANFAN_AK"] = "******"
os.environ["QIANFAN_SK"] = "******"
llm = QianfanLLMEndpoint(model="ERNIE-Speed-128K",temperature=0.2,timeout=30)
embeddings=QianfanEmbeddingsEndpoint()
data_samples = {
'question': ['When was the first super bowl?', 'Who won the most super bowls?'],
'answer': ['The first superbowl was held on Jan 15, 1967', 'The most super bowls have been won by The New England Patriots'],
'contexts' : [['The First AFL–NFL World Championship Game was an American football game played on January 15, 1967, at the Los Angeles Memorial Coliseum in Los Angeles,'],
['The Green Bay Packers...Green Bay, Wisconsin.','The Packers compete...Football Conference']],
'ground_truth': ['The first superbowl was held on January 15, 1967', 'The New England Patriots have won the Super Bowl a record six times']
}
dataset = Dataset.from_dict(data_samples)
print("开始计算。。。。。")
# SecretStr str type=type_error.str langchain下一版本修复 todo https://github.com/langchain-ai/langchain/issues/24126
score = evaluate(dataset,metrics=[faithfulness,answer_relevancy,context_precision],llm=llm,embeddings=embeddings,raise_exceptions=False)
print("计算完成。。。。。。")
print(score)
通义千问
安装三方库
pip install ragas
pip install datasets
pip install langchain_community
pip install dashscope
测试代码
import os
from ragas.metrics import faithfulness,answer_relevancy,context_relevancy,context_recall,context_precision,answer_similarity,answer_correctness
from datasets import Dataset
from ragas import evaluate, RunConfig
from langchain_community.llms.tongyi import Tongyi
from langchain_community.embeddings.dashscope import DashScopeEmbeddings
os.environ["DASHSCOPE_API_KEY"] = "******"
llm = Tongyi(model_name="qwen-turbo")
embeddings=DashScopeEmbeddings()
data_samples = {
'question': ['四川面积是多少','四川有多少人'],
'answer': ['四川省总面积48.6万平方千米','四川有9071.4万人'],
'contexts' : [
['四川省总面积48.6万平方千米,辖21个[地级行政区],其中18个[地级市]、3个[自治州]。共55个[市辖区]、19个[县级市],105个[县],4个[自治县],合计183个县级区划。街道459个、镇2016个、乡626个,合计3101个乡级区划。']
,['截止2023年末,四川省户籍人口9071.4万人']
],
'ground_truth': ['四川省总面积48.6万平方千米','四川省户籍人口9071.4万人']
}
dataset = Dataset.from_dict(data_samples)
run_config = RunConfig(
max_retries=5,
max_wait=120,
thread_timeout=500,
log_tenacity=True
)
print("开始计算。。。。。")
score = evaluate(dataset,metrics=[faithfulness,answer_relevancy,context_relevancy,context_recall,context_precision,answer_similarity,answer_correctness],
llm=llm,embeddings=embeddings,
raise_exceptions=False,run_config=run_config)
print("计算完成。。。。。。")
print(score)
# 评估结果
{'faithfulness': 1.0000, 'answer_relevancy': 0.6457, 'context_relevancy': 0.6667, 'context_recall': 1.0000, 'context_precision': 1.0000, 'answer_similarity': 0.9262, 'answer_correctness': 0.9816}
踩坑记录
1、搭建环境需要c++编译能力(安装 Microsoft Visual Studio,Microsoft Visual C++ ,版本大于2014),python>3.9
2、网上很多都是安装langchain,但是运行代码都提示 Deprecated,直接安装新版的 langchain_community
3、网上都是建议安装ragas=0.0.22版本,原因都是 langchain 还没有对 ragas 0.1 及以后版本作兼容,发现不需要安装特定版本的ragas,直接安装最新版就行
4、在使用openai渠道测试时,经常出现 faithfulness=0、faithfulness=nan、answer_relevancy=nan的情况,经过排查后仍未找出原因。疑似官方bug。
5、在使用 百度千帆 渠道 测试时,报错 "type=type_error.str,SecretStr!= str",经过排查后是langchain官方bug,等待修复后发布新版才能使用 https://github.com/langchain-ai/langchain/issues/24126,
6、需要测试多个问题时,data_samples的格式如下:
data_samples = {
'question': ['question 1', 'question 2', 'question 3'],
'answer': ['answer to the question 1', 'answer to the question 2', 'answer to the question 3'],
'contexts': [['context 1'], ['context 2'], ['context 3']],
'ground_truths': ['ground_truth to the question 1', 'ground_truth to the question 2', 'ground_truth to the question 3']
}