一个中转代码,底层调用openai,上层模拟openai
openai的调用api几乎成为了实质性的大模型社区的调用标准,你看不论是阿里的灵积,智谱,together,vllm,ollama,fastchat等都支持openai的调用方式,所以这也是大势所趋,
有时候我们想做个中间层,底层调用大模型,上层提供业务服务,特别是许多公司的多节点的agent,如果我们都保持一致那么统一性就很好了
代码如下,代码不是很精细,个别小细节需要改下,但是整体不影响使用和借鉴:
import time
import json
import asyncio
from typing import List, Optional
import uvicorn
from openai import OpenAI
from fastapi import FastAPI
from pydantic import BaseModel
from starlette.responses import StreamingResponse
app = FastAPI(title="OpenAI-compatible API")
class ChatMessage(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str = "mock-gpt-model"
messages: List[ChatMessage]
max_tokens: Optional[int] = 512
temperature: Optional[float] = 0.1
stream: Optional[bool] = False
class CallOpenAI:
'''调用底层大模型的部分 '''
def __init__(self):
self.client = OpenAI(
api_key="EMPTY",
base_url="http://127.0.0.1:9001/v1"
)
def invoke(self,messages):
messages=[{'role':'user','content':messages}]
completion = self.client.chat.completions.create(
model='Qwen2-72-gptq-int4',
messages=messages,
# temperature = 0.5,
max_tokens=500,
# top_p=1.0,
# presence_penalty=0.0,
# frequency_penalty = 0.0
stream=True
)
return completion
model = CallOpenAI()
async def _resp_async_generator1(text_resp: str):
response = model.invoke(text_resp)
stream_messages = ''
for chunk in response:
content = chunk.choices[0].delta.content
stream_message = content
if not content:
stream_message = ''
stream_messages += stream_message
chunk = {
"id": 1,
"object": "chat.completion.chunk",
"created": time.time(),
"model": "blah",
"choices": [{"delta": {"content": stream_message}}],
}
yield f"data: {json.dumps(chunk)}\n\n"
await asyncio.sleep(0)
yield "data: [DONE]\n\n"
@app.post("/v1/chat/completions")
async def chat_completions(request: ChatCompletionRequest):
if request.messages:
resp_content = "As a mock AI Assitant, I can only echo your last message:" + request.messages[-1].content
else:
resp_content = "As a mock AI Assitant, I can only echo your last message, but there wasn't one!"
resp_content = '请基于春天写一篇1000字的文章'
if request.stream:
return StreamingResponse(_resp_async_generator1(resp_content), media_type="application/x-ndjson")
return {
"id": "1337",
"object": "chat.completion",
"created": time.time(),
"model": request.model,
"choices": [{
"message": ChatMessage(role="assistant", content=resp_content) }]
}
if __name__ == '__main__':
uvicorn.run(app, host="0.0.0.0", port=8000)
拉起如下
写个代码测试下:
from openai import OpenAI
# init client and connect to localhost server
client = OpenAI(
api_key="fake-api-key",
base_url="http://localhost:8000/v1/" # change the default port if needed
)
stream = client.chat.completions.create(
model="mock-gpt-model",
messages=[{"role": "user", "content": "Say this is a test"}],
stream=True,
)
for chunk in stream:
print(chunk.choices[0].delta.content or "",end='',flush=True)
结果如下图进行流式输出