HF transformers 转 OpenAI API(以 chatglm4 为例)
服务端:
import uvicorn
from fastapi import FastAPI
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
import torch
from typing import *
app = FastAPI()
class Query(BaseModel):
messages: List[Dict[str, str]]
model: Optional[str] = None
request_id: Optional[str] = None
do_sample: Optional[bool] = None
stream: Optional[bool] = None
temperature: Optional[float] = None
top_p: Optional[float] = None
max_tokens: Optional[int] = None
stop: Optional[List[str]] = None
nf4_config = BitsAndBytesConfig(
load_in_4bit=True,
bnb_4bit_quant_type="nf4",
bnb_4bit_use_double_quant=True,
bnb_4bit_compute_dtype=torch.float16,
)
model_path = '/data/glm-4-9b-chat-1m'
model = AutoModelForCausalLM.from_pretrained(
model_path, device_map="auto",
trust_remote_code=True,
quantization_config=nf4_config,
)
tok = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
@app.post("/chat/completions")
async def generate_response(query: Query):
iids = tok.apply_chat_template(
query.messages,
add_generation_prompt=1,
)
gen_cfg = model.generation_config.to_dict() | dict(
max_tokens=query.max_tokens,
temperature=query.temperature,
do_sample=query.do_sample,
top_p=query.top_p,
)
oids = model.generate(
inputs=torch.tensor([iids]).to(model.device),
**gen_cfg,
)
oids = oids[0][len(iids):-1].tolist()
output = tok.decode(oids)
return {
"choices": [{
'index': 0,
'message': {'role': 'assistant', 'content': output}
}]
}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=6006)
客户端:
from openai import OpenAI
api = OpenAI(base_url='http://192.168.60.105:6006/')
r=api.chat.completions.create(messages=[{'role':'user', 'content':'你好'}], model='')
r.choices[0].message.content
# '\n你好👋!很高兴遇见你,如果你想聊聊或者有需要帮助的地方,随时告诉我哦!'