用fastapi和sse创建流式输出接口

示例为调用huggingface的大模型,使其流式输出

from fastapi import FastAPI, Request
import requests
import json
import os
from pydantic import BaseModel
from transformers import AutoTokenizer, AutoModelForCausalLM, TextStreamer, TextIteratorStreamer
from sse_starlette.sse import EventSourceResponse
from threading import Thread
import asyncio
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"

app = FastAPI()

class Model:
    model = None
    tokenizer = None

class Message(BaseModel):
    messages: str

# 程序启动时 加载模型
@app.on_event("startup")
async def startup_event():
  # 模型路径 model_id
= "/data1/songxiaoyong/model_70b" # model_id = "/data1/songxiaoyong/model_8b_origin" Model.tokenizer = AutoTokenizer.from_pretrained(model_id) Model.model = AutoModelForCausalLM.from_pretrained( model_id, torch_dtype="auto", device_map="auto" ) @app.get("/") async def index(): """ 注册一个根路径 :return: """ return {"message": "Welcome"} @app.post("/chat") async def chat(request: Request): """ 项目信息 :return: """ model = Model.model tokenizer = Model.tokenizer messages=await request.json() messages=messages["messages"] print(messages) # messages = [ # # {"role": "user", "content": "写一篇100字的作文"}, # ] input_ids = tokenizer.apply_chat_template( messages, add_generation_prompt=True, tokenize=False ) encoding = tokenizer(input_ids, return_tensors="pt") # streamer = TextStreamer(tokenizer) streamer = TextIteratorStreamer(tokenizer) encoding = encoding.to('cuda') generation_kwargs = dict(encoding, streamer=streamer, max_new_tokens=8192, do_sample=True, temperature=0.8, top_p=0.9) thread = Thread(target=model.generate, kwargs=generation_kwargs) thread.start() # generated_text = "" async def event_generator(streamer): for index, new_text in enumerate(streamer): if index == 0: continue print(new_text) if "<|eot_id|>" in str(new_text): new_text = str(new_text).replace("<|eot_id|>", "")

       # 处理换行符
if "\n" in str(new_text): new_text=str(new_text).replace("\n", "@@@@") # yield json.dumps({ # "event": "message", # "data": new_text # },ensure_ascii=False) yield new_text await asyncio.sleep(0.1) event_source = EventSourceResponse(event_generator(streamer))
  # 把发送Ping信息的间隔设置长一点 event_source.ping_interval
= 60000 return event_source

nohup uvicorn erver:app --host '0.0.0.0' --port 8000 --reload > server.logs 2>&1 &

posted @ 2024-06-13 09:49  Mrterrific  阅读(138)  评论(0编辑  收藏  举报