大模型负载均衡

使用nginx做大模型服务的负载均衡

  1. 前提:

  • 大模型服务端口:192.168.68.11:8000、192.168.68.11:9000
  • 使用openresty的容器
  1. openresty

    1. 镜像:

      docker pull openresty/openresty:alpine
      
    2. 容器:

      # 使用宿主机配置用原始镜像直接生成容器,注入的内容包括:
      ## 配置文件:/home/app/openresty/nginx/conf/nginx.conf:/usr/local/openresty/nginx/conf/nginx.conf
      ## 配置目录:/home/app/openresty/nginx/conf:/etc/nginx/conf.d
      ## 根路径:/home/app/openresty/nginx/html:/usr/local/openresty/nginx/html
      ## 日志路径:/home/app/openresty/nginx/logs:/usr/local/openresty/nginx/logs
      ## lua脚本路径:/home/app/openresty/nginx/lua:/usr/local/openresty/nginx/lua
      docker run -itd --name openresty -p 8000:80 --privileged=true --restart=always -v /home/app/openresty/nginx/conf/nginx.conf:/usr/local/openresty/nginx/conf/nginx.conf -v /home/app/openresty/nginx/conf:/etc/nginx/conf.d -v /home/app/openresty/nginx/html:/usr/local/openresty/nginx/html -v /home/app/openresty/nginx/logs:/usr/local/openresty/nginx/logs -v /home/app/openresty/nginx/lua:/usr/local/openresty/nginx/lua openresty/openresty:alpine
      
  2. nginx配置(支持流式)

    http {
    	# 后端服务地址
    	upstream qwen72b{
    		server 192.168.68.11:8000 weight=1;
    		server 192.168.68.11:9000 weight=1;
    	}
    	server {
    		access_log  logs/8000.access.log  main;
    		proxy_set_header Upgrade $http_upgrade;
    		proxy_set_header Connection "upgrade";
        proxy_buffering off;
        location / {
    			# 设置 Nginx 不对 SSE 响应进行缓冲,直接透传给客户端
    			proxy_buffering off;
    			# 设置代理读取服务器响应的超时时间
    			proxy_read_timeout 24h;
    			# 设置客户端连接的超时时间
    			proxy_connect_timeout 1h;
    			# 设置 HTTP 版本,SSE 需要 HTTP/1.1
    			proxy_http_version 1.1;
    			# 保持连接活性,不发送连接关闭的信号
    			proxy_set_header Connection '';
    			# 配置代理传递的头部,确保 Host 头部正确传递
    			proxy_set_header Host $host;
    			# 配置代理的后端服务器地址
    			proxy_pass http://qwen72b;
    			# 设置代理的响应头部,保持传输编码为 chunked
    			# 设置代理的响应头部,保持传输编码为 chunked
    			proxy_set_header X-Accel-Buffering no;
    			# 设置跨域资源共享 (CORS),如果你的客户端和服务器不在同一个域上
    			add_header 'Access-Control-Allow-Origin' '*' always;
    			add_header 'Access-Control-Allow-Credentials' 'true' always;
    			add_header 'Access-Control-Allow-Methods' 'GET, OPTIONS' always;
    			add_header 'Access-Control-Allow-Headers' 'Origin,Authorization,Accept,X-Requested-With' always;
    			if ($request_method = 'OPTIONS') {
    				# 如果请求方法为 OPTIONS,则返回 204 (无内容)
    				add_header 'Access-Control-Allow-Origin' '*';
    				add_header 'Access-Control-Allow-Methods' 'GET, OPTIONS';
    				add_header 'Access-Control-Allow-Headers' 'Origin,Authorization,Accept,X-Requested-With';
    				add_header 'Access-Control-Max-Age' 1728000;
    				add_header 'Content-Type' 'text/plain charset=UTF-8';
    				add_header 'Content-Length' 0;
    				return 204;
    			}
    		}
    		error_page 400 404  500 502 503 504  /50x.html;
    		location = /50x.html {
    			root   html;
    		}
    	}
    }
    
  3. 测试

    curl -H "Accept: application/json" -H "Content-type: application/json" -X POST -d "{\"model\": \"qwen72b\", \"messages\": [{\"role\": \"user\", \"content\": \"请扮演一个24点游戏的助手。你收到一组4个数字、你应该用加、减、乘、除这四种基本运算将这4个数字运算出24。如果你不能计算出24,你需要明确表示。请给出尽可能多的解法,并详细解释每一个解 法。根据用户提供的4个不大于13的数字作答,只需要回答数字的计算过程和结果即可。数字是:4,4,4,4\"}], \"max_tokens\": 512, \"presence_penalty\": 1.03, \"frequency_penalty\": 1.0, \"seed\": null, \"temperature\": 0.5, \"top_p\": 0.95, \"stream\": false}" http://192.168.136.40:8000/v1/chat/completions
    
posted @ 2024-12-20 10:37  badwood  阅读(8)  评论(0编辑  收藏  举报
Badwood's Blog