refactor: Switching from the httpx library to aiohttp (KenyonY#78)

* 使用aiohttp替换所有httpx的代码 * 解析部分相应更变 * 更新benchmark, 在Example下 * 更完善的错误处理
0x8A63F77D · Sep 25, 2023 · f97bd6f · f97bd6f
1 parent 3ce781f
commit f97bd6f
Show file tree

Hide file tree

Showing 40 changed files with 359 additions and 226 deletions.
diff --git a/.env b/.env
@@ -24,7 +24,7 @@ EXTRA_ROUTE_PREFIX=
 # `REQ_RATE_LIMIT`: i.e. 对指定路由的请求速率限制, 区分用户
 # format: {route: ratelimit-string}
 # ratelimit-string format [count] [per|/] [n (optional)] [second|minute|hour|day|month|year] :ref:`ratelimit-string`: https://limits.readthedocs.io/en/stable/quickstart.html#rate-limit-string-notation
-REQ_RATE_LIMIT={"/healthz":"100/2minutes","/v1/chat/completions":"60/minute;600/hour"}
+REQ_RATE_LIMIT={"/v1/chat/completions":"60/minute;600/hour", "/v1/completions":"60/minute;600/hour"}
 
 # `GLOBAL_RATE_LIMIT`: 限制所有`REQ_RATE_LIMIT`没有指定的路由. 不填默认无限制
 GLOBAL_RATE_LIMIT=
@@ -33,12 +33,12 @@ GLOBAL_RATE_LIMIT=
 # `fixed-window`: most memory efficient strategy; `moving-window`:most effective for preventing bursts but higher memory cost.
 RATE_LIMIT_STRATEGY=moving-window
 
-
 # 返回的token速率限制
-TOKEN_RATE_LIMIT={"/v1/chat/completions":"50/second"}
+TOKEN_RATE_LIMIT={"/v1/chat/completions":"50/second","/v1/completions":"60/second"}
 
 
-TIMEOUT=100
+# TCP连接的超时时间（秒）
+TIMEOUT=10
 
 IP_BLACKLIST=
 

diff --git a/.env.example b/.env.example
@@ -30,7 +30,6 @@ EXTRA_ROUTE_PREFIX='/tts, /translate'
 # format: {route: ratelimit-string}
 # ratelimit-string format [count] [per|/] [n (optional)] [second|minute|hour|day|month|year] :ref:`ratelimit-string`: https://limits.readthedocs.io/en/stable/quickstart.html#rate-limit-string-notation
 REQ_RATE_LIMIT='{
-"/healthz": "50/3minutes",
 "/openai/v1/chat/completions": "1/10seconds",
 "/localai/v1/chat/completions": "2/second"
 }'

diff --git a/.gitignore b/.gitignore
@@ -15,6 +15,9 @@ chat_*.yaml
 Log/
 Log-caloi-top/
 dist/
+
+.run/
+
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]

diff --git a/Examples/benchmark/.env b/Examples/benchmark/.env
@@ -37,10 +37,10 @@ RATE_LIMIT_STRATEGY=moving-window
 
 
 # 返回的token速率限制
-TOKEN_RATE_LIMIT={"/v1/chat/completions":"50/second", "/benchmark/v1/chat/completions":"30/second"}
+TOKEN_RATE_LIMIT={"/v1/chat/completions":"50/second", "/benchmark/v1/chat/completions":"5/second"}
 
 
-TIMEOUT=5
+TIMEOUT=10
 
 IP_BLACKLIST=
 

diff --git a/Examples/benchmark/README.md b/Examples/benchmark/README.md
@@ -4,7 +4,7 @@
 > http://localhost:8080  
 > 该部分测试是为评估`fastapi`本身流式与非流式返回的性能,不涉及转发
 
-启动参数: `BENCHMARK_MODE=true aifd run --workers=n --port 8080` (n=1 or 4)
+启动参数: `BENCHMARK_MODE=true aifd run --workers=n --port 8080` (n=1 or 16)
 
 
 
@@ -14,13 +14,14 @@ wrk -t8 -c400 -d10s -s post.lua http://localhost:8080/benchmark/v1/chat/completi
 ```
 单核:
 
-![img_7.png](img_7.png)
+![img_7.png](images/img_7.png)
 
 
-4核:
-
-![img_8.png](img_8.png)
-
+16核:
+```bash
+wrk -t15 -c500 -d10s -s post.lua http://localhost:8080/benchmark/v1/chat/completions
+```
+![img_11.png](images/img_11.png)
 
 ### stream == true:
 ```bash
@@ -31,11 +32,13 @@ wrk -t8 -c100 -d10s -s post.lua http://localhost:8080/benchmark/v1/chat/completi
 
 单核:
 
-![img_10.png](img_10.png)
+![img_13.png](images/img_13.png)
+
+
+16核:  
+![img_12.png](images/img_12.png)
 
-4核:
 
-![img.png](img.png)
 
 ## 转发benchmark接口
 > http://localhost:8000  
@@ -51,21 +54,39 @@ wrk -t8 -c100 -d10s -s post.lua http://localhost:8000/benchmark/v1/chat/completi
 
 **单核**
 
-![img_5.png](img_5.png)
+(httpx)  
+![img_5.png](images/img_5.png)
+
+(aiohttp)  
+![img_14.png](images/img_14.png)
+
+**4核**:
 
-**4核**:(原始与转发两边均4核)  
+(httpx)  
+![img_2.png](images/img_2.png)
 
-![img_2.png](img_2.png)
+(aiohttp)  
+![img_15.png](images/img_15.png)
 
 ### stream == true:
 
-**单核**: (是的，转发时的结果比原始的还要好,比较迷~)
+**单核**: 
+
+(httpx)  
+![img_4.png](images/img_4.png)
+
+(aiohttp)  
+![img_17.png](images/img_17.png)
 
-![img_4.png](img_4.png)
+**4核**:
 
-**4核**:(原始与转发两边均4核)
+(httpx)  
+![img_3.png](images/img_3.png)
 
-![img_3.png](img_3.png)
+(aiohttp)  
+![img_16.png](images/img_16.png)
 
 
+注: 虽然结果显示在流式转发的性能上 aiohttp 要比 httpx 的性能要高很多，
+但这主要是代码中aiohttp版本中做了一些优化，而不是这两个库的性能差异。
 
diff --git a/Examples/benchmark/img.png → Examples/benchmark/images/img.png b/Examples/benchmark/img.png → Examples/benchmark/images/img.png
diff --git a/Examples/benchmark/img_1.png → Examples/benchmark/images/img_1.png b/Examples/benchmark/img_1.png → Examples/benchmark/images/img_1.png
diff --git a/Examples/benchmark/img_10.png → Examples/benchmark/images/img_10.png b/Examples/benchmark/img_10.png → Examples/benchmark/images/img_10.png
diff --git a/Examples/benchmark/images/img_11.png b/Examples/benchmark/images/img_11.png
diff --git a/Examples/benchmark/images/img_12.png b/Examples/benchmark/images/img_12.png
diff --git a/Examples/benchmark/images/img_13.png b/Examples/benchmark/images/img_13.png
diff --git a/Examples/benchmark/images/img_14.png b/Examples/benchmark/images/img_14.png
diff --git a/Examples/benchmark/images/img_15.png b/Examples/benchmark/images/img_15.png
diff --git a/Examples/benchmark/images/img_16.png b/Examples/benchmark/images/img_16.png
diff --git a/Examples/benchmark/images/img_17.png b/Examples/benchmark/images/img_17.png
diff --git a/Examples/benchmark/img_2.png → Examples/benchmark/images/img_2.png b/Examples/benchmark/img_2.png → Examples/benchmark/images/img_2.png
diff --git a/Examples/benchmark/img_3.png → Examples/benchmark/images/img_3.png b/Examples/benchmark/img_3.png → Examples/benchmark/images/img_3.png
diff --git a/Examples/benchmark/img_4.png → Examples/benchmark/images/img_4.png b/Examples/benchmark/img_4.png → Examples/benchmark/images/img_4.png
diff --git a/Examples/benchmark/img_5.png → Examples/benchmark/images/img_5.png b/Examples/benchmark/img_5.png → Examples/benchmark/images/img_5.png
diff --git a/Examples/benchmark/img_6.png → Examples/benchmark/images/img_6.png b/Examples/benchmark/img_6.png → Examples/benchmark/images/img_6.png
diff --git a/Examples/benchmark/img_7.png → Examples/benchmark/images/img_7.png b/Examples/benchmark/img_7.png → Examples/benchmark/images/img_7.png
diff --git a/Examples/benchmark/img_8.png → Examples/benchmark/images/img_8.png b/Examples/benchmark/img_8.png → Examples/benchmark/images/img_8.png
diff --git a/Examples/benchmark/img_9.png → Examples/benchmark/images/img_9.png b/Examples/benchmark/img_9.png → Examples/benchmark/images/img_9.png
diff --git a/Examples/benchmark/post.lua b/Examples/benchmark/post.lua
@@ -1,7 +1,7 @@
 wrk.method = "POST"
 wrk.headers["Content-Type"] = "application/json"
-wrk.body = '{"stream": false}'
--- wrk.body   = '{"stream":true}'
+-- wrk.body = '{"stream": false}'
+wrk.body   = '{"stream":true}'
 
 -- wrk.timeout = 20000 -- in milliseconds
 

diff --git a/Examples/benchmark/run.py b/Examples/benchmark/run.py
@@ -51,17 +51,20 @@ async def main():
     mt = MeasureTime().start()
     mean = 0
     epochs = 5
+    concurrency = 100
     for epoch in range(epochs):
         tasks = []
-        for i in range(10):  # 创建 x个并发任务
+        for i in range(concurrency):  # 创建 concurrency 个并发任务
             task = asyncio.create_task(run(i))
             tasks.append(task)
 
         mt.start()
         await asyncio.gather(*tasks)
         cost = mt.show_interval(f"{epoch=}")
         mean += cost
-    print(f"mean: {mean / epochs} s")
+    mean_cost = mean / epochs
+    print(f"mean: {mean_cost} s")
+    print(f"{concurrency/mean_cost}req/s")
 
 
 asyncio.run(main())
diff --git a/Examples/chat.sh b/Examples/chat.sh
@@ -6,4 +6,4 @@ curl http://localhost:8000/openai/v1/chat/completions \
   -d '{
     "model": "gpt-3.5-turbo",
     "messages": [{"role": "user", "content": "Hello!"}]
-  }' &
+  }'
diff --git a/Examples/chat.py → Examples/chat_completion.py b/Examples/chat.py → Examples/chat_completion.py
@@ -15,12 +15,14 @@
 user_content = """
 用c实现目前已知最快平方根算法
 """
+from sparrow import MeasureTime
 
+mt = MeasureTime().start()
 resp = openai.ChatCompletion.create(
     model="gpt-3.5-turbo",
     # model="gpt-4",
     messages=[
-        {"role": "user", "content": 'hi'},
+        {"role": "user", "content": user_content},
     ],
     stream=stream,
     request_timeout=30,
@@ -44,6 +46,7 @@
     print(assistant_content)
     print(resp.usage)
 
+mt.show_interval("chat")
 """
 gpt-4:
 

diff --git a/Examples/completion.py b/Examples/completion.py
@@ -0,0 +1,38 @@
+import openai
+from rich import print
+from rich.console import Console
+from rich.markdown import Markdown
+from sparrow import yaml_load  # pip install sparrow-python
+
+config = yaml_load("config.yaml", rel_path=True)
+print(f"{config=}")
+openai.api_base = config["api_base"]
+openai.api_key = config["api_key"]
+
+
+stream = True
+
+
+user_content = "现在让我们使用泰勒展开推导出牛顿法迭代公式:  \n"
+from sparrow import MeasureTime
+
+mt = MeasureTime().start()
+resp = openai.Completion.create(
+    model="gpt-3.5-turbo-instruct",
+    prompt=user_content,
+    stream=stream,
+    max_tokens=500,
+    request_timeout=30,
+)
+
+console = Console()
+sentences = ""
+if stream:
+    for chunk in resp:
+        text = chunk['choices'][0]['text']
+        console.print(text, end="")
+        sentences += text
+    print()
+
+# print(70*"-")
+# console.print(Markdown(sentences))
diff --git a/README.md b/README.md
@@ -241,7 +241,6 @@ curl --location 'https://api.openai-forward.com/v1/images/generations' \
 |------------|------------|:-------:|
 | --port     | 服务端口号      |  8000   |
 | --workers  | 工作进程数      |    1    |
-| --log_chat | 同 LOG_CHAT | `False` |
 
 </details>
 

diff --git a/openai_forward/__init__.py b/openai_forward/__init__.py
@@ -1,4 +1,4 @@
-__version__ = "0.5.3"
+__version__ = "0.6.0"
 
 from dotenv import load_dotenv
 

diff --git a/openai_forward/app.py b/openai_forward/app.py
@@ -4,7 +4,6 @@
 from slowapi.errors import RateLimitExceeded
 
 from . import custom_slowapi
-from .cache.chat import chat_completions_benchmark
 from .forward import create_generic_proxies, create_openai_proxies
 from .helper import normalize_route as normalize_route_path
 from .settings import (
@@ -21,6 +20,7 @@
 
 app.state.limiter = limiter
 app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)
+
 app.add_middleware(
     CORSMiddleware,
     allow_origins=["*"],
@@ -44,25 +44,37 @@ async def normalize_route(request: Request, call_next):
     response_description="Return HTTP Status Code 200 (OK)",
     status_code=status.HTTP_200_OK,
 )
-@limiter.limit(dynamic_request_rate_limit, exempt_when=lambda: True)
 def healthz(request: Request):
     return "OK"
 
 
 if BENCHMARK_MODE:
+    from .cache.chat_completions import chat_completions_benchmark
+
     app.add_route(
         "/benchmark/v1/chat/completions",
         route=limiter.limit(dynamic_request_rate_limit)(chat_completions_benchmark),
         methods=["POST"],
     )
 
+openai_objs = create_openai_proxies()
+generic_objs = create_generic_proxies()
+
+
+@app.on_event("shutdown")
+async def shutdown():
+    for obj in openai_objs:
+        await obj.client.close()
+    for obj in generic_objs:
+        await obj.client.close()
+
+
 add_route = lambda obj: app.add_route(
     obj.ROUTE_PREFIX + "{api_path:path}",
     route=limiter.limit(dynamic_request_rate_limit)(obj.reverse_proxy),
     methods=["GET", "POST", "PUT", "DELETE", "OPTIONS", "HEAD", "PATCH", "TRACE"],
 )
-
-[add_route(obj) for obj in create_openai_proxies()]
-[add_route(obj) for obj in create_generic_proxies()]
+[add_route(obj) for obj in openai_objs]
+[add_route(obj) for obj in generic_objs]
 
 show_startup()