prepare fastapi root layout for server deployment

2026-06-04 10:55:23 +08:00
parent eb43573a44
commit b46e43aadc
103 changed files with 347 additions and 197 deletions
@@ -0,0 +1,108 @@
+import time
+
+from fastapi import APIRouter, Depends
+
+from app.agents.llm_adapter import OpenAICompatibleLLMClient
+from app.core.config import settings
+from app.core.exceptions import AppError
+from app.core.response import ApiResponse, ok
+from app.core.user_context import UserContext, get_user_context
+from app.schemas.llm import LLMTestRequest, LLMTestResponse
+
+router = APIRouter()
+
+
+@router.post("/deepseek-fast", response_model=ApiResponse[LLMTestResponse])
+async def test_deepseek_fast(
+    payload: LLMTestRequest,
+    _: UserContext = Depends(get_user_context),
+):
+    """Fast 模型测试：验证快速模型的非流式响应耗时。"""
+    client = OpenAICompatibleLLMClient()
+    response = await client.chat(
+        [{"role": "user", "content": payload.message}],
+        settings.llm_fast_model,
+        thinking_enabled=settings.llm_fast_thinking_enabled,
+        max_tokens=min(settings.llm_fast_max_tokens, 256),
+    )
+    return ok(
+        LLMTestResponse(
+            model=response.model,
+            total_latency_ms=response.latency_ms,
+            stream=False,
+            mock_mode=client.is_mock_mode,
+            fallback_used=response.model.startswith("mock-fallback"),
+            thinking_enabled=settings.llm_fast_thinking_enabled,
+        )
+    )
+
+
+@router.post("/deepseek-reason", response_model=ApiResponse[LLMTestResponse])
+async def test_deepseek_reason(
+    payload: LLMTestRequest,
+    _: UserContext = Depends(get_user_context),
+):
+    """Reason 模型测试：优先验证流式耗时，流式不兼容时降级为真实非流式测试。"""
+    client = OpenAICompatibleLLMClient()
+    messages = [{"role": "user", "content": payload.message}]
+    first_token_ms = None
+    start = time.perf_counter()
+
+    try:
+        async for chunk in client.stream_chat(
+            messages,
+            settings.llm_reason_model,
+            thinking_enabled=settings.llm_reason_thinking_enabled,
+            reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
+            max_tokens=min(settings.llm_fast_max_tokens, 256),
+        ):
+            if first_token_ms is None and chunk.first_token_ms is not None:
+                first_token_ms = chunk.first_token_ms
+            if chunk.done:
+                return ok(
+                    LLMTestResponse(
+                        model=chunk.model or (settings.llm_reason_model if not client.is_mock_mode else f"mock-{settings.llm_reason_model}"),
+                        first_token_ms=first_token_ms,
+                        total_latency_ms=chunk.total_latency_ms or int((time.perf_counter() - start) * 1000),
+                        stream=True,
+                        mock_mode=client.is_mock_mode,
+                        fallback_used=chunk.fallback_used,
+                        thinking_enabled=settings.llm_reason_thinking_enabled,
+                        reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
+                    )
+                )
+    except AppError as exc:
+        if exc.code != "LLM_STREAM_FAILED":
+            raise
+        response = await client.chat(
+            messages,
+            settings.llm_reason_model,
+            thinking_enabled=settings.llm_reason_thinking_enabled,
+            reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
+            max_tokens=min(settings.llm_fast_max_tokens, 256),
+        )
+        return ok(
+            LLMTestResponse(
+                model=response.model,
+                first_token_ms=None,
+                total_latency_ms=response.latency_ms,
+                stream=False,
+                mock_mode=client.is_mock_mode,
+                fallback_used=response.model.startswith("mock-fallback"),
+                thinking_enabled=settings.llm_reason_thinking_enabled,
+                reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
+            )
+        )
+
+    return ok(
+        LLMTestResponse(
+            model=settings.llm_reason_model,
+            first_token_ms=first_token_ms,
+            total_latency_ms=int((time.perf_counter() - start) * 1000),
+            stream=True,
+            mock_mode=client.is_mock_mode,
+            fallback_used=False,
+            thinking_enabled=settings.llm_reason_thinking_enabled,
+            reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
+        )
+    )