prepare fastapi root layout for server deployment

This commit is contained in:
刘金宝
2026-06-04 10:55:23 +08:00
parent eb43573a44
commit b46e43aadc
103 changed files with 347 additions and 197 deletions
+108
View File
@@ -0,0 +1,108 @@
import time
from fastapi import APIRouter, Depends
from app.agents.llm_adapter import OpenAICompatibleLLMClient
from app.core.config import settings
from app.core.exceptions import AppError
from app.core.response import ApiResponse, ok
from app.core.user_context import UserContext, get_user_context
from app.schemas.llm import LLMTestRequest, LLMTestResponse
router = APIRouter()
@router.post("/deepseek-fast", response_model=ApiResponse[LLMTestResponse])
async def test_deepseek_fast(
payload: LLMTestRequest,
_: UserContext = Depends(get_user_context),
):
"""Fast 模型测试:验证快速模型的非流式响应耗时。"""
client = OpenAICompatibleLLMClient()
response = await client.chat(
[{"role": "user", "content": payload.message}],
settings.llm_fast_model,
thinking_enabled=settings.llm_fast_thinking_enabled,
max_tokens=min(settings.llm_fast_max_tokens, 256),
)
return ok(
LLMTestResponse(
model=response.model,
total_latency_ms=response.latency_ms,
stream=False,
mock_mode=client.is_mock_mode,
fallback_used=response.model.startswith("mock-fallback"),
thinking_enabled=settings.llm_fast_thinking_enabled,
)
)
@router.post("/deepseek-reason", response_model=ApiResponse[LLMTestResponse])
async def test_deepseek_reason(
payload: LLMTestRequest,
_: UserContext = Depends(get_user_context),
):
"""Reason 模型测试:优先验证流式耗时,流式不兼容时降级为真实非流式测试。"""
client = OpenAICompatibleLLMClient()
messages = [{"role": "user", "content": payload.message}]
first_token_ms = None
start = time.perf_counter()
try:
async for chunk in client.stream_chat(
messages,
settings.llm_reason_model,
thinking_enabled=settings.llm_reason_thinking_enabled,
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
max_tokens=min(settings.llm_fast_max_tokens, 256),
):
if first_token_ms is None and chunk.first_token_ms is not None:
first_token_ms = chunk.first_token_ms
if chunk.done:
return ok(
LLMTestResponse(
model=chunk.model or (settings.llm_reason_model if not client.is_mock_mode else f"mock-{settings.llm_reason_model}"),
first_token_ms=first_token_ms,
total_latency_ms=chunk.total_latency_ms or int((time.perf_counter() - start) * 1000),
stream=True,
mock_mode=client.is_mock_mode,
fallback_used=chunk.fallback_used,
thinking_enabled=settings.llm_reason_thinking_enabled,
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
)
)
except AppError as exc:
if exc.code != "LLM_STREAM_FAILED":
raise
response = await client.chat(
messages,
settings.llm_reason_model,
thinking_enabled=settings.llm_reason_thinking_enabled,
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
max_tokens=min(settings.llm_fast_max_tokens, 256),
)
return ok(
LLMTestResponse(
model=response.model,
first_token_ms=None,
total_latency_ms=response.latency_ms,
stream=False,
mock_mode=client.is_mock_mode,
fallback_used=response.model.startswith("mock-fallback"),
thinking_enabled=settings.llm_reason_thinking_enabled,
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
)
)
return ok(
LLMTestResponse(
model=settings.llm_reason_model,
first_token_ms=first_token_ms,
total_latency_ms=int((time.perf_counter() - start) * 1000),
stream=True,
mock_mode=client.is_mock_mode,
fallback_used=False,
thinking_enabled=settings.llm_reason_thinking_enabled,
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
)
)