prepare fastapi root layout for server deployment
This commit is contained in:
@@ -0,0 +1,108 @@
|
||||
import time
|
||||
|
||||
from fastapi import APIRouter, Depends
|
||||
|
||||
from app.agents.llm_adapter import OpenAICompatibleLLMClient
|
||||
from app.core.config import settings
|
||||
from app.core.exceptions import AppError
|
||||
from app.core.response import ApiResponse, ok
|
||||
from app.core.user_context import UserContext, get_user_context
|
||||
from app.schemas.llm import LLMTestRequest, LLMTestResponse
|
||||
|
||||
router = APIRouter()
|
||||
|
||||
|
||||
@router.post("/deepseek-fast", response_model=ApiResponse[LLMTestResponse])
|
||||
async def test_deepseek_fast(
|
||||
payload: LLMTestRequest,
|
||||
_: UserContext = Depends(get_user_context),
|
||||
):
|
||||
"""Fast 模型测试:验证快速模型的非流式响应耗时。"""
|
||||
client = OpenAICompatibleLLMClient()
|
||||
response = await client.chat(
|
||||
[{"role": "user", "content": payload.message}],
|
||||
settings.llm_fast_model,
|
||||
thinking_enabled=settings.llm_fast_thinking_enabled,
|
||||
max_tokens=min(settings.llm_fast_max_tokens, 256),
|
||||
)
|
||||
return ok(
|
||||
LLMTestResponse(
|
||||
model=response.model,
|
||||
total_latency_ms=response.latency_ms,
|
||||
stream=False,
|
||||
mock_mode=client.is_mock_mode,
|
||||
fallback_used=response.model.startswith("mock-fallback"),
|
||||
thinking_enabled=settings.llm_fast_thinking_enabled,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
@router.post("/deepseek-reason", response_model=ApiResponse[LLMTestResponse])
|
||||
async def test_deepseek_reason(
|
||||
payload: LLMTestRequest,
|
||||
_: UserContext = Depends(get_user_context),
|
||||
):
|
||||
"""Reason 模型测试:优先验证流式耗时,流式不兼容时降级为真实非流式测试。"""
|
||||
client = OpenAICompatibleLLMClient()
|
||||
messages = [{"role": "user", "content": payload.message}]
|
||||
first_token_ms = None
|
||||
start = time.perf_counter()
|
||||
|
||||
try:
|
||||
async for chunk in client.stream_chat(
|
||||
messages,
|
||||
settings.llm_reason_model,
|
||||
thinking_enabled=settings.llm_reason_thinking_enabled,
|
||||
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
|
||||
max_tokens=min(settings.llm_fast_max_tokens, 256),
|
||||
):
|
||||
if first_token_ms is None and chunk.first_token_ms is not None:
|
||||
first_token_ms = chunk.first_token_ms
|
||||
if chunk.done:
|
||||
return ok(
|
||||
LLMTestResponse(
|
||||
model=chunk.model or (settings.llm_reason_model if not client.is_mock_mode else f"mock-{settings.llm_reason_model}"),
|
||||
first_token_ms=first_token_ms,
|
||||
total_latency_ms=chunk.total_latency_ms or int((time.perf_counter() - start) * 1000),
|
||||
stream=True,
|
||||
mock_mode=client.is_mock_mode,
|
||||
fallback_used=chunk.fallback_used,
|
||||
thinking_enabled=settings.llm_reason_thinking_enabled,
|
||||
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
|
||||
)
|
||||
)
|
||||
except AppError as exc:
|
||||
if exc.code != "LLM_STREAM_FAILED":
|
||||
raise
|
||||
response = await client.chat(
|
||||
messages,
|
||||
settings.llm_reason_model,
|
||||
thinking_enabled=settings.llm_reason_thinking_enabled,
|
||||
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
|
||||
max_tokens=min(settings.llm_fast_max_tokens, 256),
|
||||
)
|
||||
return ok(
|
||||
LLMTestResponse(
|
||||
model=response.model,
|
||||
first_token_ms=None,
|
||||
total_latency_ms=response.latency_ms,
|
||||
stream=False,
|
||||
mock_mode=client.is_mock_mode,
|
||||
fallback_used=response.model.startswith("mock-fallback"),
|
||||
thinking_enabled=settings.llm_reason_thinking_enabled,
|
||||
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
|
||||
)
|
||||
)
|
||||
|
||||
return ok(
|
||||
LLMTestResponse(
|
||||
model=settings.llm_reason_model,
|
||||
first_token_ms=first_token_ms,
|
||||
total_latency_ms=int((time.perf_counter() - start) * 1000),
|
||||
stream=True,
|
||||
mock_mode=client.is_mock_mode,
|
||||
fallback_used=False,
|
||||
thinking_enabled=settings.llm_reason_thinking_enabled,
|
||||
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
|
||||
)
|
||||
)
|
||||
Reference in New Issue
Block a user