import time from fastapi import APIRouter, Depends from app.agents.llm_adapter import OpenAICompatibleLLMClient from app.core.config import settings from app.core.exceptions import AppError from app.core.response import ApiResponse, ok from app.core.user_context import UserContext, get_user_context from app.schemas.llm import LLMTestRequest, LLMTestResponse router = APIRouter() @router.post("/deepseek-fast", response_model=ApiResponse[LLMTestResponse]) async def test_deepseek_fast( payload: LLMTestRequest, _: UserContext = Depends(get_user_context), ): """Fast 模型测试:验证快速模型的非流式响应耗时。""" client = OpenAICompatibleLLMClient() response = await client.chat( [{"role": "user", "content": payload.message}], settings.llm_fast_model, thinking_enabled=settings.llm_fast_thinking_enabled, max_tokens=min(settings.llm_fast_max_tokens, 256), ) return ok( LLMTestResponse( model=response.model, total_latency_ms=response.latency_ms, stream=False, mock_mode=client.is_mock_mode, fallback_used=response.model.startswith("mock-fallback"), thinking_enabled=settings.llm_fast_thinking_enabled, ) ) @router.post("/deepseek-reason", response_model=ApiResponse[LLMTestResponse]) async def test_deepseek_reason( payload: LLMTestRequest, _: UserContext = Depends(get_user_context), ): """Reason 模型测试:优先验证流式耗时,流式不兼容时降级为真实非流式测试。""" client = OpenAICompatibleLLMClient() messages = [{"role": "user", "content": payload.message}] first_token_ms = None start = time.perf_counter() try: async for chunk in client.stream_chat( messages, settings.llm_reason_model, thinking_enabled=settings.llm_reason_thinking_enabled, reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None, max_tokens=min(settings.llm_fast_max_tokens, 256), ): if first_token_ms is None and chunk.first_token_ms is not None: first_token_ms = chunk.first_token_ms if chunk.done: return ok( LLMTestResponse( model=chunk.model or (settings.llm_reason_model if not client.is_mock_mode else f"mock-{settings.llm_reason_model}"), first_token_ms=first_token_ms, total_latency_ms=chunk.total_latency_ms or int((time.perf_counter() - start) * 1000), stream=True, mock_mode=client.is_mock_mode, fallback_used=chunk.fallback_used, thinking_enabled=settings.llm_reason_thinking_enabled, reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None, ) ) except AppError as exc: if exc.code != "LLM_STREAM_FAILED": raise response = await client.chat( messages, settings.llm_reason_model, thinking_enabled=settings.llm_reason_thinking_enabled, reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None, max_tokens=min(settings.llm_fast_max_tokens, 256), ) return ok( LLMTestResponse( model=response.model, first_token_ms=None, total_latency_ms=response.latency_ms, stream=False, mock_mode=client.is_mock_mode, fallback_used=response.model.startswith("mock-fallback"), thinking_enabled=settings.llm_reason_thinking_enabled, reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None, ) ) return ok( LLMTestResponse( model=settings.llm_reason_model, first_token_ms=first_token_ms, total_latency_ms=int((time.perf_counter() - start) * 1000), stream=True, mock_mode=client.is_mock_mode, fallback_used=False, thinking_enabled=settings.llm_reason_thinking_enabled, reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None, ) )