fastapi/app/api/llm_test.py

import time

from fastapi import APIRouter, Depends

from app.agents.llm_adapter import OpenAICompatibleLLMClient
from app.core.config import settings
from app.core.exceptions import AppError
from app.core.response import ApiResponse, ok
from app.core.user_context import UserContext, get_user_context
from app.schemas.llm import LLMTestRequest, LLMTestResponse

router = APIRouter()


@router.post("/deepseek-fast", response_model=ApiResponse[LLMTestResponse])
async def test_deepseek_fast(
    payload: LLMTestRequest,
    _: UserContext = Depends(get_user_context),
):
    """Fast 模型测试：验证快速模型的非流式响应耗时。"""
    client = OpenAICompatibleLLMClient()
    response = await client.chat(
        [{"role": "user", "content": payload.message}],
        settings.llm_fast_model,
        thinking_enabled=settings.llm_fast_thinking_enabled,
        max_tokens=min(settings.llm_fast_max_tokens, 256),
    )
    return ok(
        LLMTestResponse(
            model=response.model,
            total_latency_ms=response.latency_ms,
            stream=False,
            mock_mode=client.is_mock_mode,
            fallback_used=response.model.startswith("mock-fallback"),
            thinking_enabled=settings.llm_fast_thinking_enabled,
        )
    )


@router.post("/deepseek-reason", response_model=ApiResponse[LLMTestResponse])
async def test_deepseek_reason(
    payload: LLMTestRequest,
    _: UserContext = Depends(get_user_context),
):
    """Reason 模型测试：优先验证流式耗时，流式不兼容时降级为真实非流式测试。"""
    client = OpenAICompatibleLLMClient()
    messages = [{"role": "user", "content": payload.message}]
    first_token_ms = None
    start = time.perf_counter()

    try:
        async for chunk in client.stream_chat(
            messages,
            settings.llm_reason_model,
            thinking_enabled=settings.llm_reason_thinking_enabled,
            reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
            max_tokens=min(settings.llm_fast_max_tokens, 256),
        ):
            if first_token_ms is None and chunk.first_token_ms is not None:
                first_token_ms = chunk.first_token_ms
            if chunk.done:
                return ok(
                    LLMTestResponse(
                        model=chunk.model or (settings.llm_reason_model if not client.is_mock_mode else f"mock-{settings.llm_reason_model}"),
                        first_token_ms=first_token_ms,
                        total_latency_ms=chunk.total_latency_ms or int((time.perf_counter() - start) * 1000),
                        stream=True,
                        mock_mode=client.is_mock_mode,
                        fallback_used=chunk.fallback_used,
                        thinking_enabled=settings.llm_reason_thinking_enabled,
                        reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
                    )
                )
    except AppError as exc:
        if exc.code != "LLM_STREAM_FAILED":
            raise
        response = await client.chat(
            messages,
            settings.llm_reason_model,
            thinking_enabled=settings.llm_reason_thinking_enabled,
            reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
            max_tokens=min(settings.llm_fast_max_tokens, 256),
        )
        return ok(
            LLMTestResponse(
                model=response.model,
                first_token_ms=None,
                total_latency_ms=response.latency_ms,
                stream=False,
                mock_mode=client.is_mock_mode,
                fallback_used=response.model.startswith("mock-fallback"),
                thinking_enabled=settings.llm_reason_thinking_enabled,
                reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
            )
        )

    return ok(
        LLMTestResponse(
            model=settings.llm_reason_model,
            first_token_ms=first_token_ms,
            total_latency_ms=int((time.perf_counter() - start) * 1000),
            stream=True,
            mock_mode=client.is_mock_mode,
            fallback_used=False,
            thinking_enabled=settings.llm_reason_thinking_enabled,
            reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
        )
    )