109 lines
4.4 KiB
Python
109 lines
4.4 KiB
Python
import time
|
|
|
|
from fastapi import APIRouter, Depends
|
|
|
|
from app.agents.llm_adapter import OpenAICompatibleLLMClient
|
|
from app.core.config import settings
|
|
from app.core.exceptions import AppError
|
|
from app.core.response import ApiResponse, ok
|
|
from app.core.user_context import UserContext, get_user_context
|
|
from app.schemas.llm import LLMTestRequest, LLMTestResponse
|
|
|
|
router = APIRouter()
|
|
|
|
|
|
@router.post("/deepseek-fast", response_model=ApiResponse[LLMTestResponse])
|
|
async def test_deepseek_fast(
|
|
payload: LLMTestRequest,
|
|
_: UserContext = Depends(get_user_context),
|
|
):
|
|
"""Fast 模型测试:验证快速模型的非流式响应耗时。"""
|
|
client = OpenAICompatibleLLMClient()
|
|
response = await client.chat(
|
|
[{"role": "user", "content": payload.message}],
|
|
settings.llm_fast_model,
|
|
thinking_enabled=settings.llm_fast_thinking_enabled,
|
|
max_tokens=min(settings.llm_fast_max_tokens, 256),
|
|
)
|
|
return ok(
|
|
LLMTestResponse(
|
|
model=response.model,
|
|
total_latency_ms=response.latency_ms,
|
|
stream=False,
|
|
mock_mode=client.is_mock_mode,
|
|
fallback_used=response.model.startswith("mock-fallback"),
|
|
thinking_enabled=settings.llm_fast_thinking_enabled,
|
|
)
|
|
)
|
|
|
|
|
|
@router.post("/deepseek-reason", response_model=ApiResponse[LLMTestResponse])
|
|
async def test_deepseek_reason(
|
|
payload: LLMTestRequest,
|
|
_: UserContext = Depends(get_user_context),
|
|
):
|
|
"""Reason 模型测试:优先验证流式耗时,流式不兼容时降级为真实非流式测试。"""
|
|
client = OpenAICompatibleLLMClient()
|
|
messages = [{"role": "user", "content": payload.message}]
|
|
first_token_ms = None
|
|
start = time.perf_counter()
|
|
|
|
try:
|
|
async for chunk in client.stream_chat(
|
|
messages,
|
|
settings.llm_reason_model,
|
|
thinking_enabled=settings.llm_reason_thinking_enabled,
|
|
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
|
|
max_tokens=min(settings.llm_fast_max_tokens, 256),
|
|
):
|
|
if first_token_ms is None and chunk.first_token_ms is not None:
|
|
first_token_ms = chunk.first_token_ms
|
|
if chunk.done:
|
|
return ok(
|
|
LLMTestResponse(
|
|
model=chunk.model or (settings.llm_reason_model if not client.is_mock_mode else f"mock-{settings.llm_reason_model}"),
|
|
first_token_ms=first_token_ms,
|
|
total_latency_ms=chunk.total_latency_ms or int((time.perf_counter() - start) * 1000),
|
|
stream=True,
|
|
mock_mode=client.is_mock_mode,
|
|
fallback_used=chunk.fallback_used,
|
|
thinking_enabled=settings.llm_reason_thinking_enabled,
|
|
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
|
|
)
|
|
)
|
|
except AppError as exc:
|
|
if exc.code != "LLM_STREAM_FAILED":
|
|
raise
|
|
response = await client.chat(
|
|
messages,
|
|
settings.llm_reason_model,
|
|
thinking_enabled=settings.llm_reason_thinking_enabled,
|
|
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
|
|
max_tokens=min(settings.llm_fast_max_tokens, 256),
|
|
)
|
|
return ok(
|
|
LLMTestResponse(
|
|
model=response.model,
|
|
first_token_ms=None,
|
|
total_latency_ms=response.latency_ms,
|
|
stream=False,
|
|
mock_mode=client.is_mock_mode,
|
|
fallback_used=response.model.startswith("mock-fallback"),
|
|
thinking_enabled=settings.llm_reason_thinking_enabled,
|
|
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
|
|
)
|
|
)
|
|
|
|
return ok(
|
|
LLMTestResponse(
|
|
model=settings.llm_reason_model,
|
|
first_token_ms=first_token_ms,
|
|
total_latency_ms=int((time.perf_counter() - start) * 1000),
|
|
stream=True,
|
|
mock_mode=client.is_mock_mode,
|
|
fallback_used=False,
|
|
thinking_enabled=settings.llm_reason_thinking_enabled,
|
|
reasoning_effort=settings.llm_reasoning_effort if settings.llm_reason_thinking_enabled else None,
|
|
)
|
|
)
|