import asyncio import sys from pathlib import Path sys.path.insert(0, str(Path(__file__).resolve().parents[1])) from app.agents.llm_adapter import OpenAICompatibleLLMClient from app.agents.patient_agent import PatientAgent from app.core.config import settings from app.db.session import SessionLocal from app.repositories.case_repository import CaseRepository async def main() -> None: """本地调试:直接调用 Patient Agent 流式回复,绕过前端和 FastAPI。""" client = OpenAICompatibleLLMClient() print(f"mock_mode={client.is_mock_mode}") print(f"fast_model={settings.llm_fast_model}") print(f"fast_thinking={settings.llm_fast_thinking_enabled}") print(f"stream_first_token_timeout={settings.llm_stream_first_token_timeout_seconds}") print(f"stream_total_timeout={settings.llm_stream_total_timeout_seconds}") db = SessionLocal() try: case = CaseRepository(db).list_active_cases()[0] text = "" first_token_ms = None done_seen = False async for chunk in PatientAgent().stream_reply(case, [], "孩子发热几天了?最高体温多少?", "novice"): if first_token_ms is None and chunk.first_token_ms is not None: first_token_ms = chunk.first_token_ms if chunk.done: done_seen = True print(f"done_seen={done_seen}") print(f"first_token_ms={first_token_ms}") print(f"total_latency_ms={chunk.total_latency_ms}") print(f"model={chunk.model}") print(f"fallback_used={chunk.fallback_used}") print(f"text_len={len(text)}") print(f"text_preview={text[:30]}") break text += chunk.delta if not done_seen: print("done_seen=False") print(f"text_len={len(text)}") print(f"text_preview={text[:30]}") raise SystemExit(1) finally: db.close() if __name__ == "__main__": asyncio.run(main())