fastapi/scripts/debug_patient_stream.py

import asyncio
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

from app.agents.llm_adapter import OpenAICompatibleLLMClient
from app.agents.patient_agent import PatientAgent
from app.core.config import settings
from app.db.session import SessionLocal
from app.repositories.case_repository import CaseRepository


async def main() -> None:
    """本地调试：直接调用 Patient Agent 流式回复，绕过前端和 FastAPI。"""
    client = OpenAICompatibleLLMClient()
    print(f"mock_mode={client.is_mock_mode}")
    print(f"fast_model={settings.llm_fast_model}")
    print(f"fast_thinking={settings.llm_fast_thinking_enabled}")
    print(f"stream_first_token_timeout={settings.llm_stream_first_token_timeout_seconds}")
    print(f"stream_total_timeout={settings.llm_stream_total_timeout_seconds}")

    db = SessionLocal()
    try:
        case = CaseRepository(db).list_active_cases()[0]
        text = ""
        first_token_ms = None
        done_seen = False
        async for chunk in PatientAgent().stream_reply(case, [], "孩子发热几天了？最高体温多少？", "novice"):
            if first_token_ms is None and chunk.first_token_ms is not None:
                first_token_ms = chunk.first_token_ms
            if chunk.done:
                done_seen = True
                print(f"done_seen={done_seen}")
                print(f"first_token_ms={first_token_ms}")
                print(f"total_latency_ms={chunk.total_latency_ms}")
                print(f"model={chunk.model}")
                print(f"fallback_used={chunk.fallback_used}")
                print(f"text_len={len(text)}")
                print(f"text_preview={text[:30]}")
                break
            text += chunk.delta
        if not done_seen:
            print("done_seen=False")
            print(f"text_len={len(text)}")
            print(f"text_preview={text[:30]}")
            raise SystemExit(1)
    finally:
        db.close()


if __name__ == "__main__":
    asyncio.run(main())