scripts/debug_patient_stream.py

import asyncio
import sys
from pathlib import Path

sys.path.insert(0, str(Path(__file__).resolve().parents[1]))

from app.agents.llm_adapter import OpenAICompatibleLLMClient
from app.agents.patient_agent import PatientAgent
from app.core.config import settings
from app.db.session import SessionLocal
from app.repositories.case_repository import CaseRepository


async def main() -> None:
    """本地调试：直接调用 Patient Agent 流式回复，绕过前端和 FastAPI。"""
    client = OpenAICompatibleLLMClient()
    print(f"mock_mode={client.is_mock_mode}")
    print(f"fast_model={settings.llm_fast_model}")
    print(f"fast_thinking={settings.llm_fast_thinking_enabled}")
    print(f"stream_first_token_timeout={settings.llm_stream_first_token_timeout_seconds}")
    print(f"stream_total_timeout={settings.llm_stream_total_timeout_seconds}")

    db = SessionLocal()
    try:
        case = CaseRepository(db).list_active_cases()[0]
        text = ""
        first_token_ms = None
        done_seen = False
        async for chunk in PatientAgent().stream_reply(case, [], "孩子发热几天了？最高体温多少？", "novice"):
            if first_token_ms is None and chunk.first_token_ms is not None:
                first_token_ms = chunk.first_token_ms
            if chunk.done:
                done_seen = True
                print(f"done_seen={done_seen}")
                print(f"first_token_ms={first_token_ms}")
                print(f"total_latency_ms={chunk.total_latency_ms}")
                print(f"model={chunk.model}")
                print(f"fallback_used={chunk.fallback_used}")
                print(f"text_len={len(text)}")
                print(f"text_preview={text[:30]}")
                break
            text += chunk.delta
        if not done_seen:
            print("done_seen=False")
            print(f"text_len={len(text)}")
            print(f"text_preview={text[:30]}")
            raise SystemExit(1)
    finally:
        db.close()


if __name__ == "__main__":
    asyncio.run(main())
chore: initialize medical consultation agent demo 2026-06-01 09:25:26 +08:00			`import asyncio`
			`import sys`
			`from pathlib import Path`

			`sys.path.insert(0, str(Path(__file__).resolve().parents[1]))`

			`from app.agents.llm_adapter import OpenAICompatibleLLMClient`
			`from app.agents.patient_agent import PatientAgent`
			`from app.core.config import settings`
			`from app.db.session import SessionLocal`
			`from app.repositories.case_repository import CaseRepository`


			`async def main() -> None:`
			`"""本地调试：直接调用 Patient Agent 流式回复，绕过前端和 FastAPI。"""`
			`client = OpenAICompatibleLLMClient()`
			`print(f"mock_mode={client.is_mock_mode}")`
			`print(f"fast_model={settings.llm_fast_model}")`
			`print(f"fast_thinking={settings.llm_fast_thinking_enabled}")`
			`print(f"stream_first_token_timeout={settings.llm_stream_first_token_timeout_seconds}")`
			`print(f"stream_total_timeout={settings.llm_stream_total_timeout_seconds}")`

			`db = SessionLocal()`
			`try:`
			`case = CaseRepository(db).list_active_cases()[0]`
			`text = ""`
			`first_token_ms = None`
			`done_seen = False`
			`async for chunk in PatientAgent().stream_reply(case, [], "孩子发热几天了？最高体温多少？", "novice"):`
			`if first_token_ms is None and chunk.first_token_ms is not None:`
			`first_token_ms = chunk.first_token_ms`
			`if chunk.done:`
			`done_seen = True`
			`print(f"done_seen={done_seen}")`
			`print(f"first_token_ms={first_token_ms}")`
			`print(f"total_latency_ms={chunk.total_latency_ms}")`
			`print(f"model={chunk.model}")`
			`print(f"fallback_used={chunk.fallback_used}")`
			`print(f"text_len={len(text)}")`
			`print(f"text_preview={text[:30]}")`
			`break`
			`text += chunk.delta`
			`if not done_seen:`
			`print("done_seen=False")`
			`print(f"text_len={len(text)}")`
			`print(f"text_preview={text[:30]}")`
			`raise SystemExit(1)`
			`finally:`
			`db.close()`


			`if __name__ == "__main__":`
			`asyncio.run(main())`