54 lines
2.0 KiB
Python
54 lines
2.0 KiB
Python
import asyncio
|
|
import sys
|
|
from pathlib import Path
|
|
|
|
sys.path.insert(0, str(Path(__file__).resolve().parents[1]))
|
|
|
|
from app.agents.llm_adapter import OpenAICompatibleLLMClient
|
|
from app.agents.patient_agent import PatientAgent
|
|
from app.core.config import settings
|
|
from app.db.session import SessionLocal
|
|
from app.repositories.case_repository import CaseRepository
|
|
|
|
|
|
async def main() -> None:
|
|
"""本地调试:直接调用 Patient Agent 流式回复,绕过前端和 FastAPI。"""
|
|
client = OpenAICompatibleLLMClient()
|
|
print(f"mock_mode={client.is_mock_mode}")
|
|
print(f"fast_model={settings.llm_fast_model}")
|
|
print(f"fast_thinking={settings.llm_fast_thinking_enabled}")
|
|
print(f"stream_first_token_timeout={settings.llm_stream_first_token_timeout_seconds}")
|
|
print(f"stream_total_timeout={settings.llm_stream_total_timeout_seconds}")
|
|
|
|
db = SessionLocal()
|
|
try:
|
|
case = CaseRepository(db).list_active_cases()[0]
|
|
text = ""
|
|
first_token_ms = None
|
|
done_seen = False
|
|
async for chunk in PatientAgent().stream_reply(case, [], "孩子发热几天了?最高体温多少?", "novice"):
|
|
if first_token_ms is None and chunk.first_token_ms is not None:
|
|
first_token_ms = chunk.first_token_ms
|
|
if chunk.done:
|
|
done_seen = True
|
|
print(f"done_seen={done_seen}")
|
|
print(f"first_token_ms={first_token_ms}")
|
|
print(f"total_latency_ms={chunk.total_latency_ms}")
|
|
print(f"model={chunk.model}")
|
|
print(f"fallback_used={chunk.fallback_used}")
|
|
print(f"text_len={len(text)}")
|
|
print(f"text_preview={text[:30]}")
|
|
break
|
|
text += chunk.delta
|
|
if not done_seen:
|
|
print("done_seen=False")
|
|
print(f"text_len={len(text)}")
|
|
print(f"text_preview={text[:30]}")
|
|
raise SystemExit(1)
|
|
finally:
|
|
db.close()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|