init medical training project
This commit is contained in:
@@ -0,0 +1,95 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import uuid
|
||||
|
||||
import jsonschema
|
||||
from django.core.cache import cache
|
||||
from pathlib import Path
|
||||
|
||||
from config.exceptions import AppError
|
||||
from . import deepseek_client
|
||||
from .pdf_reader import extract_text_from_pdfs
|
||||
from prompts.loader import load_prompt
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
audit = logging.getLogger('audit')
|
||||
|
||||
_SCHEMA_PATH = Path(__file__).resolve().parent.parent / 'schemas' / 'case_full.json'
|
||||
PARSE_RESULT_TTL = 300 # 5 minutes
|
||||
|
||||
|
||||
def parse_pdf(files, case_type: str, user) -> dict:
|
||||
"""C1: PDF 解析 → DeepSeek → 结构化数据(不落库,不含评分规则)。"""
|
||||
if case_type not in ('traditional', 'teaching'):
|
||||
raise AppError('CASE_TYPE_NOT_SUPPORTED', f'case_type 不支持: {case_type}', status_code=400)
|
||||
|
||||
t0 = time.time()
|
||||
|
||||
text = extract_text_from_pdfs(files)
|
||||
|
||||
prompt_name = f'case_{case_type}_full'
|
||||
system_prompt, prompt_version = load_prompt(prompt_name)
|
||||
|
||||
result = deepseek_client.call_deepseek(system_prompt, text)
|
||||
data = result['data']
|
||||
|
||||
data.pop('scoring_rules', None)
|
||||
data.pop('stages', None)
|
||||
|
||||
data['case_type'] = case_type
|
||||
|
||||
_strip_unknown_fields(data)
|
||||
_validate_schema(data)
|
||||
|
||||
parse_id = uuid.uuid4().hex[:12]
|
||||
cache.set(f'parse_result:{parse_id}', json.dumps(data, ensure_ascii=False), PARSE_RESULT_TTL)
|
||||
|
||||
source = {
|
||||
'files': [f.name for f in files],
|
||||
'total_bytes': sum(f.size for f in files),
|
||||
}
|
||||
|
||||
audit.info(
|
||||
'CASE_PARSE user=%s files=%d parse_id=%s tokens=%s prompt_version=%s',
|
||||
user.id, len(files), parse_id,
|
||||
result.get('usage', {}), prompt_version,
|
||||
)
|
||||
|
||||
return {
|
||||
'parse_id': parse_id,
|
||||
'case_type': case_type,
|
||||
'source': source,
|
||||
'ai_usage': result.get('usage', {}),
|
||||
'prompt_version': prompt_version,
|
||||
'parsing_seconds': round(time.time() - t0, 1),
|
||||
'data': data,
|
||||
}
|
||||
|
||||
|
||||
_SCHEMA_ALLOWED_KEYS = {
|
||||
'title', 'case_type', 'difficulty', 'chief_complaint', 'description',
|
||||
'patient_age', 'patient_gender', 'tags', 'symptom_tags', 'disease_tags',
|
||||
'competency_tags', 'guideline_tags', 'knowledge_points', 'icd_codes',
|
||||
'estimated_minutes', 'osce_enabled', 'department_name',
|
||||
'traditional', 'teaching',
|
||||
}
|
||||
|
||||
|
||||
def _strip_unknown_fields(data):
|
||||
for key in list(data.keys()):
|
||||
if key not in _SCHEMA_ALLOWED_KEYS:
|
||||
data.pop(key)
|
||||
|
||||
|
||||
def _validate_schema(data):
|
||||
schema = json.loads(_SCHEMA_PATH.read_text(encoding='utf-8'))
|
||||
try:
|
||||
jsonschema.validate(instance=data, schema=schema)
|
||||
except jsonschema.ValidationError as e:
|
||||
logger.error('AI parse output schema violation: %s', e.message)
|
||||
raise AppError(
|
||||
'AI_SCHEMA_VIOLATION',
|
||||
f'AI 输出字段不合法: {e.message}',
|
||||
status_code=500,
|
||||
)
|
||||
Reference in New Issue
Block a user