import json import logging import time import uuid import jsonschema from django.core.cache import cache from pathlib import Path from config.exceptions import AppError from . import deepseek_client from .pdf_reader import extract_text_from_pdfs from prompts.loader import load_prompt logger = logging.getLogger(__name__) audit = logging.getLogger('audit') _SCHEMA_PATH = Path(__file__).resolve().parent.parent / 'schemas' / 'case_full.json' PARSE_RESULT_TTL = 300 # 5 minutes def parse_pdf(files, case_type: str, user) -> dict: """C1: PDF 解析 → DeepSeek → 结构化数据(不落库,不含评分规则)。""" if case_type not in ('traditional', 'teaching'): raise AppError('CASE_TYPE_NOT_SUPPORTED', f'case_type 不支持: {case_type}', status_code=400) t0 = time.time() text = extract_text_from_pdfs(files) prompt_name = f'case_{case_type}_full' system_prompt, prompt_version = load_prompt(prompt_name) result = deepseek_client.call_deepseek(system_prompt, text) data = result['data'] data.pop('scoring_rules', None) data.pop('stages', None) data['case_type'] = case_type _strip_unknown_fields(data) _validate_schema(data) parse_id = uuid.uuid4().hex[:12] cache.set(f'parse_result:{parse_id}', json.dumps(data, ensure_ascii=False), PARSE_RESULT_TTL) source = { 'files': [f.name for f in files], 'total_bytes': sum(f.size for f in files), } audit.info( 'CASE_PARSE user=%s files=%d parse_id=%s tokens=%s prompt_version=%s', user.id, len(files), parse_id, result.get('usage', {}), prompt_version, ) return { 'parse_id': parse_id, 'case_type': case_type, 'source': source, 'ai_usage': result.get('usage', {}), 'prompt_version': prompt_version, 'parsing_seconds': round(time.time() - t0, 1), 'data': data, } _SCHEMA_ALLOWED_KEYS = { 'title', 'case_type', 'difficulty', 'chief_complaint', 'description', 'patient_age', 'patient_gender', 'tags', 'symptom_tags', 'disease_tags', 'competency_tags', 'guideline_tags', 'knowledge_points', 'icd_codes', 'estimated_minutes', 'osce_enabled', 'department_name', 'traditional', 'teaching', } def _strip_unknown_fields(data): for key in list(data.keys()): if key not in _SCHEMA_ALLOWED_KEYS: data.pop(key) def _validate_schema(data): schema = json.loads(_SCHEMA_PATH.read_text(encoding='utf-8')) try: jsonschema.validate(instance=data, schema=schema) except jsonschema.ValidationError as e: logger.error('AI parse output schema violation: %s', e.message) raise AppError( 'AI_SCHEMA_VIOLATION', f'AI 输出字段不合法: {e.message}', status_code=500, )