Files
medical_training/apps/case/services/pdf_reader.py
T

51 lines
1.8 KiB
Python
Raw Normal View History

2026-05-29 15:58:00 +08:00
import logging
import pdfplumber
from config.exceptions import AppError
logger = logging.getLogger(__name__)
MAX_FILES = 5
MAX_FILE_SIZE = 20 * 1024 * 1024 # 20 MB
MAX_TOTAL_SIZE = 60 * 1024 * 1024 # 60 MB
FILE_BREAK = '\n\n---FILE_BREAK: {name}---\n\n'
def extract_text_from_pdfs(files) -> str:
"""从 1~5 份 PDF 中提取文本,拼接返回。
files: request.FILES.getlist('files') 或类似 UploadedFile 列表。
"""
if not files:
raise AppError('CASE_PDF_EMPTY', '未上传 PDF 文件')
if len(files) > MAX_FILES:
raise AppError('CASE_TOO_MANY_FILES', f'最多上传 {MAX_FILES} 份 PDF', status_code=400)
total_size = 0
for f in files:
if f.size > MAX_FILE_SIZE:
raise AppError('CASE_FILE_TOO_LARGE', f'单份 PDF 不得超过 {MAX_FILE_SIZE // (1024*1024)} MB', status_code=400)
total_size += f.size
if total_size > MAX_TOTAL_SIZE:
raise AppError('CASE_FILE_TOO_LARGE', f'PDF 总大小不得超过 {MAX_TOTAL_SIZE // (1024*1024)} MB', status_code=400)
parts = []
for f in files:
text = _extract_single(f)
if not text.strip():
raise AppError('CASE_PDF_EMPTY', f'PDF "{f.name}" 无法提取文本(可能为扫描版)', status_code=400)
parts.append(FILE_BREAK.format(name=f.name) + text if len(files) > 1 else text)
return ''.join(parts)
def _extract_single(uploaded_file) -> str:
try:
with pdfplumber.open(uploaded_file) as pdf:
pages = [page.extract_text() or '' for page in pdf.pages]
return '\n'.join(pages)
except Exception as e:
logger.error('pdfplumber extract failed for %s: %s', uploaded_file.name, e)
raise AppError('CASE_PDF_EMPTY', f'PDF "{uploaded_file.name}" 解析失败: {e}', status_code=400)