import logging import pdfplumber from config.exceptions import AppError logger = logging.getLogger(__name__) MAX_FILES = 5 MAX_FILE_SIZE = 20 * 1024 * 1024 # 20 MB MAX_TOTAL_SIZE = 60 * 1024 * 1024 # 60 MB FILE_BREAK = '\n\n---FILE_BREAK: {name}---\n\n' def extract_text_from_pdfs(files) -> str: """从 1~5 份 PDF 中提取文本,拼接返回。 files: request.FILES.getlist('files') 或类似 UploadedFile 列表。 """ if not files: raise AppError('CASE_PDF_EMPTY', '未上传 PDF 文件') if len(files) > MAX_FILES: raise AppError('CASE_TOO_MANY_FILES', f'最多上传 {MAX_FILES} 份 PDF', status_code=400) total_size = 0 for f in files: if f.size > MAX_FILE_SIZE: raise AppError('CASE_FILE_TOO_LARGE', f'单份 PDF 不得超过 {MAX_FILE_SIZE // (1024*1024)} MB', status_code=400) total_size += f.size if total_size > MAX_TOTAL_SIZE: raise AppError('CASE_FILE_TOO_LARGE', f'PDF 总大小不得超过 {MAX_TOTAL_SIZE // (1024*1024)} MB', status_code=400) parts = [] for f in files: text = _extract_single(f) if not text.strip(): raise AppError('CASE_PDF_EMPTY', f'PDF "{f.name}" 无法提取文本(可能为扫描版)', status_code=400) parts.append(FILE_BREAK.format(name=f.name) + text if len(files) > 1 else text) return ''.join(parts) def _extract_single(uploaded_file) -> str: try: with pdfplumber.open(uploaded_file) as pdf: pages = [page.extract_text() or '' for page in pdf.pages] return '\n'.join(pages) except Exception as e: logger.error('pdfplumber extract failed for %s: %s', uploaded_file.name, e) raise AppError('CASE_PDF_EMPTY', f'PDF "{uploaded_file.name}" 解析失败: {e}', status_code=400)