51 lines
1.8 KiB
Python
51 lines
1.8 KiB
Python
|
|
import logging
|
||
|
|
|
||
|
|
import pdfplumber
|
||
|
|
|
||
|
|
from config.exceptions import AppError
|
||
|
|
|
||
|
|
logger = logging.getLogger(__name__)
|
||
|
|
|
||
|
|
MAX_FILES = 5
|
||
|
|
MAX_FILE_SIZE = 20 * 1024 * 1024 # 20 MB
|
||
|
|
MAX_TOTAL_SIZE = 60 * 1024 * 1024 # 60 MB
|
||
|
|
FILE_BREAK = '\n\n---FILE_BREAK: {name}---\n\n'
|
||
|
|
|
||
|
|
|
||
|
|
def extract_text_from_pdfs(files) -> str:
|
||
|
|
"""从 1~5 份 PDF 中提取文本,拼接返回。
|
||
|
|
|
||
|
|
files: request.FILES.getlist('files') 或类似 UploadedFile 列表。
|
||
|
|
"""
|
||
|
|
if not files:
|
||
|
|
raise AppError('CASE_PDF_EMPTY', '未上传 PDF 文件')
|
||
|
|
if len(files) > MAX_FILES:
|
||
|
|
raise AppError('CASE_TOO_MANY_FILES', f'最多上传 {MAX_FILES} 份 PDF', status_code=400)
|
||
|
|
|
||
|
|
total_size = 0
|
||
|
|
for f in files:
|
||
|
|
if f.size > MAX_FILE_SIZE:
|
||
|
|
raise AppError('CASE_FILE_TOO_LARGE', f'单份 PDF 不得超过 {MAX_FILE_SIZE // (1024*1024)} MB', status_code=400)
|
||
|
|
total_size += f.size
|
||
|
|
if total_size > MAX_TOTAL_SIZE:
|
||
|
|
raise AppError('CASE_FILE_TOO_LARGE', f'PDF 总大小不得超过 {MAX_TOTAL_SIZE // (1024*1024)} MB', status_code=400)
|
||
|
|
|
||
|
|
parts = []
|
||
|
|
for f in files:
|
||
|
|
text = _extract_single(f)
|
||
|
|
if not text.strip():
|
||
|
|
raise AppError('CASE_PDF_EMPTY', f'PDF "{f.name}" 无法提取文本(可能为扫描版)', status_code=400)
|
||
|
|
parts.append(FILE_BREAK.format(name=f.name) + text if len(files) > 1 else text)
|
||
|
|
|
||
|
|
return ''.join(parts)
|
||
|
|
|
||
|
|
|
||
|
|
def _extract_single(uploaded_file) -> str:
|
||
|
|
try:
|
||
|
|
with pdfplumber.open(uploaded_file) as pdf:
|
||
|
|
pages = [page.extract_text() or '' for page in pdf.pages]
|
||
|
|
return '\n'.join(pages)
|
||
|
|
except Exception as e:
|
||
|
|
logger.error('pdfplumber extract failed for %s: %s', uploaded_file.name, e)
|
||
|
|
raise AppError('CASE_PDF_EMPTY', f'PDF "{uploaded_file.name}" 解析失败: {e}', status_code=400)
|