apps/case/services/pdf_reader.py

import logging

import pdfplumber

from config.exceptions import AppError

logger = logging.getLogger(__name__)

MAX_FILES = 5
MAX_FILE_SIZE = 20 * 1024 * 1024   # 20 MB
MAX_TOTAL_SIZE = 60 * 1024 * 1024  # 60 MB
FILE_BREAK = '\n\n---FILE_BREAK: {name}---\n\n'


def extract_text_from_pdfs(files) -> str:
    """从 1~5 份 PDF 中提取文本，拼接返回。

    files: request.FILES.getlist('files') 或类似 UploadedFile 列表。
    """
    if not files:
        raise AppError('CASE_PDF_EMPTY', '未上传 PDF 文件')
    if len(files) > MAX_FILES:
        raise AppError('CASE_TOO_MANY_FILES', f'最多上传 {MAX_FILES} 份 PDF', status_code=400)

    total_size = 0
    for f in files:
        if f.size > MAX_FILE_SIZE:
            raise AppError('CASE_FILE_TOO_LARGE', f'单份 PDF 不得超过 {MAX_FILE_SIZE // (1024*1024)} MB', status_code=400)
        total_size += f.size
    if total_size > MAX_TOTAL_SIZE:
        raise AppError('CASE_FILE_TOO_LARGE', f'PDF 总大小不得超过 {MAX_TOTAL_SIZE // (1024*1024)} MB', status_code=400)

    parts = []
    for f in files:
        text = _extract_single(f)
        if not text.strip():
            raise AppError('CASE_PDF_EMPTY', f'PDF "{f.name}" 无法提取文本（可能为扫描版）', status_code=400)
        parts.append(FILE_BREAK.format(name=f.name) + text if len(files) > 1 else text)

    return ''.join(parts)


def _extract_single(uploaded_file) -> str:
    try:
        with pdfplumber.open(uploaded_file) as pdf:
            pages = [page.extract_text() or '' for page in pdf.pages]
        return '\n'.join(pages)
    except Exception as e:
        logger.error('pdfplumber extract failed for %s: %s', uploaded_file.name, e)
        raise AppError('CASE_PDF_EMPTY', f'PDF "{uploaded_file.name}" 解析失败: {e}', status_code=400)
init medical training project 2026-05-29 15:58:00 +08:00			`import logging`

			`import pdfplumber`

			`from config.exceptions import AppError`

			`logger = logging.getLogger(__name__)`

			`MAX_FILES = 5`
			`MAX_FILE_SIZE = 20 * 1024 * 1024 # 20 MB`
			`MAX_TOTAL_SIZE = 60 * 1024 * 1024 # 60 MB`
			`FILE_BREAK = '\n\n---FILE_BREAK: {name}---\n\n'`


			`def extract_text_from_pdfs(files) -> str:`
			`"""从 1~5 份 PDF 中提取文本，拼接返回。`

			`files: request.FILES.getlist('files') 或类似 UploadedFile 列表。`
			`"""`
			`if not files:`
			`raise AppError('CASE_PDF_EMPTY', '未上传 PDF 文件')`
			`if len(files) > MAX_FILES:`
			`raise AppError('CASE_TOO_MANY_FILES', f'最多上传 {MAX_FILES} 份 PDF', status_code=400)`

			`total_size = 0`
			`for f in files:`
			`if f.size > MAX_FILE_SIZE:`
			`raise AppError('CASE_FILE_TOO_LARGE', f'单份 PDF 不得超过 {MAX_FILE_SIZE // (1024*1024)} MB', status_code=400)`
			`total_size += f.size`
			`if total_size > MAX_TOTAL_SIZE:`
			`raise AppError('CASE_FILE_TOO_LARGE', f'PDF 总大小不得超过 {MAX_TOTAL_SIZE // (1024*1024)} MB', status_code=400)`

			`parts = []`
			`for f in files:`
			`text = _extract_single(f)`
			`if not text.strip():`
			`raise AppError('CASE_PDF_EMPTY', f'PDF "{f.name}" 无法提取文本（可能为扫描版）', status_code=400)`
			`parts.append(FILE_BREAK.format(name=f.name) + text if len(files) > 1 else text)`

			`return ''.join(parts)`


			`def _extract_single(uploaded_file) -> str:`
			`try:`
			`with pdfplumber.open(uploaded_file) as pdf:`
			`pages = [page.extract_text() or '' for page in pdf.pages]`
			`return '\n'.join(pages)`
			`except Exception as e:`
			`logger.error('pdfplumber extract failed for %s: %s', uploaded_file.name, e)`
			`raise AppError('CASE_PDF_EMPTY', f'PDF "{uploaded_file.name}" 解析失败: {e}', status_code=400)`