from dataclasses import dataclass from pathlib import Path from app.core.exceptions import AppError @dataclass(frozen=True) class ParsedPdfPage: """PDF 页文本:保留页码,支撑 RAG 回答中的来源页码引用。""" page_number: int text: str class PdfParser: """PDF 解析器:使用 PyMuPDF 逐页提取教材、指南等 PDF 文本。""" def parse(self, file_path: str | Path) -> list[ParsedPdfPage]: """PDF解析:逐页读取文本并过滤空页,失败时返回统一业务异常。""" path = Path(file_path) if not path.exists(): raise AppError("PDF_FILE_NOT_FOUND", "uploaded pdf file not found", 404) try: import fitz # PyMuPDF except ImportError as exc: raise AppError("PDF_PARSER_NOT_INSTALLED", "PyMuPDF is required for pdf parsing", 500) from exc pages: list[ParsedPdfPage] = [] try: with fitz.open(path) as doc: for index, page in enumerate(doc, start=1): text = self._clean_text(page.get_text("text") or "") if text: pages.append(ParsedPdfPage(page_number=index, text=text)) except Exception as exc: # pragma: no cover - PyMuPDF 异常类型较多,统一转换即可 raise AppError("PDF_PARSE_FAILED", "pdf parse failed", 422) from exc if not pages: raise AppError("PDF_PARSE_EMPTY", "pdf text content is empty", 422) return pages def _clean_text(self, text: str) -> str: """文本清洗:压缩空白并保留自然换行,便于后续教材分片。""" lines = [" ".join(line.strip().split()) for line in text.splitlines()] return "\n".join(line for line in lines if line)