45 lines
1.8 KiB
Python
45 lines
1.8 KiB
Python
|
|
from dataclasses import dataclass
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
from app.core.exceptions import AppError
|
||
|
|
|
||
|
|
|
||
|
|
@dataclass(frozen=True)
|
||
|
|
class ParsedPdfPage:
|
||
|
|
"""PDF 页文本:保留页码,支撑 RAG 回答中的来源页码引用。"""
|
||
|
|
|
||
|
|
page_number: int
|
||
|
|
text: str
|
||
|
|
|
||
|
|
|
||
|
|
class PdfParser:
|
||
|
|
"""PDF 解析器:使用 PyMuPDF 逐页提取教材、指南等 PDF 文本。"""
|
||
|
|
|
||
|
|
def parse(self, file_path: str | Path) -> list[ParsedPdfPage]:
|
||
|
|
"""PDF解析:逐页读取文本并过滤空页,失败时返回统一业务异常。"""
|
||
|
|
path = Path(file_path)
|
||
|
|
if not path.exists():
|
||
|
|
raise AppError("PDF_FILE_NOT_FOUND", "uploaded pdf file not found", 404)
|
||
|
|
try:
|
||
|
|
import fitz # PyMuPDF
|
||
|
|
except ImportError as exc:
|
||
|
|
raise AppError("PDF_PARSER_NOT_INSTALLED", "PyMuPDF is required for pdf parsing", 500) from exc
|
||
|
|
|
||
|
|
pages: list[ParsedPdfPage] = []
|
||
|
|
try:
|
||
|
|
with fitz.open(path) as doc:
|
||
|
|
for index, page in enumerate(doc, start=1):
|
||
|
|
text = self._clean_text(page.get_text("text") or "")
|
||
|
|
if text:
|
||
|
|
pages.append(ParsedPdfPage(page_number=index, text=text))
|
||
|
|
except Exception as exc: # pragma: no cover - PyMuPDF 异常类型较多,统一转换即可
|
||
|
|
raise AppError("PDF_PARSE_FAILED", "pdf parse failed", 422) from exc
|
||
|
|
if not pages:
|
||
|
|
raise AppError("PDF_PARSE_EMPTY", "pdf text content is empty", 422)
|
||
|
|
return pages
|
||
|
|
|
||
|
|
def _clean_text(self, text: str) -> str:
|
||
|
|
"""文本清洗:压缩空白并保留自然换行,便于后续教材分片。"""
|
||
|
|
lines = [" ".join(line.strip().split()) for line in text.splitlines()]
|
||
|
|
return "\n".join(line for line in lines if line)
|