app/integrations/pdf_parser.py

from dataclasses import dataclass
from pathlib import Path

from app.core.exceptions import AppError


@dataclass(frozen=True)
class ParsedPdfPage:
    """PDF 页文本：保留页码，支撑 RAG 回答中的来源页码引用。"""

    page_number: int
    text: str


class PdfParser:
    """PDF 解析器：使用 PyMuPDF 逐页提取教材、指南等 PDF 文本。"""

    def parse(self, file_path: str | Path) -> list[ParsedPdfPage]:
        """PDF解析：逐页读取文本并过滤空页，失败时返回统一业务异常。"""
        path = Path(file_path)
        if not path.exists():
            raise AppError("PDF_FILE_NOT_FOUND", "uploaded pdf file not found", 404)
        try:
            import fitz  # PyMuPDF
        except ImportError as exc:
            raise AppError("PDF_PARSER_NOT_INSTALLED", "PyMuPDF is required for pdf parsing", 500) from exc

        pages: list[ParsedPdfPage] = []
        try:
            with fitz.open(path) as doc:
                for index, page in enumerate(doc, start=1):
                    text = self._clean_text(page.get_text("text") or "")
                    if text:
                        pages.append(ParsedPdfPage(page_number=index, text=text))
        except Exception as exc:  # pragma: no cover - PyMuPDF 异常类型较多，统一转换即可
            raise AppError("PDF_PARSE_FAILED", "pdf parse failed", 422) from exc
        if not pages:
            raise AppError("PDF_PARSE_EMPTY", "pdf text content is empty", 422)
        return pages

    def _clean_text(self, text: str) -> str:
        """文本清洗：压缩空白并保留自然换行，便于后续教材分片。"""
        lines = [" ".join(line.strip().split()) for line in text.splitlines()]
        return "\n".join(line for line in lines if line)
feat: add streaming learning assistant and knowledge base scaffolding 2026-06-10 09:32:36 +08:00			`from dataclasses import dataclass`
			`from pathlib import Path`

			`from app.core.exceptions import AppError`


			`@dataclass(frozen=True)`
			`class ParsedPdfPage:`
			`"""PDF 页文本：保留页码，支撑 RAG 回答中的来源页码引用。"""`

			`page_number: int`
			`text: str`


			`class PdfParser:`
			`"""PDF 解析器：使用 PyMuPDF 逐页提取教材、指南等 PDF 文本。"""`

			`def parse(self, file_path: str \| Path) -> list[ParsedPdfPage]:`
			`"""PDF解析：逐页读取文本并过滤空页，失败时返回统一业务异常。"""`
			`path = Path(file_path)`
			`if not path.exists():`
			`raise AppError("PDF_FILE_NOT_FOUND", "uploaded pdf file not found", 404)`
			`try:`
			`import fitz # PyMuPDF`
			`except ImportError as exc:`
			`raise AppError("PDF_PARSER_NOT_INSTALLED", "PyMuPDF is required for pdf parsing", 500) from exc`

			`pages: list[ParsedPdfPage] = []`
			`try:`
			`with fitz.open(path) as doc:`
			`for index, page in enumerate(doc, start=1):`
			`text = self._clean_text(page.get_text("text") or "")`
			`if text:`
			`pages.append(ParsedPdfPage(page_number=index, text=text))`
			`except Exception as exc: # pragma: no cover - PyMuPDF 异常类型较多，统一转换即可`
			`raise AppError("PDF_PARSE_FAILED", "pdf parse failed", 422) from exc`
			`if not pages:`
			`raise AppError("PDF_PARSE_EMPTY", "pdf text content is empty", 422)`
			`return pages`

			`def _clean_text(self, text: str) -> str:`
			`"""文本清洗：压缩空白并保留自然换行，便于后续教材分片。"""`
			`lines = [" ".join(line.strip().split()) for line in text.splitlines()]`
			`return "\n".join(line for line in lines if line)`