feat: add streaming learning assistant and knowledge base scaffolding

2026-06-10 09:32:36 +08:00
parent f0cdc454b3
commit 89258ab448
31 changed files with 2021 additions and 330 deletions
@@ -0,0 +1,44 @@
+from dataclasses import dataclass
+from pathlib import Path
+
+from app.core.exceptions import AppError
+
+
+@dataclass(frozen=True)
+class ParsedPdfPage:
+    """PDF 页文本：保留页码，支撑 RAG 回答中的来源页码引用。"""
+
+    page_number: int
+    text: str
+
+
+class PdfParser:
+    """PDF 解析器：使用 PyMuPDF 逐页提取教材、指南等 PDF 文本。"""
+
+    def parse(self, file_path: str | Path) -> list[ParsedPdfPage]:
+        """PDF解析：逐页读取文本并过滤空页，失败时返回统一业务异常。"""
+        path = Path(file_path)
+        if not path.exists():
+            raise AppError("PDF_FILE_NOT_FOUND", "uploaded pdf file not found", 404)
+        try:
+            import fitz  # PyMuPDF
+        except ImportError as exc:
+            raise AppError("PDF_PARSER_NOT_INSTALLED", "PyMuPDF is required for pdf parsing", 500) from exc
+
+        pages: list[ParsedPdfPage] = []
+        try:
+            with fitz.open(path) as doc:
+                for index, page in enumerate(doc, start=1):
+                    text = self._clean_text(page.get_text("text") or "")
+                    if text:
+                        pages.append(ParsedPdfPage(page_number=index, text=text))
+        except Exception as exc:  # pragma: no cover - PyMuPDF 异常类型较多，统一转换即可
+            raise AppError("PDF_PARSE_FAILED", "pdf parse failed", 422) from exc
+        if not pages:
+            raise AppError("PDF_PARSE_EMPTY", "pdf text content is empty", 422)
+        return pages
+
+    def _clean_text(self, text: str) -> str:
+        """文本清洗：压缩空白并保留自然换行，便于后续教材分片。"""
+        lines = [" ".join(line.strip().split()) for line in text.splitlines()]
+        return "\n".join(line for line in lines if line)