Files
fastapi/app/models/knowledge_base.py
T

117 lines
8.3 KiB
Python
Raw Normal View History

from datetime import datetime
from sqlalchemy import DateTime, Float, Integer, JSON, String, Text, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column
from app.db.base import Base
from app.models.mixins import TimestampMixin
class KbKnowledgeSpace(TimestampMixin, Base):
"""知识空间模型:记录机构与 Milvus collection、embedding 参数之间的映射。"""
__tablename__ = "kb_spaces"
__table_args__ = (UniqueConstraint("institution_id", "embedding_version", name="uq_kb_space_institution_version"),)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
institution_name: Mapped[str | None] = mapped_column(String(128), comment="机构名称")
space_code: Mapped[str] = mapped_column(String(128), nullable=False, unique=True, comment="知识空间编码")
collection_name: Mapped[str] = mapped_column(String(128), nullable=False, unique=True, comment="Milvus集合名")
embedding_model: Mapped[str] = mapped_column(String(128), nullable=False, comment="向量模型")
embedding_dim: Mapped[int] = mapped_column(Integer, nullable=False, comment="向量维度")
embedding_version: Mapped[str] = mapped_column(String(32), nullable=False, default="v1", comment="向量版本")
chunk_size: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片长度")
chunk_overlap: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片重叠长度")
top_k_default: Mapped[int] = mapped_column(Integer, nullable=False, default=5, comment="默认返回片段数")
score_threshold: Mapped[float] = mapped_column(Float, nullable=False, default=0.35, comment="默认相似度阈值")
status: Mapped[str] = mapped_column(String(32), nullable=False, default="active", index=True, comment="状态")
class KbKnowledgeDocument(TimestampMixin, Base):
"""知识文档模型:记录内容管理员上传的 PDF 及其处理状态。"""
__tablename__ = "kb_documents"
__table_args__ = (UniqueConstraint("institution_id", "file_sha256", name="uq_kb_document_institution_sha"),)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
uploaded_by: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="上传用户ID")
file_name: Mapped[str] = mapped_column(String(255), nullable=False, comment="原始文件名")
file_sha256: Mapped[str] = mapped_column(String(64), nullable=False, index=True, comment="文件SHA256")
file_type: Mapped[str] = mapped_column(String(32), nullable=False, default="pdf", comment="文件类型")
file_size: Mapped[int] = mapped_column(Integer, nullable=False, comment="文件大小")
file_path: Mapped[str] = mapped_column(String(512), nullable=False, comment="文件保存路径")
document_title: Mapped[str | None] = mapped_column(String(255), comment="文档标题")
document_category: Mapped[str] = mapped_column(String(64), nullable=False, default="other", comment="文档分类")
version: Mapped[str] = mapped_column(String(32), nullable=False, default="v1", comment="文档版本")
status: Mapped[str] = mapped_column(String(32), nullable=False, default="uploaded", index=True, comment="处理状态")
parse_status: Mapped[str] = mapped_column(String(32), nullable=False, default="pending", comment="解析状态")
embedding_status: Mapped[str] = mapped_column(String(32), nullable=False, default="pending", comment="向量状态")
chunk_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="分片数量")
error_message: Mapped[str | None] = mapped_column(Text, comment="错误信息")
class KbKnowledgeChunk(Base):
"""知识分片模型:保存 PDF 分片文本、页码和 Milvus 向量 ID 元数据。"""
__tablename__ = "kb_chunks"
__table_args__ = (UniqueConstraint("chunk_uid", name="uq_kb_chunk_uid"),)
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
document_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="文档ID")
chunk_uid: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="分片唯一ID")
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片序号")
page_start: Mapped[int] = mapped_column(Integer, nullable=False, comment="起始页")
page_end: Mapped[int] = mapped_column(Integer, nullable=False, comment="结束页")
section_title: Mapped[str | None] = mapped_column(String(255), comment="章节标题")
chunk_text: Mapped[str] = mapped_column(Text, nullable=False, comment="分片文本")
chunk_hash: Mapped[str] = mapped_column(String(64), nullable=False, index=True, comment="分片Hash")
token_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="估算token数")
vector_id: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="Milvus向量ID")
collection_name: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="Milvus集合名")
embedding_model: Mapped[str] = mapped_column(String(128), nullable=False, comment="向量模型")
metadata_: Mapped[dict | None] = mapped_column("metadata", JSON, comment="扩展元数据")
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, index=True)
class KbKnowledgeIngestionTask(TimestampMixin, Base):
"""知识入库任务模型:记录 PDF 解析、分片、向量化和入 Milvus 的异步进度。"""
__tablename__ = "kb_ingestion_tasks"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
document_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="文档ID")
institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
task_type: Mapped[str] = mapped_column(String(64), nullable=False, default="document_ingestion", comment="任务类型")
status: Mapped[str] = mapped_column(String(32), nullable=False, default="queued", index=True, comment="任务状态")
progress: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="进度百分比")
current_step: Mapped[str | None] = mapped_column(String(255), comment="当前步骤")
error_message: Mapped[str | None] = mapped_column(Text, comment="错误信息")
started_at: Mapped[datetime | None] = mapped_column(DateTime)
finished_at: Mapped[datetime | None] = mapped_column(DateTime)
class KbKnowledgeQueryLog(Base):
"""学习助手问答日志:记录 RAG 检索命中、来源和耗时,用于审计和效果分析。"""
__tablename__ = "kb_query_logs"
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
user_id: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="用户ID")
institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
question: Mapped[str] = mapped_column(Text, nullable=False, comment="用户问题")
retrieval_hit: Mapped[bool] = mapped_column(Integer, nullable=False, default=0, comment="是否命中知识库")
retrieved_chunk_ids: Mapped[list | None] = mapped_column(JSON, comment="命中分片ID")
answer_summary: Mapped[str | None] = mapped_column(Text, comment="回答摘要")
llm_model: Mapped[str | None] = mapped_column(String(128), comment="LLM模型")
embedding_model: Mapped[str | None] = mapped_column(String(128), comment="向量模型")
top_k: Mapped[int | None] = mapped_column(Integer, comment="最终返回片段数")
score_threshold: Mapped[float | None] = mapped_column(Float, comment="检索阈值")
embedding_latency_ms: Mapped[int | None] = mapped_column(Integer)
search_latency_ms: Mapped[int | None] = mapped_column(Integer)
llm_latency_ms: Mapped[int | None] = mapped_column(Integer)
total_latency_ms: Mapped[int | None] = mapped_column(Integer)
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, index=True)