117 lines
8.3 KiB
Python
117 lines
8.3 KiB
Python
|
|
from datetime import datetime
|
||
|
|
|
||
|
|
from sqlalchemy import DateTime, Float, Integer, JSON, String, Text, UniqueConstraint
|
||
|
|
from sqlalchemy.orm import Mapped, mapped_column
|
||
|
|
|
||
|
|
from app.db.base import Base
|
||
|
|
from app.models.mixins import TimestampMixin
|
||
|
|
|
||
|
|
|
||
|
|
class KbKnowledgeSpace(TimestampMixin, Base):
|
||
|
|
"""知识空间模型:记录机构与 Milvus collection、embedding 参数之间的映射。"""
|
||
|
|
|
||
|
|
__tablename__ = "kb_spaces"
|
||
|
|
__table_args__ = (UniqueConstraint("institution_id", "embedding_version", name="uq_kb_space_institution_version"),)
|
||
|
|
|
||
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||
|
|
institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
|
||
|
|
institution_name: Mapped[str | None] = mapped_column(String(128), comment="机构名称")
|
||
|
|
space_code: Mapped[str] = mapped_column(String(128), nullable=False, unique=True, comment="知识空间编码")
|
||
|
|
collection_name: Mapped[str] = mapped_column(String(128), nullable=False, unique=True, comment="Milvus集合名")
|
||
|
|
embedding_model: Mapped[str] = mapped_column(String(128), nullable=False, comment="向量模型")
|
||
|
|
embedding_dim: Mapped[int] = mapped_column(Integer, nullable=False, comment="向量维度")
|
||
|
|
embedding_version: Mapped[str] = mapped_column(String(32), nullable=False, default="v1", comment="向量版本")
|
||
|
|
chunk_size: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片长度")
|
||
|
|
chunk_overlap: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片重叠长度")
|
||
|
|
top_k_default: Mapped[int] = mapped_column(Integer, nullable=False, default=5, comment="默认返回片段数")
|
||
|
|
score_threshold: Mapped[float] = mapped_column(Float, nullable=False, default=0.35, comment="默认相似度阈值")
|
||
|
|
status: Mapped[str] = mapped_column(String(32), nullable=False, default="active", index=True, comment="状态")
|
||
|
|
|
||
|
|
|
||
|
|
class KbKnowledgeDocument(TimestampMixin, Base):
|
||
|
|
"""知识文档模型:记录内容管理员上传的 PDF 及其处理状态。"""
|
||
|
|
|
||
|
|
__tablename__ = "kb_documents"
|
||
|
|
__table_args__ = (UniqueConstraint("institution_id", "file_sha256", name="uq_kb_document_institution_sha"),)
|
||
|
|
|
||
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||
|
|
institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
|
||
|
|
uploaded_by: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="上传用户ID")
|
||
|
|
file_name: Mapped[str] = mapped_column(String(255), nullable=False, comment="原始文件名")
|
||
|
|
file_sha256: Mapped[str] = mapped_column(String(64), nullable=False, index=True, comment="文件SHA256")
|
||
|
|
file_type: Mapped[str] = mapped_column(String(32), nullable=False, default="pdf", comment="文件类型")
|
||
|
|
file_size: Mapped[int] = mapped_column(Integer, nullable=False, comment="文件大小")
|
||
|
|
file_path: Mapped[str] = mapped_column(String(512), nullable=False, comment="文件保存路径")
|
||
|
|
document_title: Mapped[str | None] = mapped_column(String(255), comment="文档标题")
|
||
|
|
document_category: Mapped[str] = mapped_column(String(64), nullable=False, default="other", comment="文档分类")
|
||
|
|
version: Mapped[str] = mapped_column(String(32), nullable=False, default="v1", comment="文档版本")
|
||
|
|
status: Mapped[str] = mapped_column(String(32), nullable=False, default="uploaded", index=True, comment="处理状态")
|
||
|
|
parse_status: Mapped[str] = mapped_column(String(32), nullable=False, default="pending", comment="解析状态")
|
||
|
|
embedding_status: Mapped[str] = mapped_column(String(32), nullable=False, default="pending", comment="向量状态")
|
||
|
|
chunk_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="分片数量")
|
||
|
|
error_message: Mapped[str | None] = mapped_column(Text, comment="错误信息")
|
||
|
|
|
||
|
|
|
||
|
|
class KbKnowledgeChunk(Base):
|
||
|
|
"""知识分片模型:保存 PDF 分片文本、页码和 Milvus 向量 ID 元数据。"""
|
||
|
|
|
||
|
|
__tablename__ = "kb_chunks"
|
||
|
|
__table_args__ = (UniqueConstraint("chunk_uid", name="uq_kb_chunk_uid"),)
|
||
|
|
|
||
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||
|
|
institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
|
||
|
|
document_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="文档ID")
|
||
|
|
chunk_uid: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="分片唯一ID")
|
||
|
|
chunk_index: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片序号")
|
||
|
|
page_start: Mapped[int] = mapped_column(Integer, nullable=False, comment="起始页")
|
||
|
|
page_end: Mapped[int] = mapped_column(Integer, nullable=False, comment="结束页")
|
||
|
|
section_title: Mapped[str | None] = mapped_column(String(255), comment="章节标题")
|
||
|
|
chunk_text: Mapped[str] = mapped_column(Text, nullable=False, comment="分片文本")
|
||
|
|
chunk_hash: Mapped[str] = mapped_column(String(64), nullable=False, index=True, comment="分片Hash")
|
||
|
|
token_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="估算token数")
|
||
|
|
vector_id: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="Milvus向量ID")
|
||
|
|
collection_name: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="Milvus集合名")
|
||
|
|
embedding_model: Mapped[str] = mapped_column(String(128), nullable=False, comment="向量模型")
|
||
|
|
metadata_: Mapped[dict | None] = mapped_column("metadata", JSON, comment="扩展元数据")
|
||
|
|
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, index=True)
|
||
|
|
|
||
|
|
|
||
|
|
class KbKnowledgeIngestionTask(TimestampMixin, Base):
|
||
|
|
"""知识入库任务模型:记录 PDF 解析、分片、向量化和入 Milvus 的异步进度。"""
|
||
|
|
|
||
|
|
__tablename__ = "kb_ingestion_tasks"
|
||
|
|
|
||
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||
|
|
document_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="文档ID")
|
||
|
|
institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
|
||
|
|
task_type: Mapped[str] = mapped_column(String(64), nullable=False, default="document_ingestion", comment="任务类型")
|
||
|
|
status: Mapped[str] = mapped_column(String(32), nullable=False, default="queued", index=True, comment="任务状态")
|
||
|
|
progress: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="进度百分比")
|
||
|
|
current_step: Mapped[str | None] = mapped_column(String(255), comment="当前步骤")
|
||
|
|
error_message: Mapped[str | None] = mapped_column(Text, comment="错误信息")
|
||
|
|
started_at: Mapped[datetime | None] = mapped_column(DateTime)
|
||
|
|
finished_at: Mapped[datetime | None] = mapped_column(DateTime)
|
||
|
|
|
||
|
|
|
||
|
|
class KbKnowledgeQueryLog(Base):
|
||
|
|
"""学习助手问答日志:记录 RAG 检索命中、来源和耗时,用于审计和效果分析。"""
|
||
|
|
|
||
|
|
__tablename__ = "kb_query_logs"
|
||
|
|
|
||
|
|
id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
|
||
|
|
user_id: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="用户ID")
|
||
|
|
institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
|
||
|
|
question: Mapped[str] = mapped_column(Text, nullable=False, comment="用户问题")
|
||
|
|
retrieval_hit: Mapped[bool] = mapped_column(Integer, nullable=False, default=0, comment="是否命中知识库")
|
||
|
|
retrieved_chunk_ids: Mapped[list | None] = mapped_column(JSON, comment="命中分片ID")
|
||
|
|
answer_summary: Mapped[str | None] = mapped_column(Text, comment="回答摘要")
|
||
|
|
llm_model: Mapped[str | None] = mapped_column(String(128), comment="LLM模型")
|
||
|
|
embedding_model: Mapped[str | None] = mapped_column(String(128), comment="向量模型")
|
||
|
|
top_k: Mapped[int | None] = mapped_column(Integer, comment="最终返回片段数")
|
||
|
|
score_threshold: Mapped[float | None] = mapped_column(Float, comment="检索阈值")
|
||
|
|
embedding_latency_ms: Mapped[int | None] = mapped_column(Integer)
|
||
|
|
search_latency_ms: Mapped[int | None] = mapped_column(Integer)
|
||
|
|
llm_latency_ms: Mapped[int | None] = mapped_column(Integer)
|
||
|
|
total_latency_ms: Mapped[int | None] = mapped_column(Integer)
|
||
|
|
created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, index=True)
|