app/models/knowledge_base.py

from datetime import datetime

from sqlalchemy import DateTime, Float, Integer, JSON, String, Text, UniqueConstraint
from sqlalchemy.orm import Mapped, mapped_column

from app.db.base import Base
from app.models.mixins import TimestampMixin


class KbKnowledgeSpace(TimestampMixin, Base):
    """知识空间模型：记录机构与 Milvus collection、embedding 参数之间的映射。"""

    __tablename__ = "kb_spaces"
    __table_args__ = (UniqueConstraint("institution_id", "embedding_version", name="uq_kb_space_institution_version"),)

    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
    institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
    institution_name: Mapped[str | None] = mapped_column(String(128), comment="机构名称")
    space_code: Mapped[str] = mapped_column(String(128), nullable=False, unique=True, comment="知识空间编码")
    collection_name: Mapped[str] = mapped_column(String(128), nullable=False, unique=True, comment="Milvus集合名")
    embedding_model: Mapped[str] = mapped_column(String(128), nullable=False, comment="向量模型")
    embedding_dim: Mapped[int] = mapped_column(Integer, nullable=False, comment="向量维度")
    embedding_version: Mapped[str] = mapped_column(String(32), nullable=False, default="v1", comment="向量版本")
    chunk_size: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片长度")
    chunk_overlap: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片重叠长度")
    top_k_default: Mapped[int] = mapped_column(Integer, nullable=False, default=5, comment="默认返回片段数")
    score_threshold: Mapped[float] = mapped_column(Float, nullable=False, default=0.35, comment="默认相似度阈值")
    status: Mapped[str] = mapped_column(String(32), nullable=False, default="active", index=True, comment="状态")


class KbKnowledgeDocument(TimestampMixin, Base):
    """知识文档模型：记录内容管理员上传的 PDF 及其处理状态。"""

    __tablename__ = "kb_documents"
    __table_args__ = (UniqueConstraint("institution_id", "file_sha256", name="uq_kb_document_institution_sha"),)

    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
    institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
    uploaded_by: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="上传用户ID")
    file_name: Mapped[str] = mapped_column(String(255), nullable=False, comment="原始文件名")
    file_sha256: Mapped[str] = mapped_column(String(64), nullable=False, index=True, comment="文件SHA256")
    file_type: Mapped[str] = mapped_column(String(32), nullable=False, default="pdf", comment="文件类型")
    file_size: Mapped[int] = mapped_column(Integer, nullable=False, comment="文件大小")
    file_path: Mapped[str] = mapped_column(String(512), nullable=False, comment="文件保存路径")
    document_title: Mapped[str | None] = mapped_column(String(255), comment="文档标题")
    document_category: Mapped[str] = mapped_column(String(64), nullable=False, default="other", comment="文档分类")
    version: Mapped[str] = mapped_column(String(32), nullable=False, default="v1", comment="文档版本")
    status: Mapped[str] = mapped_column(String(32), nullable=False, default="uploaded", index=True, comment="处理状态")
    parse_status: Mapped[str] = mapped_column(String(32), nullable=False, default="pending", comment="解析状态")
    embedding_status: Mapped[str] = mapped_column(String(32), nullable=False, default="pending", comment="向量状态")
    chunk_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="分片数量")
    error_message: Mapped[str | None] = mapped_column(Text, comment="错误信息")


class KbKnowledgeChunk(Base):
    """知识分片模型：保存 PDF 分片文本、页码和 Milvus 向量 ID 元数据。"""

    __tablename__ = "kb_chunks"
    __table_args__ = (UniqueConstraint("chunk_uid", name="uq_kb_chunk_uid"),)

    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
    institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
    document_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="文档ID")
    chunk_uid: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="分片唯一ID")
    chunk_index: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片序号")
    page_start: Mapped[int] = mapped_column(Integer, nullable=False, comment="起始页")
    page_end: Mapped[int] = mapped_column(Integer, nullable=False, comment="结束页")
    section_title: Mapped[str | None] = mapped_column(String(255), comment="章节标题")
    chunk_text: Mapped[str] = mapped_column(Text, nullable=False, comment="分片文本")
    chunk_hash: Mapped[str] = mapped_column(String(64), nullable=False, index=True, comment="分片Hash")
    token_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="估算token数")
    vector_id: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="Milvus向量ID")
    collection_name: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="Milvus集合名")
    embedding_model: Mapped[str] = mapped_column(String(128), nullable=False, comment="向量模型")
    metadata_: Mapped[dict | None] = mapped_column("metadata", JSON, comment="扩展元数据")
    created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, index=True)


class KbKnowledgeIngestionTask(TimestampMixin, Base):
    """知识入库任务模型：记录 PDF 解析、分片、向量化和入 Milvus 的异步进度。"""

    __tablename__ = "kb_ingestion_tasks"

    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
    document_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="文档ID")
    institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
    task_type: Mapped[str] = mapped_column(String(64), nullable=False, default="document_ingestion", comment="任务类型")
    status: Mapped[str] = mapped_column(String(32), nullable=False, default="queued", index=True, comment="任务状态")
    progress: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="进度百分比")
    current_step: Mapped[str | None] = mapped_column(String(255), comment="当前步骤")
    error_message: Mapped[str | None] = mapped_column(Text, comment="错误信息")
    started_at: Mapped[datetime | None] = mapped_column(DateTime)
    finished_at: Mapped[datetime | None] = mapped_column(DateTime)


class KbKnowledgeQueryLog(Base):
    """学习助手问答日志：记录 RAG 检索命中、来源和耗时，用于审计和效果分析。"""

    __tablename__ = "kb_query_logs"

    id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)
    user_id: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="用户ID")
    institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")
    question: Mapped[str] = mapped_column(Text, nullable=False, comment="用户问题")
    retrieval_hit: Mapped[bool] = mapped_column(Integer, nullable=False, default=0, comment="是否命中知识库")
    retrieved_chunk_ids: Mapped[list | None] = mapped_column(JSON, comment="命中分片ID")
    answer_summary: Mapped[str | None] = mapped_column(Text, comment="回答摘要")
    llm_model: Mapped[str | None] = mapped_column(String(128), comment="LLM模型")
    embedding_model: Mapped[str | None] = mapped_column(String(128), comment="向量模型")
    top_k: Mapped[int | None] = mapped_column(Integer, comment="最终返回片段数")
    score_threshold: Mapped[float | None] = mapped_column(Float, comment="检索阈值")
    embedding_latency_ms: Mapped[int | None] = mapped_column(Integer)
    search_latency_ms: Mapped[int | None] = mapped_column(Integer)
    llm_latency_ms: Mapped[int | None] = mapped_column(Integer)
    total_latency_ms: Mapped[int | None] = mapped_column(Integer)
    created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, index=True)
feat: add streaming learning assistant and knowledge base scaffolding 2026-06-10 09:32:36 +08:00			`from datetime import datetime`

			`from sqlalchemy import DateTime, Float, Integer, JSON, String, Text, UniqueConstraint`
			`from sqlalchemy.orm import Mapped, mapped_column`

			`from app.db.base import Base`
			`from app.models.mixins import TimestampMixin`


			`class KbKnowledgeSpace(TimestampMixin, Base):`
			`"""知识空间模型：记录机构与 Milvus collection、embedding 参数之间的映射。"""`

			`__tablename__ = "kb_spaces"`
			`__table_args__ = (UniqueConstraint("institution_id", "embedding_version", name="uq_kb_space_institution_version"),)`

			`id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)`
			`institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")`
			`institution_name: Mapped[str \| None] = mapped_column(String(128), comment="机构名称")`
			`space_code: Mapped[str] = mapped_column(String(128), nullable=False, unique=True, comment="知识空间编码")`
			`collection_name: Mapped[str] = mapped_column(String(128), nullable=False, unique=True, comment="Milvus集合名")`
			`embedding_model: Mapped[str] = mapped_column(String(128), nullable=False, comment="向量模型")`
			`embedding_dim: Mapped[int] = mapped_column(Integer, nullable=False, comment="向量维度")`
			`embedding_version: Mapped[str] = mapped_column(String(32), nullable=False, default="v1", comment="向量版本")`
			`chunk_size: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片长度")`
			`chunk_overlap: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片重叠长度")`
			`top_k_default: Mapped[int] = mapped_column(Integer, nullable=False, default=5, comment="默认返回片段数")`
			`score_threshold: Mapped[float] = mapped_column(Float, nullable=False, default=0.35, comment="默认相似度阈值")`
			`status: Mapped[str] = mapped_column(String(32), nullable=False, default="active", index=True, comment="状态")`


			`class KbKnowledgeDocument(TimestampMixin, Base):`
			`"""知识文档模型：记录内容管理员上传的 PDF 及其处理状态。"""`

			`__tablename__ = "kb_documents"`
			`__table_args__ = (UniqueConstraint("institution_id", "file_sha256", name="uq_kb_document_institution_sha"),)`

			`id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)`
			`institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")`
			`uploaded_by: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="上传用户ID")`
			`file_name: Mapped[str] = mapped_column(String(255), nullable=False, comment="原始文件名")`
			`file_sha256: Mapped[str] = mapped_column(String(64), nullable=False, index=True, comment="文件SHA256")`
			`file_type: Mapped[str] = mapped_column(String(32), nullable=False, default="pdf", comment="文件类型")`
			`file_size: Mapped[int] = mapped_column(Integer, nullable=False, comment="文件大小")`
			`file_path: Mapped[str] = mapped_column(String(512), nullable=False, comment="文件保存路径")`
			`document_title: Mapped[str \| None] = mapped_column(String(255), comment="文档标题")`
			`document_category: Mapped[str] = mapped_column(String(64), nullable=False, default="other", comment="文档分类")`
			`version: Mapped[str] = mapped_column(String(32), nullable=False, default="v1", comment="文档版本")`
			`status: Mapped[str] = mapped_column(String(32), nullable=False, default="uploaded", index=True, comment="处理状态")`
			`parse_status: Mapped[str] = mapped_column(String(32), nullable=False, default="pending", comment="解析状态")`
			`embedding_status: Mapped[str] = mapped_column(String(32), nullable=False, default="pending", comment="向量状态")`
			`chunk_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="分片数量")`
			`error_message: Mapped[str \| None] = mapped_column(Text, comment="错误信息")`


			`class KbKnowledgeChunk(Base):`
			`"""知识分片模型：保存 PDF 分片文本、页码和 Milvus 向量 ID 元数据。"""`

			`__tablename__ = "kb_chunks"`
			`__table_args__ = (UniqueConstraint("chunk_uid", name="uq_kb_chunk_uid"),)`

			`id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)`
			`institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")`
			`document_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="文档ID")`
			`chunk_uid: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="分片唯一ID")`
			`chunk_index: Mapped[int] = mapped_column(Integer, nullable=False, comment="分片序号")`
			`page_start: Mapped[int] = mapped_column(Integer, nullable=False, comment="起始页")`
			`page_end: Mapped[int] = mapped_column(Integer, nullable=False, comment="结束页")`
			`section_title: Mapped[str \| None] = mapped_column(String(255), comment="章节标题")`
			`chunk_text: Mapped[str] = mapped_column(Text, nullable=False, comment="分片文本")`
			`chunk_hash: Mapped[str] = mapped_column(String(64), nullable=False, index=True, comment="分片Hash")`
			`token_count: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="估算token数")`
			`vector_id: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="Milvus向量ID")`
			`collection_name: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="Milvus集合名")`
			`embedding_model: Mapped[str] = mapped_column(String(128), nullable=False, comment="向量模型")`
			`metadata_: Mapped[dict \| None] = mapped_column("metadata", JSON, comment="扩展元数据")`
			`created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, index=True)`


			`class KbKnowledgeIngestionTask(TimestampMixin, Base):`
			`"""知识入库任务模型：记录 PDF 解析、分片、向量化和入 Milvus 的异步进度。"""`

			`__tablename__ = "kb_ingestion_tasks"`

			`id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)`
			`document_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="文档ID")`
			`institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")`
			`task_type: Mapped[str] = mapped_column(String(64), nullable=False, default="document_ingestion", comment="任务类型")`
			`status: Mapped[str] = mapped_column(String(32), nullable=False, default="queued", index=True, comment="任务状态")`
			`progress: Mapped[int] = mapped_column(Integer, nullable=False, default=0, comment="进度百分比")`
			`current_step: Mapped[str \| None] = mapped_column(String(255), comment="当前步骤")`
			`error_message: Mapped[str \| None] = mapped_column(Text, comment="错误信息")`
			`started_at: Mapped[datetime \| None] = mapped_column(DateTime)`
			`finished_at: Mapped[datetime \| None] = mapped_column(DateTime)`


			`class KbKnowledgeQueryLog(Base):`
			`"""学习助手问答日志：记录 RAG 检索命中、来源和耗时，用于审计和效果分析。"""`

			`__tablename__ = "kb_query_logs"`

			`id: Mapped[int] = mapped_column(Integer, primary_key=True, autoincrement=True)`
			`user_id: Mapped[str] = mapped_column(String(128), nullable=False, index=True, comment="用户ID")`
			`institution_id: Mapped[int] = mapped_column(Integer, nullable=False, index=True, comment="机构ID")`
			`question: Mapped[str] = mapped_column(Text, nullable=False, comment="用户问题")`
			`retrieval_hit: Mapped[bool] = mapped_column(Integer, nullable=False, default=0, comment="是否命中知识库")`
			`retrieved_chunk_ids: Mapped[list \| None] = mapped_column(JSON, comment="命中分片ID")`
			`answer_summary: Mapped[str \| None] = mapped_column(Text, comment="回答摘要")`
			`llm_model: Mapped[str \| None] = mapped_column(String(128), comment="LLM模型")`
			`embedding_model: Mapped[str \| None] = mapped_column(String(128), comment="向量模型")`
			`top_k: Mapped[int \| None] = mapped_column(Integer, comment="最终返回片段数")`
			`score_threshold: Mapped[float \| None] = mapped_column(Float, comment="检索阈值")`
			`embedding_latency_ms: Mapped[int \| None] = mapped_column(Integer)`
			`search_latency_ms: Mapped[int \| None] = mapped_column(Integer)`
			`llm_latency_ms: Mapped[int \| None] = mapped_column(Integer)`
			`total_latency_ms: Mapped[int \| None] = mapped_column(Integer)`
			`created_at: Mapped[datetime] = mapped_column(DateTime, default=datetime.utcnow, index=True)`