Files
rag/update.sql
2026-04-03 09:16:53 +08:00

162 lines
10 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-----------张斌2025-06-16 15:00:00--------------
--------------------pgsql创建rag_knowledge_dataset表语句---------------------------
-- 数据集表RAG场景专用
CREATE TABLE IF NOT EXISTS rag_knowledge_dataset (
-- 基础字段(继承 SQLBaseCol 通用字段,与 SQLBaseDO 对齐)
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8类型
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 数据集核心字段(调整为允许为空)
name VARCHAR(128) NOT NULL, -- 数据集名称(核心字段仍非空)
description TEXT DEFAULT '', -- 数据集描述(长文本,适配详细描述场景)
embedding VARCHAR(64), -- 向量模型名称(允许为空)
dimension INT, -- 向量维度(允许为空)
document_count BIGINT, -- 文件数量int64 映射为 BIGINT允许为空
document_size BIGINT -- 文件大小(字节)int64 映射为 BIGINT允许为空
);
-- 索引针对RAG数据集高频查询优化
CREATE INDEX idx_dataset_tenant_id ON rag_knowledge_dataset(tenant_id); -- 租户ID索引
CREATE INDEX idx_dataset_name ON rag_knowledge_dataset(name); -- 数据集名称模糊/精准查询
CREATE INDEX idx_dataset_embedding ON rag_knowledge_dataset(embedding); -- 按向量模型筛选允许空值索引自动忽略NULL
CREATE INDEX idx_dataset_deleted_at ON rag_knowledge_dataset(deleted_at); -- 软删字段索引
-- 唯一索引(保证数据集称唯一性,避免重复创建)
CREATE UNIQUE INDEX uk_dataset_name ON rag_knowledge_dataset(name) WHERE deleted_at IS NULL;
-- 表和字段注释
COMMENT ON TABLE rag_knowledge_dataset IS '数据集表RAG场景专用';
COMMENT ON COLUMN rag_knowledge_dataset.id IS '主键ID非自增';
COMMENT ON COLUMN rag_knowledge_dataset.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_knowledge_dataset.creator IS '创建人';
COMMENT ON COLUMN rag_knowledge_dataset.created_at IS '创建时间';
COMMENT ON COLUMN rag_knowledge_dataset.updater IS '更新人';
COMMENT ON COLUMN rag_knowledge_dataset.updated_at IS '更新时间';
COMMENT ON COLUMN rag_knowledge_dataset.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_knowledge_dataset.name IS '数据集名称';
COMMENT ON COLUMN rag_knowledge_dataset.description IS '数据集描述';
COMMENT ON COLUMN rag_knowledge_dataset.embedding IS '向量模型名称如text-embedding-ada-002允许为空';
COMMENT ON COLUMN rag_knowledge_dataset.dimension IS '向量维度对应embedding模型的输出维度允许为空';
COMMENT ON COLUMN rag_knowledge_dataset.document_count IS '数据集内文件数量(允许为空)';
COMMENT ON COLUMN rag_knowledge_dataset.document_size IS '数据集内文件总大小(字节,允许为空)';
--------------------pgsql创建rag_knowledge_dataset表语句---------------------------
--------------------pgsql创建rag_knowledge_document表语句---------------------------
-- RAG文件表存储原始文件及切分相关信息关联数据集
CREATE TABLE IF NOT EXISTS rag_knowledge_document (
-- 基础字段(继承 SQLBaseCol 通用字段)
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8类型
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 核心关联字段
dataset_id BIGINT NOT NULL, -- 关联数据集ID新增非空
-- 文件核心字段
title VARCHAR(256) NOT NULL, -- 文件标题
content TEXT, -- 文件内容(长文本,允许为空:大文件内容可仅存路径,不存原文)
format VARCHAR(16) DEFAULT '', -- 文件格式: txt, md, pdf, docx, html
source VARCHAR(64) DEFAULT '', -- 来源(如:手动上传/爬虫/API导入
source_id VARCHAR(64) DEFAULT '', -- 来源ID爬虫任务ID/上传批次ID
status SMALLINT NOT NULL DEFAULT 1, -- 状态1启用/0停用
vector_status SMALLINT NOT NULL DEFAULT 1, -- 向量化状态: 1pending, 2processing, 3completed, 4failed,5partCompleted
chunk_count BIGINT, -- 切分后的块数量int64映射为BIGINT允许为空
file_size BIGINT, -- 文件大小(字节)int64映射为BIGINT允许为空
file_path VARCHAR(512) DEFAULT '', -- 文件存储路径如MinIO路径
metadata JSONB DEFAULT '{}'::JSONB -- 额外元数据嵌套Metadata结构体JSONB存储
);
-- 单独添加外键约束(避免表定义内写约束导致的语法兼容问题)
-- 注意:执行前确保 rag_knowledge_dataset 表已存在,否则注释此行
ALTER TABLE rag_knowledge_document ADD CONSTRAINT fk_document_dataset_id FOREIGN KEY (dataset_id) REFERENCES rag_knowledge_dataset(id) ON DELETE CASCADE;
-- 索引针对RAG文件高频查询+数据集关联优化)
CREATE INDEX idx_document_tenant_id ON rag_knowledge_document(tenant_id); -- 租户ID索引
CREATE INDEX idx_document_dataset_id ON rag_knowledge_document(dataset_id); -- 数据集关联查询(核心索引)
CREATE INDEX idx_document_title ON rag_knowledge_document(title); -- 标题模糊查询
CREATE INDEX idx_document_format ON rag_knowledge_document(format); -- 按文件格式筛选
CREATE INDEX idx_document_status ON rag_knowledge_document(status); -- 启用/停用筛选
CREATE INDEX idx_document_vector_status ON rag_knowledge_document(vector_status); -- 向量化状态筛选(核心:监控处理中/失败文件)
CREATE INDEX idx_document_source ON rag_knowledge_document(source, source_id); -- 来源+来源ID组合查询溯源场景
CREATE INDEX idx_document_deleted_at ON rag_knowledge_document(deleted_at); -- 软删字段索引
-- 表和字段注释
COMMENT ON TABLE rag_knowledge_document IS 'RAG文件表存储原始文件及切分、元数据相关信息关联数据集';
COMMENT ON COLUMN rag_knowledge_document.id IS '主键ID非自增';
COMMENT ON COLUMN rag_knowledge_document.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_knowledge_document.creator IS '创建人';
COMMENT ON COLUMN rag_knowledge_document.created_at IS '创建时间';
COMMENT ON COLUMN rag_knowledge_document.updater IS '更新人';
COMMENT ON COLUMN rag_knowledge_document.updated_at IS '更新时间';
COMMENT ON COLUMN rag_knowledge_document.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_knowledge_document.dataset_id IS '关联数据集ID';
COMMENT ON COLUMN rag_knowledge_document.title IS '文件标题';
COMMENT ON COLUMN rag_knowledge_document.content IS '文件内容(大文件建议仅存路径,不存储原文)';
COMMENT ON COLUMN rag_knowledge_document.format IS '文件格式txt/md/pdf/docx/html等';
COMMENT ON COLUMN rag_knowledge_document.source IS '文件来源(手动上传/爬虫/API导入等';
COMMENT ON COLUMN rag_knowledge_document.source_id IS '来源ID溯源标识';
COMMENT ON COLUMN rag_knowledge_document.status IS '文件状态1启用/0停用';
COMMENT ON COLUMN rag_knowledge_document.vector_status IS '向量化状状态1pending-待处理/2processing-处理中/3completed-完成/4failed-失败/5partCompleted';
COMMENT ON COLUMN rag_knowledge_document.chunk_count IS '文件切分后的块数量int64类型未切分时为空';
COMMENT ON COLUMN rag_knowledge_document.file_size IS '文件大小字节int64类型允许为空';
COMMENT ON COLUMN rag_knowledge_document.file_path IS '文件存储路径如MinIO对象存储路径';
COMMENT ON COLUMN rag_knowledge_document.metadata IS '文件元数据,结构:{"author":"作者","tags":["标签1","标签2"],"custom":{"key":"值"}}';
--------------------pgsql创建rag_knowledge_document表语句---------------------------
--------------------pgsql创建rag_knowledge_keyword表语句---------------------------
-- 关键词表(文档关键词+权重)
CREATE TABLE IF NOT EXISTS rag_knowledge_keyword (
-- 基础字段(完全对齐项目规范)
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 业务字段
dataset_id BIGINT NOT NULL, -- 数据集ID
document_id BIGINT NOT NULL, -- 文件ID
word VARCHAR(255) NOT NULL, -- 关键词
weight SMALLINT NOT NULL DEFAULT 0 -- 权重
);
-- 唯一索引:保证 租户 + 数据集 + 文档 + 关键词 全局唯一
CREATE UNIQUE INDEX uk_rag_knowledge_keyword_tenant_dataset_doc_word
ON rag_knowledge_keyword(tenant_id, dataset_id, document_id, word)
WHERE deleted_at IS NULL;
-- 索引(按业务高频查询)
CREATE INDEX idx_keyword_tenant_id ON rag_knowledge_keyword(tenant_id);
CREATE INDEX idx_keyword_dataset_id ON rag_knowledge_keyword(dataset_id);
CREATE INDEX idx_keyword_document_id ON rag_knowledge_keyword(document_id);
CREATE INDEX idx_keyword_word ON rag_knowledge_keyword(word);
CREATE INDEX idx_keyword_deleted_at ON rag_knowledge_keyword(deleted_at);
-- 表和字段注释
COMMENT ON TABLE rag_knowledge_keyword IS 'RAG关键词表文档关键词+权重)';
COMMENT ON COLUMN rag_knowledge_keyword.id IS '主键ID非自增';
COMMENT ON COLUMN rag_knowledge_keyword.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_knowledge_keyword.creator IS '创建人';
COMMENT ON COLUMN rag_knowledge_keyword.created_at IS '创建时间';
COMMENT ON COLUMN rag_knowledge_keyword.updater IS '更新人';
COMMENT ON COLUMN rag_knowledge_keyword.updated_at IS '更新时间';
COMMENT ON COLUMN rag_knowledge_keyword.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_knowledge_keyword.dataset_id IS '数据集ID';
COMMENT ON COLUMN rag_knowledge_keyword.document_id IS '文档ID';
COMMENT ON COLUMN rag_knowledge_keyword.word IS '关键词';
COMMENT ON COLUMN rag_knowledge_keyword.weight IS '权重';
--------------------pgsql创建rag_knowledge_keyword表语句---------------------------