Files
rag/update.sql

364 lines
20 KiB
SQL
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
-----------张斌2025-06-16 15:00:00--------------
--------------------pgsql创建rag_knowledge_dataset表语句---------------------------
-- 数据集表RAG场景专用
CREATE TABLE IF NOT EXISTS rag_knowledge_dataset (
-- 基础字段(继承 SQLBaseCol 通用字段,与 SQLBaseDO 对齐)
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8类型
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 数据集核心字段(调整为允许为空)
name VARCHAR(128) NOT NULL, -- 数据集名称(核心字段仍非空)
description TEXT DEFAULT '', -- 数据集描述(长文本,适配详细描述场景)
embedding VARCHAR(64), -- 向量模型名称(允许为空)
dimension INT, -- 向量维度(允许为空)
document_count BIGINT, -- 文件数量int64 映射为 BIGINT允许为空
document_size BIGINT -- 文件大小(字节)int64 映射为 BIGINT允许为空
);
-- 索引针对RAG数据集高频查询优化
CREATE INDEX idx_dataset_tenant_id ON rag_knowledge_dataset(tenant_id); -- 租户ID索引
CREATE INDEX idx_dataset_name ON rag_knowledge_dataset(name); -- 数据集名称模糊/精准查询
CREATE INDEX idx_dataset_embedding ON rag_knowledge_dataset(embedding); -- 按向量模型筛选允许空值索引自动忽略NULL
CREATE INDEX idx_dataset_deleted_at ON rag_knowledge_dataset(deleted_at); -- 软删字段索引
-- 唯一索引(保证数据集称唯一性,避免重复创建)
CREATE UNIQUE INDEX uk_dataset_name ON rag_knowledge_dataset(name) WHERE deleted_at IS NULL;
-- 表和字段注释
COMMENT ON TABLE rag_knowledge_dataset IS '数据集表RAG场景专用';
COMMENT ON COLUMN rag_knowledge_dataset.id IS '主键ID非自增';
COMMENT ON COLUMN rag_knowledge_dataset.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_knowledge_dataset.creator IS '创建人';
COMMENT ON COLUMN rag_knowledge_dataset.created_at IS '创建时间';
COMMENT ON COLUMN rag_knowledge_dataset.updater IS '更新人';
COMMENT ON COLUMN rag_knowledge_dataset.updated_at IS '更新时间';
COMMENT ON COLUMN rag_knowledge_dataset.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_knowledge_dataset.name IS '数据集名称';
COMMENT ON COLUMN rag_knowledge_dataset.description IS '数据集描述';
COMMENT ON COLUMN rag_knowledge_dataset.embedding IS '向量模型名称如text-embedding-ada-002允许为空';
COMMENT ON COLUMN rag_knowledge_dataset.dimension IS '向量维度对应embedding模型的输出维度允许为空';
COMMENT ON COLUMN rag_knowledge_dataset.document_count IS '数据集内文件数量(允许为空)';
COMMENT ON COLUMN rag_knowledge_dataset.document_size IS '数据集内文件总大小(字节,允许为空)';
--------------------pgsql创建rag_knowledge_dataset表语句---------------------------
--------------------pgsql创建rag_knowledge_document表语句---------------------------
-- RAG文件表存储原始文件及切分相关信息关联数据集
CREATE TABLE IF NOT EXISTS rag_knowledge_document (
-- 基础字段(继承 SQLBaseCol 通用字段)
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8类型
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 核心关联字段
dataset_id BIGINT NOT NULL, -- 关联数据集ID新增非空
-- 文件核心字段
title VARCHAR(256) NOT NULL, -- 文件标题
content TEXT, -- 文件内容(长文本,允许为空:大文件内容可仅存路径,不存原文)
format VARCHAR(16) DEFAULT '', -- 文件格式: txt, md, pdf, docx, html
source VARCHAR(64) DEFAULT '', -- 来源(如:手动上传/爬虫/API导入
source_id VARCHAR(64) DEFAULT '', -- 来源ID爬虫任务ID/上传批次ID
status SMALLINT NOT NULL DEFAULT 1, -- 状态1启用/0停用
vector_status SMALLINT NOT NULL DEFAULT 1, -- 向量化状态: 1pending, 2processing, 3completed, 4failed,5partCompleted
chunk_count BIGINT, -- 切分后的块数量int64映射为BIGINT允许为空
file_size BIGINT, -- 文件大小(字节)int64映射为BIGINT允许为空
file_path VARCHAR(512) DEFAULT '', -- 文件存储路径如MinIO路径
metadata JSONB DEFAULT '{}'::JSONB -- 额外元数据嵌套Metadata结构体JSONB存储
);
-- 单独添加外键约束(避免表定义内写约束导致的语法兼容问题)
-- 注意:执行前确保 rag_knowledge_dataset 表已存在,否则注释此行
ALTER TABLE rag_knowledge_document ADD CONSTRAINT fk_document_dataset_id FOREIGN KEY (dataset_id) REFERENCES rag_knowledge_dataset(id) ON DELETE CASCADE;
-- 索引针对RAG文件高频查询+数据集关联优化)
CREATE INDEX idx_document_tenant_id ON rag_knowledge_document(tenant_id); -- 租户ID索引
CREATE INDEX idx_document_dataset_id ON rag_knowledge_document(dataset_id); -- 数据集关联查询(核心索引)
CREATE INDEX idx_document_title ON rag_knowledge_document(title); -- 标题模糊查询
CREATE INDEX idx_document_format ON rag_knowledge_document(format); -- 按文件格式筛选
CREATE INDEX idx_document_status ON rag_knowledge_document(status); -- 启用/停用筛选
CREATE INDEX idx_document_vector_status ON rag_knowledge_document(vector_status); -- 向量化状态筛选(核心:监控处理中/失败文件)
CREATE INDEX idx_document_source ON rag_knowledge_document(source, source_id); -- 来源+来源ID组合查询溯源场景
CREATE INDEX idx_document_deleted_at ON rag_knowledge_document(deleted_at); -- 软删字段索引
-- 表和字段注释
COMMENT ON TABLE rag_knowledge_document IS 'RAG文件表存储原始文件及切分、元数据相关信息关联数据集';
COMMENT ON COLUMN rag_knowledge_document.id IS '主键ID非自增';
COMMENT ON COLUMN rag_knowledge_document.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_knowledge_document.creator IS '创建人';
COMMENT ON COLUMN rag_knowledge_document.created_at IS '创建时间';
COMMENT ON COLUMN rag_knowledge_document.updater IS '更新人';
COMMENT ON COLUMN rag_knowledge_document.updated_at IS '更新时间';
COMMENT ON COLUMN rag_knowledge_document.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_knowledge_document.dataset_id IS '关联数据集ID';
COMMENT ON COLUMN rag_knowledge_document.title IS '文件标题';
COMMENT ON COLUMN rag_knowledge_document.content IS '文件内容(大文件建议仅存路径,不存储原文)';
COMMENT ON COLUMN rag_knowledge_document.format IS '文件格式txt/md/pdf/docx/html等';
COMMENT ON COLUMN rag_knowledge_document.source IS '文件来源(手动上传/爬虫/API导入等';
COMMENT ON COLUMN rag_knowledge_document.source_id IS '来源ID溯源标识';
COMMENT ON COLUMN rag_knowledge_document.status IS '文件状态1启用/0停用';
COMMENT ON COLUMN rag_knowledge_document.vector_status IS '向量化状状态1pending-待处理/2processing-处理中/3completed-完成/4failed-失败/5partCompleted';
COMMENT ON COLUMN rag_knowledge_document.chunk_count IS '文件切分后的块数量int64类型未切分时为空';
COMMENT ON COLUMN rag_knowledge_document.file_size IS '文件大小字节int64类型允许为空';
COMMENT ON COLUMN rag_knowledge_document.file_path IS '文件存储路径如MinIO对象存储路径';
COMMENT ON COLUMN rag_knowledge_document.metadata IS '文件元数据,结构:{"author":"作者","tags":["标签1","标签2"],"custom":{"key":"值"}}';
--------------------pgsql创建rag_knowledge_document表语句---------------------------
--------------------pgsql创建rag_knowledge_keyword表语句---------------------------
-- 关键词表(文档关键词+权重)
CREATE TABLE IF NOT EXISTS rag_knowledge_keyword (
-- 基础字段(完全对齐项目规范)
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 业务字段
dataset_id BIGINT NOT NULL, -- 数据集ID
document_id BIGINT NOT NULL, -- 文件ID
word VARCHAR(255) NOT NULL, -- 关键词
weight SMALLINT NOT NULL DEFAULT 0 -- 权重
);
-- 唯一索引:保证 租户 + 数据集 + 文档 + 关键词 全局唯一
-- CREATE UNIQUE INDEX uk_rag_knowledge_keyword_tenant_dataset_doc_word
-- ON rag_knowledge_keyword(tenant_id, dataset_id, document_id, word)
-- WHERE deleted_at IS NULL;
-- 索引(按业务高频查询)
CREATE INDEX idx_keyword_tenant_id ON rag_knowledge_keyword(tenant_id);
CREATE INDEX idx_keyword_dataset_id ON rag_knowledge_keyword(dataset_id);
CREATE INDEX idx_keyword_document_id ON rag_knowledge_keyword(document_id);
CREATE INDEX idx_keyword_word ON rag_knowledge_keyword(word);
CREATE INDEX idx_keyword_deleted_at ON rag_knowledge_keyword(deleted_at);
-- 表和字段注释
COMMENT ON TABLE rag_knowledge_keyword IS 'RAG关键词表文档关键词+权重)';
COMMENT ON COLUMN rag_knowledge_keyword.id IS '主键ID非自增';
COMMENT ON COLUMN rag_knowledge_keyword.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_knowledge_keyword.creator IS '创建人';
COMMENT ON COLUMN rag_knowledge_keyword.created_at IS '创建时间';
COMMENT ON COLUMN rag_knowledge_keyword.updater IS '更新人';
COMMENT ON COLUMN rag_knowledge_keyword.updated_at IS '更新时间';
COMMENT ON COLUMN rag_knowledge_keyword.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_knowledge_keyword.dataset_id IS '数据集ID';
COMMENT ON COLUMN rag_knowledge_keyword.document_id IS '文档ID';
COMMENT ON COLUMN rag_knowledge_keyword.word IS '关键词';
COMMENT ON COLUMN rag_knowledge_keyword.weight IS '权重';
CREATE UNIQUE INDEX uk_rag_knowledge_keyword_tenant_dataset_doc_word ON rag_knowledge_keyword (tenant_id, dataset_id, document_id, word);
--------------------pgsql创建rag_knowledge_keyword表语句---------------------------
--------------------pgsql创建rag_knowledge_task表语句---------------------------
-- 知识库任务表
CREATE TABLE IF NOT EXISTS rag_knowledge_task (
-- 基础字段(完全对齐项目规范)
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 业务字段
task_id BIGINT NOT NULL, -- 任务ID
task_type VARCHAR(32) NOT NULL, -- 任务类型
status VARCHAR(32) NOT NULL, -- 任务状态
executor VARCHAR(128) DEFAULT '', -- 执行器
remark TEXT DEFAULT '' -- 备注
);
-- 索引(高频查询)
CREATE INDEX idx_rkt_tenant_id ON rag_knowledge_task(tenant_id);
CREATE INDEX idx_rkt_task_id ON rag_knowledge_task(task_id);
CREATE INDEX idx_rkt_task_type ON rag_knowledge_task(task_type);
CREATE INDEX idx_rkt_status ON rag_knowledge_task(status);
CREATE INDEX idx_rkt_deleted_at ON rag_knowledge_task(deleted_at);
-- 表和字段注释
COMMENT ON TABLE rag_knowledge_task IS '知识库任务表';
COMMENT ON COLUMN rag_knowledge_task.id IS '主键ID非自增';
COMMENT ON COLUMN rag_knowledge_task.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_knowledge_task.creator IS '创建人';
COMMENT ON COLUMN rag_knowledge_task.created_at IS '创建时间';
COMMENT ON COLUMN rag_knowledge_task.updater IS '更新人';
COMMENT ON COLUMN rag_knowledge_task.updated_at IS '更新时间';
COMMENT ON COLUMN rag_knowledge_task.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_knowledge_task.task_id IS '任务ID';
COMMENT ON COLUMN rag_knowledge_task.task_type IS '任务类型';
COMMENT ON COLUMN rag_knowledge_task.status IS '任务状态';
COMMENT ON COLUMN rag_knowledge_task.executor IS '执行器';
COMMENT ON COLUMN rag_knowledge_task.remark IS '备注';
--------------------pgsql创建rag_knowledge_task表语句---------------------------
--------------------pgsql创建rag_knowledge_model表语句---------------------------
-- 知识库模型配置表
CREATE TABLE IF NOT EXISTS rag_knowledge_model (
-- 基础字段(完全对齐项目规范)
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 业务字段
dataset_id BIGINT NOT NULL, -- 数据集ID
model_type VARCHAR(32) NOT NULL, -- 模型类型
model_name VARCHAR(128) NOT NULL, -- 模型名称
model_desc TEXT DEFAULT '', -- 模型描述
model_config JSONB DEFAULT '{}'::JSONB -- 模型配置(JSONB)
);
-- 索引(高频查询)
CREATE INDEX idx_rkm_tenant_id ON rag_knowledge_model(tenant_id);
CREATE INDEX idx_rkm_dataset_id ON rag_knowledge_model(dataset_id);
CREATE INDEX idx_rkm_model_type ON rag_knowledge_model(model_type);
CREATE INDEX idx_rkm_deleted_at ON rag_knowledge_model(deleted_at);
-- 表和字段注释
COMMENT ON TABLE rag_knowledge_model IS '知识库模型配置表';
COMMENT ON COLUMN rag_knowledge_model.id IS '主键ID非自增';
COMMENT ON COLUMN rag_knowledge_model.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_knowledge_model.creator IS '创建人';
COMMENT ON COLUMN rag_knowledge_model.created_at IS '创建时间';
COMMENT ON COLUMN rag_knowledge_model.updater IS '更新人';
COMMENT ON COLUMN rag_knowledge_model.updated_at IS '更新时间';
COMMENT ON COLUMN rag_knowledge_model.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_knowledge_model.dataset_id IS '数据集ID';
COMMENT ON COLUMN rag_knowledge_model.model_type IS '模型类型';
COMMENT ON COLUMN rag_knowledge_model.model_name IS '模型名称';
COMMENT ON COLUMN rag_knowledge_model.model_desc IS '模型描述';
COMMENT ON COLUMN rag_knowledge_model.model_config IS '模型配置(JSONB)';
--------------------pgsql创建rag_knowledge_model表语句---------------------------
--------------------pgsql创建rag_vector_dataset_index表语句---------------------------
-- 向量数据集索引表
CREATE TABLE IF NOT EXISTS rag_vector_dataset_index (
-- 基础字段
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 核心字段
dataset_id INT8 NOT NULL,
name VARCHAR(255) NOT NULL,
collection VARCHAR(255) NOT NULL,
dimension INT NOT NULL,
field_type VARCHAR(50) NOT NULL,
metric_type VARCHAR(50) NOT NULL,
status SMALLINT NOT NULL DEFAULT 1, -- 状态1启用/0停用
vector_count INT8 NOT NULL DEFAULT 0,
description TEXT
);
-- 唯一约束
ALTER TABLE rag_vector_dataset_index ADD CONSTRAINT uk_dataset_id_name UNIQUE (dataset_id, name);
-- 索引
CREATE INDEX idx_dataset_index_tenant_id ON rag_vector_dataset_index(tenant_id);
CREATE INDEX idx_dataset_index_dataset_id ON rag_vector_dataset_index(dataset_id);
CREATE INDEX idx_dataset_index_status ON rag_vector_dataset_index(status);
-- 注释
COMMENT ON TABLE rag_vector_dataset_index IS '向量数据集索引表';
COMMENT ON COLUMN rag_vector_dataset_index.id IS '主键ID非自增';
COMMENT ON COLUMN rag_vector_dataset_index.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_vector_dataset_index.creator IS '创建人';
COMMENT ON COLUMN rag_vector_dataset_index.created_at IS '创建时间';
COMMENT ON COLUMN rag_vector_dataset_index.updater IS '更新人';
COMMENT ON COLUMN rag_vector_dataset_index.updated_at IS '更新时间';
COMMENT ON COLUMN rag_vector_dataset_index.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_vector_dataset_index.dataset_id IS '数据集ID';
COMMENT ON COLUMN rag_vector_dataset_index.name IS '索引名称';
COMMENT ON COLUMN rag_vector_dataset_index.collection IS '向量集合名称';
COMMENT ON COLUMN rag_vector_dataset_index.dimension IS '向量维度';
COMMENT ON COLUMN rag_vector_dataset_index.field_type IS '字段类型';
COMMENT ON COLUMN rag_vector_dataset_index.metric_type IS '度量类型';
COMMENT ON COLUMN rag_vector_dataset_index.status IS '状态';
COMMENT ON COLUMN rag_vector_dataset_index.vector_count IS '向量数量';
COMMENT ON COLUMN rag_vector_dataset_index.description IS '描述';
--------------------pgsql创建rag_vector_dataset_index表语句---------------------------
--------------------pgsql创建rag_vector_document_vector表语句---------------------------
CREATE EXTENSION IF NOT EXISTS vector;
-- 文档分块向量表
CREATE TABLE IF NOT EXISTS rag_vector_document_vector (
-- 基础字段
id BIGINT PRIMARY KEY, -- 主键ID非自增
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8
creator VARCHAR(64) NOT NULL,
created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
updater VARCHAR(64) NOT NULL,
updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP,
deleted_at timestamp(6),
-- 核心字段
status SMALLINT NOT NULL DEFAULT 1, -- 状态1启用/0停用
vector_status SMALLINT NOT NULL DEFAULT 1, -- 向量化状态: 1pending, 2processing, 3completed, 4failed,5partCompleted
dataset_id INT8 NOT NULL,
document_id INT8 NOT NULL,
content TEXT NOT NULL,
content_hash VARCHAR(128) NOT NULL,
chunk_index INT8 NOT NULL,
-- 向量字段pgvector
vector vector(1024) NOT NULL,
-- 扩展信息
metadata JSONB
);
-- 索引
CREATE INDEX idx_vector_tenant_id ON rag_vector_document_vector(tenant_id);
CREATE INDEX idx_vector_dataset_id ON rag_vector_document_vector(dataset_id);
CREATE INDEX idx_vector_document_id ON rag_vector_document_vector(document_id);
CREATE INDEX idx_vector_content_hash ON rag_vector_document_vector(content_hash);
CREATE INDEX idx_vector_status ON rag_vector_document_vector(status);
CREATE INDEX idx_vector_vector_status ON rag_vector_document_vector(vector_status);
-- 注释
COMMENT ON TABLE rag_vector_document_vector IS '文档分块向量表';
COMMENT ON COLUMN rag_vector_document_vector.id IS '主键ID非自增';
COMMENT ON COLUMN rag_vector_document_vector.tenant_id IS '租户ID';
COMMENT ON COLUMN rag_vector_document_vector.creator IS '创建人';
COMMENT ON COLUMN rag_vector_document_vector.created_at IS '创建时间';
COMMENT ON COLUMN rag_vector_document_vector.updater IS '更新人';
COMMENT ON COLUMN rag_vector_document_vector.updated_at IS '更新时间';
COMMENT ON COLUMN rag_vector_document_vector.deleted_at IS '删除时间(软删)';
COMMENT ON COLUMN rag_vector_document_vector.status IS '状态';
COMMENT ON COLUMN rag_vector_document_vector.vector_status IS '向量生成状态';
COMMENT ON COLUMN rag_vector_document_vector.dataset_id IS '数据集ID';
COMMENT ON COLUMN rag_vector_document_vector.document_id IS '文档ID';
COMMENT ON COLUMN rag_vector_document_vector.content IS '分块内容';
COMMENT ON COLUMN rag_vector_document_vector.content_hash IS '内容哈希';
COMMENT ON COLUMN rag_vector_document_vector.chunk_index IS '分块序号';
COMMENT ON COLUMN rag_vector_document_vector.vector IS '向量数据';
COMMENT ON COLUMN rag_vector_document_vector.metadata IS '扩展元数据';
--------------------pgsql创建rag_vector_document_vector表语句---------------------------