-----------张斌2025-06-16 15:00:00-------------- --------------------pgsql创建rag_knowledge_dataset表语句--------------------------- -- 数据集表(RAG场景专用) CREATE TABLE IF NOT EXISTS rag_knowledge_dataset ( -- 基础字段(继承 SQLBaseCol 通用字段,与 SQLBaseDO 对齐) id BIGINT PRIMARY KEY, -- 主键ID(非自增) tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8类型 creator VARCHAR(64) NOT NULL, created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, updater VARCHAR(64) NOT NULL, updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, deleted_at timestamp(6), -- 数据集核心字段(调整为允许为空) name VARCHAR(128) NOT NULL, -- 数据集名称(核心字段仍非空) description TEXT DEFAULT '', -- 数据集描述(长文本,适配详细描述场景) embedding VARCHAR(64), -- 向量模型名称(允许为空) dimension INT, -- 向量维度(允许为空) document_count BIGINT, -- 文件数量(int64 映射为 BIGINT,允许为空) document_size BIGINT -- 文件大小(字节)(int64 映射为 BIGINT,允许为空) ); -- 索引(针对RAG数据集高频查询优化) CREATE INDEX idx_dataset_tenant_id ON rag_knowledge_dataset(tenant_id); -- 租户ID索引 CREATE INDEX idx_dataset_name ON rag_knowledge_dataset(name); -- 数据集名称模糊/精准查询 CREATE INDEX idx_dataset_embedding ON rag_knowledge_dataset(embedding); -- 按向量模型筛选(允许空值,索引自动忽略NULL) CREATE INDEX idx_dataset_deleted_at ON rag_knowledge_dataset(deleted_at); -- 软删字段索引 -- 唯一索引(保证数据集称唯一性,避免重复创建) CREATE UNIQUE INDEX uk_dataset_name ON rag_knowledge_dataset(name) WHERE deleted_at IS NULL; -- 表和字段注释 COMMENT ON TABLE rag_knowledge_dataset IS '数据集表(RAG场景专用)'; COMMENT ON COLUMN rag_knowledge_dataset.id IS '主键ID(非自增)'; COMMENT ON COLUMN rag_knowledge_dataset.tenant_id IS '租户ID'; COMMENT ON COLUMN rag_knowledge_dataset.creator IS '创建人'; COMMENT ON COLUMN rag_knowledge_dataset.created_at IS '创建时间'; COMMENT ON COLUMN rag_knowledge_dataset.updater IS '更新人'; COMMENT ON COLUMN rag_knowledge_dataset.updated_at IS '更新时间'; COMMENT ON COLUMN rag_knowledge_dataset.deleted_at IS '删除时间(软删)'; COMMENT ON COLUMN rag_knowledge_dataset.name IS '数据集名称'; COMMENT ON COLUMN rag_knowledge_dataset.description IS '数据集描述'; COMMENT ON COLUMN rag_knowledge_dataset.embedding IS '向量模型名称(如text-embedding-ada-002,允许为空)'; COMMENT ON COLUMN rag_knowledge_dataset.dimension IS '向量维度(对应embedding模型的输出维度,允许为空)'; COMMENT ON COLUMN rag_knowledge_dataset.document_count IS '数据集内文件数量(允许为空)'; COMMENT ON COLUMN rag_knowledge_dataset.document_size IS '数据集内文件总大小(字节,允许为空)'; --------------------pgsql创建rag_knowledge_dataset表语句--------------------------- --------------------pgsql创建rag_knowledge_document表语句--------------------------- -- RAG文件表(存储原始文件及切分相关信息,关联数据集) CREATE TABLE IF NOT EXISTS rag_knowledge_document ( -- 基础字段(继承 SQLBaseCol 通用字段) id BIGINT PRIMARY KEY, -- 主键ID(非自增) tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8类型 creator VARCHAR(64) NOT NULL, created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, updater VARCHAR(64) NOT NULL, updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, deleted_at timestamp(6), -- 核心关联字段 dataset_id BIGINT NOT NULL, -- 关联数据集ID(新增,非空) -- 文件核心字段 title VARCHAR(256) NOT NULL, -- 文件标题 content TEXT, -- 文件内容(长文本,允许为空:大文件内容可仅存路径,不存原文) format VARCHAR(16) DEFAULT '', -- 文件格式: txt, md, pdf, docx, html source VARCHAR(64) DEFAULT '', -- 来源(如:手动上传/爬虫/API导入) source_id VARCHAR(64) DEFAULT '', -- 来源ID(如:爬虫任务ID/上传批次ID) status SMALLINT NOT NULL DEFAULT 1, -- 状态:1启用/0停用 vector_status SMALLINT NOT NULL DEFAULT 1, -- 向量化状态: 1pending, 2processing, 3completed, 4failed,5partCompleted chunk_count BIGINT, -- 切分后的块数量(int64映射为BIGINT,允许为空) file_size BIGINT, -- 文件大小(字节)(int64映射为BIGINT,允许为空) file_path VARCHAR(512) DEFAULT '', -- 文件存储路径(如MinIO路径) metadata JSONB DEFAULT '{}'::JSONB -- 额外元数据(嵌套Metadata结构体,JSONB存储) ); -- 单独添加外键约束(避免表定义内写约束导致的语法兼容问题) -- 注意:执行前确保 rag_knowledge_dataset 表已存在,否则注释此行 ALTER TABLE rag_knowledge_document ADD CONSTRAINT fk_document_dataset_id FOREIGN KEY (dataset_id) REFERENCES rag_knowledge_dataset(id) ON DELETE CASCADE; -- 索引(针对RAG文件高频查询+数据集关联优化) CREATE INDEX idx_document_tenant_id ON rag_knowledge_document(tenant_id); -- 租户ID索引 CREATE INDEX idx_document_dataset_id ON rag_knowledge_document(dataset_id); -- 数据集关联查询(核心索引) CREATE INDEX idx_document_title ON rag_knowledge_document(title); -- 标题模糊查询 CREATE INDEX idx_document_format ON rag_knowledge_document(format); -- 按文件格式筛选 CREATE INDEX idx_document_status ON rag_knowledge_document(status); -- 启用/停用筛选 CREATE INDEX idx_document_vector_status ON rag_knowledge_document(vector_status); -- 向量化状态筛选(核心:监控处理中/失败文件) CREATE INDEX idx_document_source ON rag_knowledge_document(source, source_id); -- 来源+来源ID组合查询(溯源场景) CREATE INDEX idx_document_deleted_at ON rag_knowledge_document(deleted_at); -- 软删字段索引 -- 表和字段注释 COMMENT ON TABLE rag_knowledge_document IS 'RAG文件表(存储原始文件及切分、元数据相关信息,关联数据集)'; COMMENT ON COLUMN rag_knowledge_document.id IS '主键ID(非自增)'; COMMENT ON COLUMN rag_knowledge_document.tenant_id IS '租户ID'; COMMENT ON COLUMN rag_knowledge_document.creator IS '创建人'; COMMENT ON COLUMN rag_knowledge_document.created_at IS '创建时间'; COMMENT ON COLUMN rag_knowledge_document.updater IS '更新人'; COMMENT ON COLUMN rag_knowledge_document.updated_at IS '更新时间'; COMMENT ON COLUMN rag_knowledge_document.deleted_at IS '删除时间(软删)'; COMMENT ON COLUMN rag_knowledge_document.dataset_id IS '关联数据集ID'; COMMENT ON COLUMN rag_knowledge_document.title IS '文件标题'; COMMENT ON COLUMN rag_knowledge_document.content IS '文件内容(大文件建议仅存路径,不存储原文)'; COMMENT ON COLUMN rag_knowledge_document.format IS '文件格式:txt/md/pdf/docx/html等'; COMMENT ON COLUMN rag_knowledge_document.source IS '文件来源(手动上传/爬虫/API导入等)'; COMMENT ON COLUMN rag_knowledge_document.source_id IS '来源ID(溯源标识)'; COMMENT ON COLUMN rag_knowledge_document.status IS '文件状态:1启用/0停用'; COMMENT ON COLUMN rag_knowledge_document.vector_status IS '向量化状状态:1pending-待处理/2processing-处理中/3completed-完成/4failed-失败/5partCompleted'; COMMENT ON COLUMN rag_knowledge_document.chunk_count IS '文件切分后的块数量(int64类型,未切分时为空)'; COMMENT ON COLUMN rag_knowledge_document.file_size IS '文件大小(字节,int64类型,允许为空)'; COMMENT ON COLUMN rag_knowledge_document.file_path IS '文件存储路径(如MinIO对象存储路径)'; COMMENT ON COLUMN rag_knowledge_document.metadata IS '文件元数据,结构:{"author":"作者","tags":["标签1","标签2"],"custom":{"key":"值"}}'; --------------------pgsql创建rag_knowledge_document表语句--------------------------- --------------------pgsql创建rag_knowledge_keyword表语句--------------------------- -- 关键词表(文档关键词+权重) CREATE TABLE IF NOT EXISTS rag_knowledge_keyword ( -- 基础字段(完全对齐项目规范) id BIGINT PRIMARY KEY, -- 主键ID(非自增) tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8 creator VARCHAR(64) NOT NULL, created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, updater VARCHAR(64) NOT NULL, updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, deleted_at timestamp(6), -- 业务字段 dataset_id BIGINT NOT NULL, -- 数据集ID document_id BIGINT NOT NULL, -- 文件ID word VARCHAR(255) NOT NULL, -- 关键词 weight SMALLINT NOT NULL DEFAULT 0 -- 权重 ); -- 唯一索引:保证 租户 + 数据集 + 文档 + 关键词 全局唯一 -- CREATE UNIQUE INDEX uk_rag_knowledge_keyword_tenant_dataset_doc_word -- ON rag_knowledge_keyword(tenant_id, dataset_id, document_id, word) -- WHERE deleted_at IS NULL; -- 索引(按业务高频查询) CREATE INDEX idx_keyword_tenant_id ON rag_knowledge_keyword(tenant_id); CREATE INDEX idx_keyword_dataset_id ON rag_knowledge_keyword(dataset_id); CREATE INDEX idx_keyword_document_id ON rag_knowledge_keyword(document_id); CREATE INDEX idx_keyword_word ON rag_knowledge_keyword(word); CREATE INDEX idx_keyword_deleted_at ON rag_knowledge_keyword(deleted_at); -- 表和字段注释 COMMENT ON TABLE rag_knowledge_keyword IS 'RAG关键词表(文档关键词+权重)'; COMMENT ON COLUMN rag_knowledge_keyword.id IS '主键ID(非自增)'; COMMENT ON COLUMN rag_knowledge_keyword.tenant_id IS '租户ID'; COMMENT ON COLUMN rag_knowledge_keyword.creator IS '创建人'; COMMENT ON COLUMN rag_knowledge_keyword.created_at IS '创建时间'; COMMENT ON COLUMN rag_knowledge_keyword.updater IS '更新人'; COMMENT ON COLUMN rag_knowledge_keyword.updated_at IS '更新时间'; COMMENT ON COLUMN rag_knowledge_keyword.deleted_at IS '删除时间(软删)'; COMMENT ON COLUMN rag_knowledge_keyword.dataset_id IS '数据集ID'; COMMENT ON COLUMN rag_knowledge_keyword.document_id IS '文档ID'; COMMENT ON COLUMN rag_knowledge_keyword.word IS '关键词'; COMMENT ON COLUMN rag_knowledge_keyword.weight IS '权重'; --------------------pgsql创建rag_knowledge_keyword表语句--------------------------- --------------------pgsql创建rag_knowledge_task表语句--------------------------- -- 知识库任务表 CREATE TABLE IF NOT EXISTS rag_knowledge_task ( -- 基础字段(完全对齐项目规范) id BIGINT PRIMARY KEY, -- 主键ID(非自增) tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8 creator VARCHAR(64) NOT NULL, created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, updater VARCHAR(64) NOT NULL, updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, deleted_at timestamp(6), -- 业务字段 task_id BIGINT NOT NULL, -- 任务ID task_type VARCHAR(32) NOT NULL, -- 任务类型 status VARCHAR(32) NOT NULL, -- 任务状态 executor VARCHAR(128) DEFAULT '', -- 执行器 remark TEXT DEFAULT '' -- 备注 ); -- 索引(高频查询) CREATE INDEX idx_rkt_tenant_id ON rag_knowledge_task(tenant_id); CREATE INDEX idx_rkt_task_id ON rag_knowledge_task(task_id); CREATE INDEX idx_rkt_task_type ON rag_knowledge_task(task_type); CREATE INDEX idx_rkt_status ON rag_knowledge_task(status); CREATE INDEX idx_rkt_deleted_at ON rag_knowledge_task(deleted_at); -- 表和字段注释 COMMENT ON TABLE rag_knowledge_task IS '知识库任务表'; COMMENT ON COLUMN rag_knowledge_task.id IS '主键ID(非自增)'; COMMENT ON COLUMN rag_knowledge_task.tenant_id IS '租户ID'; COMMENT ON COLUMN rag_knowledge_task.creator IS '创建人'; COMMENT ON COLUMN rag_knowledge_task.created_at IS '创建时间'; COMMENT ON COLUMN rag_knowledge_task.updater IS '更新人'; COMMENT ON COLUMN rag_knowledge_task.updated_at IS '更新时间'; COMMENT ON COLUMN rag_knowledge_task.deleted_at IS '删除时间(软删)'; COMMENT ON COLUMN rag_knowledge_task.task_id IS '任务ID'; COMMENT ON COLUMN rag_knowledge_task.task_type IS '任务类型'; COMMENT ON COLUMN rag_knowledge_task.status IS '任务状态'; COMMENT ON COLUMN rag_knowledge_task.executor IS '执行器'; COMMENT ON COLUMN rag_knowledge_task.remark IS '备注'; --------------------pgsql创建rag_knowledge_task表语句--------------------------- --------------------pgsql创建rag_vector_dataset_index表语句--------------------------- -- 向量数据集索引表 CREATE TABLE IF NOT EXISTS rag_vector_dataset_index ( -- 基础字段 id BIGINT PRIMARY KEY, -- 主键ID(非自增) tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8 creator VARCHAR(64) NOT NULL, created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, updater VARCHAR(64) NOT NULL, updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, deleted_at timestamp(6), -- 核心字段 dataset_id INT8 NOT NULL, name VARCHAR(255) NOT NULL, collection VARCHAR(255) NOT NULL, dimension INT NOT NULL, field_type VARCHAR(50) NOT NULL, metric_type VARCHAR(50) NOT NULL, status SMALLINT NOT NULL DEFAULT 1, -- 状态:1启用/0停用 vector_count INT8 NOT NULL DEFAULT 0, description TEXT ); -- 唯一约束 ALTER TABLE rag_vector_dataset_index ADD CONSTRAINT uk_dataset_id_name UNIQUE (dataset_id, name); -- 索引 CREATE INDEX idx_dataset_index_tenant_id ON rag_vector_dataset_index(tenant_id); CREATE INDEX idx_dataset_index_dataset_id ON rag_vector_dataset_index(dataset_id); CREATE INDEX idx_dataset_index_status ON rag_vector_dataset_index(status); -- 注释 COMMENT ON TABLE rag_vector_dataset_index IS '向量数据集索引表'; COMMENT ON COLUMN rag_vector_dataset_index.id IS '主键ID(非自增)'; COMMENT ON COLUMN rag_vector_dataset_index.tenant_id IS '租户ID'; COMMENT ON COLUMN rag_vector_dataset_index.creator IS '创建人'; COMMENT ON COLUMN rag_vector_dataset_index.created_at IS '创建时间'; COMMENT ON COLUMN rag_vector_dataset_index.updater IS '更新人'; COMMENT ON COLUMN rag_vector_dataset_index.updated_at IS '更新时间'; COMMENT ON COLUMN rag_vector_dataset_index.deleted_at IS '删除时间(软删)'; COMMENT ON COLUMN rag_vector_dataset_index.dataset_id IS '数据集ID'; COMMENT ON COLUMN rag_vector_dataset_index.name IS '索引名称'; COMMENT ON COLUMN rag_vector_dataset_index.collection IS '向量集合名称'; COMMENT ON COLUMN rag_vector_dataset_index.dimension IS '向量维度'; COMMENT ON COLUMN rag_vector_dataset_index.field_type IS '字段类型'; COMMENT ON COLUMN rag_vector_dataset_index.metric_type IS '度量类型'; COMMENT ON COLUMN rag_vector_dataset_index.status IS '状态'; COMMENT ON COLUMN rag_vector_dataset_index.vector_count IS '向量数量'; COMMENT ON COLUMN rag_vector_dataset_index.description IS '描述'; --------------------pgsql创建rag_vector_dataset_index表语句--------------------------- --------------------pgsql创建rag_vector_document_chunk表语句--------------------------- CREATE EXTENSION IF NOT EXISTS vector; -- 文档分块向量表 CREATE TABLE IF NOT EXISTS rag_vector_document_chunk ( -- 基础字段 id BIGINT PRIMARY KEY, -- 主键ID(非自增) tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8 creator VARCHAR(64) NOT NULL, created_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, updater VARCHAR(64) NOT NULL, updated_at TIMESTAMP NOT NULL DEFAULT CURRENT_TIMESTAMP, deleted_at timestamp(6), -- 核心字段 status SMALLINT NOT NULL DEFAULT 1, -- 状态:1启用/0停用 vector_status SMALLINT NOT NULL DEFAULT 1, -- 向量化状态: 1pending, 2processing, 3completed, 4failed,5partCompleted dataset_id INT8 NOT NULL, document_id INT8 NOT NULL, content TEXT NOT NULL, content_hash VARCHAR(128) NOT NULL, chunk_index INT8 NOT NULL, -- 向量字段(pgvector) vector vector(1024) NOT NULL, -- 扩展信息 metadata JSONB ); -- 索引 CREATE INDEX idx_chunk_tenant_id ON rag_vector_document_chunk(tenant_id); CREATE INDEX idx_chunk_dataset_id ON rag_vector_document_chunk(dataset_id); CREATE INDEX idx_chunk_document_id ON rag_vector_document_chunk(document_id); CREATE INDEX idx_chunk_content_hash ON rag_vector_document_chunk(content_hash); CREATE INDEX idx_chunk_status ON rag_vector_document_chunk(status); CREATE INDEX idx_chunk_vector_status ON rag_vector_document_chunk(vector_status); -- 注释 COMMENT ON TABLE rag_vector_document_chunk IS '文档分块向量表'; COMMENT ON COLUMN rag_vector_document_chunk.id IS '主键ID(非自增)'; COMMENT ON COLUMN rag_vector_document_chunk.tenant_id IS '租户ID'; COMMENT ON COLUMN rag_vector_document_chunk.creator IS '创建人'; COMMENT ON COLUMN rag_vector_document_chunk.created_at IS '创建时间'; COMMENT ON COLUMN rag_vector_document_chunk.updater IS '更新人'; COMMENT ON COLUMN rag_vector_document_chunk.updated_at IS '更新时间'; COMMENT ON COLUMN rag_vector_document_chunk.deleted_at IS '删除时间(软删)'; COMMENT ON COLUMN rag_vector_document_chunk.status IS '状态'; COMMENT ON COLUMN rag_vector_document_chunk.vector_status IS '向量生成状态'; COMMENT ON COLUMN rag_vector_document_chunk.dataset_id IS '数据集ID'; COMMENT ON COLUMN rag_vector_document_chunk.document_id IS '文档ID'; COMMENT ON COLUMN rag_vector_document_chunk.content IS '分块内容'; COMMENT ON COLUMN rag_vector_document_chunk.content_hash IS '内容哈希'; COMMENT ON COLUMN rag_vector_document_chunk.chunk_index IS '分块序号'; COMMENT ON COLUMN rag_vector_document_chunk.vector IS '向量数据'; COMMENT ON COLUMN rag_vector_document_chunk.metadata IS '扩展元数据'; --------------------pgsql创建rag_vector_document_chunk表语句---------------------------