diff --git a/config.yml b/config.yml index 808c664..357dc0d 100644 --- a/config.yml +++ b/config.yml @@ -62,9 +62,9 @@ jaeger: eino: # 文件切分配置 splitter: - bufferSize: 1 - minChunkSize: 64 - percentile: 0.75 + bufferSize: 3 # 必须 >=3 才能识别上下文语义 + minChunkSize: 1 # 避免切碎 + percentile: 0.75 # 保持不变 # 向量化配置 embedding: provider: "dashscope" @@ -77,6 +77,10 @@ eino: provider: "dashscope" apiKey: "sk-4a8b82770bf74bc490eb3e4c5a8e2be9" model: "qwen-turbo" + rerank: + provider: "dashscope" + apiKey: "sk-4a8b82770bf74bc490eb3e4c5a8e2be9" + model: "qwen3-rerank" # 文件上传服务地址,与oss模块minio中的endpoint一致 filePrefix: "http://116.204.74.41:9000" diff --git a/controller/document.go b/controller/document.go index 49b49d6..7396bea 100644 --- a/controller/document.go +++ b/controller/document.go @@ -51,3 +51,18 @@ func (c *document) DocumentVector(ctx context.Context, req *dto.DocumentVectorRe err = service.Document.Vector(ctx, req) return } + +func (c *document) VectorSemanticSplit(ctx context.Context, req *dto.VectorSemanticSplitReq) (res *beans.ResponseEmpty, err error) { + err = service.Document.VectorSemanticSplit(ctx, req) + return +} + +func (c *document) SearchRecursiveSplit(ctx context.Context, req *dto.SearchRecursiveSplitReq) (res *beans.ResponseEmpty, err error) { + err = service.Document.SearchRecursiveSplit(ctx, req) + return +} + +func (c *document) KeywordExtract(ctx context.Context, req *dto.KeywordExtractReq) (res *beans.ResponseEmpty, err error) { + err = service.Document.KeywordExtract(ctx, req) + return +} diff --git a/go.mod b/go.mod index b8a7277..f1e4301 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module rag go 1.26.0 require ( - gitea.com/red-future/common v0.0.15 + gitea.com/red-future/common v0.0.18 github.com/bjang03/gmq v0.0.1 github.com/cloudwego/eino v0.8.6 github.com/cloudwego/eino-ext/components/document/loader/file v0.0.0-20260416081055-0ebab92e14f2 diff --git a/go.sum b/go.sum index 6dac976..f91c53a 100644 --- a/go.sum +++ b/go.sum @@ -9,6 +9,8 @@ entgo.io/ent v0.14.3 h1:wokAV/kIlH9TeklJWGGS7AYJdVckr0DloWjIcO9iIIQ= entgo.io/ent v0.14.3/go.mod h1:aDPE/OziPEu8+OWbzy4UlvWmD2/kbRuWfK2A40hcxJM= gitea.com/red-future/common v0.0.15 h1:PcjjS7TpQHSlyGmfgWquxCoSWh1KMCu3DyXIhAgvvfg= gitea.com/red-future/common v0.0.15/go.mod h1:+El06tJ0E4SkWuWLLtP7t94CjG7Vqi8k1ladjWUvQx8= +gitea.com/red-future/common v0.0.18 h1:RwpnnWmDTCnFtKfmlp9BOnDd4r9eUnx7YT6Zst3VJqY= +gitea.com/red-future/common v0.0.18/go.mod h1:6/nqIucVzmjOyqDTIq71feYBXXFNBy0rFwzaQ0/Ueoo= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/toml v1.6.0 h1:dRaEfpa2VI55EwlIW72hMRHdWouJeRF7TPYhI+AUQjk= github.com/BurntSushi/toml v1.6.0/go.mod h1:ukJfTF/6rtPPRCnwkur4qwRxa8vTRFBF0uk2lLoLwho= diff --git a/model/dto/document.go b/model/dto/document.go index a3b1ad0..78d186b 100644 --- a/model/dto/document.go +++ b/model/dto/document.go @@ -102,3 +102,21 @@ type DocumentVectorRPC struct { ContentHash string `json:"contentHash" dc:"内容hash"` Vector pgvector.Vector `json:"vector" dc:"向量"` } + +type VectorSemanticSplitReq struct { + g.Meta `path:"/vectorSemanticSplit" method:"post" tags:"文件管理" summary:"向量化生成" dc:"向量化生成"` + + Id int64 `json:"id" v:"required#ID不能为空"` +} + +type SearchRecursiveSplitReq struct { + g.Meta `path:"/searchRecursiveSplit" method:"post" tags:"文件管理" summary:"全文检索生成" dc:"全文检索生成"` + + Id int64 `json:"id" v:"required#ID不能为空"` +} + +type KeywordExtractReq struct { + g.Meta `path:"/keywordExtract" method:"post" tags:"文件管理" summary:"关键词提取" dc:"关键词提取"` + + Id int64 `json:"id" v:"required#ID不能为空"` +} diff --git a/service/document.go b/service/document.go index 50c672e..3659cf2 100644 --- a/service/document.go +++ b/service/document.go @@ -80,13 +80,16 @@ func (s *documentService) Create(ctx context.Context, req *dto.CreateDocumentReq return } res = &dto.CreateDocumentRes{Id: id} - // 写入任务进度待处理 任务类型为文档解析 + // 写入任务进度进行中 任务类型为文档解析 err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{ TaskId: id, TaskType: task.TaskTypeDocParse, - Status: task.TaskStatusPending, - Remark: "文档上传成功待解析: " + req.Title, + Status: task.TaskStatusCompleted, + Remark: "文档上传完成", }) + if err != nil { + return + } return }) @@ -171,8 +174,7 @@ func (s *documentService) List(ctx context.Context, req *dto.ListDocumentReq) (r return } -// Vector 处理文件(使用eino框架切分和向量化) -func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq) (err error) { +func (s *documentService) VectorSemanticSplit(ctx context.Context, req *dto.VectorSemanticSplitReq) (err error) { // 1. 查询文件信息 documentReq := dto.GetDocumentReq{Id: req.Id} doc, err := dao.Document.Get(ctx, &documentReq) @@ -182,8 +184,56 @@ func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq if g.IsEmpty(doc) { return errors.New("document not found") } + err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{ + TaskId: req.Id, + TaskType: task.TaskTypeGenerateVector, + Status: task.TaskStatusRunning, + Remark: "向量化执行中", + }) + return s.semanticSplitDocument(ctx, doc) +} - // 2. 更新文档状态为处理中 +func (s *documentService) SearchRecursiveSplit(ctx context.Context, req *dto.SearchRecursiveSplitReq) (err error) { + // 1. 查询文件信息 + documentReq := dto.GetDocumentReq{Id: req.Id} + doc, err := dao.Document.Get(ctx, &documentReq) + if err != nil { + return err + } + if g.IsEmpty(doc) { + return errors.New("document not found") + } + err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{ + TaskId: req.Id, + TaskType: task.TaskTypeFullTextSearch, + Status: task.TaskStatusRunning, + Remark: "全文检索执行中", + }) + return s.recursiveSplitDocument(ctx, doc) +} + +func (s *documentService) KeywordExtract(ctx context.Context, req *dto.KeywordExtractReq) (err error) { + // 1. 查询文件信息 + documentReq := dto.GetDocumentReq{Id: req.Id} + doc, err := dao.Document.Get(ctx, &documentReq) + if err != nil { + return err + } + if g.IsEmpty(doc) { + return errors.New("document not found") + } + err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{ + TaskId: req.Id, + TaskType: task.TaskTypeExtractKeywords, + Status: task.TaskStatusRunning, + Remark: "提取关键词执行中", + }) + return s.extractDocument(ctx, doc) +} + +// Vector 处理文件(使用eino框架切分和向量化) +func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq) (err error) { + // 更新文档状态为处理中 updateDocumentReq := new(dto.UpdateDocumentReq) updateDocumentReq.Id = req.Id updateDocumentReq.VectorStatus = document.VectorStatusProcessing.Code() @@ -197,16 +247,7 @@ func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq }) return } - // 写入任务进度进行中 任务类型为文档解析 - err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{ - TaskId: req.Id, - TaskType: task.TaskTypeDocParse, - Status: task.TaskStatusRunning, - Remark: "文档解析开始", - }) - if err != nil { - return - } + user, err := utils.GetUserInfo(ctx) if err != nil { return err @@ -217,7 +258,7 @@ func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq // 任务1: 语义 切分文档 grpool.Add(taskCtx, func(ctx context.Context) { g.TryCatch(ctx, func(ctx context.Context) { - if innerErr := s.semanticSplitDocument(ctx, doc); innerErr != nil { + if innerErr := s.VectorSemanticSplit(ctx, &dto.VectorSemanticSplitReq{Id: req.Id}); innerErr != nil { cancel() } }, func(ctx context.Context, err error) { @@ -228,7 +269,7 @@ func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq // 任务2: 递归 切分文档 grpool.Add(taskCtx, func(ctx context.Context) { g.TryCatch(ctx, func(ctx context.Context) { - if innerErr := s.recursiveSplitDocument(ctx, doc); innerErr != nil { + if innerErr := s.SearchRecursiveSplit(ctx, &dto.SearchRecursiveSplitReq{Id: req.Id}); innerErr != nil { cancel() } }, func(ctx context.Context, err error) { @@ -239,7 +280,7 @@ func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq // 任务3: 提取文档 grpool.Add(taskCtx, func(ctx context.Context) { g.TryCatch(ctx, func(ctx context.Context) { - if innerErr := s.extractDocument(ctx, doc); innerErr != nil { + if innerErr := s.KeywordExtract(ctx, &dto.KeywordExtractReq{Id: req.Id}); innerErr != nil { cancel() } }, func(ctx context.Context, err error) {