refactor: 重构文档向量相关代码结构
This commit is contained in:
@@ -100,9 +100,9 @@ func (i *PGVectorIndexer) doStore(ctx context.Context, docs []*schema.Document,
|
||||
}
|
||||
|
||||
// 转成业务实体
|
||||
var chunks []*dto.VectorDocumentChunkMsg
|
||||
var chunks []*dto.VectorDocumentVectorMsg
|
||||
for idx, doc := range docs {
|
||||
ck := new(dto.VectorDocumentChunkMsg)
|
||||
ck := new(dto.VectorDocumentVectorMsg)
|
||||
err = gconv.Struct(doc.MetaData, ck)
|
||||
if err != nil {
|
||||
glog.Errorf(ctx, "doStore err: %v", err)
|
||||
@@ -126,7 +126,7 @@ func (i *PGVectorIndexer) doStore(ctx context.Context, docs []*schema.Document,
|
||||
return
|
||||
}
|
||||
// 入库
|
||||
rows, err = dao.DocumentChunk.BatchInsert(ctx, chunks)
|
||||
rows, err = dao.DocumentVector.BatchInsert(ctx, chunks)
|
||||
|
||||
return
|
||||
}
|
||||
|
||||
@@ -210,7 +210,7 @@ func (r *PGVectorRetriever) doRetrieveVector(ctx context.Context, query string,
|
||||
}
|
||||
datasetIds := gconv.Int64s(opts.DSLInfo["dataset_ids"])
|
||||
|
||||
rows, err := dao.DocumentChunk.GetAllByVector(ctx, datasetIds, queryVec, topK)
|
||||
rows, err := dao.DocumentVector.GetAllByVector(ctx, datasetIds, queryVec, topK)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
@@ -239,7 +239,7 @@ func (r *PGVectorRetriever) doRetrieveMeilisearch(ctx context.Context, query str
|
||||
datasetIds := gconv.Int64s(opts.DSLInfo["dataset_ids"])
|
||||
|
||||
// 调用你已有的 Meilisearch DAO
|
||||
rows, err := dao.DocumentChunk.SearchByKeywords(ctx, query, datasetIds, topK)
|
||||
rows, err := dao.DocumentVector.SearchByKeywords(ctx, query, datasetIds, topK)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
package task
|
||||
|
||||
import (
|
||||
"time"
|
||||
|
||||
"gitea.com/red-future/common/beans"
|
||||
)
|
||||
|
||||
type baseTaskCol struct {
|
||||
beans.SQLBaseCol
|
||||
TaskType string
|
||||
Status string
|
||||
Priority string
|
||||
ParentTaskID string
|
||||
TotalItems string
|
||||
ProcessedItems string
|
||||
Progress string
|
||||
StartTime string
|
||||
EndTime string
|
||||
Duration string
|
||||
SuccessCount string
|
||||
FailCount string
|
||||
Executor string
|
||||
DocumentID string
|
||||
Remark string
|
||||
}
|
||||
|
||||
var BaseTaskCol = baseTaskCol{
|
||||
SQLBaseCol: beans.DefSQLBaseCol,
|
||||
TaskType: "task_type",
|
||||
Status: "status",
|
||||
Priority: "task_priority",
|
||||
ParentTaskID: "parent_task_id",
|
||||
TotalItems: "total_items",
|
||||
ProcessedItems: "processed_items",
|
||||
Progress: "progress",
|
||||
StartTime: "start_time",
|
||||
EndTime: "end_time",
|
||||
Duration: "duration",
|
||||
SuccessCount: "success_count",
|
||||
FailCount: "fail_count",
|
||||
Executor: "executor",
|
||||
DocumentID: "document_id",
|
||||
Remark: "remark",
|
||||
}
|
||||
|
||||
// SQLBaseTask 任务基类 - SQL版本
|
||||
type SQLBaseTask struct {
|
||||
beans.SQLBaseDO `orm:",inline"`
|
||||
// 任务核心信息
|
||||
TaskType TaskType `orm:"task_type" json:"taskType" dc:"任务类型"`
|
||||
Status TaskStatus `orm:"status" json:"status" dc:"任务状态"`
|
||||
Priority TaskPriority `orm:"task_priority" json:"priority,omitempty" dc:"任务优先级"`
|
||||
ParentTaskID int64 `orm:"parent_task_id" json:"parentTaskId,omitempty" dc:"父任务ID"`
|
||||
// 任务进度
|
||||
TotalItems int64 `orm:"total_items" json:"totalItems" dc:"总数"`
|
||||
ProcessedItems int64 `orm:"processed_items" json:"processedItems" dc:"已处理数"`
|
||||
Progress float64 `orm:"progress" json:"progress" dc:"进度"` // 0~100 百分比
|
||||
// 任务结果
|
||||
StartTime *time.Time `orm:"start_time" json:"startTime" dc:"开始时间"`
|
||||
EndTime *time.Time `orm:"end_time" json:"endTime,omitempty" dc:"结束时间"`
|
||||
Duration int64 `orm:"duration" json:"duration,omitempty" dc:"耗时(毫秒)"`
|
||||
SuccessCount int64 `orm:"success_count" json:"successCount" dc:"成功数"`
|
||||
FailCount int64 `orm:"fail_count" json:"failCount" dc:"失败数"`
|
||||
// 其他
|
||||
Executor string `orm:"executor" json:"executor,omitempty" dc:"执行器标识"`
|
||||
DocumentID int64 `orm:"document_id" json:"documentId,omitempty" dc:"文档ID"`
|
||||
Remark string `orm:"remark" json:"remark,omitempty" dc:"备注/错误信息"`
|
||||
}
|
||||
54
config.yml
54
config.yml
@@ -5,47 +5,6 @@ server:
|
||||
|
||||
# Database.
|
||||
database:
|
||||
default:
|
||||
- type: "pgsql"
|
||||
host: "116.204.74.41"
|
||||
port: "15432"
|
||||
user: "postgres"
|
||||
pass: "Bjang09@686^*^"
|
||||
name: "rag"
|
||||
prefix: "rag_knowledge_" # (可选)表名前缀
|
||||
role: "master" # (可选)数据库主从角色(master/slave),默认为master。如果不使用应用主从机制请不配置或留空即可。
|
||||
debug: true # (可选)开启调试模式
|
||||
dryRun: false # (可选)ORM空跑(只读不写)
|
||||
charset: "utf8" # (可选)数据库编码(如: utf8mb4/utf8/gbk/gb2312),一般设置为utf8mb4。默认为utf8。
|
||||
timezone: "Asia/Shanghai" # (可选)时区配置,例如:Local
|
||||
maxIdle: 5 # (可选)连接池最大闲置的连接数(默认10)
|
||||
maxOpen: 20 # (可选)连接池最大打开的连接数(默认无限制)
|
||||
maxLifetime: "30s" # (可选)连接对象可重复使用的时间长度(默认30秒)
|
||||
maxIdleConnTime: "30s" # (可选,v2.10新增)连接池中空闲连接的最大生存时间(默认30秒)。可以通过配置文件或SetConnMaxIdleTime方法设置,避免长时间空闲连接占用资源。
|
||||
createdAt: "created_at" # (可选)自动创建时间字段名称
|
||||
updatedAt: "updated_at" # (可选)自动更新时间字段名称
|
||||
deletedAt: "deleted_at" # (可选)软删除时间字段名称
|
||||
timeMaintainDisabled: false # (可选)是否完全关闭时间更新特性,为true时CreatedAt/UpdatedAt/DeletedAt都将失效
|
||||
- type: "pgsql"
|
||||
host: "116.204.74.41"
|
||||
port: "15432"
|
||||
user: "postgres"
|
||||
pass: "Bjang09@686^*^"
|
||||
name: "tenant-1"
|
||||
prefix: "rag_knowledge_" # (可选)表名前缀
|
||||
role: "slave" # (可选)数据库主从角色(master/slave),默认为master。如果不使用应用主从机制请不配置或留空即可。
|
||||
debug: false # (可选)开启调试模式
|
||||
dryRun: false # (可选)ORM空跑(只读不写)
|
||||
charset: "utf8" # (可选)数据库编码(如: utf8mb4/utf8/gbk/gb2312),一般设置为utf8mb4。默认为utf8。
|
||||
timezone: "Asia/Shanghai" # (可选)时区配置,例如:Local
|
||||
maxIdle: 5 # (可选)连接池最大闲置的连接数(默认10)
|
||||
maxOpen: 20 # (可选)连接池最大打开的连接数(默认无限制)
|
||||
maxLifetime: "30s" # (可选)连接对象可重复使用的时间长度(默认30秒)
|
||||
maxIdleConnTime: "30s" # (可选,v2.10新增)连接池中空闲连接的最大生存时间(默认30秒)。可以通过配置文件或SetConnMaxIdleTime方法设置,避免长时间空闲连接占用资源。
|
||||
createdAt: "created_at" # (可选)自动创建时间字段名称
|
||||
updatedAt: "updated_at" # (可选)自动更新时间字段名称
|
||||
deletedAt: "deleted_at" # (可选)软删除时间字段名称
|
||||
timeMaintainDisabled: false # (可选)是否完全关闭时间更新特性,为true时CreatedAt/UpdatedAt/DeletedAt都将失效
|
||||
rag_knowledge:
|
||||
- type: "pgsql"
|
||||
host: "116.204.74.41"
|
||||
@@ -123,19 +82,6 @@ eino:
|
||||
# 文件上传服务地址,与oss模块minio中的endpoint一致
|
||||
filePrefix: "http://116.204.74.41:9000"
|
||||
|
||||
gmq:
|
||||
redis:
|
||||
primary:
|
||||
addr: "116.204.74.41"
|
||||
port: "6379"
|
||||
db: 0
|
||||
username: ""
|
||||
password: ""
|
||||
poolSize: 10
|
||||
minIdleConn: 5
|
||||
maxActiveConn: 10
|
||||
maxRetries: 30
|
||||
|
||||
# Meilisearch 全文检索配置
|
||||
meilisearch:
|
||||
default:
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
package public
|
||||
|
||||
const GmqMsgPluginsName = "gmq_msg"
|
||||
|
||||
const KnowledgeLockEsKey = "rag_binary:knowledge:lock:knowledgeIdEs-%v"
|
||||
const KnowledgeLockSqlKey = "rag_binary:knowledge:lock:knowledgeIdSql-%v"
|
||||
const KnowledgeContentHashEsKey = "rag_binary:knowledge:knowledgeId:contentHashEs-%v"
|
||||
@@ -13,8 +15,8 @@ const (
|
||||
)
|
||||
|
||||
const (
|
||||
KnowledgeDocumentChunkTopic = "knowledge:document:chunk:stream" // 请求 Stream 键名(与发消息的key一致)
|
||||
KnowledgeDocumentChunkConsumer = "knowledge-document-chunk-consumer" // 消费者名称(唯一标识)
|
||||
KnowledgeDocumentChunkBatchSize = 1 // 批处理大小(每次读取1条)
|
||||
KnowledgeDocumentChunkAutoAck = false // ACK是否自动确认(true自动确认,false不确认)
|
||||
KnowledgeDocumentVectorTopic = "knowledge:document:vector:stream" // 请求 Stream 键名(与发消息的key一致)
|
||||
KnowledgeDocumentVectorConsumer = "knowledge-document-vector-consumer" // 消费者名称(唯一标识)
|
||||
KnowledgeDocumentVectorCount = 1 // 批处理大小(每次读取1条)
|
||||
KnowledgeDocumentVectorAutoAck = false // ACK是否自动确认(true自动确认,false不确认)
|
||||
)
|
||||
@@ -13,7 +13,7 @@ const (
|
||||
TableNameKeyword = "keyword"
|
||||
TableNameTask = "task"
|
||||
TableNameDatasetIndex = "dataset_index"
|
||||
TableNameDocumentChunk = "document_chunk"
|
||||
TableNameDocumentVector = "document_vector"
|
||||
)
|
||||
|
||||
// es 索引名称
|
||||
|
||||
@@ -40,9 +40,3 @@ func (c *dataset) List(ctx context.Context, req *dto.ListDatasetReq) (res *dto.L
|
||||
res, err = service.Dataset.List(ctx, req)
|
||||
return
|
||||
}
|
||||
|
||||
// Search 搜索
|
||||
//func (c *dataset) Search(ctx context.Context, req *dto.SearchReq) (res *dto.SearchRes, err error) {
|
||||
// res, err = service.Dataset.Search(ctx, req)
|
||||
// return
|
||||
//}
|
||||
|
||||
@@ -2,7 +2,6 @@ package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"rag/model/dto"
|
||||
"rag/service"
|
||||
|
||||
@@ -47,8 +46,8 @@ func (c *document) List(ctx context.Context, req *dto.ListDocumentReq) (res *dto
|
||||
return
|
||||
}
|
||||
|
||||
// Process 处理文件(向量化)
|
||||
func (c *document) Process(ctx context.Context, req *dto.ProcessDocumentReq) (res *beans.ResponseEmpty, err error) {
|
||||
err = service.Document.Process(ctx, req)
|
||||
// DocumentVector 处理文件(向量化)
|
||||
func (c *document) DocumentVector(ctx context.Context, req *dto.DocumentVectorReq) (res *beans.ResponseEmpty, err error) {
|
||||
err = service.Document.Vector(ctx, req)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"rag/model/dto"
|
||||
"rag/service"
|
||||
|
||||
"gitea.com/red-future/common/beans"
|
||||
"github.com/gogf/gf/v2/frame/g"
|
||||
)
|
||||
|
||||
type documentChunk struct{}
|
||||
|
||||
var DocumentChunk = new(documentChunk)
|
||||
|
||||
// Update 更新文件片段
|
||||
func (c *documentChunk) Update(ctx context.Context, req *dto.UpdateDocumentChunkReq) (res *beans.ResponseEmpty, err error) {
|
||||
err = service.DocumentChunk.Update(ctx, req)
|
||||
return
|
||||
}
|
||||
|
||||
// List 文件片段列表
|
||||
func (c *documentChunk) List(ctx context.Context, req *dto.ListDocumentChunkReq) (res *dto.ListDocumentChunkRes, err error) {
|
||||
if !g.IsEmpty(req.Page) {
|
||||
req.Page = &beans.Page{PageNum: 1, PageSize: 20}
|
||||
}
|
||||
res, err = service.DocumentChunk.List(ctx, req)
|
||||
return
|
||||
}
|
||||
35
controller/document_vector.go
Normal file
35
controller/document_vector.go
Normal file
@@ -0,0 +1,35 @@
|
||||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"rag/model/dto"
|
||||
"rag/service"
|
||||
|
||||
"gitea.com/red-future/common/beans"
|
||||
"github.com/gogf/gf/v2/frame/g"
|
||||
)
|
||||
|
||||
type documentVector struct{}
|
||||
|
||||
var DocumentVector = new(documentVector)
|
||||
|
||||
// Query 执行RAG查询
|
||||
func (c *documentVector) Query(ctx context.Context, req *dto.RAGQueryReq) (res *dto.RAGQueryRes, err error) {
|
||||
res, err = service.DocumentVector.Query(ctx, req)
|
||||
return
|
||||
}
|
||||
|
||||
// Update 更新文件片段
|
||||
func (c *documentVector) Update(ctx context.Context, req *dto.UpdateDocumentVectorReq) (res *beans.ResponseEmpty, err error) {
|
||||
err = service.DocumentVector.Update(ctx, req)
|
||||
return
|
||||
}
|
||||
|
||||
// List 文件片段列表
|
||||
func (c *documentVector) List(ctx context.Context, req *dto.ListDocumentVectorReq) (res *dto.ListDocumentVectorRes, err error) {
|
||||
if !g.IsEmpty(req.Page) {
|
||||
req.Page = &beans.Page{PageNum: 1, PageSize: 20}
|
||||
}
|
||||
res, err = service.DocumentVector.List(ctx, req)
|
||||
return
|
||||
}
|
||||
@@ -2,7 +2,6 @@ package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"rag/model/dto"
|
||||
"rag/service"
|
||||
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
package controller
|
||||
|
||||
import (
|
||||
"context"
|
||||
"rag/model/dto"
|
||||
"rag/service"
|
||||
)
|
||||
|
||||
type ragQuery struct{}
|
||||
|
||||
var RAGQuery = new(ragQuery)
|
||||
|
||||
// Query 执行RAG查询
|
||||
func (c *ragQuery) Query(ctx context.Context, req *dto.RAGQueryReq) (res *dto.RAGQueryRes, err error) {
|
||||
res, err = service.RAGQuery.Query(ctx, req)
|
||||
return
|
||||
}
|
||||
@@ -51,7 +51,7 @@ func (d *datasetIndexDao) InsertIndex(ctx context.Context, indexName string) (er
|
||||
USING ivfflat (vector vector_cosine_ops)
|
||||
WITH (lists = 100)
|
||||
WHERE vector IS NOT NULL;
|
||||
`, indexName, gfdb.TablePrefix+public.TableNameDocumentChunk)
|
||||
`, indexName, gfdb.TablePrefix+public.TableNameDocumentVector)
|
||||
_, err = db.Exec(ctx, sqlStr)
|
||||
return
|
||||
}
|
||||
|
||||
@@ -15,17 +15,17 @@ import (
|
||||
"github.com/pgvector/pgvector-go"
|
||||
)
|
||||
|
||||
var DocumentChunk = new(documentChunkDao)
|
||||
var DocumentVector = new(documentVectorDao)
|
||||
|
||||
type documentChunkDao struct{}
|
||||
type documentVectorDao struct{}
|
||||
|
||||
// BatchInsert 批量插入文件块
|
||||
func (d *documentChunkDao) BatchInsert(ctx context.Context, req []*dto.VectorDocumentChunkMsg) (rows int64, err error) {
|
||||
var res []*entity.DocumentChunk
|
||||
func (d *documentVectorDao) BatchInsert(ctx context.Context, req []*dto.VectorDocumentVectorMsg) (rows int64, err error) {
|
||||
var res []*entity.DocumentVector
|
||||
if err = gconv.Structs(req, &res); err != nil {
|
||||
return
|
||||
}
|
||||
r, err := gfdb.DB(ctx, public.DbNameVector).Model(ctx, public.TableNameDocumentChunk).Data(&res).Insert()
|
||||
r, err := gfdb.DB(ctx, public.DbNameVector).Model(ctx, public.TableNameDocumentVector).Data(&res).Insert()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
@@ -33,9 +33,9 @@ func (d *documentChunkDao) BatchInsert(ctx context.Context, req []*dto.VectorDoc
|
||||
}
|
||||
|
||||
// Update 更新文件块
|
||||
func (d *documentChunkDao) Update(ctx context.Context, req *dto.UpdateDocumentChunkReq) (rows int64, err error) {
|
||||
model := gfdb.DB(ctx, public.DbNameVector).Model(ctx, public.TableNameDocumentChunk)
|
||||
r, err := model.Data(&req).Where(entity.DocumentChunkCol.Id, req.Id).Update()
|
||||
func (d *documentVectorDao) Update(ctx context.Context, req *dto.UpdateDocumentVectorReq) (rows int64, err error) {
|
||||
model := gfdb.DB(ctx, public.DbNameVector).Model(ctx, public.TableNameDocumentVector)
|
||||
r, err := model.Data(&req).Where(entity.DocumentVectorCol.Id, req.Id).Update()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
@@ -43,13 +43,13 @@ func (d *documentChunkDao) Update(ctx context.Context, req *dto.UpdateDocumentCh
|
||||
}
|
||||
|
||||
// List 文件块列表
|
||||
func (d *documentChunkDao) List(ctx context.Context, req *dto.ListDocumentChunkReq, fields ...string) (res []*entity.DocumentChunk, total int, err error) {
|
||||
model := gfdb.DB(ctx, public.DbNameVector).Model(ctx, public.TableNameDocumentChunk).Fields(fields).OmitEmpty().
|
||||
Where(entity.DocumentChunkCol.DatasetId, req.DatasetId).
|
||||
Where(entity.DocumentChunkCol.DocumentId, req.DocumentId).
|
||||
Where(entity.DocumentChunkCol.Status, req.Status).
|
||||
Where(entity.DocumentChunkCol.VectorStatus, req.VectorStatus).
|
||||
OrderDesc(entity.DocumentChunkCol.CreatedAt)
|
||||
func (d *documentVectorDao) List(ctx context.Context, req *dto.ListDocumentVectorReq, fields ...string) (res []*entity.DocumentVector, total int, err error) {
|
||||
model := gfdb.DB(ctx, public.DbNameVector).Model(ctx, public.TableNameDocumentVector).Fields(fields).OmitEmpty().
|
||||
Where(entity.DocumentVectorCol.DatasetId, req.DatasetId).
|
||||
Where(entity.DocumentVectorCol.DocumentId, req.DocumentId).
|
||||
Where(entity.DocumentVectorCol.Status, req.Status).
|
||||
Where(entity.DocumentVectorCol.VectorStatus, req.VectorStatus).
|
||||
OrderDesc(entity.DocumentVectorCol.CreatedAt)
|
||||
if req.Page != nil {
|
||||
model.Page(int(req.Page.PageNum), int(req.Page.PageSize))
|
||||
}
|
||||
@@ -61,27 +61,22 @@ func (d *documentChunkDao) List(ctx context.Context, req *dto.ListDocumentChunkR
|
||||
return
|
||||
}
|
||||
|
||||
func (d *documentChunkDao) GetAllByVector(ctx context.Context, datasetId []int64, queryVec pgvector.Vector, topK int) (list gdb.List, err error) {
|
||||
sql := `
|
||||
SELECT id, content, dataset_id, document_id,
|
||||
vector <=> ? AS distance
|
||||
FROM rag_vector_document_chunk
|
||||
WHERE dataset_id IN (?)
|
||||
AND vector IS NOT NULL
|
||||
ORDER BY distance ASC
|
||||
LIMIT ?
|
||||
`
|
||||
// 顺序:vector, dataset_id, topK
|
||||
result, err := gfdb.DB(ctx, public.DbNameVector).GetAll(ctx, sql, queryVec, datasetId, topK)
|
||||
func (d *documentVectorDao) GetAllByVector(ctx context.Context, datasetIds []int64, vector pgvector.Vector, topK int) (list gdb.List, err error) {
|
||||
result, err := gfdb.DB(ctx, public.DbNameVector).Model(ctx, public.TableNameDocumentVector).
|
||||
Fields("id, content, dataset_id, document_id, vector <=> ? AS distance", vector).
|
||||
WhereIn(entity.DocumentVectorCol.DatasetId, datasetIds).
|
||||
WhereNotNull(entity.DocumentVectorCol.Vector).
|
||||
OrderAsc("distance").
|
||||
Limit(topK).
|
||||
All()
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
|
||||
return result.List(), nil
|
||||
}
|
||||
|
||||
// SearchByKeywords 通过关键词全文检索文档块
|
||||
func (d *documentChunkDao) SearchByKeywords(ctx context.Context, query string, datasetIds []int64, topK int) (list gdb.List, err error) {
|
||||
func (d *documentVectorDao) SearchByKeywords(ctx context.Context, query string, datasetIds []int64, topK int) (list gdb.List, err error) {
|
||||
// 构建 meilisearch 查询参数
|
||||
searchParams := &meilisearch.SearchParams{
|
||||
Query: query,
|
||||
26
main.go
26
main.go
@@ -7,6 +7,7 @@ import (
|
||||
"rag/consts/public"
|
||||
"rag/controller"
|
||||
"rag/service"
|
||||
"strings"
|
||||
"syscall"
|
||||
|
||||
_ "gitea.com/red-future/common/config"
|
||||
@@ -28,9 +29,8 @@ func main() {
|
||||
http.RouteRegister([]interface{}{
|
||||
controller.Dataset,
|
||||
controller.Document,
|
||||
controller.DocumentChunk,
|
||||
controller.DocumentVector,
|
||||
controller.Keyword,
|
||||
controller.RAGQuery,
|
||||
})
|
||||
|
||||
err := utils.InitGseTool(ctx)
|
||||
@@ -38,15 +38,21 @@ func main() {
|
||||
g.Log().Error(ctx, "gse 分词工具初始化失败:", err)
|
||||
}
|
||||
|
||||
gmq.Init("config.yml")
|
||||
|
||||
if err := gmq.GetGmq("primary").GmqSubscribe(ctx, &mq.RedisSubMessage{
|
||||
redisAddress := g.Cfg().MustGet(ctx, "redis.default.address").String()
|
||||
redisAddressList := strings.Split(redisAddress, ":")
|
||||
gmq.GmqRegister(public.GmqMsgPluginsName, &mq.RedisConn{
|
||||
RedisConfig: mq.RedisConfig{
|
||||
Addr: redisAddressList[0],
|
||||
Port: redisAddressList[1],
|
||||
},
|
||||
})
|
||||
if err = gmq.GetGmq(public.GmqMsgPluginsName).GmqSubscribe(ctx, &mq.RedisSubMessage{
|
||||
SubMessage: types.SubMessage{
|
||||
Topic: public.KnowledgeDocumentChunkTopic,
|
||||
ConsumerName: public.KnowledgeDocumentChunkConsumer,
|
||||
AutoAck: public.KnowledgeDocumentChunkAutoAck,
|
||||
FetchCount: public.KnowledgeDocumentChunkBatchSize,
|
||||
HandleFunc: service.DocumentChunk.DocsChunkMsg,
|
||||
Topic: public.KnowledgeDocumentVectorTopic,
|
||||
ConsumerName: public.KnowledgeDocumentVectorConsumer,
|
||||
AutoAck: public.KnowledgeDocumentVectorAutoAck,
|
||||
FetchCount: public.KnowledgeDocumentVectorCount,
|
||||
HandleFunc: service.DocumentVector.DocsChunkMsg,
|
||||
},
|
||||
}); err != nil {
|
||||
return
|
||||
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
|
||||
// CreateDatasetReq 创建数据集请求
|
||||
type CreateDatasetReq struct {
|
||||
g.Meta `path:"/createDataset" method:"post" tags:"知识库(数据集)管理" summary:"创建知识库(数据集)" dc:"创建知识库(数据集)"`
|
||||
g.Meta `path:"/create" method:"post" tags:"知识库(数据集)管理" summary:"创建知识库(数据集)" dc:"创建知识库(数据集)"`
|
||||
|
||||
Name string `json:"name" v:"required#名称不能为空"`
|
||||
Description string `json:"description"`
|
||||
@@ -21,7 +21,7 @@ type CreateDatasetRes struct {
|
||||
|
||||
// UpdateDatasetReq 更新数据集请求
|
||||
type UpdateDatasetReq struct {
|
||||
g.Meta `path:"/updateDataset" method:"put" tags:"知识库(数据集)管理" summary:"更新知识库(数据集)" dc:"更新知识库(数据集)"`
|
||||
g.Meta `path:"/update" method:"put" tags:"知识库(数据集)管理" summary:"更新知识库(数据集)" dc:"更新知识库(数据集)"`
|
||||
|
||||
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||
Name string `json:"name"`
|
||||
@@ -32,21 +32,21 @@ type UpdateDatasetReq struct {
|
||||
|
||||
// DeleteDatasetReq 删除数据集请求
|
||||
type DeleteDatasetReq struct {
|
||||
g.Meta `path:"/deleteDataset" method:"delete" tags:"知识库(数据集)管理" summary:"删除知识库(数据集)" dc:"删除知识库(数据集)"`
|
||||
g.Meta `path:"/delete" method:"delete" tags:"知识库(数据集)管理" summary:"删除知识库(数据集)" dc:"删除知识库(数据集)"`
|
||||
|
||||
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||
}
|
||||
|
||||
// GetDatasetReq 获取数据集请求
|
||||
type GetDatasetReq struct {
|
||||
g.Meta `path:"/getDataset" method:"get" tags:"知识库(数据集)管理" summary:"获取知识库(数据集)详情" dc:"获取知识库(数据集)详情"`
|
||||
g.Meta `path:"/get" method:"get" tags:"知识库(数据集)管理" summary:"获取知识库(数据集)详情" dc:"获取知识库(数据集)详情"`
|
||||
|
||||
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||
}
|
||||
|
||||
// ListDatasetReq 数据集列表请求
|
||||
type ListDatasetReq struct {
|
||||
g.Meta `path:"/listDataset" method:"get" tags:"知识库(数据集)管理" summary:"获取知识库(数据集)列表" dc:"分页查询知识库(数据集)列表,支持多条件筛选"`
|
||||
g.Meta `path:"/list" method:"get" tags:"知识库(数据集)管理" summary:"获取知识库(数据集)列表" dc:"分页查询知识库(数据集)列表,支持多条件筛选"`
|
||||
|
||||
Page *beans.Page `json:"page"`
|
||||
Keyword string `json:"keyword" dc:"关键词搜索"`
|
||||
|
||||
@@ -10,7 +10,7 @@ import (
|
||||
|
||||
// CreateDocumentReq 创建文件请求
|
||||
type CreateDocumentReq struct {
|
||||
g.Meta `path:"/createDocument" method:"post" tags:"文件管理" summary:"创建文件" dc:"创建文件"`
|
||||
g.Meta `path:"/create" method:"post" tags:"文件管理" summary:"创建文件" dc:"创建文件"`
|
||||
|
||||
DatasetId int64 `json:"datasetId" v:"required#数据集ID不能为空"`
|
||||
Title string `json:"title" v:"required#标题不能为空"`
|
||||
@@ -26,7 +26,7 @@ type CreateDocumentRes struct {
|
||||
|
||||
// UpdateDocumentReq 更新文件请求
|
||||
type UpdateDocumentReq struct {
|
||||
g.Meta `path:"/updateDocument" method:"put" tags:"文件管理" summary:"更新文件" dc:"更新文件"`
|
||||
g.Meta `path:"/update" method:"put" tags:"文件管理" summary:"更新文件" dc:"更新文件"`
|
||||
|
||||
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||
Status document.Status `json:"status"`
|
||||
@@ -36,21 +36,21 @@ type UpdateDocumentReq struct {
|
||||
|
||||
// DeleteDocumentReq 删除文件请求
|
||||
type DeleteDocumentReq struct {
|
||||
g.Meta `path:"/deleteDocument" method:"delete" tags:"文件管理" summary:"删除文件" dc:"删除文件"`
|
||||
g.Meta `path:"/delete" method:"delete" tags:"文件管理" summary:"删除文件" dc:"删除文件"`
|
||||
|
||||
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||
}
|
||||
|
||||
// GetDocumentReq 获取文件请求
|
||||
type GetDocumentReq struct {
|
||||
g.Meta `path:"/getDocument" method:"get" tags:"文件管理" summary:"获取文件详情" dc:"获取文件详情"`
|
||||
g.Meta `path:"/get" method:"get" tags:"文件管理" summary:"获取文件详情" dc:"获取文件详情"`
|
||||
|
||||
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||
}
|
||||
|
||||
// ListDocumentReq 文件列表请求
|
||||
type ListDocumentReq struct {
|
||||
g.Meta `path:"/listDocument" method:"get" tags:"文件管理" summary:"获取文件列表" dc:"分页查询文件列表,支持多条件筛选"`
|
||||
g.Meta `path:"/list" method:"get" tags:"文件管理" summary:"获取文件列表" dc:"分页查询文件列表,支持多条件筛选"`
|
||||
|
||||
Page *beans.Page `json:"page"`
|
||||
DatasetId int64 `json:"datasetId"`
|
||||
@@ -76,27 +76,16 @@ type DocumentVO struct {
|
||||
UpdatedAt *gtime.Time `json:"updatedAt" dc:"更新时间"`
|
||||
}
|
||||
|
||||
// ProcessDocumentReq 处理文件请求(向量化)
|
||||
type ProcessDocumentReq struct {
|
||||
g.Meta `path:"/getProcess" method:"get" tags:"文件管理" summary:"文件向量化处理" dc:"文件向量化处理"`
|
||||
// DocumentVectorReq 处理文件请求(向量化)
|
||||
type DocumentVectorReq struct {
|
||||
g.Meta `path:"/vectorization" method:"post" tags:"文件管理" summary:"文件向量化处理" dc:"文件向量化处理"`
|
||||
|
||||
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||
DatasetId int64 `json:"datasetId" v:"required#数据集ID不能为空"`
|
||||
}
|
||||
|
||||
type ListDocumentChunkRPC struct {
|
||||
List []*DocumentChunkRPC `json:"list"`
|
||||
}
|
||||
|
||||
type DocumentChunkRPC struct {
|
||||
type DocumentVectorRPC struct {
|
||||
Id int64 `json:"id" dc:"id"`
|
||||
DatasetId int64 `json:"datasetId" dc:"所属数据集ID"`
|
||||
ContentHash string `json:"contentHash" dc:"内容hash"`
|
||||
}
|
||||
|
||||
type KnowledgeDocumentMsg struct {
|
||||
TenantId uint64 `json:"tenantId"`
|
||||
Creator string `json:"creator"`
|
||||
Id int64 `json:"id"`
|
||||
VectorStatus document.VectorStatus `json:"vectorStatus"`
|
||||
}
|
||||
|
||||
@@ -9,17 +9,37 @@ import (
|
||||
"github.com/pgvector/pgvector-go"
|
||||
)
|
||||
|
||||
// UpdateDocumentChunkReq 更新文件块向量请求
|
||||
type UpdateDocumentChunkReq struct {
|
||||
g.Meta `path:"/updateDocumentChunk" method:"put" tags:"文件块向量管理" summary:"更新文件块" dc:"更新文件块"`
|
||||
// RAGQueryReq RAG查询请求
|
||||
type RAGQueryReq struct {
|
||||
g.Meta `path:"/ragQuery" method:"post" tags:"RAG查询" summary:"执行RAG查询" dc:"执行RAG查询"`
|
||||
|
||||
Content string `json:"content" v:"required#查询内容不能为空" dc:"用户问题"`
|
||||
DatasetIds []int64 `json:"datasetIds" dc:"数据集ID"`
|
||||
History []*Message `json:"history" dc:"历史对话"`
|
||||
TopK int `json:"topK" d:"5" dc:"检索topK,默认5"`
|
||||
}
|
||||
|
||||
type Message struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
// RAGQueryRes RAG查询响应
|
||||
type RAGQueryRes struct {
|
||||
Answer string `json:"answer" dc:"生成的答案"`
|
||||
}
|
||||
|
||||
// UpdateDocumentVectorReq 更新文件块向量请求
|
||||
type UpdateDocumentVectorReq struct {
|
||||
g.Meta `path:"/update" method:"put" tags:"文件块向量管理" summary:"更新文件块" dc:"更新文件块"`
|
||||
|
||||
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||
Status document.Status `json:"status"`
|
||||
}
|
||||
|
||||
// ListDocumentChunkReq 文件块向量列表请求
|
||||
type ListDocumentChunkReq struct {
|
||||
g.Meta `path:"/listDocumentChunk" method:"get" tags:"文件块向量管理" summary:"获取文件块向量列表" dc:"分页查询文件块向量列表,支持多条件筛选"`
|
||||
// ListDocumentVectorReq 文件块向量列表请求
|
||||
type ListDocumentVectorReq struct {
|
||||
g.Meta `path:"/list" method:"get" tags:"文件块向量管理" summary:"获取文件块向量列表" dc:"分页查询文件块向量列表,支持多条件筛选"`
|
||||
|
||||
Page *beans.Page `json:"page"`
|
||||
DatasetId int64 `json:"datasetId"`
|
||||
@@ -28,13 +48,13 @@ type ListDocumentChunkReq struct {
|
||||
VectorStatus document.VectorStatus `json:"vectorStatus"`
|
||||
}
|
||||
|
||||
// ListDocumentChunkRes 文件块向量列表响应
|
||||
type ListDocumentChunkRes struct {
|
||||
List []*DocumentChunkItem `json:"list"`
|
||||
// ListDocumentVectorRes 文件块向量列表响应
|
||||
type ListDocumentVectorRes struct {
|
||||
List []*DocumentVectorVO `json:"list"`
|
||||
Total int `json:"total"`
|
||||
}
|
||||
|
||||
type DocumentChunkItem struct {
|
||||
type DocumentVectorVO struct {
|
||||
Id int64 `json:"id,string" dc:"id"`
|
||||
Status document.Status `json:"status" dc:"状态"`
|
||||
VectorStatus document.VectorStatus `json:"vectorStatus" dc:"向量状态"`
|
||||
@@ -49,7 +69,7 @@ type DocumentChunkItem struct {
|
||||
UpdatedAt *gtime.Time `json:"updatedAt" dc:"更新时间"`
|
||||
}
|
||||
|
||||
type VectorDocumentChunkMsg struct {
|
||||
type VectorDocumentVectorMsg struct {
|
||||
TenantId uint64 `json:"tenantId"`
|
||||
Creator string `json:"creator"`
|
||||
DatasetId int64 `json:"datasetId"` // 数据集ID
|
||||
@@ -8,7 +8,7 @@ import (
|
||||
|
||||
// CreateKeywordReq 创建关键词请求
|
||||
type CreateKeywordReq struct {
|
||||
g.Meta `path:"/createKeyword" method:"post" tags:"关键词管理" summary:"创建关键词" dc:"创建关键词"`
|
||||
g.Meta `path:"/create" method:"post" tags:"关键词管理" summary:"创建关键词" dc:"创建关键词"`
|
||||
|
||||
DatasetId int64 `json:"datasetId" v:"required#数据集ID不能为空"`
|
||||
DocumentId int64 `json:"documentId" v:"required#文档ID不能为空"`
|
||||
@@ -23,7 +23,7 @@ type CreateKeywordRes struct {
|
||||
|
||||
// UpdateKeywordReq 更新关键词请求
|
||||
type UpdateKeywordReq struct {
|
||||
g.Meta `path:"/updateKeyword" method:"put" tags:"关键词管理" summary:"更新关键词" dc:"更新关键词"`
|
||||
g.Meta `path:"/update" method:"put" tags:"关键词管理" summary:"更新关键词" dc:"更新关键词"`
|
||||
|
||||
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||
Word string `json:"word"`
|
||||
@@ -32,21 +32,21 @@ type UpdateKeywordReq struct {
|
||||
|
||||
// DeleteKeywordReq 删除关键词请求
|
||||
type DeleteKeywordReq struct {
|
||||
g.Meta `path:"/deleteKeyword" method:"delete" tags:"关键词管理" summary:"删除关键词" dc:"删除关键词"`
|
||||
g.Meta `path:"/delete" method:"delete" tags:"关键词管理" summary:"删除关键词" dc:"删除关键词"`
|
||||
|
||||
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||
}
|
||||
|
||||
// GetKeywordReq 获取关键词请求
|
||||
type GetKeywordReq struct {
|
||||
g.Meta `path:"/getKeyword" method:"get" tags:"关键词管理" summary:"获取关键词详情" dc:"获取关键词详情"`
|
||||
g.Meta `path:"/get" method:"get" tags:"关键词管理" summary:"获取关键词详情" dc:"获取关键词详情"`
|
||||
|
||||
Id int64 `json:"id" v:"required#ID不能为空"`
|
||||
}
|
||||
|
||||
// ListKeywordReq 关键词列表请求
|
||||
type ListKeywordReq struct {
|
||||
g.Meta `path:"/listKeyword" method:"get" tags:"关键词管理" summary:"获取关键词列表" dc:"分页查询关键词列表,支持多条件筛选"`
|
||||
g.Meta `path:"/list" method:"get" tags:"关键词管理" summary:"获取关键词列表" dc:"分页查询关键词列表,支持多条件筛选"`
|
||||
|
||||
Page *beans.Page `json:"page"`
|
||||
DatasetId int64 `json:"datasetId"`
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
package dto
|
||||
|
||||
import (
|
||||
"github.com/gogf/gf/v2/frame/g"
|
||||
)
|
||||
|
||||
// RAGQueryReq RAG查询请求
|
||||
type RAGQueryReq struct {
|
||||
g.Meta `path:"/ragQuery" method:"post" tags:"RAG查询" summary:"执行RAG查询" dc:"执行RAG查询"`
|
||||
|
||||
Content string `json:"content" v:"required#查询内容不能为空" dc:"用户问题"`
|
||||
DatasetIds []int64 `json:"datasetIds" dc:"数据集ID"`
|
||||
History []*Message `json:"history" dc:"历史对话"`
|
||||
TopK int `json:"topK" d:"5" dc:"检索topK,默认5"`
|
||||
}
|
||||
|
||||
type Message struct {
|
||||
Role string `json:"role"`
|
||||
Content string `json:"content"`
|
||||
}
|
||||
|
||||
// RAGQueryRes RAG查询响应
|
||||
type RAGQueryRes struct {
|
||||
Answer string `json:"answer" dc:"生成的答案"`
|
||||
}
|
||||
@@ -1,7 +1,7 @@
|
||||
package dto
|
||||
|
||||
import (
|
||||
"rag/common/task"
|
||||
"rag/consts/task"
|
||||
)
|
||||
|
||||
// WriteTaskProgressReq 写入任务进度请求
|
||||
|
||||
@@ -7,7 +7,7 @@ import (
|
||||
"github.com/pgvector/pgvector-go"
|
||||
)
|
||||
|
||||
type documentChunkCol struct {
|
||||
type documentVectorCol struct {
|
||||
beans.SQLBaseCol
|
||||
Status string
|
||||
VectorStatus string
|
||||
@@ -20,7 +20,7 @@ type documentChunkCol struct {
|
||||
Metadata string
|
||||
}
|
||||
|
||||
var DocumentChunkCol = documentChunkCol{
|
||||
var DocumentVectorCol = documentVectorCol{
|
||||
SQLBaseCol: beans.DefSQLBaseCol,
|
||||
Status: "status",
|
||||
VectorStatus: "vector_status",
|
||||
@@ -33,8 +33,8 @@ var DocumentChunkCol = documentChunkCol{
|
||||
Metadata: "metadata",
|
||||
}
|
||||
|
||||
// DocumentChunk 文档切分块实体
|
||||
type DocumentChunk struct {
|
||||
// DocumentVector 文档切分块实体
|
||||
type DocumentVector struct {
|
||||
beans.SQLBaseDO `orm:",inline"`
|
||||
|
||||
Status document.Status `orm:"status" json:"status" dc:"状态"`
|
||||
@@ -1,7 +1,7 @@
|
||||
package entity
|
||||
|
||||
import (
|
||||
"rag/common/task"
|
||||
"rag/consts/task"
|
||||
|
||||
"gitea.com/red-future/common/beans"
|
||||
)
|
||||
|
||||
@@ -5,9 +5,9 @@ import (
|
||||
"errors"
|
||||
"fmt"
|
||||
"rag/common/eino"
|
||||
"rag/common/task"
|
||||
"rag/consts/document"
|
||||
"rag/consts/public"
|
||||
"rag/consts/task"
|
||||
"rag/dao"
|
||||
"rag/model/dto"
|
||||
"rag/model/entity"
|
||||
@@ -123,8 +123,8 @@ func (s *documentService) List(ctx context.Context, req *dto.ListDocumentReq) (r
|
||||
return
|
||||
}
|
||||
|
||||
// Process 处理文件(使用eino框架切分和向量化)
|
||||
func (s *documentService) Process(ctx context.Context, req *dto.ProcessDocumentReq) (err error) {
|
||||
// Vector 处理文件(使用eino框架切分和向量化)
|
||||
func (s *documentService) Vector(ctx context.Context, req *dto.DocumentVectorReq) (err error) {
|
||||
// 1. 查询文件信息
|
||||
documentReq := dto.GetDocumentReq{Id: req.Id}
|
||||
doc, err := dao.Document.GetByID(ctx, &documentReq)
|
||||
@@ -403,9 +403,9 @@ func (s *documentService) sqlSplitDocument(ctx context.Context, doc *entity.Docu
|
||||
metaData[entity.DocumentCol.TenantId] = doc.TenantId
|
||||
metaData[entity.DocumentCol.Creator] = doc.Creator
|
||||
metaData[entity.DocumentCol.DatasetId] = doc.DatasetId
|
||||
metaData[entity.DocumentChunkCol.DocumentId] = doc.Id
|
||||
metaData[entity.DocumentChunkCol.ContentHash] = contentHash
|
||||
metaData[entity.DocumentChunkCol.ChunkIndex] = gconv.Int64(i)
|
||||
metaData[entity.DocumentVectorCol.DocumentId] = doc.Id
|
||||
metaData[entity.DocumentVectorCol.ContentHash] = contentHash
|
||||
metaData[entity.DocumentVectorCol.ChunkIndex] = gconv.Int64(i)
|
||||
t.MetaData = metaData
|
||||
docsChunk = append(docsChunk, t)
|
||||
}
|
||||
@@ -423,9 +423,9 @@ func (s *documentService) sqlSplitDocument(ctx context.Context, doc *entity.Docu
|
||||
|
||||
// 4. 发送消息到队列
|
||||
if len(docsChunk) > 0 {
|
||||
err = gmq.GetGmq("primary").GmqPublish(ctx, &mq.RedisPubMessage{
|
||||
err = gmq.GetGmq(public.GmqMsgPluginsName).GmqPublish(ctx, &mq.RedisPubMessage{
|
||||
PubMessage: types.PubMessage{
|
||||
Topic: public.KnowledgeDocumentChunkTopic,
|
||||
Topic: public.KnowledgeDocumentVectorTopic,
|
||||
Data: docsChunk,
|
||||
},
|
||||
})
|
||||
@@ -541,12 +541,12 @@ func (s *documentService) esSplitDocument(ctx context.Context, doc *entity.Docum
|
||||
continue
|
||||
}
|
||||
meiliDocs = append(meiliDocs, map[string]interface{}{
|
||||
entity.DocumentChunkCol.Id: contentHash,
|
||||
entity.DocumentChunkCol.DatasetId: doc.DatasetId,
|
||||
entity.DocumentChunkCol.DocumentId: doc.Id,
|
||||
entity.DocumentChunkCol.Content: t.Content,
|
||||
entity.DocumentChunkCol.ContentHash: contentHash,
|
||||
entity.DocumentChunkCol.ChunkIndex: i,
|
||||
entity.DocumentVectorCol.Id: contentHash,
|
||||
entity.DocumentVectorCol.DatasetId: doc.DatasetId,
|
||||
entity.DocumentVectorCol.DocumentId: doc.Id,
|
||||
entity.DocumentVectorCol.Content: t.Content,
|
||||
entity.DocumentVectorCol.ContentHash: contentHash,
|
||||
entity.DocumentVectorCol.ChunkIndex: i,
|
||||
})
|
||||
}
|
||||
|
||||
@@ -621,7 +621,7 @@ func (s *documentService) getHistoryData(ctx context.Context, doc *entity.Docume
|
||||
}
|
||||
|
||||
// 3. Redis 无数据:根据 contentKey 类型选择查询方式
|
||||
var dictData = make([]*dto.DocumentChunkRPC, 0)
|
||||
var dictData = make([]*dto.DocumentVectorRPC, 0)
|
||||
if public.KnowledgeContentHashSqlKey == contentKey {
|
||||
// SQL 方式:调用 HTTP 接口查询
|
||||
dictData, err = s.getHistoryDataFromHttp(ctx, doc)
|
||||
@@ -658,9 +658,9 @@ func (s *documentService) getHistoryData(ctx context.Context, doc *entity.Docume
|
||||
}
|
||||
|
||||
// getHistoryDataFromHttp 通过 HTTP 接口查询历史数据
|
||||
func (s *documentService) getHistoryDataFromHttp(ctx context.Context, doc *entity.Document) (dictData []*dto.DocumentChunkRPC, err error) {
|
||||
func (s *documentService) getHistoryDataFromHttp(ctx context.Context, doc *entity.Document) (dictData []*dto.DocumentVectorRPC, err error) {
|
||||
// 调用接口获取数据
|
||||
res, _, err := dao.DocumentChunk.List(ctx, &dto.ListDocumentChunkReq{
|
||||
res, _, err := dao.DocumentVector.List(ctx, &dto.ListDocumentVectorReq{
|
||||
DatasetId: doc.DatasetId,
|
||||
Status: gconv.PtrInt8(1),
|
||||
})
|
||||
@@ -669,7 +669,7 @@ func (s *documentService) getHistoryDataFromHttp(ctx context.Context, doc *entit
|
||||
}
|
||||
|
||||
// getHistoryDataFromMeilisearch 通过 meilisearch 查询历史数据
|
||||
func (s *documentService) getHistoryDataFromMeilisearch(ctx context.Context, doc *entity.Document) (dictData []*dto.DocumentChunkRPC, err error) {
|
||||
func (s *documentService) getHistoryDataFromMeilisearch(ctx context.Context, doc *entity.Document) (dictData []*dto.DocumentVectorRPC, err error) {
|
||||
// 构建 meilisearch 查询参数
|
||||
searchParams := &meilisearch.SearchParams{
|
||||
Filter: fmt.Sprintf("datasetId = %d", doc.DatasetId),
|
||||
@@ -684,9 +684,9 @@ func (s *documentService) getHistoryDataFromMeilisearch(ctx context.Context, doc
|
||||
}
|
||||
|
||||
// 转换查询结果
|
||||
dictData = make([]*dto.DocumentChunkRPC, 0)
|
||||
dictData = make([]*dto.DocumentVectorRPC, 0)
|
||||
for _, hit := range hits {
|
||||
item := &dto.DocumentChunkRPC{}
|
||||
item := &dto.DocumentVectorRPC{}
|
||||
if err = gconv.Struct(hit, item); err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
@@ -1,84 +0,0 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"rag/common/eino"
|
||||
"rag/common/task"
|
||||
"rag/dao"
|
||||
"rag/model/dto"
|
||||
"rag/model/entity"
|
||||
|
||||
"gitea.com/red-future/common/beans"
|
||||
"github.com/cloudwego/eino/components/indexer"
|
||||
"github.com/cloudwego/eino/schema"
|
||||
"github.com/gogf/gf/v2/frame/g"
|
||||
"github.com/gogf/gf/v2/util/gconv"
|
||||
)
|
||||
|
||||
var DocumentChunk = new(documentChunkService)
|
||||
|
||||
type documentChunkService struct{}
|
||||
|
||||
// Update 更新文件块
|
||||
func (s *documentChunkService) Update(ctx context.Context, req *dto.UpdateDocumentChunkReq) (err error) {
|
||||
_, err = dao.DocumentChunk.Update(ctx, req)
|
||||
return
|
||||
}
|
||||
|
||||
// List 获取文件块列表
|
||||
func (s *documentChunkService) List(ctx context.Context, req *dto.ListDocumentChunkReq) (res *dto.ListDocumentChunkRes, err error) {
|
||||
list, total, err := dao.DocumentChunk.List(ctx, req)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
res = &dto.ListDocumentChunkRes{
|
||||
Total: total,
|
||||
}
|
||||
err = gconv.Struct(list, &res.List)
|
||||
return
|
||||
}
|
||||
|
||||
func (s *documentChunkService) DocsChunkMsg(ctx context.Context, msg any) (err error) {
|
||||
var docs = make([]*schema.Document, 0)
|
||||
msgMap := gconv.Map(msg)
|
||||
if err = gconv.Structs(msgMap["data"], &docs); err != nil {
|
||||
g.Log().Error(ctx, "DocsChunkMsg err:", err)
|
||||
return
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
g.Log().Error(ctx, "DocsChunkMsg err:", "msg is empty")
|
||||
return
|
||||
}
|
||||
ctx = context.WithValue(ctx, "user", &beans.User{
|
||||
TenantId: gconv.Uint64(docs[0].MetaData[entity.DocumentChunkCol.TenantId]),
|
||||
UserName: gconv.String(docs[0].MetaData[entity.DocumentChunkCol.Creator]),
|
||||
})
|
||||
idx := eino.NewPGVectorIndexer(&eino.PGVectorIndexerOptions{
|
||||
BatchSize: 10,
|
||||
})
|
||||
documentId := gconv.Int64(docs[0].MetaData[entity.DocumentChunkCol.DocumentId])
|
||||
rows, err := idx.Store(ctx, docs, indexer.WithEmbedding(eino.EmbedderDashscope))
|
||||
if err != nil || rows == 0 {
|
||||
g.Log().Error(ctx, "DocsChunkMsg rows: , err:", rows, err)
|
||||
// 写入任务进度失败 任务类型为sql存储
|
||||
remark := " 向量存储数量: " + gconv.String(rows)
|
||||
if err != nil {
|
||||
remark = "向量存储失败: " + err.Error()
|
||||
}
|
||||
err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{
|
||||
TaskId: documentId,
|
||||
TaskType: task.TaskTypeGenerateVector,
|
||||
Status: task.TaskStatusFailed,
|
||||
Remark: remark,
|
||||
})
|
||||
return
|
||||
}
|
||||
// 写入任务进度成功 任务类型为sql存储
|
||||
err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{
|
||||
TaskId: documentId,
|
||||
TaskType: task.TaskTypeGenerateVector,
|
||||
Status: task.TaskStatusCompleted,
|
||||
Remark: "向量生成完成",
|
||||
})
|
||||
return
|
||||
}
|
||||
129
service/document_vector.go
Normal file
129
service/document_vector.go
Normal file
@@ -0,0 +1,129 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"rag/common/eino"
|
||||
"rag/consts/task"
|
||||
"rag/dao"
|
||||
"rag/model/dto"
|
||||
"rag/model/entity"
|
||||
|
||||
"gitea.com/red-future/common/beans"
|
||||
"github.com/cloudwego/eino/components/indexer"
|
||||
"github.com/cloudwego/eino/components/retriever"
|
||||
"github.com/cloudwego/eino/schema"
|
||||
"github.com/gogf/gf/v2/frame/g"
|
||||
"github.com/gogf/gf/v2/util/gconv"
|
||||
)
|
||||
|
||||
var DocumentVector = new(documentVectorService)
|
||||
|
||||
type documentVectorService struct{}
|
||||
|
||||
// Query 执行RAG查询
|
||||
func (s *documentVectorService) Query(ctx context.Context, req *dto.RAGQueryReq) (*dto.RAGQueryRes, error) {
|
||||
if req.TopK <= 0 {
|
||||
req.TopK = 5
|
||||
}
|
||||
|
||||
// 4. 使用向量检索器进行查询
|
||||
r, err := eino.NewPGVectorRetriever(&eino.PGVectorRetrieverConfig{
|
||||
Embedder: eino.EmbedderDashscope,
|
||||
DefaultTopK: req.TopK,
|
||||
})
|
||||
if err != nil {
|
||||
g.Log().Errorf(ctx, "初始化向量检索器失败: %v", err)
|
||||
return nil, fmt.Errorf("初始化向量检索器失败: %w", err)
|
||||
}
|
||||
|
||||
// 5. 执行向量检索
|
||||
docs, err := r.Retrieve(ctx, req.Content, retriever.WithEmbedding(eino.EmbedderDashscope), retriever.WithDSLInfo(map[string]any{
|
||||
"dataset_ids": req.DatasetIds,
|
||||
}))
|
||||
if err != nil {
|
||||
g.Log().Errorf(ctx, "向量检索失败: %v", err)
|
||||
return nil, fmt.Errorf("向量检索失败: %w", err)
|
||||
}
|
||||
|
||||
messages := make([]*schema.Message, 0)
|
||||
err = gconv.Struct(req.History, &messages)
|
||||
if err != nil {
|
||||
g.Log().Errorf(ctx, "转换历史消息失败: %v", err)
|
||||
return nil, fmt.Errorf("转换历史消息失败: %w", err)
|
||||
}
|
||||
|
||||
replyMsg, err := eino.NewChatModel(ctx, req.Content, docs, messages)
|
||||
if err != nil {
|
||||
g.Log().Errorf(ctx, "向量检索失败: %v", err)
|
||||
return nil, fmt.Errorf("向量检索失败: %w", err)
|
||||
}
|
||||
|
||||
return &dto.RAGQueryRes{
|
||||
Answer: replyMsg.Content,
|
||||
}, nil
|
||||
}
|
||||
|
||||
// Update 更新文件块
|
||||
func (s *documentVectorService) Update(ctx context.Context, req *dto.UpdateDocumentVectorReq) (err error) {
|
||||
_, err = dao.DocumentVector.Update(ctx, req)
|
||||
return
|
||||
}
|
||||
|
||||
// List 获取文件块列表
|
||||
func (s *documentVectorService) List(ctx context.Context, req *dto.ListDocumentVectorReq) (res *dto.ListDocumentVectorRes, err error) {
|
||||
list, total, err := dao.DocumentVector.List(ctx, req)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
res = &dto.ListDocumentVectorRes{
|
||||
Total: total,
|
||||
}
|
||||
err = gconv.Struct(list, &res.List)
|
||||
return
|
||||
}
|
||||
|
||||
func (s *documentVectorService) DocsChunkMsg(ctx context.Context, msg any) (err error) {
|
||||
var docs = make([]*schema.Document, 0)
|
||||
msgMap := gconv.Map(msg)
|
||||
if err = gconv.Structs(msgMap["data"], &docs); err != nil {
|
||||
g.Log().Error(ctx, "DocsChunkMsg err:", err)
|
||||
return
|
||||
}
|
||||
if len(docs) == 0 {
|
||||
g.Log().Error(ctx, "DocsChunkMsg err:", "msg is empty")
|
||||
return
|
||||
}
|
||||
ctx = context.WithValue(ctx, "user", &beans.User{
|
||||
TenantId: gconv.Uint64(docs[0].MetaData[entity.DocumentVectorCol.TenantId]),
|
||||
UserName: gconv.String(docs[0].MetaData[entity.DocumentVectorCol.Creator]),
|
||||
})
|
||||
idx := eino.NewPGVectorIndexer(&eino.PGVectorIndexerOptions{
|
||||
BatchSize: 10,
|
||||
})
|
||||
documentId := gconv.Int64(docs[0].MetaData[entity.DocumentVectorCol.DocumentId])
|
||||
rows, err := idx.Store(ctx, docs, indexer.WithEmbedding(eino.EmbedderDashscope))
|
||||
if err != nil || rows == 0 {
|
||||
g.Log().Error(ctx, "DocsChunkMsg rows: , err:", rows, err)
|
||||
// 写入任务进度失败 任务类型为sql存储
|
||||
remark := " 向量存储数量: " + gconv.String(rows)
|
||||
if err != nil {
|
||||
remark = "向量存储失败: " + err.Error()
|
||||
}
|
||||
err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{
|
||||
TaskId: documentId,
|
||||
TaskType: task.TaskTypeGenerateVector,
|
||||
Status: task.TaskStatusFailed,
|
||||
Remark: remark,
|
||||
})
|
||||
return
|
||||
}
|
||||
// 写入任务进度成功 任务类型为sql存储
|
||||
err = Task.WriteTaskProgress(ctx, &dto.WriteTaskProgressReq{
|
||||
TaskId: documentId,
|
||||
TaskType: task.TaskTypeGenerateVector,
|
||||
Status: task.TaskStatusCompleted,
|
||||
Remark: "向量生成完成",
|
||||
})
|
||||
return
|
||||
}
|
||||
@@ -1,60 +0,0 @@
|
||||
package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
"rag/common/eino"
|
||||
"rag/model/dto"
|
||||
|
||||
"github.com/cloudwego/eino/components/retriever"
|
||||
"github.com/cloudwego/eino/schema"
|
||||
"github.com/gogf/gf/v2/os/glog"
|
||||
"github.com/gogf/gf/v2/util/gconv"
|
||||
)
|
||||
|
||||
var RAGQuery = new(ragQueryService)
|
||||
|
||||
type ragQueryService struct{}
|
||||
|
||||
// Query 执行RAG查询
|
||||
func (s *ragQueryService) Query(ctx context.Context, req *dto.RAGQueryReq) (*dto.RAGQueryRes, error) {
|
||||
if req.TopK <= 0 {
|
||||
req.TopK = 5
|
||||
}
|
||||
|
||||
// 4. 使用向量检索器进行查询
|
||||
r, err := eino.NewPGVectorRetriever(&eino.PGVectorRetrieverConfig{
|
||||
Embedder: eino.EmbedderDashscope,
|
||||
DefaultTopK: req.TopK,
|
||||
})
|
||||
if err != nil {
|
||||
glog.Errorf(ctx, "初始化向量检索器失败: %v", err)
|
||||
return nil, fmt.Errorf("初始化向量检索器失败: %w", err)
|
||||
}
|
||||
|
||||
// 5. 执行向量检索
|
||||
docs, err := r.Retrieve(ctx, req.Content, retriever.WithEmbedding(eino.EmbedderDashscope), retriever.WithDSLInfo(map[string]any{
|
||||
"dataset_ids": req.DatasetIds,
|
||||
}))
|
||||
if err != nil {
|
||||
glog.Errorf(ctx, "向量检索失败: %v", err)
|
||||
return nil, fmt.Errorf("向量检索失败: %w", err)
|
||||
}
|
||||
|
||||
messages := make([]*schema.Message, 0)
|
||||
err = gconv.Struct(req.History, &messages)
|
||||
if err != nil {
|
||||
glog.Errorf(ctx, "转换历史消息失败: %v", err)
|
||||
return nil, fmt.Errorf("转换历史消息失败: %w", err)
|
||||
}
|
||||
|
||||
replyMsg, err := eino.NewChatModel(ctx, req.Content, docs, messages)
|
||||
if err != nil {
|
||||
glog.Errorf(ctx, "向量检索失败: %v", err)
|
||||
return nil, fmt.Errorf("向量检索失败: %w", err)
|
||||
}
|
||||
|
||||
return &dto.RAGQueryRes{
|
||||
Answer: replyMsg.Content,
|
||||
}, nil
|
||||
}
|
||||
@@ -2,11 +2,10 @@ package service
|
||||
|
||||
import (
|
||||
"context"
|
||||
"rag/consts/task"
|
||||
"rag/dao"
|
||||
"rag/model/dto"
|
||||
|
||||
"rag/common/task"
|
||||
|
||||
"gitea.com/red-future/common/db/gfdb"
|
||||
"github.com/gogf/gf/v2/database/gdb"
|
||||
"github.com/gogf/gf/v2/frame/g"
|
||||
|
||||
52
update.sql
52
update.sql
@@ -260,12 +260,12 @@ COMMENT ON COLUMN rag_vector_dataset_index.description IS '描述';
|
||||
|
||||
--------------------pgsql创建rag_vector_dataset_index表语句---------------------------
|
||||
|
||||
--------------------pgsql创建rag_vector_document_chunk表语句---------------------------
|
||||
--------------------pgsql创建rag_vector_document_vector表语句---------------------------
|
||||
|
||||
CREATE EXTENSION IF NOT EXISTS vector;
|
||||
|
||||
-- 文档分块向量表
|
||||
CREATE TABLE IF NOT EXISTS rag_vector_document_chunk (
|
||||
CREATE TABLE IF NOT EXISTS rag_vector_document_vector (
|
||||
-- 基础字段
|
||||
id BIGINT PRIMARY KEY, -- 主键ID(非自增)
|
||||
tenant_id BIGINT NOT NULL DEFAULT 0, -- 租户ID int8
|
||||
@@ -292,30 +292,30 @@ CREATE TABLE IF NOT EXISTS rag_vector_document_chunk (
|
||||
);
|
||||
|
||||
-- 索引
|
||||
CREATE INDEX idx_chunk_tenant_id ON rag_vector_document_chunk(tenant_id);
|
||||
CREATE INDEX idx_chunk_dataset_id ON rag_vector_document_chunk(dataset_id);
|
||||
CREATE INDEX idx_chunk_document_id ON rag_vector_document_chunk(document_id);
|
||||
CREATE INDEX idx_chunk_content_hash ON rag_vector_document_chunk(content_hash);
|
||||
CREATE INDEX idx_chunk_status ON rag_vector_document_chunk(status);
|
||||
CREATE INDEX idx_chunk_vector_status ON rag_vector_document_chunk(vector_status);
|
||||
CREATE INDEX idx_vector_tenant_id ON rag_vector_document_vector(tenant_id);
|
||||
CREATE INDEX idx_vector_dataset_id ON rag_vector_document_vector(dataset_id);
|
||||
CREATE INDEX idx_vector_document_id ON rag_vector_document_vector(document_id);
|
||||
CREATE INDEX idx_vector_content_hash ON rag_vector_document_vector(content_hash);
|
||||
CREATE INDEX idx_vector_status ON rag_vector_document_vector(status);
|
||||
CREATE INDEX idx_vector_vector_status ON rag_vector_document_vector(vector_status);
|
||||
|
||||
-- 注释
|
||||
COMMENT ON TABLE rag_vector_document_chunk IS '文档分块向量表';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.id IS '主键ID(非自增)';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.tenant_id IS '租户ID';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.creator IS '创建人';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.created_at IS '创建时间';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.updater IS '更新人';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.updated_at IS '更新时间';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.deleted_at IS '删除时间(软删)';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.status IS '状态';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.vector_status IS '向量生成状态';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.dataset_id IS '数据集ID';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.document_id IS '文档ID';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.content IS '分块内容';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.content_hash IS '内容哈希';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.chunk_index IS '分块序号';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.vector IS '向量数据';
|
||||
COMMENT ON COLUMN rag_vector_document_chunk.metadata IS '扩展元数据';
|
||||
COMMENT ON TABLE rag_vector_document_vector IS '文档分块向量表';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.id IS '主键ID(非自增)';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.tenant_id IS '租户ID';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.creator IS '创建人';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.created_at IS '创建时间';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.updater IS '更新人';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.updated_at IS '更新时间';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.deleted_at IS '删除时间(软删)';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.status IS '状态';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.vector_status IS '向量生成状态';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.dataset_id IS '数据集ID';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.document_id IS '文档ID';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.content IS '分块内容';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.content_hash IS '内容哈希';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.chunk_index IS '分块序号';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.vector IS '向量数据';
|
||||
COMMENT ON COLUMN rag_vector_document_vector.metadata IS '扩展元数据';
|
||||
|
||||
--------------------pgsql创建rag_vector_document_chunk表语句---------------------------
|
||||
--------------------pgsql创建rag_vector_document_vector表语句---------------------------
|
||||
Reference in New Issue
Block a user