feat: 集成Eino文档解析与嵌入功能
新增Eino相关依赖,支持docx、pdf、xlsx等格式的文档加载与解析,并集成了Dashscope嵌入模型。同时修复了部分DAO查询中的OmitEmpty配置。
This commit is contained in:
51
rag/eino/document_loader.go
Normal file
51
rag/eino/document_loader.go
Normal file
@@ -0,0 +1,51 @@
|
||||
package eino
|
||||
|
||||
import (
|
||||
"context"
|
||||
"fmt"
|
||||
|
||||
"gitea.com/red-future/common/utils"
|
||||
"github.com/cloudwego/eino-ext/components/document/loader/url"
|
||||
"github.com/cloudwego/eino-ext/components/document/parser/docx"
|
||||
"github.com/cloudwego/eino-ext/components/document/parser/pdf"
|
||||
"github.com/cloudwego/eino-ext/components/document/parser/xlsx"
|
||||
"github.com/cloudwego/eino/components/document"
|
||||
"github.com/cloudwego/eino/components/document/parser"
|
||||
"github.com/cloudwego/eino/schema"
|
||||
)
|
||||
|
||||
// LoadDocument 业务函数:加载文件
|
||||
func LoadDocument(ctx context.Context, filePath, fileFormat string) (docs []*schema.Document, err error) {
|
||||
p, err := docsParser(ctx, fileFormat)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
loader, err := url.NewLoader(ctx, &url.LoaderConfig{
|
||||
Parser: p,
|
||||
})
|
||||
imageUrl, err := utils.GetFileAddressPrefix(ctx)
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
docs, err = loader.Load(context.Background(), document.Source{
|
||||
URI: fmt.Sprintf("%s%s", imageUrl, filePath),
|
||||
})
|
||||
return
|
||||
}
|
||||
|
||||
func docsParser(ctx context.Context, fileFormat string) (p parser.Parser, err error) {
|
||||
switch fileFormat {
|
||||
case "docx":
|
||||
p, err = docx.NewDocxParser(ctx, &docx.Config{
|
||||
ToSections: true,
|
||||
IncludeHeaders: true,
|
||||
IncludeFooters: true,
|
||||
IncludeTables: true,
|
||||
})
|
||||
case "pdf":
|
||||
p, err = pdf.NewPDFParser(ctx, &pdf.Config{})
|
||||
case "xlsx":
|
||||
p, err = xlsx.NewXlsxParser(ctx, &xlsx.Config{})
|
||||
}
|
||||
return
|
||||
}
|
||||
46
rag/eino/embedding.go
Normal file
46
rag/eino/embedding.go
Normal file
@@ -0,0 +1,46 @@
|
||||
package eino
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/cloudwego/eino-ext/components/embedding/dashscope"
|
||||
"github.com/gogf/gf/v2/frame/g"
|
||||
"github.com/golang/glog"
|
||||
)
|
||||
|
||||
// 全局只初始化一次
|
||||
var (
|
||||
Embedder *dashscope.Embedder // 导出供其他模块使用
|
||||
)
|
||||
|
||||
// init:程序启动时自动执行一次
|
||||
func init() {
|
||||
ctx := context.Background()
|
||||
if !g.Cfg().MustGet(ctx, "eino.embedding").IsEmpty() {
|
||||
var err error
|
||||
cfg := &dashscope.EmbeddingConfig{
|
||||
APIKey: g.Cfg().MustGet(ctx, "eino.embedding.apiKey").String(),
|
||||
Model: g.Cfg().MustGet(ctx, "eino.embedding.model").String(),
|
||||
}
|
||||
// 检查是否配置了 APIType,支持 "text_api" 和 "multi_modal_api"
|
||||
//if apiType := g.Cfg().MustGet(ctx, "eino.embedding.apiType").String(); apiType != "" {
|
||||
// apiTypeVal := dashscope.APIType(apiType)
|
||||
// cfg.APIType = &apiTypeVal
|
||||
//}
|
||||
Embedder, err = dashscope.NewEmbedder(ctx, cfg)
|
||||
if err != nil {
|
||||
glog.Fatalf("NewEmbedder of ark error: %v", err)
|
||||
}
|
||||
//embedding, err := embedder.EmbedStrings(ctx, []string{"hello world", "bye bye"})
|
||||
//if err != nil {
|
||||
// log.Printf("embedding error: %v\n", err)
|
||||
// return
|
||||
//}
|
||||
//
|
||||
//log.Printf("embedding: %v\n", embedding)
|
||||
}
|
||||
}
|
||||
|
||||
func EmbedStrings(ctx context.Context, texts []string) (embeddings [][]float64, err error) {
|
||||
return Embedder.EmbedStrings(ctx, texts)
|
||||
}
|
||||
47
rag/eino/embedding_batch.go
Normal file
47
rag/eino/embedding_batch.go
Normal file
@@ -0,0 +1,47 @@
|
||||
package eino
|
||||
|
||||
import (
|
||||
"context"
|
||||
|
||||
"github.com/cloudwego/eino/components/embedding"
|
||||
)
|
||||
|
||||
// BatchEmbedder 包装器,支持批量限制
|
||||
type BatchEmbedder struct {
|
||||
embedder embedding.Embedder
|
||||
batchSize int
|
||||
}
|
||||
|
||||
// NewBatchEmbedder 创建支持批量限制的 embedding 包装器
|
||||
func NewBatchEmbedder(embedder embedding.Embedder, batchSize int) *BatchEmbedder {
|
||||
if batchSize <= 0 {
|
||||
batchSize = 10 // 默认每批 10 个
|
||||
}
|
||||
return &BatchEmbedder{
|
||||
embedder: embedder,
|
||||
batchSize: batchSize,
|
||||
}
|
||||
}
|
||||
|
||||
// EmbedStrings 分批调用 embedding
|
||||
func (b *BatchEmbedder) EmbedStrings(ctx context.Context, texts []string, opts ...embedding.Option) ([][]float64, error) {
|
||||
if len(texts) <= b.batchSize {
|
||||
return b.embedder.EmbedStrings(ctx, texts, opts...)
|
||||
}
|
||||
|
||||
var allEmbeddings [][]float64
|
||||
for i := 0; i < len(texts); i += b.batchSize {
|
||||
end := i + b.batchSize
|
||||
if end > len(texts) {
|
||||
end = len(texts)
|
||||
}
|
||||
|
||||
batch := texts[i:end]
|
||||
embeddings, err := b.embedder.EmbedStrings(ctx, batch, opts...)
|
||||
if err != nil {
|
||||
return nil, err
|
||||
}
|
||||
allEmbeddings = append(allEmbeddings, embeddings...)
|
||||
}
|
||||
return allEmbeddings, nil
|
||||
}
|
||||
Reference in New Issue
Block a user