Files
common/rag/eino/document_loader.go
qhd bcbe6eba78 feat: 集成Eino文档解析与嵌入功能
新增Eino相关依赖,支持docx、pdf、xlsx等格式的文档加载与解析,并集成了Dashscope嵌入模型。同时修复了部分DAO查询中的OmitEmpty配置。
2026-03-28 18:24:15 +08:00

52 lines
1.4 KiB
Go

package eino
import (
"context"
"fmt"
"gitea.com/red-future/common/utils"
"github.com/cloudwego/eino-ext/components/document/loader/url"
"github.com/cloudwego/eino-ext/components/document/parser/docx"
"github.com/cloudwego/eino-ext/components/document/parser/pdf"
"github.com/cloudwego/eino-ext/components/document/parser/xlsx"
"github.com/cloudwego/eino/components/document"
"github.com/cloudwego/eino/components/document/parser"
"github.com/cloudwego/eino/schema"
)
// LoadDocument 业务函数:加载文件
func LoadDocument(ctx context.Context, filePath, fileFormat string) (docs []*schema.Document, err error) {
p, err := docsParser(ctx, fileFormat)
if err != nil {
return
}
loader, err := url.NewLoader(ctx, &url.LoaderConfig{
Parser: p,
})
imageUrl, err := utils.GetFileAddressPrefix(ctx)
if err != nil {
return
}
docs, err = loader.Load(context.Background(), document.Source{
URI: fmt.Sprintf("%s%s", imageUrl, filePath),
})
return
}
func docsParser(ctx context.Context, fileFormat string) (p parser.Parser, err error) {
switch fileFormat {
case "docx":
p, err = docx.NewDocxParser(ctx, &docx.Config{
ToSections: true,
IncludeHeaders: true,
IncludeFooters: true,
IncludeTables: true,
})
case "pdf":
p, err = pdf.NewPDFParser(ctx, &pdf.Config{})
case "xlsx":
p, err = xlsx.NewXlsxParser(ctx, &xlsx.Config{})
}
return
}