52 lines
1.4 KiB
Go
52 lines
1.4 KiB
Go
package eino
|
|
|
|
import (
|
|
"context"
|
|
"fmt"
|
|
|
|
"gitea.com/red-future/common/utils"
|
|
"github.com/cloudwego/eino-ext/components/document/loader/url"
|
|
"github.com/cloudwego/eino-ext/components/document/parser/docx"
|
|
"github.com/cloudwego/eino-ext/components/document/parser/pdf"
|
|
"github.com/cloudwego/eino-ext/components/document/parser/xlsx"
|
|
"github.com/cloudwego/eino/components/document"
|
|
"github.com/cloudwego/eino/components/document/parser"
|
|
"github.com/cloudwego/eino/schema"
|
|
)
|
|
|
|
// LoadDocument 业务函数:加载文件
|
|
func LoadDocument(ctx context.Context, filePath, fileFormat string) (docs []*schema.Document, err error) {
|
|
p, err := docsParser(ctx, fileFormat)
|
|
if err != nil {
|
|
return
|
|
}
|
|
loader, err := url.NewLoader(ctx, &url.LoaderConfig{
|
|
Parser: p,
|
|
})
|
|
imageUrl, err := utils.GetFileAddressPrefix(ctx)
|
|
if err != nil {
|
|
return
|
|
}
|
|
docs, err = loader.Load(context.Background(), document.Source{
|
|
URI: fmt.Sprintf("%s%s", imageUrl, filePath),
|
|
})
|
|
return
|
|
}
|
|
|
|
func docsParser(ctx context.Context, fileFormat string) (p parser.Parser, err error) {
|
|
switch fileFormat {
|
|
case "docx":
|
|
p, err = docx.NewDocxParser(ctx, &docx.Config{
|
|
ToSections: true,
|
|
IncludeHeaders: true,
|
|
IncludeFooters: true,
|
|
IncludeTables: true,
|
|
})
|
|
case "pdf":
|
|
p, err = pdf.NewPDFParser(ctx, &pdf.Config{})
|
|
case "xlsx":
|
|
p, err = xlsx.NewXlsxParser(ctx, &xlsx.Config{})
|
|
}
|
|
return
|
|
}
|