package eino import ( "context" "github.com/cloudwego/eino-ext/components/document/transformer/splitter/recursive" "github.com/cloudwego/eino-ext/components/document/transformer/splitter/semantic" "github.com/cloudwego/eino/components/document" "github.com/cloudwego/eino/schema" "github.com/gogf/gf/v2/frame/g" ) // 全局只初始化一次 var ( splitter document.Transformer ) // SemanticSplitDocument 语义分割文档 func SemanticSplitDocument(ctx context.Context, docs []*schema.Document) (res []*schema.Document, err error) { if g.IsEmpty(splitter) { // 默认分隔符(支持中英文) separators := []string{"\n\n", "\n", "。", "!", "?", ";", ".", "!", "?", ";"} // 读取配置,使用合理的默认值 bufferSize := g.Cfg().MustGet(ctx, "eino.splitter.bufferSize").Int() percentile := g.Cfg().MustGet(ctx, "eino.splitter.percentile").Float64() batchSize := g.Cfg().MustGet(ctx, "eino.splitter.batchSize").Int() if batchSize <= 0 { batchSize = 10 // doubao-embedding-vision 限制每批最多 10 个 } // 使用批量包装器 batchEmbedder := NewBatchEmbedder(Embedder, batchSize) splitter, err = semantic.NewSplitter(ctx, &semantic.Config{ Embedding: batchEmbedder, BufferSize: bufferSize, Percentile: percentile, Separators: separators, }) if err != nil { return } } return splitter.Transform(ctx, docs) } // RecursiveSplitDocument 递归分割文档 func RecursiveSplitDocument(ctx context.Context, docs []*schema.Document) (res []*schema.Document, err error) { if g.IsEmpty(splitter) { // 默认分隔符(支持中英文) separators := []string{"\n\n", "\n", "。", "!", "?", ";", ".", "!", "?", ";"} splitter, err = recursive.NewSplitter(ctx, &recursive.Config{ ChunkSize: 1500, OverlapSize: 300, KeepType: recursive.KeepTypeNone, Separators: separators, }) if err != nil { return } } return splitter.Transform(ctx, docs) }