feat: 添加分词工具支持并更新字段命名
This commit is contained in:
@@ -414,6 +414,7 @@ var (
|
||||
|
||||
type Gfdb interface {
|
||||
Exec(ctx context.Context, sql string, args ...any) (sql.Result, error)
|
||||
GetAll(ctx context.Context, sql string, args ...any) (gdb.Result, error)
|
||||
Model(ctx context.Context, tableNameOrStruct ...any) *model
|
||||
Transaction(ctx context.Context, f func(ctx context.Context, tx gdb.TX) error) error
|
||||
}
|
||||
|
||||
@@ -118,7 +118,7 @@ func (m *meilisearchDB) ensureIndexExists(client ms.ServiceManager, indexName st
|
||||
return err
|
||||
}
|
||||
|
||||
requiredFilterable := []string{"tenantId", "isDeleted", "datasetId", "creator", "updater"}
|
||||
requiredFilterable := []string{"tenantId", "isDeleted", "dataset_id", "creator", "updater"}
|
||||
needUpdate := false
|
||||
|
||||
// 检查是否缺少必要的 filterable attributes
|
||||
|
||||
2
go.mod
2
go.mod
@@ -5,6 +5,7 @@ go 1.26.0
|
||||
require (
|
||||
github.com/alibaba/sentinel-golang v1.0.4
|
||||
github.com/bwmarrin/snowflake v0.3.0
|
||||
github.com/go-ego/gse v1.0.2
|
||||
github.com/gogf/gf/contrib/registry/consul/v2 v2.9.5
|
||||
github.com/gogf/gf/contrib/trace/otlphttp/v2 v2.9.5
|
||||
github.com/gogf/gf/v2 v2.9.5
|
||||
@@ -125,6 +126,7 @@ require (
|
||||
github.com/tklauser/go-sysconf v0.3.6 // indirect
|
||||
github.com/tklauser/numcpus v0.2.2 // indirect
|
||||
github.com/valyala/fastrand v1.1.0 // indirect
|
||||
github.com/vcaesar/cedar v0.30.0 // indirect
|
||||
github.com/vmihailenco/msgpack/v5 v5.4.1 // indirect
|
||||
github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect
|
||||
github.com/xdg-go/pbkdf2 v1.0.0 // indirect
|
||||
|
||||
6
go.sum
6
go.sum
@@ -125,6 +125,8 @@ github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4
|
||||
github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k=
|
||||
github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0=
|
||||
github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04=
|
||||
github.com/go-ego/gse v1.0.2 h1:+27lYFPhQEhA9igtdOsJPRKYL/k3TwYsxBF5jr6KFv4=
|
||||
github.com/go-ego/gse v1.0.2/go.mod h1:Fy35G+q7VV7Et1zIKO8o/sW1kkugV3znXap/lF/11zc=
|
||||
github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A=
|
||||
github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8=
|
||||
github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as=
|
||||
@@ -591,6 +593,10 @@ github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijb
|
||||
github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0=
|
||||
github.com/valyala/fastrand v1.1.0 h1:f+5HkLW4rsgzdNoleUOB69hyT9IlD2ZQh9GyDMfb5G8=
|
||||
github.com/valyala/fastrand v1.1.0/go.mod h1:HWqCzkrkg6QXT8V2EXWvXCoow7vLwOFN002oeRzjapQ=
|
||||
github.com/vcaesar/cedar v0.30.0 h1:9fSDpM7FTjjUdPiBUUa0MWYMRGSEcqgFXvppZcZ4d7Y=
|
||||
github.com/vcaesar/cedar v0.30.0/go.mod h1:lyuGvALuZZDPNXwpzv/9LyxW+8Y6faN7zauFezNsnik=
|
||||
github.com/vcaesar/tt v0.20.1 h1:D/jUeeVCNbq3ad8M7hhtB3J9x5RZ6I1n1eZ0BJp7M+4=
|
||||
github.com/vcaesar/tt v0.20.1/go.mod h1:cH2+AwGAJm19Wa6xvEa+0r+sXDJBT0QgNQey6mwqLeU=
|
||||
github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IUPn0Bjt8=
|
||||
github.com/vmihailenco/msgpack/v5 v5.4.1/go.mod h1:GaZTsDaehaPpQVyxrf5mtQlH+pc21PIudVV/E3rRQok=
|
||||
github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g=
|
||||
|
||||
122
utils/gse.go
Normal file
122
utils/gse.go
Normal file
@@ -0,0 +1,122 @@
|
||||
package utils
|
||||
|
||||
import (
|
||||
"context"
|
||||
"sort"
|
||||
"sync"
|
||||
|
||||
"github.com/go-ego/gse"
|
||||
"github.com/go-ego/gse/hmm/extracker"
|
||||
"github.com/go-ego/gse/hmm/segment"
|
||||
"github.com/gogf/gf/v2/os/glog"
|
||||
)
|
||||
|
||||
// 全局工具实例(不再自动初始化)
|
||||
var (
|
||||
GseTool *gseTool
|
||||
once sync.Once // 保证只初始化一次,线程安全
|
||||
)
|
||||
|
||||
func InitGseTool(ctx context.Context) error {
|
||||
var err error
|
||||
once.Do(func() {
|
||||
// 只执行一次初始化
|
||||
GseTool, err = newGseTool()
|
||||
})
|
||||
if err != nil {
|
||||
glog.Error(ctx, "gse 分词工具初始化失败:", err)
|
||||
}
|
||||
return err
|
||||
}
|
||||
|
||||
// gseTool 关键词提取工具(gse v1.0.2 标准)
|
||||
type gseTool struct {
|
||||
seg gse.Segmenter
|
||||
tfidf *extracker.TagExtracter
|
||||
tr *extracker.TextRanker
|
||||
}
|
||||
|
||||
// newGseTool 初始化工具(内置词典 + 停用词)
|
||||
func newGseTool() (tool *gseTool, err error) {
|
||||
// 1. 初始化分词器
|
||||
var seg gse.Segmenter
|
||||
// 内置词典(无外部文件)
|
||||
err = seg.LoadDictEmbed()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
// 内置停用词(v1.0.2 标准)
|
||||
err = seg.LoadStopEmbed()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// 2. 初始化 TF-IDF 提取器
|
||||
tfidf := &extracker.TagExtracter{}
|
||||
tfidf.WithGse(seg)
|
||||
err = tfidf.LoadIdf()
|
||||
if err != nil {
|
||||
return
|
||||
}
|
||||
|
||||
// 3. 初始化 TextRank 提取器
|
||||
tr := &extracker.TextRanker{}
|
||||
tr.WithGse(seg)
|
||||
|
||||
tool = &gseTool{
|
||||
seg: seg,
|
||||
tfidf: tfidf,
|
||||
tr: tr,
|
||||
}
|
||||
return
|
||||
}
|
||||
|
||||
// Cut 分词(关键词提取唯一正确模式:精确模式 + HMM)
|
||||
func (k *gseTool) Cut(text string) []string {
|
||||
return k.seg.Cut(text, true)
|
||||
}
|
||||
|
||||
// Keyword 最终输出:关键词 + 权重
|
||||
type Keyword struct {
|
||||
Word string `json:"word"`
|
||||
Score float64 `json:"score"`
|
||||
}
|
||||
|
||||
func (k *gseTool) Extract(text string, topN int) []Keyword {
|
||||
// 1. 提取 TF-IDF
|
||||
tfTags := k.extractTFIDF(text, topN)
|
||||
|
||||
// 2. 提取 TextRank
|
||||
trTags := k.extractTextRank(text, topN)
|
||||
|
||||
// 3. 合并成最终关键词(业务最常用)
|
||||
scoreMap := make(map[string]float64)
|
||||
for _, tag := range tfTags {
|
||||
scoreMap[tag.Text] = tag.Weight
|
||||
}
|
||||
for _, tag := range trTags {
|
||||
scoreMap[tag.Text] = tag.Weight
|
||||
}
|
||||
|
||||
// 转成切片并排序(高分在前)
|
||||
res := make([]Keyword, 0, len(scoreMap))
|
||||
for word, score := range scoreMap {
|
||||
res = append(res, Keyword{Word: word, Score: score})
|
||||
}
|
||||
|
||||
sort.Slice(res, func(i, j int) bool {
|
||||
return res[i].Score > res[j].Score
|
||||
})
|
||||
|
||||
return res
|
||||
}
|
||||
|
||||
// ExtractTFIDF TF-IDF 关键词(带权重)90% 业务:文章标签、搜索、关键词
|
||||
func (k *gseTool) extractTFIDF(text string, topN int) segment.Segments {
|
||||
return k.tfidf.ExtractTags(text, topN)
|
||||
}
|
||||
|
||||
// ExtractTextRank TextRank 关键词(带权重)长文本、摘要、语义理解
|
||||
func (k *gseTool) extractTextRank(text string, topN int) segment.Segments {
|
||||
return k.tr.TextRank(text, topN)
|
||||
}
|
||||
Reference in New Issue
Block a user