From 5022e9115d68daf26781e223e857d655b1be32a5 Mon Sep 17 00:00:00 2001 From: qhd <1766646056@qq.com> Date: Wed, 8 Apr 2026 14:21:12 +0800 Subject: [PATCH] =?UTF-8?q?feat:=20=E6=B7=BB=E5=8A=A0=E5=88=86=E8=AF=8D?= =?UTF-8?q?=E5=B7=A5=E5=85=B7=E6=94=AF=E6=8C=81=E5=B9=B6=E6=9B=B4=E6=96=B0?= =?UTF-8?q?=E5=AD=97=E6=AE=B5=E5=91=BD=E5=90=8D?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- db/gfdb/gfdb.go | 1 + full-text-search/meilisearch/meilisearch.go | 2 +- go.mod | 2 + go.sum | 6 + utils/gse.go | 122 ++++++++++++++++++++ 5 files changed, 132 insertions(+), 1 deletion(-) create mode 100644 utils/gse.go diff --git a/db/gfdb/gfdb.go b/db/gfdb/gfdb.go index 1d990dc..9706a1e 100644 --- a/db/gfdb/gfdb.go +++ b/db/gfdb/gfdb.go @@ -414,6 +414,7 @@ var ( type Gfdb interface { Exec(ctx context.Context, sql string, args ...any) (sql.Result, error) + GetAll(ctx context.Context, sql string, args ...any) (gdb.Result, error) Model(ctx context.Context, tableNameOrStruct ...any) *model Transaction(ctx context.Context, f func(ctx context.Context, tx gdb.TX) error) error } diff --git a/full-text-search/meilisearch/meilisearch.go b/full-text-search/meilisearch/meilisearch.go index 708f8a0..2bb81bf 100644 --- a/full-text-search/meilisearch/meilisearch.go +++ b/full-text-search/meilisearch/meilisearch.go @@ -118,7 +118,7 @@ func (m *meilisearchDB) ensureIndexExists(client ms.ServiceManager, indexName st return err } - requiredFilterable := []string{"tenantId", "isDeleted", "datasetId", "creator", "updater"} + requiredFilterable := []string{"tenantId", "isDeleted", "dataset_id", "creator", "updater"} needUpdate := false // 检查是否缺少必要的 filterable attributes diff --git a/go.mod b/go.mod index 49d50b6..b86c7e3 100644 --- a/go.mod +++ b/go.mod @@ -5,6 +5,7 @@ go 1.26.0 require ( github.com/alibaba/sentinel-golang v1.0.4 github.com/bwmarrin/snowflake v0.3.0 + github.com/go-ego/gse v1.0.2 github.com/gogf/gf/contrib/registry/consul/v2 v2.9.5 github.com/gogf/gf/contrib/trace/otlphttp/v2 v2.9.5 github.com/gogf/gf/v2 v2.9.5 @@ -125,6 +126,7 @@ require ( github.com/tklauser/go-sysconf v0.3.6 // indirect github.com/tklauser/numcpus v0.2.2 // indirect github.com/valyala/fastrand v1.1.0 // indirect + github.com/vcaesar/cedar v0.30.0 // indirect github.com/vmihailenco/msgpack/v5 v5.4.1 // indirect github.com/vmihailenco/tagparser/v2 v2.0.0 // indirect github.com/xdg-go/pbkdf2 v1.0.0 // indirect diff --git a/go.sum b/go.sum index 27b7bff..7a8b7ff 100644 --- a/go.sum +++ b/go.sum @@ -125,6 +125,8 @@ github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4 github.com/fsnotify/fsnotify v1.9.0 h1:2Ml+OJNzbYCTzsxtv8vKSFD9PbJjmhYF14k/jKC7S9k= github.com/fsnotify/fsnotify v1.9.0/go.mod h1:8jBTzvmWwFyi3Pb8djgCCO5IBqzKJ/Jwo8TRcHyHii0= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= +github.com/go-ego/gse v1.0.2 h1:+27lYFPhQEhA9igtdOsJPRKYL/k3TwYsxBF5jr6KFv4= +github.com/go-ego/gse v1.0.2/go.mod h1:Fy35G+q7VV7Et1zIKO8o/sW1kkugV3znXap/lF/11zc= github.com/go-ini/ini v1.67.0 h1:z6ZrTEZqSWOTyH2FlglNbNgARyHG8oLW9gMELqKr06A= github.com/go-ini/ini v1.67.0/go.mod h1:ByCAeIL28uOIIG0E3PJtZPDL8WnHpFKFOtgjp+3Ies8= github.com/go-kit/kit v0.8.0/go.mod h1:xBxKIO96dXMWWy0MnWVtmwkA9/13aqxPnvrjFYMA2as= @@ -591,6 +593,10 @@ github.com/urfave/cli v1.20.0/go.mod h1:70zkFmudgCuE/ngEzBv17Jvp/497gISqfk5gWijb github.com/urfave/cli v1.22.1/go.mod h1:Gos4lmkARVdJ6EkW0WaNv/tZAAMe9V7XWyB60NtXRu0= github.com/valyala/fastrand v1.1.0 h1:f+5HkLW4rsgzdNoleUOB69hyT9IlD2ZQh9GyDMfb5G8= github.com/valyala/fastrand v1.1.0/go.mod h1:HWqCzkrkg6QXT8V2EXWvXCoow7vLwOFN002oeRzjapQ= +github.com/vcaesar/cedar v0.30.0 h1:9fSDpM7FTjjUdPiBUUa0MWYMRGSEcqgFXvppZcZ4d7Y= +github.com/vcaesar/cedar v0.30.0/go.mod h1:lyuGvALuZZDPNXwpzv/9LyxW+8Y6faN7zauFezNsnik= +github.com/vcaesar/tt v0.20.1 h1:D/jUeeVCNbq3ad8M7hhtB3J9x5RZ6I1n1eZ0BJp7M+4= +github.com/vcaesar/tt v0.20.1/go.mod h1:cH2+AwGAJm19Wa6xvEa+0r+sXDJBT0QgNQey6mwqLeU= github.com/vmihailenco/msgpack/v5 v5.4.1 h1:cQriyiUvjTwOHg8QZaPihLWeRAAVoCpE00IUPn0Bjt8= github.com/vmihailenco/msgpack/v5 v5.4.1/go.mod h1:GaZTsDaehaPpQVyxrf5mtQlH+pc21PIudVV/E3rRQok= github.com/vmihailenco/tagparser/v2 v2.0.0 h1:y09buUbR+b5aycVFQs/g70pqKVZNBmxwAhO7/IwNM9g= diff --git a/utils/gse.go b/utils/gse.go new file mode 100644 index 0000000..64539a6 --- /dev/null +++ b/utils/gse.go @@ -0,0 +1,122 @@ +package utils + +import ( + "context" + "sort" + "sync" + + "github.com/go-ego/gse" + "github.com/go-ego/gse/hmm/extracker" + "github.com/go-ego/gse/hmm/segment" + "github.com/gogf/gf/v2/os/glog" +) + +// 全局工具实例(不再自动初始化) +var ( + GseTool *gseTool + once sync.Once // 保证只初始化一次,线程安全 +) + +func InitGseTool(ctx context.Context) error { + var err error + once.Do(func() { + // 只执行一次初始化 + GseTool, err = newGseTool() + }) + if err != nil { + glog.Error(ctx, "gse 分词工具初始化失败:", err) + } + return err +} + +// gseTool 关键词提取工具(gse v1.0.2 标准) +type gseTool struct { + seg gse.Segmenter + tfidf *extracker.TagExtracter + tr *extracker.TextRanker +} + +// newGseTool 初始化工具(内置词典 + 停用词) +func newGseTool() (tool *gseTool, err error) { + // 1. 初始化分词器 + var seg gse.Segmenter + // 内置词典(无外部文件) + err = seg.LoadDictEmbed() + if err != nil { + return + } + // 内置停用词(v1.0.2 标准) + err = seg.LoadStopEmbed() + if err != nil { + return + } + + // 2. 初始化 TF-IDF 提取器 + tfidf := &extracker.TagExtracter{} + tfidf.WithGse(seg) + err = tfidf.LoadIdf() + if err != nil { + return + } + + // 3. 初始化 TextRank 提取器 + tr := &extracker.TextRanker{} + tr.WithGse(seg) + + tool = &gseTool{ + seg: seg, + tfidf: tfidf, + tr: tr, + } + return +} + +// Cut 分词(关键词提取唯一正确模式:精确模式 + HMM) +func (k *gseTool) Cut(text string) []string { + return k.seg.Cut(text, true) +} + +// Keyword 最终输出:关键词 + 权重 +type Keyword struct { + Word string `json:"word"` + Score float64 `json:"score"` +} + +func (k *gseTool) Extract(text string, topN int) []Keyword { + // 1. 提取 TF-IDF + tfTags := k.extractTFIDF(text, topN) + + // 2. 提取 TextRank + trTags := k.extractTextRank(text, topN) + + // 3. 合并成最终关键词(业务最常用) + scoreMap := make(map[string]float64) + for _, tag := range tfTags { + scoreMap[tag.Text] = tag.Weight + } + for _, tag := range trTags { + scoreMap[tag.Text] = tag.Weight + } + + // 转成切片并排序(高分在前) + res := make([]Keyword, 0, len(scoreMap)) + for word, score := range scoreMap { + res = append(res, Keyword{Word: word, Score: score}) + } + + sort.Slice(res, func(i, j int) bool { + return res[i].Score > res[j].Score + }) + + return res +} + +// ExtractTFIDF TF-IDF 关键词(带权重)90% 业务:文章标签、搜索、关键词 +func (k *gseTool) extractTFIDF(text string, topN int) segment.Segments { + return k.tfidf.ExtractTags(text, topN) +} + +// ExtractTextRank TextRank 关键词(带权重)长文本、摘要、语义理解 +func (k *gseTool) extractTextRank(text string, topN int) segment.Segments { + return k.tr.TextRank(text, topN) +}