fix: GSE数据文件从gse/dict目录加载

This commit is contained in:
2026-04-21 10:24:47 +08:00
parent f671096dbe
commit e051046f77
12 changed files with 1886750 additions and 47 deletions

15
gse/dict/README.md Normal file
View File

@@ -0,0 +1,15 @@
Some dict/zh data is from [github.com/fxsjy/jieba](https://github.com/fxsjy/jieba)
update at 2023-11-16:
add two new dict documents , which from [github.com/GuocaiL/nlp_corpus](https://github.com/GuocaiL/nlp_corpus)
generated by `nlp_corpus/open_ner_data/boson/boson.txt`, `open_ner_data/people_daily/people_daily_ner.txt`, `open_ner_data/tianchi_yiyao/train.txt`,`open_ner_data/ResumeNER/dev.txt`
1. tf_idf.txt
The first column of this document is the term , the second column is the word frequency of the corresponding term, and the third column is the inverse document frequency of the corresponding term
2. tf_idf_origin.txt
the origin corpus text

0
gse/dict/en/dict.txt Normal file
View File

1
gse/dict/jp/README.md Normal file
View File

@@ -0,0 +1 @@
dict.txt 通过内部工具生成, Copyright 2017 ego authors. 商用和拷贝请注明来源和版权

885298
gse/dict/jp/dict.txt Normal file

File diff suppressed because it is too large Load Diff

270132
gse/dict/zh/idf.txt Normal file

File diff suppressed because it is too large Load Diff

352279
gse/dict/zh/s_1.txt Normal file

File diff suppressed because it is too large Load Diff

1161
gse/dict/zh/stop_tokens.txt Normal file

File diff suppressed because it is too large Load Diff

88
gse/dict/zh/stop_word.txt Normal file
View File

@@ -0,0 +1,88 @@
,
.
?
!
"
@
 
~
*
<
>
/
\
|
-
_
+
=
&
^
%
#
`
;
$
︿
哎呀
哎哟
俺们
按照
吧哒
罢了
本着
比方
比如
鄙人
彼此
别的
别说

236754
gse/dict/zh/t_1.txt Normal file

File diff suppressed because it is too large Load Diff

107536
gse/dict/zh/tf_idf.txt Normal file

File diff suppressed because it is too large Load Diff

33450
gse/dict/zh/tf_idf_origin.txt Normal file

File diff suppressed because one or more lines are too long

View File

@@ -2,7 +2,6 @@ package utils
import ( import (
"context" "context"
"os"
"path/filepath" "path/filepath"
"sort" "sort"
"sync" "sync"
@@ -38,78 +37,68 @@ type gseTool struct {
tr *extracker.TextRanker tr *extracker.TextRanker
} }
// newGseTool 初始化工具(内置词典 + 停用词 // newGseTool 初始化工具(使用外部数据文件
func newGseTool() (tool *gseTool, err error) { func newGseTool() (tool *gseTool, err error) {
// 1. 初始化分词器 // 1. 初始化分词器
var seg gse.Segmenter var seg gse.Segmenter
// 获取GSE数据文件路径 // gse数据文件在可执行文件同级的gse/dict目录下
gseDataPath := os.Getenv("GSE_DATA_PATH") gseDataPath := "gse"
dictPath := filepath.Join(gseDataPath, "dict", "zh")
idfPath := filepath.Join(dictPath, "idf.txt")
stopPath := filepath.Join(dictPath, "stop.txt")
if gseDataPath != "" { // 加载词典
// 使用外部数据文件 err = seg.LoadDict(filepath.Join(dictPath, "dict.txt"))
dictPath := filepath.Join(gseDataPath, "dict", "zh") if err != nil {
idfPath := filepath.Join(gseDataPath, "dict", "zh", "idf.txt") glog.Warning(context.Background(), "加载gse词典失败尝试embed模式:", err)
stopPath := filepath.Join(gseDataPath, "dict", "zh", "stop.txt") // 回退到embed模式
// 加载词典
err = seg.LoadDict(filepath.Join(dictPath, "dict.txt"))
if err != nil {
return
}
// 加载停用词
err = seg.LoadStop(stopPath)
if err != nil {
glog.Warning(context.Background(), "加载停用词失败,继续:", err)
}
// 2. 初始化 TF-IDF 提取器
tfidf := &extracker.TagExtracter{}
tfidf.WithGse(seg)
err = tfidf.LoadIdf(idfPath)
if err != nil {
return
}
// 3. 初始化 TextRank 提取器
tr := &extracker.TextRanker{}
tr.WithGse(seg)
tool = &gseTool{
seg: seg,
tfidf: tfidf,
tr: tr,
}
} else {
// 使用内置embed数据
err = seg.LoadDictEmbed() err = seg.LoadDictEmbed()
if err != nil { if err != nil {
return return
} }
// 内置停用词v1.0.2 标准)
err = seg.LoadStopEmbed() err = seg.LoadStopEmbed()
if err != nil { if err != nil {
return return
} }
// 2. 初始化 TF-IDF 提取器
tfidf := &extracker.TagExtracter{} tfidf := &extracker.TagExtracter{}
tfidf.WithGse(seg) tfidf.WithGse(seg)
err = tfidf.LoadIdf() err = tfidf.LoadIdf()
if err != nil { if err != nil {
return return
} }
// 3. 初始化 TextRank 提取器
tr := &extracker.TextRanker{} tr := &extracker.TextRanker{}
tr.WithGse(seg) tr.WithGse(seg)
tool = &gseTool{ tool = &gseTool{
seg: seg, seg: seg,
tfidf: tfidf, tfidf: tfidf,
tr: tr, tr: tr,
} }
return
}
// 加载停用词
err = seg.LoadStop(stopPath)
if err != nil {
glog.Warning(context.Background(), "加载停用词失败,继续:", err)
}
// 2. 初始化 TF-IDF 提取器
tfidf := &extracker.TagExtracter{}
tfidf.WithGse(seg)
err = tfidf.LoadIdf(idfPath)
if err != nil {
return
}
// 3. 初始化 TextRank 提取器
tr := &extracker.TextRanker{}
tr.WithGse(seg)
tool = &gseTool{
seg: seg,
tfidf: tfidf,
tr: tr,
} }
return return
} }