fix: GSE数据文件从gse/dict目录加载
This commit is contained in:
15
gse/dict/README.md
Normal file
15
gse/dict/README.md
Normal file
@@ -0,0 +1,15 @@
|
|||||||
|
Some dict/zh data is from [github.com/fxsjy/jieba](https://github.com/fxsjy/jieba)
|
||||||
|
|
||||||
|
update at 2023-11-16:
|
||||||
|
|
||||||
|
add two new dict documents , which from [github.com/GuocaiL/nlp_corpus](https://github.com/GuocaiL/nlp_corpus)
|
||||||
|
|
||||||
|
generated by `nlp_corpus/open_ner_data/boson/boson.txt`, `open_ner_data/people_daily/people_daily_ner.txt`, `open_ner_data/tianchi_yiyao/train.txt`,`open_ner_data/ResumeNER/dev.txt`
|
||||||
|
|
||||||
|
1. tf_idf.txt
|
||||||
|
|
||||||
|
The first column of this document is the term , the second column is the word frequency of the corresponding term, and the third column is the inverse document frequency of the corresponding term
|
||||||
|
|
||||||
|
2. tf_idf_origin.txt
|
||||||
|
|
||||||
|
the origin corpus text
|
||||||
0
gse/dict/en/dict.txt
Normal file
0
gse/dict/en/dict.txt
Normal file
1
gse/dict/jp/README.md
Normal file
1
gse/dict/jp/README.md
Normal file
@@ -0,0 +1 @@
|
|||||||
|
dict.txt 通过内部工具生成, Copyright 2017 ego authors. 商用和拷贝请注明来源和版权
|
||||||
885298
gse/dict/jp/dict.txt
Normal file
885298
gse/dict/jp/dict.txt
Normal file
File diff suppressed because it is too large
Load Diff
270132
gse/dict/zh/idf.txt
Normal file
270132
gse/dict/zh/idf.txt
Normal file
File diff suppressed because it is too large
Load Diff
352279
gse/dict/zh/s_1.txt
Normal file
352279
gse/dict/zh/s_1.txt
Normal file
File diff suppressed because it is too large
Load Diff
1161
gse/dict/zh/stop_tokens.txt
Normal file
1161
gse/dict/zh/stop_tokens.txt
Normal file
File diff suppressed because it is too large
Load Diff
88
gse/dict/zh/stop_word.txt
Normal file
88
gse/dict/zh/stop_word.txt
Normal file
@@ -0,0 +1,88 @@
|
|||||||
|
,
|
||||||
|
.
|
||||||
|
?
|
||||||
|
!
|
||||||
|
"
|
||||||
|
@
|
||||||
|
,
|
||||||
|
。
|
||||||
|
、
|
||||||
|
?
|
||||||
|
!
|
||||||
|
:
|
||||||
|
“
|
||||||
|
”
|
||||||
|
;
|
||||||
|
|
||||||
|
(
|
||||||
|
)
|
||||||
|
《
|
||||||
|
》
|
||||||
|
~
|
||||||
|
*
|
||||||
|
<
|
||||||
|
>
|
||||||
|
/
|
||||||
|
\
|
||||||
|
|
|
||||||
|
-
|
||||||
|
_
|
||||||
|
+
|
||||||
|
=
|
||||||
|
&
|
||||||
|
^
|
||||||
|
%
|
||||||
|
#
|
||||||
|
`
|
||||||
|
;
|
||||||
|
$
|
||||||
|
¥
|
||||||
|
‘
|
||||||
|
’
|
||||||
|
〉
|
||||||
|
〈
|
||||||
|
…
|
||||||
|
>
|
||||||
|
<
|
||||||
|
@
|
||||||
|
#
|
||||||
|
$
|
||||||
|
%
|
||||||
|
︿
|
||||||
|
&
|
||||||
|
*
|
||||||
|
+
|
||||||
|
~
|
||||||
|
|
|
||||||
|
[
|
||||||
|
]
|
||||||
|
{
|
||||||
|
}
|
||||||
|
啊
|
||||||
|
阿
|
||||||
|
哎
|
||||||
|
哎呀
|
||||||
|
哎哟
|
||||||
|
唉
|
||||||
|
俺
|
||||||
|
俺们
|
||||||
|
按
|
||||||
|
按照
|
||||||
|
吧
|
||||||
|
吧哒
|
||||||
|
把
|
||||||
|
罢了
|
||||||
|
被
|
||||||
|
本
|
||||||
|
本着
|
||||||
|
比
|
||||||
|
比方
|
||||||
|
比如
|
||||||
|
鄙人
|
||||||
|
彼
|
||||||
|
彼此
|
||||||
|
边
|
||||||
|
别
|
||||||
|
别的
|
||||||
|
别说
|
||||||
|
并
|
||||||
236754
gse/dict/zh/t_1.txt
Normal file
236754
gse/dict/zh/t_1.txt
Normal file
File diff suppressed because it is too large
Load Diff
107536
gse/dict/zh/tf_idf.txt
Normal file
107536
gse/dict/zh/tf_idf.txt
Normal file
File diff suppressed because it is too large
Load Diff
33450
gse/dict/zh/tf_idf_origin.txt
Normal file
33450
gse/dict/zh/tf_idf_origin.txt
Normal file
File diff suppressed because one or more lines are too long
@@ -2,7 +2,6 @@ package utils
|
|||||||
|
|
||||||
import (
|
import (
|
||||||
"context"
|
"context"
|
||||||
"os"
|
|
||||||
"path/filepath"
|
"path/filepath"
|
||||||
"sort"
|
"sort"
|
||||||
"sync"
|
"sync"
|
||||||
@@ -38,23 +37,43 @@ type gseTool struct {
|
|||||||
tr *extracker.TextRanker
|
tr *extracker.TextRanker
|
||||||
}
|
}
|
||||||
|
|
||||||
// newGseTool 初始化工具(内置词典 + 停用词)
|
// newGseTool 初始化工具(使用外部数据文件)
|
||||||
func newGseTool() (tool *gseTool, err error) {
|
func newGseTool() (tool *gseTool, err error) {
|
||||||
// 1. 初始化分词器
|
// 1. 初始化分词器
|
||||||
var seg gse.Segmenter
|
var seg gse.Segmenter
|
||||||
|
|
||||||
// 获取GSE数据文件路径
|
// gse数据文件在可执行文件同级的gse/dict目录下
|
||||||
gseDataPath := os.Getenv("GSE_DATA_PATH")
|
gseDataPath := "gse"
|
||||||
|
|
||||||
if gseDataPath != "" {
|
|
||||||
// 使用外部数据文件
|
|
||||||
dictPath := filepath.Join(gseDataPath, "dict", "zh")
|
dictPath := filepath.Join(gseDataPath, "dict", "zh")
|
||||||
idfPath := filepath.Join(gseDataPath, "dict", "zh", "idf.txt")
|
idfPath := filepath.Join(dictPath, "idf.txt")
|
||||||
stopPath := filepath.Join(gseDataPath, "dict", "zh", "stop.txt")
|
stopPath := filepath.Join(dictPath, "stop.txt")
|
||||||
|
|
||||||
// 加载词典
|
// 加载词典
|
||||||
err = seg.LoadDict(filepath.Join(dictPath, "dict.txt"))
|
err = seg.LoadDict(filepath.Join(dictPath, "dict.txt"))
|
||||||
if err != nil {
|
if err != nil {
|
||||||
|
glog.Warning(context.Background(), "加载gse词典失败,尝试embed模式:", err)
|
||||||
|
// 回退到embed模式
|
||||||
|
err = seg.LoadDictEmbed()
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
err = seg.LoadStopEmbed()
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
tfidf := &extracker.TagExtracter{}
|
||||||
|
tfidf.WithGse(seg)
|
||||||
|
err = tfidf.LoadIdf()
|
||||||
|
if err != nil {
|
||||||
|
return
|
||||||
|
}
|
||||||
|
tr := &extracker.TextRanker{}
|
||||||
|
tr.WithGse(seg)
|
||||||
|
tool = &gseTool{
|
||||||
|
seg: seg,
|
||||||
|
tfidf: tfidf,
|
||||||
|
tr: tr,
|
||||||
|
}
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -81,36 +100,6 @@ func newGseTool() (tool *gseTool, err error) {
|
|||||||
tfidf: tfidf,
|
tfidf: tfidf,
|
||||||
tr: tr,
|
tr: tr,
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// 使用内置embed数据
|
|
||||||
err = seg.LoadDictEmbed()
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
// 内置停用词(v1.0.2 标准)
|
|
||||||
err = seg.LoadStopEmbed()
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// 2. 初始化 TF-IDF 提取器
|
|
||||||
tfidf := &extracker.TagExtracter{}
|
|
||||||
tfidf.WithGse(seg)
|
|
||||||
err = tfidf.LoadIdf()
|
|
||||||
if err != nil {
|
|
||||||
return
|
|
||||||
}
|
|
||||||
|
|
||||||
// 3. 初始化 TextRank 提取器
|
|
||||||
tr := &extracker.TextRanker{}
|
|
||||||
tr.WithGse(seg)
|
|
||||||
|
|
||||||
tool = &gseTool{
|
|
||||||
seg: seg,
|
|
||||||
tfidf: tfidf,
|
|
||||||
tr: tr,
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return
|
return
|
||||||
}
|
}
|
||||||
|
|
||||||
Reference in New Issue
Block a user