From 26fa21ddafe19b268fd5f8ca5e89e6b02195c233 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E5=87=8C=E6=B5=B7=E4=BB=B2=E5=AD=90?= Date: Fri, 28 Oct 2022 23:26:05 +0800 Subject: [PATCH] =?UTF-8?q?=E5=88=86=E8=AF=8D=E5=B7=A5=E5=85=B7=E8=B0=83?= =?UTF-8?q?=E6=95=B4=E4=B8=BAgse?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api-test.http | 6 ++--- config.yaml | 2 +- go.mod | 3 +-- main.go | 2 +- searcher/words/tokenizer.go | 51 +++++++++++++++++++------------------ 5 files changed, 32 insertions(+), 32 deletions(-) diff --git a/api-test.http b/api-test.http index 855dcfb..8bab4fa 100644 --- a/api-test.http +++ b/api-test.http @@ -33,7 +33,7 @@ POST {{url}}/index/batch?database=default HTTP/1.1 [{ "id": 88887, - "text": "深圳南站", + "text": "深圳南站 Systems Engineering Services", "document": { "title": "阿森松岛所445", "number": 223 @@ -58,7 +58,7 @@ POST {{url}}/index/remove?database=default HTTP/1.1 POST http://192.168.0.254:5678/ HTTP/1.1 { - "query":"融七牛", + "query":"Engineering", "page":1, "limit":10, "order": "DESC", @@ -72,7 +72,7 @@ POST http://192.168.0.254:5678/ HTTP/1.1 POST {{url}}/query HTTP/1.1 { - "query":"北京", + "query":"Engineering", "page":1, "limit":10, "order": "DESC", diff --git a/config.yaml b/config.yaml index 3cf33bb..13f25b4 100644 --- a/config.yaml +++ b/config.yaml @@ -5,7 +5,7 @@ port: 5678 #数据目录 data: ./data #词典目录 -dictionary: ./data/dictionary.txt +dictionary: # 最大线程数 gomaxprocs: 4 diff --git a/go.mod b/go.mod index ff2626d..1a3ff1a 100644 --- a/go.mod +++ b/go.mod @@ -5,8 +5,8 @@ go 1.18 require ( github.com/Knetic/govaluate v3.0.0+incompatible github.com/emirpasic/gods v1.12.0 - github.com/gin-contrib/gzip v0.0.5 github.com/gin-gonic/gin v1.7.7 + github.com/go-ego/gse v0.70.2 github.com/ryszard/goskiplist v0.0.0-20150312221310-2dfbae5fcf46 github.com/shirou/gopsutil/v3 v3.22.4 github.com/syndtr/goleveldb v1.0.0 @@ -17,7 +17,6 @@ require ( require ( github.com/gin-contrib/sse v0.1.0 // indirect - github.com/go-ego/gse v0.70.2 // indirect github.com/go-ole/go-ole v1.2.6 // indirect github.com/go-playground/locales v0.13.0 // indirect github.com/go-playground/universal-translator v0.17.0 // indirect diff --git a/main.go b/main.go index 1070a93..ccfd4c4 100644 --- a/main.go +++ b/main.go @@ -56,7 +56,7 @@ func parser() *global.Config { var debug = flag.Bool("debug", true, "设置是否开启调试模式") - var dictionaryPath = flag.String("dictionary", "./data/dictionary.txt", "设置词典路径") + var dictionaryPath = flag.String("dictionary", "", "设置词典路径") var enableAdmin = flag.Bool("enableAdmin", true, "设置是否开启后台管理") diff --git a/searcher/words/tokenizer.go b/searcher/words/tokenizer.go index 563ddb0..2b1cf5c 100644 --- a/searcher/words/tokenizer.go +++ b/searcher/words/tokenizer.go @@ -1,34 +1,41 @@ package words import ( - "embed" "strings" - "github.com/wangbin/jiebago" - "myschools.me/suguo/gofound/searcher/utils" + "github.com/go-ego/gse" ) -var ( - //go:embed data/*.txt - dictionaryFS embed.FS -) +// var ( +// //go:embed data/*.txt +// dictionaryFS embed.FS +// ) type Tokenizer struct { - seg jiebago.Segmenter + seg gse.Segmenter } func NewTokenizer(dictionaryPath string) *Tokenizer { - file, err := dictionaryFS.Open("data/dictionary.txt") + // file, err := dictionaryFS.Open("data/dictionary.txt") + // if err != nil { + // panic(err) + // } + // utils.ReleaseAssets(file, dictionaryPath) + + tokenizer := &Tokenizer{} + s, err := gse.New("zh", "alpha") if err != nil { panic(err) } - utils.ReleaseAssets(file, dictionaryPath) + s.LoadDict() + // s.LoadDict("zh_s") + tokenizer.seg = s - tokenizer := &Tokenizer{} - - err = tokenizer.seg.LoadDictionary(dictionaryPath) - if err != nil { - panic(err) + if dictionaryPath != "" { + err = tokenizer.seg.LoadDict(dictionaryPath) + if err != nil { + panic(err) + } } return tokenizer @@ -37,20 +44,14 @@ func NewTokenizer(dictionaryPath string) *Tokenizer { func (t *Tokenizer) Cut(text string) []string { //不区分大小写 text = strings.ToLower(text) - //移除所有的标点符号 - text = utils.RemovePunctuation(text) - //移除所有的空格 - text = utils.RemoveSpace(text) var wordMap = make(map[string]struct{}) - resultChan := t.seg.CutForSearch(text, true) + resultChan := t.seg.Cut(text, true) + resultChan = t.seg.Trim(resultChan) + var wordsSlice []string - for { - w, ok := <-resultChan - if !ok { - break - } + for _, w := range resultChan { _, found := wordMap[w] if !found { //去除重复的词