分词工具调整为gse

This commit is contained in:
suguo.yao 2022-10-28 23:26:05 +08:00
parent 623cac96df
commit 26fa21ddaf
5 changed files with 32 additions and 32 deletions

View File

@ -33,7 +33,7 @@ POST {{url}}/index/batch?database=default HTTP/1.1
[{
"id": 88887,
"text": "深圳南站",
"text": "深圳南站 Systems Engineering Services",
"document": {
"title": "阿森松岛所445",
"number": 223
@ -58,7 +58,7 @@ POST {{url}}/index/remove?database=default HTTP/1.1
POST http://192.168.0.254:5678/ HTTP/1.1
{
"query":"融七牛",
"query":"Engineering",
"page":1,
"limit":10,
"order": "DESC",
@ -72,7 +72,7 @@ POST http://192.168.0.254:5678/ HTTP/1.1
POST {{url}}/query HTTP/1.1
{
"query":"北京",
"query":"Engineering",
"page":1,
"limit":10,
"order": "DESC",

View File

@ -5,7 +5,7 @@ port: 5678
#数据目录
data: ./data
#词典目录
dictionary: ./data/dictionary.txt
dictionary:
# 最大线程数
gomaxprocs: 4

3
go.mod
View File

@ -5,8 +5,8 @@ go 1.18
require (
github.com/Knetic/govaluate v3.0.0+incompatible
github.com/emirpasic/gods v1.12.0
github.com/gin-contrib/gzip v0.0.5
github.com/gin-gonic/gin v1.7.7
github.com/go-ego/gse v0.70.2
github.com/ryszard/goskiplist v0.0.0-20150312221310-2dfbae5fcf46
github.com/shirou/gopsutil/v3 v3.22.4
github.com/syndtr/goleveldb v1.0.0
@ -17,7 +17,6 @@ require (
require (
github.com/gin-contrib/sse v0.1.0 // indirect
github.com/go-ego/gse v0.70.2 // indirect
github.com/go-ole/go-ole v1.2.6 // indirect
github.com/go-playground/locales v0.13.0 // indirect
github.com/go-playground/universal-translator v0.17.0 // indirect

View File

@ -56,7 +56,7 @@ func parser() *global.Config {
var debug = flag.Bool("debug", true, "设置是否开启调试模式")
var dictionaryPath = flag.String("dictionary", "./data/dictionary.txt", "设置词典路径")
var dictionaryPath = flag.String("dictionary", "", "设置词典路径")
var enableAdmin = flag.Bool("enableAdmin", true, "设置是否开启后台管理")

View File

@ -1,34 +1,41 @@
package words
import (
"embed"
"strings"
"github.com/wangbin/jiebago"
"myschools.me/suguo/gofound/searcher/utils"
"github.com/go-ego/gse"
)
var (
//go:embed data/*.txt
dictionaryFS embed.FS
)
// var (
// //go:embed data/*.txt
// dictionaryFS embed.FS
// )
type Tokenizer struct {
seg jiebago.Segmenter
seg gse.Segmenter
}
func NewTokenizer(dictionaryPath string) *Tokenizer {
file, err := dictionaryFS.Open("data/dictionary.txt")
// file, err := dictionaryFS.Open("data/dictionary.txt")
// if err != nil {
// panic(err)
// }
// utils.ReleaseAssets(file, dictionaryPath)
tokenizer := &Tokenizer{}
s, err := gse.New("zh", "alpha")
if err != nil {
panic(err)
}
utils.ReleaseAssets(file, dictionaryPath)
s.LoadDict()
// s.LoadDict("zh_s")
tokenizer.seg = s
tokenizer := &Tokenizer{}
err = tokenizer.seg.LoadDictionary(dictionaryPath)
if err != nil {
panic(err)
if dictionaryPath != "" {
err = tokenizer.seg.LoadDict(dictionaryPath)
if err != nil {
panic(err)
}
}
return tokenizer
@ -37,20 +44,14 @@ func NewTokenizer(dictionaryPath string) *Tokenizer {
func (t *Tokenizer) Cut(text string) []string {
//不区分大小写
text = strings.ToLower(text)
//移除所有的标点符号
text = utils.RemovePunctuation(text)
//移除所有的空格
text = utils.RemoveSpace(text)
var wordMap = make(map[string]struct{})
resultChan := t.seg.CutForSearch(text, true)
resultChan := t.seg.Cut(text, true)
resultChan = t.seg.Trim(resultChan)
var wordsSlice []string
for {
w, ok := <-resultChan
if !ok {
break
}
for _, w := range resultChan {
_, found := wordMap[w]
if !found {
//去除重复的词