分词工具调整为gse
This commit is contained in:
parent
623cac96df
commit
26fa21ddaf
|
|
@ -33,7 +33,7 @@ POST {{url}}/index/batch?database=default HTTP/1.1
|
|||
|
||||
[{
|
||||
"id": 88887,
|
||||
"text": "深圳南站",
|
||||
"text": "深圳南站 Systems Engineering Services",
|
||||
"document": {
|
||||
"title": "阿森松岛所445",
|
||||
"number": 223
|
||||
|
|
@ -58,7 +58,7 @@ POST {{url}}/index/remove?database=default HTTP/1.1
|
|||
POST http://192.168.0.254:5678/ HTTP/1.1
|
||||
|
||||
{
|
||||
"query":"融七牛",
|
||||
"query":"Engineering",
|
||||
"page":1,
|
||||
"limit":10,
|
||||
"order": "DESC",
|
||||
|
|
@ -72,7 +72,7 @@ POST http://192.168.0.254:5678/ HTTP/1.1
|
|||
POST {{url}}/query HTTP/1.1
|
||||
|
||||
{
|
||||
"query":"北京",
|
||||
"query":"Engineering",
|
||||
"page":1,
|
||||
"limit":10,
|
||||
"order": "DESC",
|
||||
|
|
|
|||
|
|
@ -5,7 +5,7 @@ port: 5678
|
|||
#数据目录
|
||||
data: ./data
|
||||
#词典目录
|
||||
dictionary: ./data/dictionary.txt
|
||||
dictionary:
|
||||
|
||||
# 最大线程数
|
||||
gomaxprocs: 4
|
||||
|
|
|
|||
3
go.mod
3
go.mod
|
|
@ -5,8 +5,8 @@ go 1.18
|
|||
require (
|
||||
github.com/Knetic/govaluate v3.0.0+incompatible
|
||||
github.com/emirpasic/gods v1.12.0
|
||||
github.com/gin-contrib/gzip v0.0.5
|
||||
github.com/gin-gonic/gin v1.7.7
|
||||
github.com/go-ego/gse v0.70.2
|
||||
github.com/ryszard/goskiplist v0.0.0-20150312221310-2dfbae5fcf46
|
||||
github.com/shirou/gopsutil/v3 v3.22.4
|
||||
github.com/syndtr/goleveldb v1.0.0
|
||||
|
|
@ -17,7 +17,6 @@ require (
|
|||
|
||||
require (
|
||||
github.com/gin-contrib/sse v0.1.0 // indirect
|
||||
github.com/go-ego/gse v0.70.2 // indirect
|
||||
github.com/go-ole/go-ole v1.2.6 // indirect
|
||||
github.com/go-playground/locales v0.13.0 // indirect
|
||||
github.com/go-playground/universal-translator v0.17.0 // indirect
|
||||
|
|
|
|||
2
main.go
2
main.go
|
|
@ -56,7 +56,7 @@ func parser() *global.Config {
|
|||
|
||||
var debug = flag.Bool("debug", true, "设置是否开启调试模式")
|
||||
|
||||
var dictionaryPath = flag.String("dictionary", "./data/dictionary.txt", "设置词典路径")
|
||||
var dictionaryPath = flag.String("dictionary", "", "设置词典路径")
|
||||
|
||||
var enableAdmin = flag.Bool("enableAdmin", true, "设置是否开启后台管理")
|
||||
|
||||
|
|
|
|||
|
|
@ -1,34 +1,41 @@
|
|||
package words
|
||||
|
||||
import (
|
||||
"embed"
|
||||
"strings"
|
||||
|
||||
"github.com/wangbin/jiebago"
|
||||
"myschools.me/suguo/gofound/searcher/utils"
|
||||
"github.com/go-ego/gse"
|
||||
)
|
||||
|
||||
var (
|
||||
//go:embed data/*.txt
|
||||
dictionaryFS embed.FS
|
||||
)
|
||||
// var (
|
||||
// //go:embed data/*.txt
|
||||
// dictionaryFS embed.FS
|
||||
// )
|
||||
|
||||
type Tokenizer struct {
|
||||
seg jiebago.Segmenter
|
||||
seg gse.Segmenter
|
||||
}
|
||||
|
||||
func NewTokenizer(dictionaryPath string) *Tokenizer {
|
||||
file, err := dictionaryFS.Open("data/dictionary.txt")
|
||||
// file, err := dictionaryFS.Open("data/dictionary.txt")
|
||||
// if err != nil {
|
||||
// panic(err)
|
||||
// }
|
||||
// utils.ReleaseAssets(file, dictionaryPath)
|
||||
|
||||
tokenizer := &Tokenizer{}
|
||||
s, err := gse.New("zh", "alpha")
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
utils.ReleaseAssets(file, dictionaryPath)
|
||||
s.LoadDict()
|
||||
// s.LoadDict("zh_s")
|
||||
tokenizer.seg = s
|
||||
|
||||
tokenizer := &Tokenizer{}
|
||||
|
||||
err = tokenizer.seg.LoadDictionary(dictionaryPath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
if dictionaryPath != "" {
|
||||
err = tokenizer.seg.LoadDict(dictionaryPath)
|
||||
if err != nil {
|
||||
panic(err)
|
||||
}
|
||||
}
|
||||
|
||||
return tokenizer
|
||||
|
|
@ -37,20 +44,14 @@ func NewTokenizer(dictionaryPath string) *Tokenizer {
|
|||
func (t *Tokenizer) Cut(text string) []string {
|
||||
//不区分大小写
|
||||
text = strings.ToLower(text)
|
||||
//移除所有的标点符号
|
||||
text = utils.RemovePunctuation(text)
|
||||
//移除所有的空格
|
||||
text = utils.RemoveSpace(text)
|
||||
|
||||
var wordMap = make(map[string]struct{})
|
||||
|
||||
resultChan := t.seg.CutForSearch(text, true)
|
||||
resultChan := t.seg.Cut(text, true)
|
||||
resultChan = t.seg.Trim(resultChan)
|
||||
|
||||
var wordsSlice []string
|
||||
for {
|
||||
w, ok := <-resultChan
|
||||
if !ok {
|
||||
break
|
||||
}
|
||||
for _, w := range resultChan {
|
||||
_, found := wordMap[w]
|
||||
if !found {
|
||||
//去除重复的词
|
||||
|
|
|
|||
Loading…
Reference in New Issue