148 lines
3.3 KiB
Go
148 lines
3.3 KiB
Go
package service
|
|
|
|
import (
|
|
"errors"
|
|
"fmt"
|
|
"hash/crc32"
|
|
"io"
|
|
"net/http"
|
|
"net/url"
|
|
"strconv"
|
|
"strings"
|
|
"time"
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
"golang.org/x/net/html"
|
|
"gorm.io/gorm/clause"
|
|
found "myschools.me/suguo/gofound/searcher/model"
|
|
"myschools.me/suguo/search/exceptionless"
|
|
"myschools.me/suguo/search/model"
|
|
"myschools.me/suguo/search/mysql"
|
|
)
|
|
|
|
func siteAccess(uri *string) error {
|
|
client := http.Client{Timeout: 3 * time.Second}
|
|
req, err := http.NewRequest("GET", fmt.Sprintf("https://www.%s", *uri), nil)
|
|
if err != nil {
|
|
exceptionless.SubmitAppError("siteAccess", "http.NewRequest", nil, err)
|
|
return err
|
|
}
|
|
resp, err := client.Do(req)
|
|
if err != nil {
|
|
return err.(*url.Error).Unwrap()
|
|
}
|
|
|
|
defer resp.Body.Close()
|
|
if resp.StatusCode != http.StatusOK {
|
|
return errors.New("http status!=StatusOK")
|
|
}
|
|
|
|
doc, err := goquery.NewDocumentFromReader(resp.Body)
|
|
if err != nil {
|
|
exceptionless.SubmitAppError("siteAccess", "goquery.NewDocumentFromReader", nil, err)
|
|
return err
|
|
}
|
|
|
|
title := doc.Find("title").Text()
|
|
description := ""
|
|
doc.Find("meta").Each(func(i int, s *goquery.Selection) {
|
|
if name, _ := s.Attr("name"); name == "description" {
|
|
description, _ = s.Attr("content")
|
|
}
|
|
})
|
|
if title != "" {
|
|
id := uint32(crc32.ChecksumIEEE([]byte(*uri)))
|
|
go gofound.AddIndex("default", &found.IndexDoc{
|
|
Id: id,
|
|
Text: func() string {
|
|
return fmt.Sprintf("%s %s %s", *uri, title, description)
|
|
}(),
|
|
Document: map[string]interface{}{
|
|
"url": fmt.Sprintf("https://www.%s", *uri),
|
|
"description": description,
|
|
"title": title,
|
|
},
|
|
})
|
|
}
|
|
|
|
urls := urlAnalysis(resp.Body)
|
|
site := &model.Site{
|
|
URL: *uri,
|
|
Title: title,
|
|
Description: description,
|
|
LinkCount: uint(len(urls)),
|
|
UpdatedAt: time.Time{},
|
|
CreatedAt: time.Time{},
|
|
}
|
|
db, err := mysql.New()
|
|
if err != nil {
|
|
exceptionless.SubmitAppError("siteAccess", "mysql.New", nil, err)
|
|
return err
|
|
}
|
|
|
|
if err := db.Clauses(clause.OnConflict{UpdateAll: true}).Create(site).Error; err != nil {
|
|
exceptionless.SubmitAppError("siteAccess", "db.Create", nil, err)
|
|
}
|
|
return nil
|
|
}
|
|
|
|
// 从内容中分析返回所有URL链接
|
|
func urlAnalysis(httpBody io.Reader) []string {
|
|
links := []string{}
|
|
col := []string{}
|
|
page := html.NewTokenizer(httpBody)
|
|
for {
|
|
tokenType := page.Next()
|
|
if tokenType == html.ErrorToken {
|
|
return links
|
|
}
|
|
token := page.Token()
|
|
if tokenType == html.StartTagToken && token.DataAtom.String() == "a" {
|
|
for _, attr := range token.Attr {
|
|
if attr.Key == "href" {
|
|
tl := trimHash(attr.Val)
|
|
col = append(col, tl)
|
|
resolv(&links, col)
|
|
}
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
// trimHash slices a hash # from the link
|
|
func trimHash(l string) string {
|
|
if strings.Contains(l, "#") {
|
|
var index int
|
|
for n, str := range l {
|
|
if strconv.QuoteRune(str) == "'#'" {
|
|
index = n
|
|
break
|
|
}
|
|
}
|
|
return l[:index]
|
|
}
|
|
return l
|
|
}
|
|
|
|
// check looks to see if a url exits in the slice.
|
|
func check(sl []string, s string) bool {
|
|
var check bool
|
|
for _, str := range sl {
|
|
if str == s {
|
|
check = true
|
|
break
|
|
}
|
|
}
|
|
return check
|
|
}
|
|
|
|
// resolv adds links to the link slice and insures that there is no repetition
|
|
// in our collection.
|
|
func resolv(sl *[]string, ml []string) {
|
|
for _, str := range ml {
|
|
if !check(*sl, str) {
|
|
*sl = append(*sl, str)
|
|
}
|
|
}
|
|
}
|