search/service/site-service.go

139 lines
3.0 KiB
Go

package service
import (
"errors"
"fmt"
"hash/crc32"
"io"
"net/http"
"net/url"
"strconv"
"strings"
"time"
"github.com/PuerkitoBio/goquery"
"golang.org/x/net/html"
"gorm.io/gorm/clause"
"myschools.me/suguo/search/exceptionless"
"myschools.me/suguo/search/model"
"myschools.me/suguo/search/mysql"
)
func siteAccess(uri *string) error {
client := http.Client{Timeout: 3 * time.Second}
requestUri := fmt.Sprintf("https://www.%s", *uri)
req, err := http.NewRequest("GET", requestUri, nil)
if err != nil {
exceptionless.SubmitAppError("siteAccess", "http.NewRequest", nil, err)
return err
}
resp, err := client.Do(req)
if err != nil {
return err.(*url.Error).Unwrap()
}
defer resp.Body.Close()
if resp.StatusCode != http.StatusOK {
return errors.New("http status!=StatusOK")
}
doc, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
exceptionless.SubmitAppError("siteAccess", "goquery.NewDocumentFromReader", nil, err)
return err
}
title := doc.Find("title").Text()
description := ""
doc.Find("meta").Each(func(i int, s *goquery.Selection) {
if name, _ := s.Attr("name"); name == "description" {
description, _ = s.Attr("content")
}
})
if title != "" {
id := uint32(crc32.ChecksumIEEE([]byte(*uri)))
r := strings.Split(*uri, ".")
go indexAdd(&id, &r[0], &title, &description)
}
urls := urlAnalysis(resp.Body)
site := &model.Site{
URL: requestUri,
Title: title,
Description: description,
LinkCount: uint(len(urls)),
UpdatedAt: time.Time{},
CreatedAt: time.Time{},
}
db, err := mysql.New()
if err != nil {
exceptionless.SubmitAppError("siteAccess", "mysql.New", nil, err)
return err
}
if err := db.Clauses(clause.OnConflict{UpdateAll: true}).Create(site).Error; err != nil {
exceptionless.SubmitAppError("siteAccess", "db.Create", nil, err)
}
return nil
}
// 从内容中分析返回所有URL链接
func urlAnalysis(httpBody io.Reader) []string {
links := []string{}
col := []string{}
page := html.NewTokenizer(httpBody)
for {
tokenType := page.Next()
if tokenType == html.ErrorToken {
return links
}
token := page.Token()
if tokenType == html.StartTagToken && token.DataAtom.String() == "a" {
for _, attr := range token.Attr {
if attr.Key == "href" {
tl := trimHash(attr.Val)
col = append(col, tl)
resolv(&links, col)
}
}
}
}
}
// trimHash slices a hash # from the link
func trimHash(l string) string {
if strings.Contains(l, "#") {
var index int
for n, str := range l {
if strconv.QuoteRune(str) == "'#'" {
index = n
break
}
}
return l[:index]
}
return l
}
// check looks to see if a url exits in the slice.
func check(sl []string, s string) bool {
var check bool
for _, str := range sl {
if str == s {
check = true
break
}
}
return check
}
// resolv adds links to the link slice and insures that there is no repetition
// in our collection.
func resolv(sl *[]string, ml []string) {
for _, str := range ml {
if !check(*sl, str) {
*sl = append(*sl, str)
}
}
}