search/service/site-service.go

package service

import (
	"errors"
	"fmt"
	"hash/crc32"
	"io"
	"net/http"
	"net/url"
	"strconv"
	"strings"
	"time"

	"github.com/PuerkitoBio/goquery"
	"golang.org/x/net/html"
	"gorm.io/gorm/clause"
	found "myschools.me/suguo/gofound/searcher/model"
	"myschools.me/suguo/search/exceptionless"
	"myschools.me/suguo/search/model"
	"myschools.me/suguo/search/mysql"
)

func siteAccess(uri *string) error {
	client := http.Client{Timeout: 3 * time.Second}
	req, err := http.NewRequest("GET", fmt.Sprintf("https://www.%s", *uri), nil)
	if err != nil {
		exceptionless.SubmitAppError("siteAccess", "http.NewRequest", nil, err)
		return err
	}
	resp, err := client.Do(req)
	if err != nil {
		return err.(*url.Error).Unwrap()
	}

	defer resp.Body.Close()
	if resp.StatusCode != http.StatusOK {
		return errors.New("http status!=StatusOK")
	}

	doc, err := goquery.NewDocumentFromReader(resp.Body)
	if err != nil {
		exceptionless.SubmitAppError("siteAccess", "goquery.NewDocumentFromReader", nil, err)
		return err
	}

	title := doc.Find("title").Text()
	description := ""
	doc.Find("meta").Each(func(i int, s *goquery.Selection) {
		if name, _ := s.Attr("name"); name == "description" {
			description, _ = s.Attr("content")
		}
	})
	if title != "" {
		id := uint32(crc32.ChecksumIEEE([]byte(*uri)))
		go gofound.AddIndex("default", &found.IndexDoc{
			Id: id,
			Text: func() string {
				return fmt.Sprintf("%s %s %s", *uri, title, description)
			}(),
			Document: map[string]interface{}{
				"url":         fmt.Sprintf("https://www.%s", *uri),
				"description": description,
				"title":       title,
			},
		})
	}

	urls := urlAnalysis(resp.Body)
	site := &model.Site{
		URL:         *uri,
		Title:       title,
		Description: description,
		LinkCount:   uint(len(urls)),
		UpdatedAt:   time.Time{},
		CreatedAt:   time.Time{},
	}
	db, err := mysql.New()
	if err != nil {
		exceptionless.SubmitAppError("siteAccess", "mysql.New", nil, err)
		return err
	}

	if err := db.Clauses(clause.OnConflict{UpdateAll: true}).Create(site).Error; err != nil {
		exceptionless.SubmitAppError("siteAccess", "db.Create", nil, err)
	}
	return nil
}

// 从内容中分析返回所有URL链接
func urlAnalysis(httpBody io.Reader) []string {
	links := []string{}
	col := []string{}
	page := html.NewTokenizer(httpBody)
	for {
		tokenType := page.Next()
		if tokenType == html.ErrorToken {
			return links
		}
		token := page.Token()
		if tokenType == html.StartTagToken && token.DataAtom.String() == "a" {
			for _, attr := range token.Attr {
				if attr.Key == "href" {
					tl := trimHash(attr.Val)
					col = append(col, tl)
					resolv(&links, col)
				}
			}
		}
	}
}

// trimHash slices a hash # from the link
func trimHash(l string) string {
	if strings.Contains(l, "#") {
		var index int
		for n, str := range l {
			if strconv.QuoteRune(str) == "'#'" {
				index = n
				break
			}
		}
		return l[:index]
	}
	return l
}

// check looks to see if a url exits in the slice.
func check(sl []string, s string) bool {
	var check bool
	for _, str := range sl {
		if str == s {
			check = true
			break
		}
	}
	return check
}

// resolv adds links to the link slice and insures that there is no repetition
// in our collection.
func resolv(sl *[]string, ml []string) {
	for _, str := range ml {
		if !check(*sl, str) {
			*sl = append(*sl, str)
		}
	}
}