package service import ( "io" "net/http" "strconv" "strings" "time" "golang.org/x/net/html" "gorm.io/gorm/clause" "myschools.me/suguo/search/exceptionless" "myschools.me/suguo/search/model" "myschools.me/suguo/search/mysql" ) func siteAccess(url *string) { client := http.Client{Timeout: 3 * time.Second} req, err := http.NewRequest("GET", *url, nil) if err != nil { exceptionless.SubmitAppError("siteAccess", "http.NewRequest", nil, err) return } resp, err := client.Do(req) if err != nil { exceptionless.SubmitAppError("siteAccess", "client.Do", nil, err) return } defer resp.Body.Close() title := resp.Header.Get("title") if title == "" { return } description := resp.Header.Get("description") if description == "" { return } urls := urlAnalysis(resp.Body) site := &model.Site{ URL: *url, Title: title, Description: description, LinkCount: uint(len(urls)), UpdatedAt: time.Time{}, CreatedAt: time.Time{}, } db, err := mysql.New() if err != nil { exceptionless.SubmitAppError("siteAccess", "mysql.New", nil, err) return } if err := db.Clauses(clause.OnConflict{UpdateAll: true}).Create(site).Error; err != nil { exceptionless.SubmitAppError("siteAccess", "db.Create", nil, err) } } // 从内容中分析返回所有URL链接 func urlAnalysis(httpBody io.Reader) []string { links := []string{} col := []string{} page := html.NewTokenizer(httpBody) for { tokenType := page.Next() if tokenType == html.ErrorToken { return links } token := page.Token() if tokenType == html.StartTagToken && token.DataAtom.String() == "a" { for _, attr := range token.Attr { if attr.Key == "href" { tl := trimHash(attr.Val) col = append(col, tl) resolv(&links, col) } } } } } // trimHash slices a hash # from the link func trimHash(l string) string { if strings.Contains(l, "#") { var index int for n, str := range l { if strconv.QuoteRune(str) == "'#'" { index = n break } } return l[:index] } return l } // check looks to see if a url exits in the slice. func check(sl []string, s string) bool { var check bool for _, str := range sl { if str == s { check = true break } } return check } // resolv adds links to the link slice and insures that there is no repetition // in our collection. func resolv(sl *[]string, ml []string) { for _, str := range ml { if !check(*sl, str) { *sl = append(*sl, str) } } }