go 爬虫 colly 使用xpath解析

package main

import (
	"fmt"
	"github.com/antchfx/htmlquery"
	"github.com/gocolly/colly"
	"log"
	"strings"
	"time"
)

func main() {
	c := colly.NewCollector(
		colly.AllowedDomains("yeves.cn"),
		)

	c.OnRequest(func(r *colly.Request) {
		fmt.Println("Visiting", r.URL.String())
	})


	c.Limit(&colly.LimitRule{
		DomainGlob:  "*",
		RandomDelay: 1 * time.Second,
	})

	//收到响应后
	c.OnResponse(func(r *colly.Response) {
		doc, err := htmlquery.Parse(strings.NewReader(string(r.Body)))
		if err != nil {
			log.Fatal(err)
		}
		nodes := htmlquery.Find(doc, `//*[@id="secondary"]/section[2]/ul//li`)
		for _, node := range nodes {
			a := htmlquery.FindOne(node, "./a[@href]")
			fmt.Println(htmlquery.SelectAttr(a,"href"),htmlquery.InnerText(a))
		}
	})//因为


	c.Visit("https://yeves.cn/")
}
原文地址:https://www.cnblogs.com/brady-wang/p/14004597.html