go-百度贴吧-纵向爬取

百度贴吧纵向爬取

上一个是横向爬取的,这个纵向爬取,具体怎么做的看代码

package main

import (
	"fmt"
	"io"
	"net/http"
	"os"
	"regexp"
	"strconv"
)

func HttpGetDB(url string) (result string, err error) {
	resp, err1 := http.Get(url)
	if err1 != nil {
		err = err1
		return
	}
	defer resp.Body.Close()

	buf := make([]byte, 4096)

	for {
		n, err2 := resp.Body.Read(buf)
		if n == 0 {
			break
		}
		if err2 != nil && err2 != io.EOF {
			err = err2
			return
		}
		result += string(buf[:n])
	}

	return
}

func Save2file(idx int, fileName [][]string) {
	path := "第" + strconv.Itoa(idx) + "页" + ".txt"
	f, err := os.Create(path)
	if err != nil {
		fmt.Println("os.Create err", err)
		return
	}
	defer f.Close()

	n := len(fileName)

	f.WriteString("名称
")
	for i := 0; i < n; i++ {
		f.WriteString(fileName[i][1] + "
")
	}
}

func SpiderPageDB(idx int, page chan int) {
	//	url := "https://movie.douban.com/review/best/?start=" + strconv.Itoa((idx-1)*20)
	//	url := "https://movie.douban.com/annual/2018?source=navigation#" + strconv.Itoa(idx-1)
	//	https://movie.douban.com/review/best/?start=20

	url := "https://tieba.baidu.com/f?kw=vue&ie=utf-8&pn=" + strconv.Itoa((idx-1)*50)

	result, err := HttpGetDB(url)
	if err != nil {
		fmt.Println("HttpGet2 err", err)
		return
	}
	//	fmt.Println("result=", result)
	ret := regexp.MustCompile(`<span class="tb_icon_author_rely j_replyer" title="最后回复人:(?s:(.*?))"`)
	fileName := ret.FindAllStringSubmatch(result, -1)
	//	for _, name := range fileName {
	//		fmt.Println("name", name[1])
	//	}
	Save2file(idx, fileName)

	page <- idx
}

func toWork(start, end int) {
	fmt.Printf("正在爬取%d到%d页。。。
", start, end)

	page := make(chan int)
	for i := start; i <= end; i++ {
		go SpiderPageDB(i, page)
	}

	for i := start; i <= end; i++ {
		fmt.Print("第%d页爬取完成
", <-page)
	}
}

func main() {
	var start, end int
	fmt.Print("请输入起始页(>=1):")
	fmt.Scan(&start)
	fmt.Print("请输入终止页(>=start):")
	fmt.Scan(&end)

	toWork(start, end)
}

原文地址:https://www.cnblogs.com/ygjzs/p/12001364.html