go 多协程爬取图片

package main

import (
	"fmt"
	"github.com/antchfx/htmlquery"
	"golang.org/x/net/html"
	"io/ioutil"
	"net/http"
	"strconv"
	"strings"
	"sync"
	"time"
)

var (
	url      = "https://www.woyaogexing.com/shouji/"
	referUrl = "https://www.woyaogexing.com/shouji/"
	referImg = "img2.woyaogexing.com"
)

func downloadUrl(url string, refer string) []byte {

	client := &http.Client{}
	req, e := http.NewRequest("GET", url, nil)
	handError(e)

	req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.149 Safari/537.36")
	req.Header.Add("Referer", refer)
	response, err := client.Do(req)
	handError(err)

	defer response.Body.Close()

	byteContent, e := ioutil.ReadAll(response.Body)
	handError(e)
	return byteContent
}

func parseContent(content []byte) []string {
	reader := strings.NewReader(string(content))
	html_node, i := html.Parse(reader)
	handError(i)

	nodes, e := htmlquery.QueryAll(html_node, "//img/@src")

	handError(e)
	var urls []string
	for _, n := range nodes {
		src := htmlquery.SelectAttr(n, "src")
		urls = append(urls, src)
	}
	return urls
}

func downloadImgs(url string, refer string,wg *sync.WaitGroup) {
	prefix := strings.HasPrefix(url, "//img2")
	if prefix != true {
		return
	}
	defer wg.Done()
	url  = url[2:]
	url = "http://"+url
	fmt.Println("下载图片", url)
	content := downloadUrl(url, referUrl)
	str1 := strings.Split(url, "/")
	file_name := str1[len(str1)-1]
	file := ioutil.WriteFile("./imgs/"+file_name, content, 0777)
	if file != nil {
		fmt.Printf("下载图片%s 成功", file_name)
	}
}

func handError(err error) {
	if err != nil {
		fmt.Println(err)
	}
}

func main() {

	var wg sync.WaitGroup

	var totalPage = 10
	for j:=0;j<=totalPage;j++{
		wg.Add(1)
		pageUrl := url+"index_"+strconv.Itoa(j) +".html"
		go crawl(pageUrl)
		wg.Done()
	}
	wg.Wait()

	time.Sleep(time.Second * 100)
}

func crawl(url string )  {
	var wg sync.WaitGroup

	byteContent := downloadUrl(url,referUrl)
	urls := parseContent(byteContent)
	fmt.Println(urls)
	if len(urls) > 0 {
		wg.Add(len(urls))
		for _, v := range urls {
			go downloadImgs(v, referImg,&wg)
		}
		wg.Wait()
	}
}

  

原文地址:https://www.cnblogs.com/brady-wang/p/13098687.html