golang抓取soyoung新氧案例图

时隔多年 感慨良多 废话不多说 上代码

package main

import (
	"crypto/md5"
	"encoding/hex"
	"encoding/json"
	"io/ioutil"
	"log"
	"net/http"
	"os"
	"runtime"
	"strings"

	"github.com/PuerkitoBio/goquery"
	"github.com/axgle/mahonia"
)

// 字符转换
func ConvertToString(src string, srcCode string, tagCode string) string {
	srcCoder := mahonia.NewDecoder(srcCode)
	srcResult := srcCoder.ConvertString(src)
	tagCoder := mahonia.NewDecoder(tagCode)
	_, cdata, _ := tagCoder.Translate([]byte(srcResult), true)
	result := string(cdata)
	return result
}

func GbkToUtf8(src string) string {
	return ConvertToString(src, "gbk", "utf-8")
}

//下载图片
func downloadImg(img_url string, filename string, Referer string) {

	req, _ := http.NewRequest("GET", img_url, nil)
	req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/68.0.3440.75 Safari/537.36")
	req.Header.Add("Referer", Referer)

	client := &http.Client{}
	response, err := client.Do(req)
	if err != nil {
		log.Println("get img_url failed:", err)
		return
	}
	defer response.Body.Close()

	data, err := ioutil.ReadAll(response.Body)
	if err != nil {
		log.Println("read data failed:", img_url, err)
		return
	}

	image, err := os.Create(filename)
	if err != nil {
		log.Println("create file failed:", filename, err)
		return
	}
	defer image.Close()

	image.Write(data)

}

//抓取图片 id目录名
func GetJokes(url string, id string) map[string]string {

	// page one
	baseUrl := url
	client := &http.Client{}
	req, err := http.NewRequest("GET", baseUrl, nil)
	req.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36")
	req.Header.Add("Referer", "https://www.soyoung.com/")
	req.Header.Add("Cookie", "__order_time__=undefined; msg_time=undefined; back_order_time=undefined; complain_time=undefined; __usersign__=1570614910876417305; _ga=GA1.2.2061581476.1570614904; _gid=GA1.2.1666843180.1570614904; PHPSESSID=5001a7796cc83a8255b33284a3a30dd7; cityId=1; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1570614904,1570693381; __p_t__=15706935958294; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bs%3A3%3A%22207%22%3Bs%3A8%3A%22cityName%22%3Bs%3A9%3A%22%E6%B3%89%E5%B7%9E%E5%B8%82%22%3Bs%3A8%3A%22cityCode%22%3Bs%3A3%3A%22134%22%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1570694344")
	res, err := client.Do(req)
	if err != nil {
		log.Fatal(err)
	}
	defer res.Body.Close()

	doc, err := goquery.NewDocumentFromResponse(res)
	if err != nil {
		log.Fatal(err)
	}

	//获取术前图片链接
	var beforImgs []string
	doc.Find(".big-photo").Each(func(i int, s *goquery.Selection) {

		imgUrl, _ := s.Attr("href")
		beforImgs = append(beforImgs, imgUrl)

	})

	err = os.MkdirAll("./"+id+"/before", os.ModePerm)
	if err != nil {
		log.Println(err)
	} else {

		for _, v := range beforImgs {
			//下载图片
			h := md5.New()
			h.Write([]byte(v))
			filename := hex.EncodeToString(h.Sum(nil)) + ".jpg"
			downloadImg(v, "./"+id+"/before/"+filename, baseUrl)
		}

	}

	//获取术后文字和图片链接 page one
	doc.Find(".diary-item").Each(func(i int, s *goquery.Selection) {

		title := s.Find(".day").Text()
		os.Mkdir("./"+id+"/"+title, os.ModePerm)

		s.Find(".photo-list li img").Each(func(k int, s2 *goquery.Selection) {

			imgUrl, _ := s2.Attr("data-img")
			imgUrl = strings.Replace(imgUrl, "_301_301", "", -1)
			imgUrl = strings.Replace(imgUrl, "face/", "", -1)
			h := md5.New()
			h.Write([]byte(imgUrl))
			filename := hex.EncodeToString(h.Sum(nil)) + ".jpg"
			downloadImg(imgUrl, "./"+id+"/"+title+"/"+filename, baseUrl)

		})

	})

	// page two
	baseUrl2 := url + "/p2/"
	req2, err := http.NewRequest("GET", baseUrl2, nil)
	req2.Header.Add("User-Agent", "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3865.90 Safari/537.36")
	req2.Header.Add("Referer", baseUrl)
	req2.Header.Add("Cookie", "__order_time__=undefined; msg_time=undefined; back_order_time=undefined; complain_time=undefined; __usersign__=1570614910876417305; _ga=GA1.2.2061581476.1570614904; _gid=GA1.2.1666843180.1570614904; PHPSESSID=5001a7796cc83a8255b33284a3a30dd7; cityId=1; Hm_lvt_b366fbb5465f5a86e1cc2871552e1fdb=1570614904,1570693381; __p_t__=15706935958294; __postion__=a%3A4%3A%7Bs%3A6%3A%22cityId%22%3Bs%3A3%3A%22207%22%3Bs%3A8%3A%22cityName%22%3Bs%3A9%3A%22%E6%B3%89%E5%B7%9E%E5%B8%82%22%3Bs%3A8%3A%22cityCode%22%3Bs%3A3%3A%22134%22%3Bs%3A3%3A%22jwd%22%3Bi%3A0%3B%7D; _gat=1; Hm_lpvt_b366fbb5465f5a86e1cc2871552e1fdb=1570694344")
	res2, err := client.Do(req2)
	if err != nil {
		log.Fatal(err)
	}
	defer res2.Body.Close()

	doc2, err := goquery.NewDocumentFromResponse(res2)
	if err != nil {
		log.Fatal(err)
	}

	//获取术后文字和图片链接 page two
	doc2.Find(".diary-item").Each(func(i int, s *goquery.Selection) {
		num := s.Length()
		if num > 0 {
			title := s.Find(".day").Text()
			os.Mkdir("./"+id+"/"+title, os.ModePerm)

			s.Find(".photo-list li img").Each(func(k int, s2 *goquery.Selection) {

				imgUrl, _ := s2.Attr("data-img")
				imgUrl = strings.Replace(imgUrl, "_301_301", "", -1)
				imgUrl = strings.Replace(imgUrl, "face/", "", -1)

				h := md5.New()
				h.Write([]byte(imgUrl))
				filename := hex.EncodeToString(h.Sum(nil)) + ".jpg"
				downloadImg(imgUrl, "./"+id+"/"+title+"/"+filename, baseUrl)

			})
		}

	})
	//对应ID
	info := make(map[string]string)
	info["ID"] = GbkToUtf8(id)
	info["picUrl"] = "success"
	return info

}

//http://127.0.0.1:1024/?id=dpg8426968
func main() {

	runtime.GOMAXPROCS(runtime.NumCPU())
	http.HandleFunc("/", indexHandler)
	http.ListenAndServe(":1024", nil)

}

func indexHandler(w http.ResponseWriter, r *http.Request) {

	defer r.Body.Close()
	r.ParseForm()
	id := r.FormValue("id")
	var url string

	url = "https://www.soyoung.com/" + id
	info := GetJokes(url, id)
	s, _ := json.Marshal(info)
	w.Write(s)

}

  

原文地址:https://www.cnblogs.com/hcjs/p/11653051.html