如何用golang搜索抓取淘宝商品

package main

import (
    "fmt"
    "log"
    "os"
    "strings"
    "sync"

    "./php"
    "github.com/tealeg/xlsx"
)

var wg sync.WaitGroup //定义一个同步等待的组
func main() {
    fileName := "xxx_debug.log"
    logFile, err := os.Create(fileName)
    defer logFile.Close()
    log.SetOutput(logFile)
    arg_num := len(os.Args)
    fmt.Printf("the num of input is %d
", arg_num)

    if arg_num == 1 || !strings.Contains(os.Args[1], ".xlsx") {
        fmt.Println("请输入****.xlsx文件作为参数")
        return
    }

    fmt.Printf("they are :
")
    for i := 0; i < arg_num; i++ {
        fmt.Println(os.Args[i])
    }

    var (
        excel_file_path string                         = os.Args[1]
        file_result     map[int]map[int]map[int]string = make(map[int]map[int]map[int]string)
        sheet_result    map[int]map[int]string         = make(map[int]map[int]string)
    )
    //打开一个excel文件资源
    f, err := xlsx.OpenFile(excel_file_path)
    if err != nil {
        log.Println(err.Error())
    }

    //循环文件中所有工作表
    for sheet_key, sheet := range f.Sheets {
        //循环对应工作表中行数
        for key, row := range sheet.Rows {
            row_result := make(map[int]string)
            //循环工作表行数的每一列
            for k, cell := range row.Cells {
                row_result[k] = cell.Value
            }
            //如果为空不添加对应值到 数组
            if !php.Empty(row_result) {
                sheet_result[key] = row_result
            }
        }
        //如果为空不添加对应值到 数组
        if !php.Empty(sheet_result) {
            file_result[sheet_key] = sheet_result
        }

    }
    //输出表格的结果
    for _, sheet := range file_result {
        for k, _ := range sheet {
            if k != 0 || !strings.Contains(sheet[k][1], "商品名称") {
                log.Printf("%d=%v
", k, sheet[k][1])
                wg.Add(1) //为同步等待组增加一个成员
                go Spy(sheet[k][1])
            }
        }
    }
    wg.Wait() //阻塞等待所有组内成员都执行完毕退栈
    fmt.Println("WE DONE!!!")
}




func Spy(urls string) {
    defer func() {
        wg.Done()

        if r := recover(); r != nil {
            log.Println("[E]", r)
        }
    }()
    urls = url.QueryEscape(urls)
    urlpath := tburl + urls + tburlpara

    log.Println(urlpath)
    req, err := http.NewRequest("GET", urlpath, nil)
    if err != nil {
        log.Printf("Get请求%s返回错误:%s", urlpath, err)
        return
    }
    req.Header.Set("User-Agent", GetRandomUserAgent())
    client := http.DefaultClient
    res, e := client.Do(req)
    if e != nil {
        log.Printf("Get请求%s返回错误:%s", urlpath, e)
        return
    }

    if res.StatusCode == 200 {
        body := res.Body
        defer body.Close()
        bodyByte, _ := ioutil.ReadAll(body)
        resStr := string(bodyByte)

        ajson := atagRegExp.FindAllString(resStr, -1)
        nlen := len(ajson[0])
        if nlen > 16 {
            jsons := ajson[0][16 : len(ajson[0])-2]
            var v interface{}
            json.Unmarshal([]byte(jsons), &v)
            i := 0
            minprice := 9999999.00
            words, _ := dproxy.New(v).P("/mods/itemlist/data").M("query").String()
            m := make(map[string][]string)
            for {
                set := dproxy.New(v).P("/mods/itemlist/data/auctions").A(i)

                var u = make([]string, 0)
                sales, err := set.M("view_sales").String()
                if err != nil {
                    log.Printf("/mods/itemlist/data/auctions path error %v 
", err.Error())
                    break
                }
                sales = strings.Replace(sales, "人付款", "", 1)
                price, err := set.M("view_price").String()
                title, err := set.M("raw_title").String()
                url, err := set.M("detail_url").String()

                inprice, err := strconv.ParseFloat(price, 32)
                if err != nil {
                    log.Println("转换有错")
                    panic(fmt.Sprintf("%v 转换有错", price))

                }
                insales, err := strconv.Atoi(sales)
                if err != nil {
                    log.Println("转换有错")
                    panic(fmt.Sprintf("%v 转换有错", price))

                }
                if minprice > inprice && inprice > 1 && insales >= 1 {
                    minprice = inprice
                }

                u = append(u, sales)
                u = append(u, price)
                u = append(u, title)
                u = append(u, url)

                fmt.Printf("%v===%v===%v
", title, sales, price)
                log.Printf("%v===%v===%v
", title, sales, price)
                i = i + 1
                m[url] = u
            }
            fmt.Printf("%v
", minprice)
            buildxlsx(words, m, minprice)
        }

    } else {
        log.Printf("返回网页错误 %v", res.StatusCode)

    }
}



var patherrch = [...]string{"/", "\", ":", "*", "?", """, "<", ">", "|"}
var userAgent = [...]string{"Mozilla/5.0 (compatible, MSIE 10.0, Windows NT, DigExt)",
    "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, 360SE)",
    "Mozilla/4.0 (compatible, MSIE 8.0, Windows NT 6.0, Trident/4.0)",
    "Mozilla/5.0 (compatible, MSIE 9.0, Windows NT 6.1, Trident/5.0,",
    "Opera/9.80 (Windows NT 6.1, U, en) Presto/2.8.131 Version/11.11",
    "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, TencentTraveler 4.0)",
    "Mozilla/5.0 (Windows, U, Windows NT 6.1, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Macintosh, Intel Mac OS X 10_7_0) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11",
    "Mozilla/5.0 (Macintosh, U, Intel Mac OS X 10_6_8, en-us) AppleWebKit/534.50 (KHTML, like Gecko) Version/5.1 Safari/534.50",
    "Mozilla/5.0 (Linux, U, Android 3.0, en-us, Xoom Build/HRI39) AppleWebKit/534.13 (KHTML, like Gecko) Version/4.0 Safari/534.13",
    "Mozilla/5.0 (iPad, U, CPU OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "Mozilla/4.0 (compatible, MSIE 7.0, Windows NT 5.1, Trident/4.0, SE 2.X MetaSr 1.0, SE 2.X MetaSr 1.0, .NET CLR 2.0.50727, SE 2.X MetaSr 1.0)",
    "Mozilla/5.0 (iPhone, U, CPU iPhone OS 4_3_3 like Mac OS X, en-us) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8J2 Safari/6533.18.5",
    "MQQBrowser/26 Mozilla/5.0 (Linux, U, Android 2.3.7, zh-cn, MB200 Build/GRJ22, CyanogenMod-7) AppleWebKit/533.1 (KHTML, like Gecko) Version/4.0 Mobile Safari/533.1"}

var r = rand.New(rand.NewSource(time.Now().UnixNano()))
var tburl = "https://s.taobao.com/search?q="
var tburlpara = "&imgfile=&commend=all&ssid=s5-e&search_type=item&sourceId=tb.index&spm=a21bo.50862.201857-taobao-item.1&ie=utf8&initiative_id=tbindexz_20170306&sort=price-asc"
var urlChannel = make(chan string, 200)                         //chan中存入string类型的href属性,缓冲200
var atagRegExp = regexp.MustCompile(`g_page_config = (.*?);
`) //以Must前缀的方法或函数都是必须保证一定能执行成功的,否则将引发一次panic
var chineseRegExp = regexp.MustCompile("^[u4e00-u9fa5]$")

func GetRandomUserAgent() string {
    return userAgent[r.Intn(len(userAgent))]
}




原文地址:https://www.cnblogs.com/ipub520/p/ipub520.html