golang爬虫

网络爬虫（又被称为网页蜘蛛，网络机器人，在FOAF社区中间，更经常的称为网页追逐者），是一种按照一定的规则，自动地抓取万维网信息的程序或者脚本。

其实通俗的讲就是通过程序去获取web页面上自己想要的数据，也就是自动抓取数据

爬虫的基本流程

发起请求
通过HTTP库向目标站点发起请求，也就是发送一个Request，请求可以包含额外的header等信息，等待服务器响应

获取响应内容
如果服务器能正常响应，会得到一个Response，Response的内容便是所要获取的页面内容，类型可能是HTML,Json字符串，二进制数据（图片或者视频）等类型

解析内容
得到的内容可能是HTML,可以用正则表达式，页面解析库进行解析，可能是Json,可以直接转换为Json对象解析，可能是二进制数据，可以做保存或者进一步的处理

保存数据
保存形式多样，可以存为文本，也可以保存到数据库，或者保存特定格式的文件

示例

简单爬取博文

package main

import (
    "bufio"
    "errors"
    "fmt"
    "io"
    "log"
    "net/http"
    "os"
    "path"
    "strconv"
    "strings"
    "time"

    "github.com/PuerkitoBio/goquery"
)

func ExampleScrape(url, name string) error {
    // Request the HTML page.
    log.Println(url)
    res, err := http.Get(url)
    if err != nil {
        log.Fatal(err)
    }
    defer res.Body.Close()
    if res.StatusCode != 200 {
        log.Fatalf("status code error: %d %s", res.StatusCode, res.Status)
    }

    // Load the HTML document
    doc, err := goquery.NewDocumentFromReader(res.Body)
    if err != nil {
        log.Fatal(err)
    }

    // Find the review items
    doc.Find("#mainContent .postTitle a").Each(func(i int, s *goquery.Selection) {
        var fileName string
        // For each item found, get the band and title
        title := s.Text()
        urlp, ok := s.Attr("href")
        if ok {
            _, fileName = path.Split(urlp)

            fmt.Printf("Review %d: %s - %s - %s
", i, title, urlp, fileName)
            storeData(urlp, fileName)

            s.SetAttr("href", fileName)
        }
    })

    doc.Find("#homepage_bottom_pager .pager a").Each(func(i int, s *goquery.Selection) {
        strHref, ok := s.Attr("href")
        if ok {
            sliceHref := strings.Split(strHref, "=")
            if len(sliceHref) == 2 {
                s.SetAttr("href", "index"+sliceHref[1]+".html")
            }
        }
    })

    docStr, err := goquery.OuterHtml(doc.Selection)
    if err != nil {
        log.Fatal(err)
    }
    writeFileString(name, docStr)
    return nil
}

func writeFileString(fileName, info string) error {
    file, err := os.Create(fileName)
    if err != nil {
        return err
    }
    defer file.Close()

    file.WriteString(info)
    file.Sync()
    return nil
}

func writeFile(fileName string, info io.Reader) error {
    file, err := os.Create(fileName)
    if err != nil {
        return err
    }
    defer file.Close()

    wfile := bufio.NewWriter(file)
    defer wfile.Flush()

    _, err = wfile.ReadFrom(info)
    if err != nil {
        return err
    }
    return nil
}

func storeData(url, fileName string) error {
    resp, err := http.Get(url)
    if err != nil {
        return err
    }
    defer resp.Body.Close()
    if resp.StatusCode != 200 {
        return errors.New(fmt.Sprintf("Get %s error and error code is %d
", url, resp.StatusCode))
    }
    return writeFile(fileName, resp.Body)
}

func main() {
    var i int
    for i = 1; i < 80; i++ {
        url := "https://www.cnblogs.com/embedded-linux/default.html?page=" + strconv.Itoa(i)
        indexUrl := "index" + strconv.Itoa(i) + ".html"
        err := ExampleScrape(url, indexUrl)
        if err != nil {
            break
        }
        time.Sleep(1 * time.Second)
    }
}

反爬虫

有爬取网页的方法，自然有反爬虫，阻止爬取网页。

1. HTTP返回418，如爬取豆瓣电影Top250

触发了反爬，返回一个彩蛋，因为你和 618 只差一个 HTTP 200 ……

参考：打开网站URL遇到“HTTP Error 418：”问题

在使用浏览器访问网站时，访问请求中包含请求头。检测请求头是常见的反爬虫策略。

服务器通过检测请求头判断这次请求是不是人为的。

在程序上加入请求头，这样服务器就会认为这是一个从浏览器发出的人为请求：

  req.Header.Add("User-Agent", `Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/537.36 115Broswer/6.0.3`)

package main

import (
  "fmt"
  "log"
  "net/http"

  "github.com/PuerkitoBio/goquery"
)

func ExampleScrape() {
  // Request the HTML page.
  client := &http.Client{}
  req, err := http.NewRequest("GET", "http://movie.douban.com/top250", nil)
  if err != nil {
    log.Fatal(err)
  }
  req.Header.Add("User-Agent", `Mozilla/5.0 (Windows NT 6.1;WOW64) AppleWebKit/537.36 (KHTML,like GeCKO) Chrome/45.0.2454.85 Safari/537.36 115Broswer/6.0.3`)
  req.Header.Add("Referer", "https://movie.douban.com/top250")
  req.Header.Add("connection", "keep-alive")
  res, err := client.Do(req) 
  if err != nil {
    log.Fatal(err)
  }
  defer res.Body.Close()
  log.Println(res)
  if res.StatusCode != 200 {
    log.Fatalf("status code error: %d %s", res.StatusCode, res.Status)
  }

  // Load the HTML document
  doc, err := goquery.NewDocumentFromReader(res.Body)
  if err != nil {
    log.Fatal(err)
  }

  // Find the review items
  doc.Find(".sidebar-reviews article .content-block").Each(func(i int, s *goquery.Selection) {
    // For each item found, get the band and title
    band := s.Find("a").Text()
    title := s.Find("i").Text()
    fmt.Printf("Review %d: %s - %s
", i, band, title)
  })
}

func main() {
  ExampleScrape()
}

http://movie.douban.com/top250

参考：

1. 用Golang写爬虫（一）

2. python爬虫

3. 深入浅出爬虫之道： Python、Golang与GraphQuery的对比

4. 数据分析爬虫了解一下Golang的市场行情爬取汽车之家二手车产品库爬取豆瓣电影Top250

5. Golang爬虫全攻略