GO 爬虫链接

 

1. 正则爬取手机号

结果:

代码:

 1 package main
 2 
 3 import (
 4     "fmt"
 5     "io/ioutil"
 6     "net/http"
 7     "os"
 8     "regexp"
 9 )
10 
11 var (
12     rePhone = `(1[3456789]d)(d{4})(d{4})`
13 )
14 
15 func HandleError(err error, when string) {
16     if err != nil {
17         fmt.Println(when, err)
18         os.Exit(1)
19     }
20 }
21 
22 func main() {
23     //http get请求页面
24     resp, err := http.Get("https://www.haomagujia.com/")
25 
26     //处理报错
27     HandleError(err, "http.Get")
28 
29     //读取整体获取的页面内容
30     bytes, _ := ioutil.ReadAll(resp.Body)
31     html := string(bytes)
32     //fmt.Println(html)
33 
34     //使用正则表达式对象在网页中过滤出手机号信息
35     re := regexp.MustCompile(rePhone)
36     //-1 代表匹配全部
37     allString := re.FindAllStringSubmatch(html, -1)
38     for _, x := range allString {
39         fmt.Println(x)
40     }
41 
42 }

2. 正则爬邮箱

结果:

代码

 1 package main
 2 
 3 import (
 4     "fmt"
 5     "io/ioutil"
 6     "net/http"
 7     "os"
 8     "regexp"
 9 )
10 
11 var (
12     reEmail = `[w.]+@w+.[a-z]{2,3}(.[a-z]{2,3})?`
13 )
14 
15 func HandleError(err error, when string) {
16     if err != nil {
17         fmt.Println(when, err)
18         os.Exit(1)
19     }
20 }
21 
22 func main() {
23 
24     html := GetHtml("https://www.douban.com/group/topic/41562980/")
25 
26     //模拟邮箱数据
27     html += "xiaoming@163.com
"
28     html += "aaa@126.com
"
29     html += "22223@qq.com
"
30     html += "x.badt@gmail.com
"
31 
32     re := regexp.MustCompile(reEmail)
33     AallString := re.FindAllStringSubmatch(html, -1)
34     for _, x := range AallString {
35         fmt.Println(x)
36     }
37 
38 }
39 
40 func GetHtml(url string) string {
41     resp, err := http.Get(url)
42     HandleError(err, "http.Get")
43     bytes, _ := ioutil.ReadAll(resp.Body)
44     html := string(bytes)
45     return html
46 }

3. 正则爬超链接

代码:

 1 package main
 2 
 3 import (
 4     "fmt"
 5     "io/ioutil"
 6     "net/http"
 7     "os"
 8     "regexp"
 9 )
10 
11 var (
12     reLink = `<a[sS]+?href="(http[sS+?])"` //需要根据实际情况书写规则
13 )
14 
15 func HandleError(err error, when string) {
16     if err != nil {
17         fmt.Println(when, err)
18         os.Exit(1)
19     }
20 }
21 
22 func main() {
23 
24     html := GetHtml("https://www.hao123.com")
25 
26     //爬取超链接
27     re := regexp.MustCompile(reLink)
28     AallString := re.FindAllStringSubmatch(html, -1)
29     for _, x := range AallString {
30         fmt.Println(x[0])
31     }
32 
33 }
34 
35 func GetHtml(url string) string {
36     resp, err := http.Get(url)
37     HandleError(err, "http.Get")
38     bytes, _ := ioutil.ReadAll(resp.Body)
39     html := string(bytes)
40     return html
41 }

3. 正则爬身份证号

代码:

 1 package main
 2 
 3 import (
 4     "fmt"
 5     "io/ioutil"
 6     "net/http"
 7     "os"
 8     "regexp"
 9 )
10 
11 var (
12     //3-30625-1970-04-26-0474
13     //reLink = `[1-6]d{5}-(19d{2})|(20(0d)|(1[0-8]))-(0[1-9])|(1[012])-((0[1-9])|([12]d)|(3[01]))-d{3}[dx]` //需要根据实际情况书写规则
14     reLink = `[1-6]d{5}((19d{2})|(20((0d)|(1[0-8]))))(0[1-9])|(1[012])((0[1-9])|([12]d)|(3[01]))d{3}[dx]` //需要根据实际情况书写规则
15 
16 )
17 
18 func HandleError(err error, when string) {
19     if err != nil {
20         fmt.Println(when, err)
21         os.Exit(1)
22     }
23 }
24 
25 func main() {
26 
27     html := GetHtml("http://www.shaoxing.com.cn/p/2771751.html")
28 
29     re := regexp.MustCompile(reLink)
30     AallString := re.FindAllStringSubmatch(html, -1)
31     for _, x := range AallString {
32         fmt.Println(x[0])
33     }
34 
35 }
36 
37 func GetHtml(url string) string {
38     resp, err := http.Get(url)
39     HandleError(err, "http.Get")
40     bytes, _ := ioutil.ReadAll(resp.Body)
41     html := string(bytes)
42     return html
43 }

 完毕

原文地址:https://www.cnblogs.com/chaoyangxu/p/12342678.html