比较爬虫用的语言Python与Go

Python是我比较喜欢的语言,莫名的喜欢,对Python的学习可能起初是敲错了网址开始的,哈哈哈~

工作的任务从一个网站后台做登录、爬取数据,写入服务器Redis中,同事认为我会用PHP来写,哼!让你猜到那该多没意思,于是乎有了如下Python的代码,你看50多行搞定了。

 1 #!/usr/bin/python3
 2 import requests
 3 import re
 4 import redis
 5 from pyquery import PyQuery as pq
 6 
 7 loginUrl = 'https://manage.xxx.com.cn/home/login'
 8 userName = 'xxx'
 9 passWord = 'xxx'
10 
11 redisServer = '192.168.0.2'
12 redisPort = 6379
13 redisPass = ''
14 
15 productList = {'椰油':'CL_Spot','咖啡':'COFFEE','工业铜':'COPPER'}
16 volumeList = {'CL_Spot':[0, 0], 'COFFEE':[0, 0], 'COPPER':[0, 0]}
17 
18 def main():
19     jsessionid = getCookie()
20     doLogin(jsessionid)
21     dataUrl = 'https://manage.xxx.cn/?pageNo=1&pageSize=100'
22     cookies = {'JSESSIONID': jsessionid}
23     r = requests.get(dataUrl, cookies = cookies)
24     dom = pq(r.text)
25     lines = dom('table').eq(1).find('tr').items()
26     for line in lines:
27         line = re.sub(r'<!--.*-->', '', str(line))
28         pattern = re.compile(r'<td>(.*?)</td>')
29         group = pattern.findall(line)
30         if not group:
31             continue
32         productCode = productList[group[3]]
33         if group[6] == '':
34             volumeList[productCode][0]+= int(group[7]) * int(group[8])
35         if group[6] == '':
36             volumeList[productCode][1]+= int(group[7]) * int(group[8])
37 
38     redisClient = redis.Redis(host=redisServer, port=redisPort, password=redisPass)
39     for x in volumeList:
40         keyUp = 'redis_order_count_u_%s' % x
41         keyDown = 'redis_order_count_d_%s' % x
42         redisClient.set(keyUp, int(volumeList[x][0]))
43         redisClient.set(keyDown, int(volumeList[x][1]))
44 
45 def getCookie():
46     ua = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'}
47     r = requests.get(loginUrl, headers = ua)
48     return r.cookies['JSESSIONID']
49 
50 def doLogin(jsessionid):
51     param = {'userName': userName, 'password': passWord}
52     cookies = {'JSESSIONID': jsessionid}
53     requests.post(loginUrl, data = param, cookies = cookies)
54     
55 
56 if __name__ == '__main__':
57     main()

另一个服务也需要这个需求,用了最近看的Golang来实现一次,瞧写了100多行

  1 package main
  2 
  3 import (
  4     "fmt"
  5     "net/http"
  6     "net/url"
  7     "os"
  8     "strings"
  9     "strconv"
 10     "gopkg.in/redis.v4"
 11     "github.com/PuerkitoBio/goquery"
 12 )
 13 
 14 var loginUrl string = "https://manage.xxx.com.cn/home/login"
 15 var dataUrl string = "https://manage.xxx.com.cn/?pageNo=1&pageSize=100"
 16 var userName string = "xxx"
 17 var passWord string = "xxx"
 18 var redisServer string = "192.168.1.2"
 19 var redisPort string = "6379"
 20 var redisPass string = ""
 21 var redisDB   int = 0
 22 
 23 func main() {
 24     productList := make(map[string] string)
 25     productList["椰油"] = "CL_Spot"
 26     productList["咖啡"] = "COFFEE"
 27     productList["工业铜"] = "COPPER"
 28     volumeList := make(map[string] int)
 29     volumeList["u_CL_Spot"] = 0
 30     volumeList["d_CL_Spot"] = 0
 31     volumeList["u_COFFEE"] = 0
 32     volumeList["d_COFFEE"] = 0
 33     volumeList["u_COPPER"] = 0
 34     volumeList["d_COPPER"] = 0
 35     jsessionid := getCookie()
 36     doLogin(jsessionid)
 37 
 38     request, err := http.NewRequest("GET", dataUrl, nil)
 39     request.AddCookie(&http.Cookie{Name: "JSESSIONID", Value: jsessionid})
 40     client := &http.Client{}
 41     response, err := client.Do(request)
 42     if err != nil {
 43         fmt.Println(err.Error())
 44         os.Exit(0)
 45     }
 46     defer response.Body.Close()
 47     doc, err := goquery.NewDocumentFromReader(response.Body)
 48     doc.Find("table").Eq(1).Find("tr").Each(func(i int, tr *goquery.Selection) {
 49         td := tr.Find("td")
 50         name := td.Eq(3).Text()
 51         dir := td.Eq(6).Text()
 52         if val, ok := productList[name]; ok {
 53             buyNum, _ := strconv.Atoi(td.Eq(7).Text())
 54             buyUnit, _ := strconv.Atoi(td.Eq(8).Text())
 55             num :=  buyNum * buyUnit
 56             cacheKey := ""
 57             if dir == "" {
 58                 cacheKey = fmt.Sprintf("u_%s", val)
 59             } else if dir == "" {
 60                 cacheKey = fmt.Sprintf("d_%s", val)
 61             }
 62             volumeList[cacheKey] += num
 63         }
 64     })
 65     redisClient := redis.NewClient(&redis.Options{
 66         Addr:     fmt.Sprintf("%s:%s", redisServer, redisPort),
 67         Password: redisPass,
 68         DB:       redisDB,
 69     })
 70     for k, v := range volumeList {
 71         strKey := fmt.Sprintf("redis_order_count_%s", k)
 72         redisClient.Set(strKey, int(v), 0)
 73     }
 74     fmt.Println("puti volume get success")
 75 }
 76 
 77 func getCookie() string {
 78     jsessionid := ""
 79     response, err := http.Get(loginUrl)
 80     if err != nil {
 81         fmt.Println(err.Error())
 82         os.Exit(0)
 83     }
 84     defer response.Body.Close()
 85     for _, val := range response.Cookies() {
 86         if val.Name == "JSESSIONID" {
 87             jsessionid = val.Value
 88         }
 89     }
 90     return jsessionid
 91 }
 92 
 93 func doLogin(jsessionid string) bool {
 94     data := url.Values{}
 95     data.Set("userName", userName)
 96     data.Add("password", passWord)
 97     request, _ := http.NewRequest("POST", loginUrl, strings.NewReader(data.Encode()))
 98     request.Header.Add("Content-Type", "application/x-www-form-urlencoded")
 99     request.Header.Add("Content-Length", strconv.Itoa(len(data.Encode())))
100     request.AddCookie(&http.Cookie{Name: "JSESSIONID", Value: jsessionid})
101     client := &http.Client{}
102     response, err := client.Do(request)
103     if err != nil {
104         fmt.Println(err.Error())
105         os.Exit(0)
106     }
107     defer response.Body.Close()
108     return true
109 }

Python的实现到上线半天的功夫搞定了,Go足足搞了1整天,蹩脚的语法与不熟悉的语法让我学习了很多知识点,最后Mac编译到Linux上执行也给我上了一课。

觉得入门学习这两门语言挺好,一个是脚本语言另一个是编译语言,用处都很广泛。轩轩你准备好了吗?

原文地址:https://www.cnblogs.com/aboys/p/10025409.html