比较爬虫用的语言Python与Go
Python是我比较喜欢的语言,莫名的喜欢,对Python的学习可能起初是敲错了网址开始的,哈哈哈~
工作的任务从一个网站后台做登录、爬取数据,写入服务器Redis中,同事认为我会用PHP来写,哼!让你猜到那该多没意思,于是乎有了如下Python的代码,你看50多行搞定了。
1 #!/usr/bin/python3 2 import requests 3 import re 4 import redis 5 from pyquery import PyQuery as pq 6 7 loginUrl = 'https://manage.xxx.com.cn/home/login' 8 userName = 'xxx' 9 passWord = 'xxx' 10 11 redisServer = '192.168.0.2' 12 redisPort = 6379 13 redisPass = '' 14 15 productList = {'椰油':'CL_Spot','咖啡':'COFFEE','工业铜':'COPPER'} 16 volumeList = {'CL_Spot':[0, 0], 'COFFEE':[0, 0], 'COPPER':[0, 0]} 17 18 def main(): 19 jsessionid = getCookie() 20 doLogin(jsessionid) 21 dataUrl = 'https://manage.xxx.cn/?pageNo=1&pageSize=100' 22 cookies = {'JSESSIONID': jsessionid} 23 r = requests.get(dataUrl, cookies = cookies) 24 dom = pq(r.text) 25 lines = dom('table').eq(1).find('tr').items() 26 for line in lines: 27 line = re.sub(r'<!--.*-->', '', str(line)) 28 pattern = re.compile(r'<td>(.*?)</td>') 29 group = pattern.findall(line) 30 if not group: 31 continue 32 productCode = productList[group[3]] 33 if group[6] == '买': 34 volumeList[productCode][0]+= int(group[7]) * int(group[8]) 35 if group[6] == '卖': 36 volumeList[productCode][1]+= int(group[7]) * int(group[8]) 37 38 redisClient = redis.Redis(host=redisServer, port=redisPort, password=redisPass) 39 for x in volumeList: 40 keyUp = 'redis_order_count_u_%s' % x 41 keyDown = 'redis_order_count_d_%s' % x 42 redisClient.set(keyUp, int(volumeList[x][0])) 43 redisClient.set(keyDown, int(volumeList[x][1])) 44 45 def getCookie(): 46 ua = {'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_14_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.110 Safari/537.36'} 47 r = requests.get(loginUrl, headers = ua) 48 return r.cookies['JSESSIONID'] 49 50 def doLogin(jsessionid): 51 param = {'userName': userName, 'password': passWord} 52 cookies = {'JSESSIONID': jsessionid} 53 requests.post(loginUrl, data = param, cookies = cookies) 54 55 56 if __name__ == '__main__': 57 main()
另一个服务也需要这个需求,用了最近看的Golang来实现一次,瞧写了100多行
1 package main 2 3 import ( 4 "fmt" 5 "net/http" 6 "net/url" 7 "os" 8 "strings" 9 "strconv" 10 "gopkg.in/redis.v4" 11 "github.com/PuerkitoBio/goquery" 12 ) 13 14 var loginUrl string = "https://manage.xxx.com.cn/home/login" 15 var dataUrl string = "https://manage.xxx.com.cn/?pageNo=1&pageSize=100" 16 var userName string = "xxx" 17 var passWord string = "xxx" 18 var redisServer string = "192.168.1.2" 19 var redisPort string = "6379" 20 var redisPass string = "" 21 var redisDB int = 0 22 23 func main() { 24 productList := make(map[string] string) 25 productList["椰油"] = "CL_Spot" 26 productList["咖啡"] = "COFFEE" 27 productList["工业铜"] = "COPPER" 28 volumeList := make(map[string] int) 29 volumeList["u_CL_Spot"] = 0 30 volumeList["d_CL_Spot"] = 0 31 volumeList["u_COFFEE"] = 0 32 volumeList["d_COFFEE"] = 0 33 volumeList["u_COPPER"] = 0 34 volumeList["d_COPPER"] = 0 35 jsessionid := getCookie() 36 doLogin(jsessionid) 37 38 request, err := http.NewRequest("GET", dataUrl, nil) 39 request.AddCookie(&http.Cookie{Name: "JSESSIONID", Value: jsessionid}) 40 client := &http.Client{} 41 response, err := client.Do(request) 42 if err != nil { 43 fmt.Println(err.Error()) 44 os.Exit(0) 45 } 46 defer response.Body.Close() 47 doc, err := goquery.NewDocumentFromReader(response.Body) 48 doc.Find("table").Eq(1).Find("tr").Each(func(i int, tr *goquery.Selection) { 49 td := tr.Find("td") 50 name := td.Eq(3).Text() 51 dir := td.Eq(6).Text() 52 if val, ok := productList[name]; ok { 53 buyNum, _ := strconv.Atoi(td.Eq(7).Text()) 54 buyUnit, _ := strconv.Atoi(td.Eq(8).Text()) 55 num := buyNum * buyUnit 56 cacheKey := "" 57 if dir == "买" { 58 cacheKey = fmt.Sprintf("u_%s", val) 59 } else if dir == "卖" { 60 cacheKey = fmt.Sprintf("d_%s", val) 61 } 62 volumeList[cacheKey] += num 63 } 64 }) 65 redisClient := redis.NewClient(&redis.Options{ 66 Addr: fmt.Sprintf("%s:%s", redisServer, redisPort), 67 Password: redisPass, 68 DB: redisDB, 69 }) 70 for k, v := range volumeList { 71 strKey := fmt.Sprintf("redis_order_count_%s", k) 72 redisClient.Set(strKey, int(v), 0) 73 } 74 fmt.Println("puti volume get success") 75 } 76 77 func getCookie() string { 78 jsessionid := "" 79 response, err := http.Get(loginUrl) 80 if err != nil { 81 fmt.Println(err.Error()) 82 os.Exit(0) 83 } 84 defer response.Body.Close() 85 for _, val := range response.Cookies() { 86 if val.Name == "JSESSIONID" { 87 jsessionid = val.Value 88 } 89 } 90 return jsessionid 91 } 92 93 func doLogin(jsessionid string) bool { 94 data := url.Values{} 95 data.Set("userName", userName) 96 data.Add("password", passWord) 97 request, _ := http.NewRequest("POST", loginUrl, strings.NewReader(data.Encode())) 98 request.Header.Add("Content-Type", "application/x-www-form-urlencoded") 99 request.Header.Add("Content-Length", strconv.Itoa(len(data.Encode()))) 100 request.AddCookie(&http.Cookie{Name: "JSESSIONID", Value: jsessionid}) 101 client := &http.Client{} 102 response, err := client.Do(request) 103 if err != nil { 104 fmt.Println(err.Error()) 105 os.Exit(0) 106 } 107 defer response.Body.Close() 108 return true 109 }
Python的实现到上线半天的功夫搞定了,Go足足搞了1整天,蹩脚的语法与不熟悉的语法让我学习了很多知识点,最后Mac编译到Linux上执行也给我上了一课。
觉得入门学习这两门语言挺好,一个是脚本语言另一个是编译语言,用处都很广泛。轩轩你准备好了吗?