纯golang爬虫实战(一)
纯golang爬取内网网站数据
参考https://blog.csdn.net/CrazyJavaPerson/article/details/81871649
难点一:需要登录,参考我的上一篇文章https://www.cnblogs.com/pu369/p/12201707.html,找到POST登录的url和Form Data,并且经测试可改用GET方式登录。
难点二:登录后访问时携带cookie.参考https://blog.csdn.net/liu_rong_fei/article/details/51820793 用resp.Cookies()可得到Cookies,还有https://www.cnblogs.com/cnsanshao/p/7084808.html,最后参考https://segmentfault.com/q/1010000005889328这里,决定还是用cookiejar和http.Client,
这样
后续请求client会自动将cookie加入
难点三:我只想保留文字,参考https://studygolang.com/articles/9360去掉html标签
上代码(密码用XX代替了,根据网站特点,主要是URL参数变化且是连续数字,只需写个循环访问即可)
//纯golang爬虫 package main import ( "fmt" "io/ioutil" "net/http" "net/http/cookiejar" "regexp" "strings" ) type MySpider struct { indexUrl string cleint *http.Client } //登录,用GET代替POST请求 func (this MySpider) login() (string, error) { resp, err := this.cleint.Get("http://192.168.13.1:8080/") defer resp.Body.Close() resp, err = this.cleint.Get("http://192.168.13.1:8080/login/auth?name=XX&password=XX&scurity=s&type=0&typeField=0") body, err := ioutil.ReadAll(resp.Body) fmt.Print(string(body)) resp, err = this.cleint.Get("http://192.168.13.1:8080/browse/basicinfo_p.jsp?rtpage=psnfrm&pid=00000164&func=0297&userbase=Usr") body, err = ioutil.ReadAll(resp.Body) fmt.Print(string(body)) trimbody := []byte(trimHtml(string(body))) //保存到文件 err = ioutil.WriteFile("test.txt", trimbody, 0644) resp, err = http.Get(this.indexUrl) if err != nil { return "err", err } defer resp.Body.Close() body, err = ioutil.ReadAll(resp.Body) //fmt.Print(resp.Cookies()) if err != nil { return "err", err } return string(body), err } //运行 func (this MySpider) run() string { //生成可复用的client var client http.Client jar, err := cookiejar.New(nil) if err != nil { panic(err) } client.Jar = jar this.cleint = &client //登录,用GET代替POST请求 this.login() return "" } func trimHtml(src string) string { //将HTML标签全转换成小写 re, _ := regexp.Compile("\\<[\\S\\s]+?\\>") src = re.ReplaceAllStringFunc(src, strings.ToLower) //去除STYLE re, _ = regexp.Compile("\\<style[\\S\\s]+?\\</style\\>") src = re.ReplaceAllString(src, "") //去除SCRIPT re, _ = regexp.Compile("\\<script[\\S\\s]+?\\</script\\>") src = re.ReplaceAllString(src, "") //去除所有尖括号内的HTML代码,并换成换行符 re, _ = regexp.Compile("\\<[\\S\\s]+?\\>") src = re.ReplaceAllString(src, "\n") //去除连续的换行符 re, _ = regexp.Compile("\\s{2,}") src = re.ReplaceAllString(src, "\n") return strings.TrimSpace(src) } func main() { //爬虫实例 ms := new(MySpider) //入口地址http://192.168.13.1:8080 ms.indexUrl = "http://192.168.13.1:8080" ms.run() }