之前写的关于chromedp的文章被别人转到CSDN,很受鼓励,再来一篇golang爬虫实例
示例说明:用chromedp操作chrome,导航到baidu,然后输入“美女”,然后再翻2页,在此过程中保存cookie和所有img标签内容,并保存第一页的baidu logo为png
注释已经比较详细了,上代码:
package main import ( "bufio" "context" "fmt" "io/ioutil" "log" "os" "github.com/chromedp/cdproto/page" "time" "github.com/chromedp/cdproto/network" "github.com/chromedp/cdproto/cdp" "github.com/chromedp/chromedp" ) var res string // 定义全局变量,用来保存爬虫的数据,注释掉了 var nodes []*cdp.Node // 定义全局变量,用来保存爬虫的数据node var buf []byte //定义全局变量,用来保存Screenshot func main() { var err error // create context ctxt, cancel := context.WithCancel(context.Background()) defer cancel() // create chrome instance c, err := chromedp.New(ctxt, chromedp.WithLog(log.Printf)) if err != nil { log.Fatal(err) } // run task list wd, _ := os.Getwd() err = c.Run(ctxt, chromedp.Tasks{ page.SetDownloadBehavior(page.SetDownloadBehaviorBehaviorAllow).WithDownloadPath(wd), chromedp.Navigate(`https://www.baidu.com/`), // 访问掉队的BAT chromedp.WaitVisible(`#kw`, chromedp.ByQuery), // 等待id=kw渲染成功,成功则说明已经获取到了正确的页面 chromedp.SendKeys(`#kw`, `美女`, chromedp.ByID), //输入关键词 chromedp.Click("#su", chromedp.ByID), // 触发点击事件, chromedp.Sleep(1 * time.Second), //缓一缓 //chromedp.OuterHTML("html", &res, chromedp.ByQuery), //获取html源码 chromedp.Nodes("img", &nodes, chromedp.ByQueryAll), //获取当前页的img标签 chromedp.Screenshot("#result_logo", &buf, chromedp.ByID), // 获取cookie chromedp.ActionFunc(func(ctx context.Context, h cdp.Executor) error { cookies, err := network.GetAllCookies().Do(ctx, h) // 将cookie拼接成header请求中cookie字段的模式 var coo string for _, v := range cookies { coo = coo + v.Name + "=" + v.Value + ";" } WirteTXT(coo) //保存cookie到文件 WirteTXT(fmt.Sprintf("\r\n\r\n%s", nodes)) //保存img标签 ioutil.WriteFile("contact-form.png", buf, 0644) if err != nil { return err } return nil }), chromedp.ActionFunc(func(ctx context.Context, h cdp.Executor) error { // 循环翻页 for i := 1; i < 3; i++ { //执行 err = c.Run(ctxt, chromedp.Tasks{ chromedp.Click(`#page a:nth-last-child(1)`, chromedp.ByID), //翻页 chromedp.Sleep(1 * time.Second), //缓一缓 chromedp.Nodes("img", &nodes, chromedp.ByQueryAll), //获取标签的html }) //执行爬虫任务 WirteTXT(fmt.Sprintf("\r\n\r\n%s", nodes)) //保存img标签 } return nil }), }) if err != nil { log.Fatal(err) } } func WirteTXT(txt string) { f, err := os.OpenFile("1.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0777) if err != nil { fmt.Println("os Create error: ", err) return } defer f.Close() bw := bufio.NewWriter(f) bw.WriteString(txt + "\n") bw.Flush() }
参考:
https://godoc.org/github.com/chromedp/chromedp#Selector.Do
https://www.jianshu.com/p/d282b4a57596
https://juejin.im/entry/5aac8374518825556a722de3
https://blog.csdn.net/yang731227/article/details/89202458
https://www.cnblogs.com/midnight/p/10384627.html
https://www.cnblogs.com/midnight/p/10384699.html
https://crieit.net/posts/chromedp-Node-HTML
https://qiita.com/yoheimuta/items/bbbe84d2a7fe673720b3
https://segmentfault.com/a/1190000019705499?utm_source=tag-newest
https://stackoverflow.com/search?q=chromedp
https://cloud.tencent.com/developer/ask/173850
https://www.ribice.ba/golang-chrome-automation/
https://gitee.com/-/ide/project/kwff/chromedp/edit/master/-/errors.go
https://www.cnblogs.com/apocelipes/archive/2018/07/04/9264673.html
如果在windows安装chromedp,还可参考我之前写的
https://www.cnblogs.com/pu369/p/10315988.html
https://www.cnblogs.com/pu369/p/10345483.html