pu369com

之前写的关于chromedp的文章被别人转到CSDN,很受鼓励,再来一篇golang爬虫实例

示例说明:用chromedp操作chrome,导航到baidu,然后输入“美女”,然后再翻2页,在此过程中保存cookie和所有img标签内容,并保存第一页的baidu logo为png

注释已经比较详细了,上代码:

package main

import (
    "bufio"
    "context"
    "fmt"

    "io/ioutil"
    "log"
    "os"

    "github.com/chromedp/cdproto/page"

    "time"

    "github.com/chromedp/cdproto/network"

    "github.com/chromedp/cdproto/cdp"

    "github.com/chromedp/chromedp"
)

var res string        // 定义全局变量,用来保存爬虫的数据,注释掉了
var nodes []*cdp.Node // 定义全局变量,用来保存爬虫的数据node
var buf []byte        //定义全局变量,用来保存Screenshot

func main() {
    var err error
    // create context
    ctxt, cancel := context.WithCancel(context.Background())
    defer cancel()

    // create chrome instance
    c, err := chromedp.New(ctxt, chromedp.WithLog(log.Printf))
    if err != nil {
        log.Fatal(err)
    }

    // run task list
    wd, _ := os.Getwd()
    err = c.Run(ctxt, chromedp.Tasks{
        page.SetDownloadBehavior(page.SetDownloadBehaviorBehaviorAllow).WithDownloadPath(wd),
        chromedp.Navigate(`https://www.baidu.com/`),   // 访问掉队的BAT
        chromedp.WaitVisible(`#kw`, chromedp.ByQuery), // 等待id=kw渲染成功,成功则说明已经获取到了正确的页面
        chromedp.SendKeys(`#kw`, `美女`, chromedp.ByID), //输入关键词 
        chromedp.Click("#su", chromedp.ByID),          // 触发点击事件,
        chromedp.Sleep(1 * time.Second),               //缓一缓
        //chromedp.OuterHTML("html", &res, chromedp.ByQuery), //获取html源码
        chromedp.Nodes("img", &nodes, chromedp.ByQueryAll), //获取当前页的img标签
        chromedp.Screenshot("#result_logo", &buf, chromedp.ByID),
        // 获取cookie
        chromedp.ActionFunc(func(ctx context.Context, h cdp.Executor) error {
            cookies, err := network.GetAllCookies().Do(ctx, h)
            // 将cookie拼接成header请求中cookie字段的模式
            var coo string
            for _, v := range cookies {
                coo = coo + v.Name + "=" + v.Value + ";"
            }
            WirteTXT(coo)                              //保存cookie到文件
            WirteTXT(fmt.Sprintf("\r\n\r\n%s", nodes)) //保存img标签
            ioutil.WriteFile("contact-form.png", buf, 0644)
            if err != nil {
                return err
            }
            return nil
        }),
        chromedp.ActionFunc(func(ctx context.Context, h cdp.Executor) error {
            // 循环翻页
            for i := 1; i < 3; i++ {
                //执行
                err = c.Run(ctxt, chromedp.Tasks{
                    chromedp.Click(`#page a:nth-last-child(1)`, chromedp.ByID), //翻页
                    chromedp.Sleep(1 * time.Second),                            //缓一缓
                    chromedp.Nodes("img", &nodes, chromedp.ByQueryAll),         //获取标签的html

                }) //执行爬虫任务
                WirteTXT(fmt.Sprintf("\r\n\r\n%s", nodes)) //保存img标签
            }
            return nil
        }),
    })
    if err != nil {
        log.Fatal(err)
    }

}

func WirteTXT(txt string) {
    f, err := os.OpenFile("1.txt", os.O_RDWR|os.O_CREATE|os.O_APPEND, 0777)
    if err != nil {
        fmt.Println("os Create error: ", err)
        return
    }
    defer f.Close()

    bw := bufio.NewWriter(f)
    bw.WriteString(txt + "\n")
    bw.Flush()
}

 

参考:

https://godoc.org/github.com/chromedp/chromedp#Selector.Do

https://www.jianshu.com/p/d282b4a57596

https://juejin.im/entry/5aac8374518825556a722de3

https://blog.csdn.net/yang731227/article/details/89202458

https://www.cnblogs.com/midnight/p/10384627.html

https://www.cnblogs.com/midnight/p/10384699.html

https://crieit.net/posts/chromedp-Node-HTML

https://qiita.com/yoheimuta/items/bbbe84d2a7fe673720b3

https://segmentfault.com/a/1190000019705499?utm_source=tag-newest

https://stackoverflow.com/search?q=chromedp

https://cloud.tencent.com/developer/ask/173850

https://www.ribice.ba/golang-chrome-automation/

https://gitee.com/-/ide/project/kwff/chromedp/edit/master/-/errors.go

https://www.cnblogs.com/apocelipes/archive/2018/07/04/9264673.html

如果在windows安装chromedp,还可参考我之前写的

https://www.cnblogs.com/pu369/p/10315988.html

https://www.cnblogs.com/pu369/p/10345483.html

 

posted on 2019-10-07 23:37  pu369com  阅读(1984)  评论(0编辑  收藏  举报

导航