学习Go语言爬虫类--爬虫项目总结
目录
1、基于net/http&&正则 爬虫
1.1、总代码:
package main
import (
"fmt"
"io/ioutil"
"net/http"
"os"
"regexp"
"strings"
"time"
"xorm.io/xorm"
)
func fetch(url string) string {
//创建请求
client := &http.Client{}
req, _ := http.NewRequest("Get", url, nil)
//设置header和cookie
req.Header.Set("User-Agen", "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:105.0) Gecko/20100101 Firefox/105.0")
req.Header.Set("Cookie", "Hm_lvt_866c9be12d4a814454792b1fd0fed295=1663940215,1663972835,1664128274,1664293414; _ga=GA1.2.1806306827.1644455607; __gads=ID=20a4fda84b63bc54-2213d85cb0d60049:T=1644455608:S=ALNI_Mb8f7rEEP0_mZP_quJb9qXnJLOADg; __gpi=UID=00000496dc5f943b:T=1649322529:RT=1664293415:S=ALNI_MYbhii7b2DD8jtu5A7PCfBIiA-aTA; _ga_3Q0DVSGN10=GS1.1.1663653152.2.1.1663653160.52.0.0; .Cnblogs.AspNetCore.Cookies=CfDJ8NfDHj8mnYFAmPyhfXwJojeexzMHHoPzoYmVgsUJQ21R-E9eJqLSZQqt6m-OFdnLndy_W5qgQ8HYjjRO5HSsRfrbow8Ce7zF6MDxpOCoBajPr4TNQADVGkN2fZlWIRDorCklCCW-DzlpWHCta3sBCkq0GAxB20n8vKLTzZ9N_c6809XwrVxfvQfy4Nh5oXSx_33IjQtLAhi_hkEQ5kD12qTudzvIKpy7P_ZiGSgSmF5yc8aGjQ6ZdGm0xDwxHPj7g7yxGn6Xw-m7MOQfeIMzsJdoqA1c8IZfTZgLDNGtmi6oDupzMAtiv6JDN8_9Dkiyieg2d4JGFfao5ynOLPXGHuQ7mmEO69F4dSDauwLqok1RrWkv77oGvNDwQ-FPZU_Dfz133E5K3cKiqFFHHZM8lQlDIwtbbUC6DehiUC9B6xGMpc-ptroNjHRCwlz1PNbismVw3liVk4gc0MXnmqp_aNpev-dKeJ6moZLLMa4FVWl3ry7sbtPPB6jEPZi8n7iY96nsa9Z3ZPaSscW-VUHBH6AVvDjVsY8a2H7oBytGxhnUwJYmE-CLPRTchLtFCDb-ew; .CNBlogsCookie=C3CCD6AC43ED752609CBAECD01E9E10F3C3B526D81D4B575A88735351E8E12C2504FA4AD354EDEBC80D8E2B94E21866E623E0DC782DDDA2E00F7C6F92C935C8DB6409307CCC14A45387A0E2B468CE89D4515AA11; Hm_lpvt_866c9be12d4a814454792b1fd0fed295=1664294882; _gid=GA1.2.361416983.1664293415; affinity=1664295230.358.309.504809|a6728cc07008ec0fd0d6b7ff6028a867; __utma=59123430.1806306827.1644455607.1664295230.1664295230.1; __utmb=59123430.2.10.1664295230; __utmc=59123430; __utmz=59123430.1664295230.1.1.utmcsr=(direct)|utmccn=(direct)|utmcmd=(none); __utmt=1")
//发出请求
resp, err := client.Do(req)
if err != nil {
fmt.Printf("err: %v\n", err)
}
if resp.StatusCode != 200 {
fmt.Printf("resp.StatusCode: %v\n", resp.StatusCode)
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Printf("err: %v\n", err)
return ""
}
return string(body)
}
func parse(html string) {
// 替换掉空格
html = strings.Replace(html, "\n", "", -1)
// 边栏内容块正则
re_sidebar := regexp.MustCompile(`<aside id="sidebar" role="navigation">(.*?)</aside>`)
// 找到边栏内容块
sidebar := re_sidebar.FindString(html)
// 链接正则
re_link := regexp.MustCompile(`href="(.*?)"`)
// 找到所有链接
links := re_link.FindAllString(sidebar, -1)
base_url := "https://gorm.io/zh_CN/docs/"
for _, v := range links {
s := v[6 : len(v)-1]
url := base_url + s
fmt.Printf("url: %v\n", url)
body := fetch(url)
// 启动另外一个线程处理
go parse2(body)
}
}
func parse2(body string) {
// 替换掉空格
body = strings.Replace(body, "\n", "", -1)
// 页面内容
re_content := regexp.MustCompile(`<div class="article">(.*?)</div>`)
// 找到页面内容
content := re_content.FindString(body)
// fmt.Printf("content: %v\n", content)
// 标题
re_title := regexp.MustCompile(`<h1 class="article-title" itemprop="name">(.*?)</h1>`)
// 找到页面内容
title := re_title.FindString(content)
fmt.Printf("title: %v\n", title)
// 切片
title = title[42 : len(title)-5]
fmt.Printf("title: %v\n", title)
save(title, content)
saveToDB(title, content)
}
//保存到本地文件
func save(title string, content string) {
err := os.WriteFile("./"+title+".html", []byte(content), 0644)
if err != nil {
panic(err)
}
}
var engine *xorm.Engine
var err error
//立即执行函数,连接数据库
func init() {
engine, err = xorm.NewEngine("mysql", "root:123456@/test_xorm?charset=utf8")
if err != nil {
fmt.Printf("err: %v\n", err)
} else {
err2 := engine.Ping()
if err2 != nil {
fmt.Printf("err2: %v\n", err2)
} else {
print("连接成功!")
}
}
}
//创建结构体
type GormPage struct {
Id int64
Title string
Content string `xorm:"text"`
Created time.Time `xorm:"created"`
Updated time.Time `xorm:"updated"`
}
//保存到数据库
func saveToDB(title string, content string) {
engine.Sync(new(GormPage))
page := GormPage{
Title: title,
Content: content,
}
affected, err := engine.Insert(&page)
if err != nil {
fmt.Printf("err: %v\n", err)
}
fmt.Println("save:" + string(affected))
}
func main() {
url := "https://gorm.io/zh_CN/docs/"
//url := "https://manhua.fffdm.com/2/989/"
s := fetch(url)
//fmt.Printf("s: %v\n", s)
parse(s)
}
1.2、创建请求
client := &http.Client{}
req, _ := http.NewRequest("GET", url, nil)
1.3、设置请求header和cookie
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36")
req.Header.Add("Cookie", "__gads=ID=a03841e8dc5e3a01:T=1614315354:S=ALNI_MbPtvxPEZ3zQwJR9ZymNtNXqk5T2w; Hm_lvt_39b794a97f47c65b6b2e4e1741dcba38=1617680418,1617680467,1617691441,1617698379; _ga_R55C9JJH2H=GS1.1.1625526333.1.0.1625526335.0; __utmz=226521935.1627719195.1.1.utmcsr=zzk.cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; Hm_lvt_1408eee61ba676e12f93290ce7650e83=1629337233; sc_is_visitor_unique=rx11857110.1636164233.13F2C484526C4FA2010957F320FEEA18.1.1.1.1.1.1.1.1.1-9614694.1623747052.1.1.1.1.1.1.1.1.1; __utma=226521935.142504143.1614315356.1627719195.1636780252.2; UM_distinctid=17d4af395843ea-0b45d9f907fcac-978183a-144000-17d4af39585871; _ga_3Q0DVSGN10=GS1.1.1638963943.3.1.1638963943.60; _ga_4CQQXWHK3C=GS1.1.1641964040.3.0.1641964040.0; _ga=GA1.2.142504143.1614315356; Hm_lvt_d8d668bc92ee885787caab7ba4aa77ec=1640569355,1642119762; gr_user_id=3bf443cf-dc53-4d5f-ae5c-5bbb3ca819c3; Hm_lvt_7a41cb9dd1d636656563500edd2ddba8=1646182206; _gid=GA1.2.1450982996.1646736611; .Cnblogs.AspNetCore.Cookies=CfDJ8GsLOKiGtk1Au0UP1SouGdVYsphv8fTJFxTIvJxScUQCqJc5Ugl21LPkwOqhwGAvgS5GW7vDZEpxDA7VMMVyvZdtskQrPLqPj8aNRhFU7bN1vaTnWjRCgmVBKWnkfSOvS71t8xcJFwfWROB6_UEPt9uMWrWdRYlvvInER3kWX2s1rsrDUpUA9HoJ6BaIsnxBv10Xvhixq7gF4187lbmr1ODbLLo8VMRKOUWMrUC3GZHBBRRNP9qLoGvOYLLCwbGfoPEQvbCzXjJTfjM1cLCC0Ajnf4MT3Q-BpwoSmxFKarrunefNYaiVPwGYpJjsxfXFvEQN8rXVlr9MSCcicJepFRs5aQfZZ7z8o2PQomfcn2TZGG8pvdSrCqIESt0fpd9FN3cwwPdqs9aj6MiBEAk4GUeI0_TvTczhW11QHDxyRlFQUtaWaR6JJIcv9xCIC4cMjfOc592R9VjEpdRCqnK0d4NdHsFaDC3UE2SDDjkEmF5qmx7RHdJkPljghmzXC4TdtAX5WhiZMqcV2FJgiH3DjmtPZG0iuSgx9m4qNxYBY7rQpT6JK6MonuNJjOL5LUzbvA; Hm_lvt_866c9be12d4a814454792b1fd0fed295=1646742153,1646796896,1646800170,1646879057; __utmc=59123430; __utmz=59123430.1646882996.44.25.utmcsr=cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=59123430.142504143.1614315356.1646796901.1646882995.44; Hm_lpvt_866c9be12d4a814454792b1fd0fed295=1646883549; __utmb=59123430.3.10.1646882996")
1.4、发出请求
resp, err := client.Do(req)
if err != nil {
fmt.Println("Http get err:", err)
return ""
}
if resp.StatusCode != 200 {
fmt.Println("Http status code:", resp.StatusCode)
return ""
}
defer resp.Body.Close()
body, err := ioutil.ReadAll(resp.Body)
if err != nil {
fmt.Println("Read error", err)
return ""
}
1.5、解析链接页面
func parse(html string) {
// 替换掉空格
html = strings.Replace(html, "\n", "", -1)
// 边栏内容块正则
re_sidebar := regexp.MustCompile(`<aside id="sidebar" role="navigation">(.*?)</aside>`)
// 找到边栏内容块
sidebar := re_sidebar.FindString(html)
// 链接正则
re_link := regexp.MustCompile(`href="(.*?)"`)
// 找到所有链接
links := re_link.FindAllString(sidebar, -1)
base_url := "https://gorm.io/zh_CN/docs/"
for _, v := range links {
s := v[6 : len(v)-1]
url := base_url + s
fmt.Printf("url: %v\n", url)
body := fetch(url)
// 启动另外一个线程处理
go parse2(body)
}
}
1.6、解析内容页面
func parse2(body string) {
// 替换掉空格
body = strings.Replace(body, "\n", "", -1)
// 页面内容
re_content := regexp.MustCompile(`<div class="article">(.*?)</div>`)
// 找到页面内容
content := re_content.FindString(body)
// fmt.Printf("content: %v\n", content)
// 标题
re_title := regexp.MustCompile(`<h1 class="article-title" itemprop="name">(.*?)</h1>`)
// 找到页面内容
title := re_title.FindString(content)
fmt.Printf("title: %v\n", title)
// 切片
title = title[42 : len(title)-5]
fmt.Printf("title: %v\n", title)
}
1.7、保存到本地
func save(title string, content string) {
err := os.WriteFile("./"+title+".html", []byte(content), 0644)
if err != nil {
panic(err)
}
}
1.8、保存到数据库
1.8.1、安装库
go get xorm.io/xorm
go get github.com/go-sql-driver/mysql
1.8.2、连接数据库
var engine *xorm.Engine
var err error
func init() {
engine, err = xorm.NewEngine("mysql", "root:123456@/test_xorm?charset=utf8")
if err != nil {
fmt.Printf("err: %v\n", err)
} else {
err2 := engine.Ping()
if err2 != nil {
fmt.Printf("err2: %v\n", err2)
} else {
print("连接成功!")
}
}
}
1.8.3、创建结构体
type GormPage struct {
Id int64
Title string
Content string `xorm:"text"`
Created time.Time `xorm:"created"`
Updated time.Time `xorm:"updated"`
}
1.8.4、保存数据到数据库
func saveToDB(title string, content string) {
engine.Sync(new(GormPage))
page := GormPage{
Title: title,
Content: content,
}
affected, err := engine.Insert(&page)
if err != nil {
fmt.Printf("err: %v\n", err)
}
fmt.Println("save:" + string(affected))
}
2、goquery
2.1、简介
goquery是一个爬虫库,可以非常方便的进行html页面分析,元素提取,类似jQuery。它基于 HTML 解析库net/html和 CSS 库cascadia,提供与 jQuery 相近的接口。Go 著名的爬虫框架colly就是基于 goquery 的。
2.2、官方文档
https://github.com/PuerkitoBio/goquery
2.3、安装goquery库
go get -u github.com/PuerkitoBio/goquery
2.4、第一个goquery应用
2.4.1、代码
package main
import (
"fmt"
"log"
"github.com/PuerkitoBio/goquery"
)
func main() {
url := "https://gorm.io/zh_CN/docs/"
dom, err := goquery.NewDocument(url)
if err != nil {
log.Fatalln(err)
}
//获取所有的".sidebar-link",遍历
dom.Find(".sidebar-link").Each(func(i int, s *goquery.Selection) {
href, _ := s.Attr("href")
text := s.Text()
fmt.Println(i, href, text)
})
}
2.4.2、运行结果
0 index.html 概述
1 models.html 声明模型
2 connecting_to_the_database.html 连接到数据库
3 create.html 创建
4 query.html 查询
5 advanced_query.html 高级查询
6 update.html 更新
7 delete.html 删除
8 sql_builder.html 原生 SQL 和 SQL 生成器
9 belongs_to.html Belongs To
10 has_one.html Has One
11 has_many.html Has Many
12 many_to_many.html Many To Many
13 associations.html 关联模式
14 preload.html 预加载
15 context.html Context
16 error_handling.html 错误处理
17 method_chaining.html 链式操作
18 session.html Session
19 hooks.html 钩子
20 transactions.html 事务
21 migration.html 迁移
22 logger.html Logger
23 generic_interface.html 通用数据库接口
24 performance.html 性能
25 data_types.html 自定义数据类型
26 scopes.html Scope
27 conventions.html 约定
28 settings.html 设置
29 dbresolver.html Database Resolver
30 sharding.html Sharding
31 prometheus.html Prometheus
32 hints.html 提示
33 indexes.html 索引
34 constraints.html 约束
35 composite_primary_key.html 复合主键
36 security.html 安全
37 gorm_config.html GORM 配置
38 write_plugins.html 编写插件
39 write_driver.html 编写驱动
40 changelog.html 更新日志
41 /community.html 社区
42 /contribute.html 贡献
43 /contribute.html#Translate-this-site 翻译当前页面
2.5、goquery api Document
2.5.1、Document定义
Document表示要爬取的文档,创建方法有如下几种
2.5.2、goquery.NewDocument(url)
url := "https://gorm.io/zh_CN/docs/"
dom, err := goquery.NewDocument(url)
if err != nil {
log.Fatalln(err)
}
2.5.3、goquery.NewDocumentFromResponse(resp)
从响应包中获取
client := &http.Client{}
url := "https://gorm.io/zh_CN/docs/"
req, _ := http.NewRequest("GET", url, nil)
resp, err := client.Do(req)
dom, err := goquery.NewDocumentFromResponse(resp)
if err != nil {
log.Fatalln(err)
}
2.5.4、goquery.NewDocumentFromReader(strings.NewReader(html))
html := `<body>
<div>DIV1</div>
<div>DIV2</div>
<span>SPAN</span>
</body>
`
dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
log.Fatalln(err)
}
2.6、goquery api 选择器-Goquery
类似jQuery 或者css选择器,常用的有元素名称选择器、ID选择器、class选择器
2.6.1、元素名称选择器
package main
import (
"fmt"
"log"
"strings"
"github.com/PuerkitoBio/goquery"
)
func main() {
html := `<body>
<div>DIV1</div>
<div>DIV2</div>
<span>SPAN</span>
</body>
`
dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
log.Fatalln(err)
}
dom.Find("div").Each(func(i int, selection *goquery.Selection) {
fmt.Println("i", i, "select text", selection.Text())
})
}
2.6.2、ID选择器
package main
import (
"fmt"
"log"
"strings"
"github.com/PuerkitoBio/goquery"
)
func main() {
html := `<body>
<div id="div1">DIV1</div>
<div>DIV2</div>
<span>SPAN</span>
</body>
`
dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
log.Fatalln(err)
}
dom.Find("#div1").Each(func(i int, selection *goquery.Selection) {
fmt.Println(selection.Text())
})
}
2.6.3、class类选择器
package main
import (
"fmt"
"log"
"strings"
"github.com/PuerkitoBio/goquery"
)
func main() {
html := `<body>
<div id="div1">DIV1</div>
<div class="name">DIV2</div>
<span>SPAN</span>
</body>
`
dom, err := goquery.NewDocumentFromReader(strings.NewReader(html))
if err != nil {
log.Fatalln(err)
}
dom.Find(".name").Each(func(i int, selection *goquery.Selection) {
fmt.Println(selection.Text())
})
}
2.7、goquery api Selection
2.7.1、内置函数
2.7.1.1、类似函数的位置操作
Eq(index int) *Selection //根据索引获取某个节点集
First() *Selection //获取第一个子节点集
Last() *Selection //获取最后一个子节点集
Next() *Selection //获取下一个兄弟节点集
NextAll() *Selection //获取后面所有兄弟节点集
Prev() *Selection //前一个兄弟节点集
Get(index int) *html.Node //根据索引获取一个节点
Index() int //返回选择对象中第一个元素的位置
Slice(start, end int) *Selection //根据起始位置获取子节点集
2.7.1.2、循环遍历选择的节点
Each(f func(int, *Selection)) *Selection //遍历
EachWithBreak(f func(int, *Selection) bool) *Selection //可中断遍历
Map(f func(int, *Selection) string) (result []string) //返回字符串数组
2.7.1.3、检测或获取节点属性值
Attr(), RemoveAttr(), SetAttr() //获取,移除,设置属性的值
AddClass(), HasClass(), RemoveClass(), ToggleClass()
Html() //获取该节点的html
Length() //返回该Selection的元素个数
Text() //获取该节点的文本值
2.7.1.4、在文档树之间来回跳转(常用的查找节点方法
Children() //返回selection中各个节点下的孩子节点
Contents() //获取当前节点下的所有节点
Find() //查找获取当前匹配的元素
Next() //下一个元素
Prev() //上一个元素
2.8、goquery爬虫应用
package main
import (
"fmt"
"github.com/PuerkitoBio/goquery"
)
func main() {
url := "https://gorm.io/zh_CN/docs/"
d, _ := goquery.NewDocument(url)
d.Find(".sidebar-link").Each(func(i int, s *goquery.Selection) {
link, _ := s.Attr("href")
base_url := "https://gorm.io/zh_CN/docs/"
detail_url := base_url + link
fmt.Printf("detail_url: %v\n", detail_url)
d, _ = goquery.NewDocument(detail_url)
title := d.Find(".article-title").Text()
content, _ := d.Find(".article").Html()
fmt.Printf("title: %v\n", title)
fmt.Printf("content: %v\n", content)
})
}
3、爬取豆瓣top250
package main
import (
"fmt"
"net/http"
"regexp"
"strconv"
"github.com/PuerkitoBio/goquery"
)
func petch(url, page string) {
//d, err := goquery.NewDocument(url)
url = url + page
client := &http.Client{}
req, _ := http.NewRequest("GET", url, nil)
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36")
req.Header.Add("Cookie", "__gads=ID=a03841e8dc5e3a01:T=1614315354:S=ALNI_MbPtvxPEZ3zQwJR9ZymNtNXqk5T2w; Hm_lvt_39b794a97f47c65b6b2e4e1741dcba38=1617680418,1617680467,1617691441,1617698379; _ga_R55C9JJH2H=GS1.1.1625526333.1.0.1625526335.0; __utmz=226521935.1627719195.1.1.utmcsr=zzk.cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; Hm_lvt_1408eee61ba676e12f93290ce7650e83=1629337233; sc_is_visitor_unique=rx11857110.1636164233.13F2C484526C4FA2010957F320FEEA18.1.1.1.1.1.1.1.1.1-9614694.1623747052.1.1.1.1.1.1.1.1.1; __utma=226521935.142504143.1614315356.1627719195.1636780252.2; UM_distinctid=17d4af395843ea-0b45d9f907fcac-978183a-144000-17d4af39585871; _ga_3Q0DVSGN10=GS1.1.1638963943.3.1.1638963943.60; _ga_4CQQXWHK3C=GS1.1.1641964040.3.0.1641964040.0; _ga=GA1.2.142504143.1614315356; Hm_lvt_d8d668bc92ee885787caab7ba4aa77ec=1640569355,1642119762; gr_user_id=3bf443cf-dc53-4d5f-ae5c-5bbb3ca819c3; Hm_lvt_7a41cb9dd1d636656563500edd2ddba8=1646182206; _gid=GA1.2.1450982996.1646736611; .Cnblogs.AspNetCore.Cookies=CfDJ8GsLOKiGtk1Au0UP1SouGdVYsphv8fTJFxTIvJxScUQCqJc5Ugl21LPkwOqhwGAvgS5GW7vDZEpxDA7VMMVyvZdtskQrPLqPj8aNRhFU7bN1vaTnWjRCgmVBKWnkfSOvS71t8xcJFwfWROB6_UEPt9uMWrWdRYlvvInER3kWX2s1rsrDUpUA9HoJ6BaIsnxBv10Xvhixq7gF4187lbmr1ODbLLo8VMRKOUWMrUC3GZHBBRRNP9qLoGvOYLLCwbGfoPEQvbCzXjJTfjM1cLCC0Ajnf4MT3Q-BpwoSmxFKarrunefNYaiVPwGYpJjsxfXFvEQN8rXVlr9MSCcicJepFRs5aQfZZ7z8o2PQomfcn2TZGG8pvdSrCqIESt0fpd9FN3cwwPdqs9aj6MiBEAk4GUeI0_TvTczhW11QHDxyRlFQUtaWaR6JJIcv9xCIC4cMjfOc592R9VjEpdRCqnK0d4NdHsFaDC3UE2SDDjkEmF5qmx7RHdJkPljghmzXC4TdtAX5WhiZMqcV2FJgiH3DjmtPZG0iuSgx9m4qNxYBY7rQpT6JK6MonuNJjOL5LUzbvA; Hm_lvt_866c9be12d4a814454792b1fd0fed295=1646742153,1646796896,1646800170,1646879057; __utmc=59123430; __utmz=59123430.1646882996.44.25.utmcsr=cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=59123430.142504143.1614315356.1646796901.1646882995.44; Hm_lpvt_866c9be12d4a814454792b1fd0fed295=1646883549; __utmb=59123430.3.10.1646882996")
resp, err := client.Do(req)
if err != nil {
fmt.Printf("err: %v\n", err)
}
defer resp.Body.Close()
d, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
fmt.Printf("err: %v\n", err)
}
d.Find(".grid_view > li").Each(func(i int, aa *goquery.Selection) {
// href, _ := aa.Attr("href")
// //fmt.Printf("href: %v\n", href)
// // link := HrefSpite(href)
// // fmt.Printf("link: %v\n", link)
title := aa.Find("div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > a:nth-child(1) > span:nth-child(1)").Text()
img := aa.Find("div:nth-child(1) > div:nth-child(1) > a:nth-child(2) > img:nth-child(1)")
imgTmp, ok := img.Attr("src")
info := aa.Find("div:nth-child(1) > div:nth-child(2) > div:nth-child(2) > p:nth-child(1)").Text()
dir, actor, year := InfoSpite(info)
score := aa.Find("div:nth-child(1) > div:nth-child(2) > div:nth-child(2) > div:nth-child(2) > span:nth-child(2)").Text()
quote := aa.Find("div:nth-child(1) > div:nth-child(2) > div:nth-child(2) > p:nth-child(3) > span:nth-child(1)").Text()
if ok {
fmt.Printf("title: %v\n", title)
fmt.Printf("imgTmp: %v\n", imgTmp)
//fmt.Printf("info: %v\n", info)
fmt.Printf("dir: %v\n", dir)
fmt.Printf("actor: %v\n", actor)
fmt.Printf("year: %v\n", year)
fmt.Printf("score: %v\n", score)
fmt.Printf("quote: %v\n", quote)
fmt.Println("++++++++++++++++++++++++++++++++++++++++++++")
}
})
}
func InfoSpite(info string) (dir, actor, year string) {
dirRe, _ := regexp.Compile(` 导演:(.*) `)
dir = string(dirRe.Find([]byte(info)))
actorRe, _ := regexp.Compile(`主演: (.*) `)
actor = string(actorRe.Find([]byte(info)))
yearRe, _ := regexp.Compile(`(\d+)`)
year = string(yearRe.Find([]byte(info)))
return
}
func main() {
url := "https://movie.douban.com/top250?start="
for i := 0; i < 10; i++ {
p := i + 1
fmt.Printf("正在爬取第%d页…………\n", p)
page := strconv.Itoa(i * 25)
petch(url, page)
}
}
4、多线程爬虫
package main
import (
"fmt"
"net/http"
"regexp"
"strconv"
"sync"
"time"
"github.com/PuerkitoBio/goquery"
)
//创建个结构体用来存爬出来的信息
type MovieData struct {
Title string `json:"title"`
Director string `json:"director"`
Picture string `json:"picture"`
Actor string `json:"actor"`
Year string `json:"year"`
Score string `json:"score"`
Quote string `json:"quote"`
}
func petch(url string, ch chan bool, i int) {
//d, err := goquery.NewDocument(url)
client := &http.Client{}
req, _ := http.NewRequest("GET", url, nil)
req.Header.Set("User-Agent", "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/99.0.4844.51 Safari/537.36")
req.Header.Add("Cookie", "__gads=ID=a03841e8dc5e3a01:T=1614315354:S=ALNI_MbPtvxPEZ3zQwJR9ZymNtNXqk5T2w; Hm_lvt_39b794a97f47c65b6b2e4e1741dcba38=1617680418,1617680467,1617691441,1617698379; _ga_R55C9JJH2H=GS1.1.1625526333.1.0.1625526335.0; __utmz=226521935.1627719195.1.1.utmcsr=zzk.cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; Hm_lvt_1408eee61ba676e12f93290ce7650e83=1629337233; sc_is_visitor_unique=rx11857110.1636164233.13F2C484526C4FA2010957F320FEEA18.1.1.1.1.1.1.1.1.1-9614694.1623747052.1.1.1.1.1.1.1.1.1; __utma=226521935.142504143.1614315356.1627719195.1636780252.2; UM_distinctid=17d4af395843ea-0b45d9f907fcac-978183a-144000-17d4af39585871; _ga_3Q0DVSGN10=GS1.1.1638963943.3.1.1638963943.60; _ga_4CQQXWHK3C=GS1.1.1641964040.3.0.1641964040.0; _ga=GA1.2.142504143.1614315356; Hm_lvt_d8d668bc92ee885787caab7ba4aa77ec=1640569355,1642119762; gr_user_id=3bf443cf-dc53-4d5f-ae5c-5bbb3ca819c3; Hm_lvt_7a41cb9dd1d636656563500edd2ddba8=1646182206; _gid=GA1.2.1450982996.1646736611; .Cnblogs.AspNetCore.Cookies=CfDJ8GsLOKiGtk1Au0UP1SouGdVYsphv8fTJFxTIvJxScUQCqJc5Ugl21LPkwOqhwGAvgS5GW7vDZEpxDA7VMMVyvZdtskQrPLqPj8aNRhFU7bN1vaTnWjRCgmVBKWnkfSOvS71t8xcJFwfWROB6_UEPt9uMWrWdRYlvvInER3kWX2s1rsrDUpUA9HoJ6BaIsnxBv10Xvhixq7gF4187lbmr1ODbLLo8VMRKOUWMrUC3GZHBBRRNP9qLoGvOYLLCwbGfoPEQvbCzXjJTfjM1cLCC0Ajnf4MT3Q-BpwoSmxFKarrunefNYaiVPwGYpJjsxfXFvEQN8rXVlr9MSCcicJepFRs5aQfZZ7z8o2PQomfcn2TZGG8pvdSrCqIESt0fpd9FN3cwwPdqs9aj6MiBEAk4GUeI0_TvTczhW11QHDxyRlFQUtaWaR6JJIcv9xCIC4cMjfOc592R9VjEpdRCqnK0d4NdHsFaDC3UE2SDDjkEmF5qmx7RHdJkPljghmzXC4TdtAX5WhiZMqcV2FJgiH3DjmtPZG0iuSgx9m4qNxYBY7rQpT6JK6MonuNJjOL5LUzbvA; Hm_lvt_866c9be12d4a814454792b1fd0fed295=1646742153,1646796896,1646800170,1646879057; __utmc=59123430; __utmz=59123430.1646882996.44.25.utmcsr=cnblogs.com|utmccn=(referral)|utmcmd=referral|utmcct=/; __utma=59123430.142504143.1614315356.1646796901.1646882995.44; Hm_lpvt_866c9be12d4a814454792b1fd0fed295=1646883549; __utmb=59123430.3.10.1646882996")
req.Header.Set("Connection", "keep-alive")
req.Header.Set("Cache-Control", "max-age=0")
req.Header.Set("sec-ch-ua-mobile", "?0")
req.Header.Set("Upgrade-Insecure-Requests", "1")
req.Header.Set("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/avif,image/webp,*/*;q=0.8")
req.Header.Set("Sec-Fetch-Site", "none")
req.Header.Set("Sec-Fetch-Mode", "navigate")
req.Header.Set("Sec-Fetch-User", "?1")
req.Header.Set("Sec-Fetch-Dest", "document")
req.Header.Set("Referer", "https://movie.douban.com/chart")
req.Header.Set("Accept-Language", "zh-CN,zh;q=0.8,zh-TW;q=0.7,zh-HK;q=0.5,en-US;q=0.3,en;q=0.2")
resp, err := client.Do(req)
if err != nil {
fmt.Printf("err: %v\n", err)
}
defer resp.Body.Close()
d, err := goquery.NewDocumentFromReader(resp.Body)
if err != nil {
fmt.Printf("err: %v\n", err)
}
d.Find(".grid_view > li").Each(func(i int, aa *goquery.Selection) {
// href, _ := aa.Attr("href")
// //fmt.Printf("href: %v\n", href)
// // link := HrefSpite(href)
// // fmt.Printf("link: %v\n", link)
var movieData MovieData
title := aa.Find("div:nth-child(1) > div:nth-child(2) > div:nth-child(1) > a:nth-child(1) > span:nth-child(1)").Text()
img := aa.Find("div:nth-child(1) > div:nth-child(1) > a:nth-child(2) > img:nth-child(1)")
imgTmp, ok := img.Attr("src")
info := aa.Find("div:nth-child(1) > div:nth-child(2) > div:nth-child(2) > p:nth-child(1)").Text()
dir, actor, year := InfoSpite(info)
score := aa.Find("div:nth-child(1) > div:nth-child(2) > div:nth-child(2) > div:nth-child(2) > span:nth-child(2)").Text()
quote := aa.Find("div:nth-child(1) > div:nth-child(2) > div:nth-child(2) > p:nth-child(3) > span:nth-child(1)").Text()
if ok {
// fmt.Printf("title: %v\n", title)
// fmt.Printf("imgTmp: %v\n", imgTmp)
// //fmt.Printf("info: %v\n", info)
// fmt.Printf("dir: %v\n", dir)
// fmt.Printf("actor: %v\n", actor)
// fmt.Printf("year: %v\n", year)
// fmt.Printf("score: %v\n", score)
// fmt.Printf("quote: %v\n", quote)
// fmt.Println("++++++++++++++++++++++++++++++++++++++++++++")
movieData.Title = title
movieData.Director = dir
movieData.Picture = imgTmp
movieData.Actor = actor
movieData.Year = year
movieData.Score = score
movieData.Quote = quote
//fmt.Println(movieData)
}
})
if ch != nil {
ch <- true
}
}
func InfoSpite(info string) (dir, actor, year string) {
dirRe, _ := regexp.Compile(` 导演:(.*) `)
dir = string(dirRe.Find([]byte(info)))
actorRe, _ := regexp.Compile(`主演: (.*) `)
actor = string(actorRe.Find([]byte(info)))
yearRe, _ := regexp.Compile(`(\d+)`)
year = string(yearRe.Find([]byte(info)))
return
}
func NormalStart(url string) {
start := time.Now()
for i := 0; i < 10; i++ {
//p := i + 1
//fmt.Printf("正在爬取第%d页…………\n", p)
page := strconv.Itoa(i * 25)
url = url + page
petch(url, nil, i)
}
elapsed := time.Since(start)
fmt.Printf("NormalStart Time %s \n", elapsed)
}
func ChannelStart(url string) {
ch := make(chan bool)
start := time.Now()
for i := 0; i < 10; i++ {
//p := i + 1
//fmt.Printf("正在爬取第%d页…………\n", p)
page := strconv.Itoa(i * 25)
url = url + page
go petch(url, ch, i)//只是单开一个go去处理petch函数,而url的创建还是走的单线程
}
for i := 0; i < 10; i++ {
<-ch
}
elapsed := time.Since(start)
fmt.Printf("ChannelStart Time %s \n", elapsed)
}
var wg sync.WaitGroup
func WaitGroupStart(url string) {
start := time.Now()
wg.Add(10)
for i := 0; i < 10; i++ {
go func(i int) {//直接把url的创建也写入到整体里面,直接单开一个go创建完直接传参运行
defer wg.Done()
//p := i + 1
//fmt.Printf("正在爬取第%d页…………\n", p)
page := strconv.Itoa(i * 25)
url = url + page
petch(url, nil, i)
}(i)
}
wg.Wait()
elapsed := time.Since(start)
fmt.Printf("WaitGroupStart Time %s \n", elapsed)
}
func main() {
url := "https://movie.douban.com/top250?start="
// for i := 0; i < 10; i++ {
// p := i + 1
// fmt.Printf("正在爬取第%d页…………\n", p)
// page := strconv.Itoa(i * 25)
// url = url + page
// }
NormalStart(url) // 单线程爬虫
ChannelStart(url) // Channel多线程爬虫
WaitGroupStart(url) // Wait 多线程爬虫
}
运行结果:
D:\GO\src\github.com\pachong>go run douban.go
NormalStart Time 2.16376s
ChannelStart Time 1.1002293s
WaitGroupStart Time 401.7712ms
5、colly
5.1、简介
Colly 是一个采用 Go 语言编写的 Web 爬虫框架,旨在提供一个能够写任何爬虫/采集器/蜘蛛的简洁模板。通过 Colly ,你可以轻松从网站中提取结构化的数据,然后进行数据挖掘、处理或归档。
5.2、项目特性
- 清晰明了的 API
- 速度快(单个内核上的请求数大于1k)
- 管理每个域的请求延迟和最大并发数
- 自动 cookie 和会话处理
- 同步/异步/并行抓取
- 高速缓存
- 自动处理非 Unicode 编码
- 支持 Robots.txt
- 支持 Google App Engine
- 通过环境变量进行配置
- 可扩展
5.3、安装colly
go get -u github.com/gocolly/colly
5.4、第一个colly应用
package main
import (
"fmt"
"github.com/gocolly/colly"
)
func main() {
c := colly.NewCollector()
c.OnHTML(".sidebar-link", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("url", r.URL)
})
c.Visit("https://gorm.io/zh_CN/docs/")
}
5.5、回调方法的使用
colly提供了如下回调方法:
5.5.1、OnRequest
在请求之前调用
5.5.2、OnError
在请求期间发生错误时调用
5.5.3、OnResponseHeaders
在收到响应标头后调用
5.5.4、OnResponse
收到响应后调用
5.5.5、OnHTML
OnResponse
如果接收到的内容是 HTML ,则立即调用
5.5.6、OnXML
OnHTML
如果接收到的内容是 HTML 或 XML ,则立即调用
5.5.7、OnScraped
回调后OnXML
调用
5.5.8、实例1:
package main
import (
"fmt"
"github.com/gocolly/colly"
)
func main() {
c := colly.NewCollector()
c.OnRequest(func(r *colly.Request) {
fmt.Println("请求前调用:OnRequest")
})
c.OnError(func(_ *colly.Response, err error) {
fmt.Println("发生错误调用:OnError")
})
c.OnResponse(func(r *colly.Response) {
fmt.Println("获得响应后调用:OnResponse")
})
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
fmt.Println("OnResponse收到html内容后调用:OnHTML")
})
c.OnXML("//h1", func(e *colly.XMLElement) {
fmt.Println("OnResponse收到xml内容后调用:OnXML")
})
c.OnScraped(func(r *colly.Response) {
fmt.Println("结束", r.Request.URL)
})
c.Visit("https://gorm.io/zh_CN/docs/")
}
5.6、页面爬取和解析
页面爬取和解析重点方法是OnHTML
回调方法
c.OnHTML("a[href]", func(e *colly.HTMLElement) {
e.Request.Visit(e.Attr("href"))
})
第一个参数是:goquery选择器,可以元素名称、ID或者class选择器,第二个参数是根据第一个选择器获得HTML元素结构如下:
type HTMLElement struct {
// Name is the name of the tag
Name string
Text string
attributes []html.Attribute
// Request is the request object of the element's HTML document
Request *Request
// Response is the Response object of the element's HTML document
Response *Response
// DOM is the goquery parsed DOM object of the page. DOM is relative
// to the current HTMLElement
DOM *goquery.Selection
// Index stores the position of the current element within all the elements matched by an OnHTML callback
Index int
}
这里可以获得元素名称Name,元素你内容Text,属性attributes,请求,响应和DOM,DOM是一个goquery Selection可以继续解析...
5.7、colly框架重构爬虫应用
5.7.1、基本案例:
package main
import (
"fmt"
"github.com/gocolly/colly"
)
func main() {
c := colly.NewCollector()
c.OnHTML(".sidebar-link", func(e *colly.HTMLElement) {
href := e.Attr("href")
if href != "index.html" {
c.Visit(e.Request.AbsoluteURL(href))
}
})
c.OnHTML(".article-title", func(h *colly.HTMLElement) {
title := h.Text
fmt.Printf("title: %v\n", title)
})
c.OnHTML(".article", func(h *colly.HTMLElement) {
content, _ := h.DOM.Html()
fmt.Printf("content: %v\n", content)
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
c.Visit("https://gorm.io/zh_CN/docs/")
}
5.7.2、单线程
package main
import (
"fmt"
"time"
"github.com/gocolly/colly"
)
func pase(url string) {
c := colly.NewCollector()
// c.OnHTML(".sidebar-link", func(e *colly.HTMLElement) {
// href := e.Attr("href")
// if href != "index.html" {
// c.Visit(e.Request.AbsoluteURL(href))
// }
// })
c.OnHTML(".article-title", func(h *colly.HTMLElement) {
title := h.Text
fmt.Printf("title: %v\n", title)
})
c.OnHTML(".article", func(h *colly.HTMLElement) {
content, _ := h.DOM.Html()
fmt.Printf("content: %v\n", content)
})
c.OnRequest(func(r *colly.Request) {
fmt.Println("Visiting", r.URL.String())
})
c.Visit(url)
}
func getLink(url string) (link string) {
c := colly.NewCollector()
c.OnHTML(".sidebar-link", func(e *colly.HTMLElement) {
href := e.Attr("href")
if href != "index.html" {
c.Visit(e.Request.AbsoluteURL(href))
}
})
c.OnRequest(func(r *colly.Request) {
// //fmt.Println("Visiting", r.URL.String())
// var links Links
s := r.URL.String()
pase(s)
// links.link = s
})
c.Visit(url)
return
}
type Links struct {
link string
}
func main() {
start := time.Now()
url := "https://gorm.io/zh_CN/docs/"
getLink(url)
elapsed := time.Since(start)
fmt.Printf("爬取 Time %s \n", elapsed)
}
运行结果
爬取url Time 30.5143806s
5.7.3、多线程
待补充
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南