[日常] Go语言圣经--并发的web爬虫

两种:

crawler.go 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
package main
 
import (
        "fmt"
        "links"
        //"log"
        "os"
)
 
func main() {
        worklist := make(chan []string)
 
        // Start with the command-line arguments.
        go func() { worklist <- os.Args[1:] }()
        // Crawl the web concurrently.
        seen := make(map[string]bool)
        for list := range worklist {
                for _, link := range list {
                        if !seen[link] {
                                seen[link] = true
                                go func(link string) {
                                        worklist <- crawl(link)
                                }(link)
                        }  
                }  
        }  
}
 
var tokens = make(chan struct{}, 20)
 
//从一个url页面中提取出所有的url
func crawl(url string) []string {
        fmt.Println(url)
        tokens <- struct{}{}
        list, err := links.Extract(url)
        <-tokens
        if err != nil {
                //log.Print(err)
        }  
        return list
}

crawler2.go 

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
package main
 
import (
        "fmt"
        "links"
        //"log"
        "os"
        "strings"
)
 
func main() {
        worklist := make(chan []string)
        unseenLinks := make(chan string)
 
        // Start with the command-line arguments.
        go func() { worklist <- os.Args[1:] }()
        // Create 20 crawler goroutines to fetch each unseen link.
        for i := 0; i < 20; i++ {
                go func() {
                        for link := range unseenLinks {
                                //if strings.HasPrefix(link, "http://www.lypeng.com") {
                                foundLinks := crawl(link)
                                go func() { worklist <- foundLinks }()
 
                                //}
                        }  
                }()
        }  
 
        // The main goroutine de-duplicates worklist items
        // and sends the unseen ones to the crawlers.
        seen := make(map[string]bool)
        for list := range worklist {
                for _, link := range list {
                        if !seen[link] {
                                seen[link] = true
                                unseenLinks <- link
                        }  
                }  
        }  
}
 
//从一个url页面中提取出所有的url
func crawl(url string) []string {
        fmt.Println(url)
        list, err := links.Extract(url)
        if err != nil {
                //log.Print(err)
        }  
        return list
}

  

  

posted @   唯一客服系统开发笔记  阅读(721)  评论(0编辑  收藏  举报
编辑推荐:
· 开发者必知的日志记录最佳实践
· SQL Server 2025 AI相关能力初探
· Linux系列:如何用 C#调用 C方法造成内存泄露
· AI与.NET技术实操系列(二):开始使用ML.NET
· 记一次.NET内存居高不下排查解决与启示
阅读排行:
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 开源Multi-agent AI智能体框架aevatar.ai,欢迎大家贡献代码
· Manus重磅发布:全球首款通用AI代理技术深度解析与实战指南
· 被坑几百块钱后,我竟然真的恢复了删除的微信聊天记录!
· AI技术革命,工作效率10个最佳AI工具
历史上的今天:
2016-05-05 [javaSE] 网络编程(TCP,UDP,Socket特点)
2016-05-05 [javaSE] 网络编程(概述)
2016-05-05 [PHP] 商品类型规格属性后台管理(代码流程备忘)
点击右上角即可分享
微信分享提示
1
chat with us