[日常] Go语言圣经--并发的web爬虫
两种:
crawler.go
package main import ( "fmt" "links" //"log" "os" ) func main() { worklist := make(chan []string) // Start with the command-line arguments. go func() { worklist <- os.Args[1:] }() // Crawl the web concurrently. seen := make(map[string]bool) for list := range worklist { for _, link := range list { if !seen[link] { seen[link] = true go func(link string) { worklist <- crawl(link) }(link) } } } } var tokens = make(chan struct{}, 20) //从一个url页面中提取出所有的url func crawl(url string) []string { fmt.Println(url) tokens <- struct{}{} list, err := links.Extract(url) <-tokens if err != nil { //log.Print(err) } return list }
crawler2.go
package main import ( "fmt" "links" //"log" "os" "strings" ) func main() { worklist := make(chan []string) unseenLinks := make(chan string) // Start with the command-line arguments. go func() { worklist <- os.Args[1:] }() // Create 20 crawler goroutines to fetch each unseen link. for i := 0; i < 20; i++ { go func() { for link := range unseenLinks { //if strings.HasPrefix(link, "http://www.lypeng.com") { foundLinks := crawl(link) go func() { worklist <- foundLinks }() //} } }() } // The main goroutine de-duplicates worklist items // and sends the unseen ones to the crawlers. seen := make(map[string]bool) for list := range worklist { for _, link := range list { if !seen[link] { seen[link] = true unseenLinks <- link } } } } //从一个url页面中提取出所有的url func crawl(url string) []string { fmt.Println(url) list, err := links.Extract(url) if err != nil { //log.Print(err) } return list }
十年开发经验程序员,离职全心创业中,历时三年开发出的产品《唯一客服系统》
一款基于Golang+Vue开发的在线客服系统,软件著作权编号:2021SR1462600。一套可私有化部署的网站在线客服系统,编译后的二进制文件可直接使用无需搭开发环境,下载zip解压即可,仅依赖MySQL数据库,是一个开箱即用的全渠道在线客服系统,致力于帮助广大开发者/公司快速部署整合私有化客服功能。
开源地址:唯一客服(开源学习版)
官网地址:唯一客服官网