Go语言 之并发版网络爬虫

package main

import (
    "fmt"
    "net/http"
    "os"
    "strconv"
)

var url = "https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn="

//get请求获取网页内容
func HttpGet(url string) (result string) {
    //网络请求
    response, err := http.Get(url)
    if err != nil {
        fmt.Println(err.Error())
    }
    //延迟关闭
    defer response.Body.Close()
    buf := make([]byte, 1024*6)
    for {
        n, _ := response.Body.Read(buf)
        if n == 0 {
            break
        }
        result += string(buf[:n])
    }
    return
}

//保存文件内容到本地
func SaveFileToLocal(index int, c chan<- int) {
    index_str := strconv.Itoa((index - 1) * 50)
    //网络请求获取内容
    result := HttpGet(url + index_str)
    if result != "" {
        //获取当前绝对路径
        path, _ := os.Getwd()
        //创建文件
        f, err := os.Create(path + "/page/" + strconv.Itoa(index) + ".html")
        if err != nil {
            fmt.Println(err)
        }
        //将内容写到文件
        f.WriteString(result)
        f.Close()
    }
    //传递当前协程爬取的页数
    c <- index
}

func doWork(start, end int) {
    //创建无缓冲通道
    page := make(chan int)
    for i := start; i <= end; i++ {
        //创建协程处理
        go SaveFileToLocal(i, page)
    }
    for i := start; i <= end; i++ {
        //阻塞等待获取通道的值
        fmt.Printf("第%d页爬取完毕\n", <-page)
    }
}

func main() {
    var start, end int
    fmt.Print("请输入起始页:")
    fmt.Scan(&start)
    if start <= 0 {
        fmt.Println("参数不正确")
    }
    fmt.Print("请输入结束页:")
    fmt.Scan(&end)

    doWork(start, end)
}

 

posted @ 2019-07-11 15:32  样子2018  阅读(633)  评论(0编辑  收藏  举报