[GO]百度贴吧的爬虫

package main

import (
    "fmt"
    "strconv"
    "net/http"
    "os"
    "io"
)

//百度贴吧的地址规律
//第一页:https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8(&pn=0)
//第二页:https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=50
//第三页:https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=100
//所以它的最后的数字每加50,代表着下一页

//整体提取的思路:
//1、先拿地址
//2、爬
//3、取
//4、存

func HttpGet(url string) (result string, err error) {
    resp, err1 := http.Get(url)
    if err != nil {
        err = err1
        return
    }
    defer resp.Body.Close()
    //读取网页的body内容
    buf := make([]byte, 4*1024)
    for true {
        n, err := resp.Body.Read(buf)
        if err != nil {
            if err == io.EOF{
                fmt.Println("文件读取完毕")
                break
            }else {
                fmt.Println("resp.Body.Read err = ", err)
                break
            }
        }
        result += string(buf[:n])
    }
    return
}

func DoWork(start, end int)  {
    fmt.Printf("正在爬取第%d页到%d页\n", start, end)
    for i:=start; i<=end; i++ {
        url := "https://tieba.baidu.com/f?kw=%E7%BB%9D%E5%9C%B0%E6%B1%82%E7%94%9F&ie=utf-8&pn=" + strconv.Itoa((i-1)*50)
        //爬,将所有的网页内容爬取下来
        result, err := HttpGet(url)
        if err != nil {
            fmt.Println("http.Get err = ", err)
            continue
        }
        //把内容写入到文件
        filename := strconv.Itoa((i-1)*50) + ".html"
        f, err1 := os.Create(filename)
        if err1 != nil{
            fmt.Println("os.Create err = ", err1)
            continue
        }
        //写内容
        f.WriteString(result)
        //关闭文件
        f.Close()
    }
}

func main() {
    var start, end int
    fmt.Printf("请输入起始页>=1:> ")
    fmt.Scan(&start)
    fmt.Printf("请输入结束页:> ")
    fmt.Scan(&end)
    DoWork(start, end)
}

 

posted @ 2018-09-26 10:19  蟒城贝勒爷  阅读(255)  评论(0编辑  收藏  举报