从0开始学golang--2.2--如何去爬园子的数据👉进阶篇,面向对象的单任务版

 

执行页main.go-----------------------------------代码👇----------------------------------

package main

import (
    "../sole/cnblogs/parserabc"
    "../sole/engine"
)

func main() {
//执行run方法,发起请求,根据url,解析并获取数据 engine.Run(engine.Request{
ParserFunc: parserabc.ParseCategory, Url: "https://www.cnblogs.com/", }) }

 

engine页1engine.go----run代码块-----------------------------------代码👇----------------------------------

package engine

import (
    "log"

    "../fetcher"
)
//执行
func Run(seeds ...Request) {
    var requests []Request
    for _, r := range seeds {
        requests = append(requests, r)
    }
    for len(requests) > 0 {
        r := requests[0]
        requests = requests[1:]
     //打印URL
        log.Printf("Fetching %s", r.Url)
//根据main中园子的url去提取body
        body, err := fetcher.Fetch(r.Url)
        if err != nil {
            log.Printf("Fetch:error"+
                "fetching url %s:%v",
                r.Url, err)
            continue
        }
//获取body中的Requests和items ParseResult :
= r.ParserFunc(body)
//将ParseResult的数据展开并塞入到requests中 requests
= append(requests, ParseResult.Requests...) //输出节点名称 for _, item := range ParseResult.Items { log.Printf("Got item %v", item) } } }

engine页2types.go----处理数据代码块-----------------------------------代码👇----------------------------------

package engine
//url和ParseFunc解析功能
type Request struct {
    Url        string
    ParserFunc func([]byte) ParseResult
}
//Requests和节点 type ParseResult struct { Requests []Request Items []interface{} } // func NilParser([]byte) ParseResult {
return ParseResult{} }

fetcher页1fetcher.go------------提取数据代码块-----------------------------------代码👇----------------------------------

package fetcher

import (
    "fmt"
    "io/ioutil"
    "net/http"
)
//读取url,获取body数据
func Fetch(url string) ([]byte, error) {
    resp, err := http.Get(url)
    if err != nil {
        return nil, err
    }
    defer resp.Body.Close()

    if resp.StatusCode != http.StatusOK {
        return nil,
            fmt.Errorf("wrong status code :%d",
                resp.StatusCode)
    }
    return ioutil.ReadAll(resp.Body)
}

cnblgos页1category------------拉取cnblgos数据代码块-----------------------------------代码👇----------------------------------

package parser

import (
    "regexp"

    "../../engine"
)

const text = "<a href=\"(https://[0-9a-zA-Z]+.cnblogs.com/)\"[^>]*>([^<]+)</a>"

//解析园子主页的body,然后拆分数据装入到result中,result的属性值是ParseResult。返回给main中的ParserFunc
func ParseCategory(
    contets []byte) engine.ParseResult {
    re := regexp.MustCompile(text)
    matches := re.FindAllSubmatch(contets, -1)
    result := engine.ParseResult{}
    for _, m := range matches {
        result.Items = append(
            result.Items, string(m[2]))
        result.Requests = append(
            result.Requests, engine.Request{
                Url: string(m[1]),
                ParserFunc: engine.NilParser,
            })
    }
    return result
}

cnblgos页2category_test.go------------测试category拉取cnblgos数据代码块-----------------------------------代码👇----------------------------------

package parser

import (
    "io/ioutil"
    "testing"
)

func TestParseCategory(t *testing.T) {
  //获取内容
contents, err := ioutil.ReadFile("category_test_data.html") if err != nil { panic(err) }
//解析数据 result := ParseCategory(contents) //已知数据长度,来比对数据长度 const resultSize = 10
//人工去拿正确的前三个url数据 expectedUrls := []string{ "https://home.cnblogs.com/", "https://q.cnblogs.com/", "https://ing.cnblogs.com/", }
//人工去拿正确的前三个节点名称 expectedCities :
= []string{ "园子", "博问", "闪存", }
//判断拉取的数据,他们的长度是否相等。
if len(result.Requests) != resultSize { t.Errorf("resykt should have %d "+ "requests;but had %d", resultSize, len(result.Requests)) }
//对比url数据是否相等
for i, url := range expectedUrls { if result.Requests[i].Url != url { t.Errorf("expected url #%d:%s;but"+ "was %s", i, url, result.Requests[i].Url) } } //对比节点长度是否相等 if len(result.Items) != resultSize { t.Errorf("resykt should have %d "+ "requests;but had %d", resultSize, len(result.Items)) }
//对比节点是否相等
for i, city := range expectedCities { if result.Items[i].(string) != city { t.Errorf("expected url #%d:%s;but"+ "was %s", i, city, result.Items[i].(string)) } } }

cnblgos页3category_test_data.html------------category_test.go需要的数据-----------------------------------👇----------------------------------

打开博客园主页然后F12,右键全选复制保存即可。

 

 

 临时在博客页面中加的注释,如果完全复制过去,跑不起来的话,去掉注释肯定没问题。

主页面园子  下一级页面=我想要的页面  下下一级页面=下一次我要实现的功能

只是一个根据主页获取了body中我想要的页面,如何在拿到我想要的页面后,再去下下一级的页面中拉取数据,有空在写。。

posted @ 2019-04-15 20:32  子明  阅读(180)  评论(0编辑  收藏  举报