我也来写个小爬虫 ^_^

今天下班抽了点时间看了下印象笔记,整理了一个礼拜node的api笔记。。。。然后去慕课网看了Scott老师讲的node系列视频教程。于是自己写了一个小小的爬虫,爬的是自己写的博客章节 ,里面的一些es6语法和api我就不一一细说,大家可以去看文档,http://nodeapi.ucdok.com/#/api/,好了话不多说,直接上代码

'use strict';
{
    const http = require(`http`);
    
    const cheerio = require(`cheerio`);

    const fs = require(`fs`);

    let url = `http://www.cnblogs.com/tween`;

    http.get(url, (res) => {

        let content = ``;

        res.on(`data`, (data) => {

            content += data;

        }).on(`end`, () => {

            let html = getContent(content);

            creatTxt(html);

        });

    }).on(`error`,() => console.log(`获取数据失败`));

    let creatTxt = content => {

        let txt = ``;

        for(let v of content){
            txt += v.time;
            let blog = v.blog;
            for(let v of blog){
                let json = v;
                for(let name in json){
                    txt += json[name];
                }
                txt += `\n`;
            }
        }
        
        fs.writeFile(`blog.txt`,txt,'utf-8',(err) => {

            err?console.log(err):console.log(`写入成功`);

        });

    };
    let getContent = content => {

        let $ = cheerio.load(content);

        let blogs = $(`.day`);

        let arr = [];

        blogs.each( (index, item) => {

            let _this = $(item);

            let time = _this.find(`.dayTitle`).text();

            let indexBlog = [];

            _this.find(`.postTitle`).each((index, item) => {

                let title = $(item).text().trim();

                let list = _this.find(`.postDesc`).eq(index).text();

                let read = list.match(/\(\d+\)/g)[0].trim();

                let comment = list.match(/\(\d+\)/g)[1].trim();

                indexBlog[index] = {
                    title:`\t${title}\n`,
                    read:`\t阅读:${read} 评论:${comment}\n`,
                };
            });
            arr[index] = {
                time:`${index+1} 、${time.trim()}\n`,
                blog:indexBlog
            };

        });
        return arr;
    };
}

运行后会在同目录下创建个blog.txt,里面的内容就是爬到的数据

posted @ 2016-04-21 23:54  BigPanda  阅读(264)  评论(0编辑  收藏  举报