Loading

博客园首页好文监控(回复邮箱地址即可订阅通知)

现在博客园首页文章质量参差不齐,比如我这篇就要水了。于是弄了个小爬虫定时去爬首页的文章,超过1000点击的就自动发送邮件。
https://github.com/kklldog/cnblogs_notice

接口

博客园的首页列表其实是有ajax接口的阅读量就在这里面,使用cheerio就可以抽取出来。
https://www.cnblogs.com/mvc/AggSite/PostList.aspx

使用request发送请求

var request = require('request');

var get = function (url, callback, errCallback, trytimes) {
    req({ url: url, timeout: 30000 }, callback, errCallback, trytimes);
}

var post = function(url,body,isJson,callback,errCallback,trytimes){
    req({ url: url, timeout: 30000,body:body,method:'POST',json:isJson }, callback, errCallback, trytimes);    
}

var req = function (option, callback, errCallback, trytimes) {
    if (trytimes === undefined) {
        trytimes = 5;
    }
    request(option, function (err, res) {
        if (err) {
            console.error('request ' + option.url + ' error .');
            console.error(err);
            if (trytimes > 0) {
                req(option, callback, errCallback, trytimes - 1);
            }
            else {
                if (errCallback) {
                    errCallback(err);
                }
            }
        }
        else {
             callback(res);
        }

    });
}

exports.get = get;
exports.post = post;
exports.req = req;

使用cheerio抽取数据

    var $ = cheerio.load(body);
    $('div.post_item_body').each((index, postBody) => {
        var name = $(postBody).find('a.titlelnk').text();
        $(postBody).find('span.article_view a').each((i, e) => {
            var link = $(e).attr('href');
            var text = $(e).text();
            var sIndex = text.indexOf('(');
            var eIndex = text.indexOf(')');
            var viewCount = text.substr(sIndex + 1, eIndex - sIndex - 1);
            var intViewCount = parseInt(viewCount);
            console.log(link + ' ' + viewCount + ' ' + name);
            if (intViewCount > 1000) {
                // console.log(link + ' ' + viewCount+' '+name);
                trySendMail(link, name,mailAddress);
            }
        });

    });

使用mongodb储存数据


var Db = require('mongodb').Db;
var Server = require('mongodb').Server;
var MongoClient = require('mongodb').MongoClient;

var db;

var init = function () {
    MongoClient.connect("mongodb://localhost:27017/notice", (err, database) => {
        if (err) {
            console.error(err);
            return;
        }
        console.log('connect to db success');
        db = database;
    });
}

var insert = function (collName, data, callback) {
    var coll = db.collection(collName);
    coll.insert(data, (err, r) => {
        if (!err) {
            console.log('save to ' + collName + ' success !');
            if (callback) {
                callback(r);
            }
        }
        else {
            console.error(err);
        }

    });
};

var queryPage = function (collName, filter, skip, limit, callback) {
    var coll = db.collection(collName);
    coll.find(filter).sort({ videoId: 1 }).skip(skip).limit(limit).toArray((err, r) => {
        if (!err) {
            callback(r);
        }
        else {
            console.error(err);
            callback([]);
        }
    });
}

var remove = function (collName, filter, callback) {
    var coll = db.collection(collName);
    coll.remove(filter, ((err, r) => {
        if (!err) {
            console.log('remove to ' + collName + ' success !');
            if (callback) {
                callback(r);
            }
        }
        else {
            console.error(err);
        }

    }));
}

var find = function (collName, filter, callback) {
    var coll = db.collection(collName);
    coll.find(filter).toArray((err, r) => {
        if (!err) {
            callback(r);
        }
        else {
            console.error(err);
            callback([]);
        }
    })
}

var update = function (collName, filter, updateObj, callback, errCallback) {
    var coll = db.collection(collName);
    coll.update(filter, { $set: updateObj }, (err, r) => {
        if (!err) {
            console.log('update to ' + collName + ' success !');
            if (callback) {
                callback(r);
            }
        }
        else {
            console.error(err);
            errCallback(err);
        }
    });
}

exports.insert = insert;
exports.queryPage = queryPage;
exports.remove = remove;
exports.find = find;
exports.update = update;

exports.init = init;

使用node-schedule来执行定时任务

var schedule = require('node-schedule');
var cnblogs =require('./cnblogs');

var filter = function(){
    cnblogs.filter(1,10);
}

var initSchedule = function () {
    schedule.scheduleJob({ hour:10, minute: 01 }, filter);
    console.log('schedule inited .');
}

订阅

回复邮件地址就可以自动订阅推送 😃

我的博客即将搬运同步至腾讯云+社区,邀请大家一同入驻:https://cloud.tencent.com/developer/support-plan

posted @ 2017-08-10 10:07  Agile.Zhou  阅读(423)  评论(7编辑  收藏  举报