node爬虫

const cheerio = require('cheerio');
const superagent = require('superagent');
const request = require('request');
var fs = require('fs');

var _data = require('./public/data/frequency.json');
var result = require('./public/data/frequency-result.json');
var url = '';
var random = 1000;
var index = 0;
var _obj = {};
_obj.jia = [];
_obj.yi = [];
_obj.bing = [];

var getData = function(search){
  clearTimeout(timeout);
  random = Math.ceil(Math.random()*5)*1000;
  superagent.get(url+encodeURI(search))
    .set('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8')
    .set('Accept-Encoding', 'gzip, deflate')
    .set('Accept-Language', 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7')
    .set('Cache-Control', 'max-age=0')
    .set('Connection', 'keep-alive')
    .set('Cookie', '')
    .set('Host', '')
    .set('Referer', '')
    .set('Upgrade-Insecure-Requests', '1')
    .set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36')
    .end(function (err, res) {
      // 抛错拦截
       if(err){
           // console.log(err);
           console.log('err');
       }
       var $ = cheerio.load(res.text);
       console.log(index, search, $('.panel-heading p').eq(1).text());
       var str = $('.panel-heading p').eq(1).text().trim();
       var index_1 = str.indexOf('共');
       var index_2 = str.indexOf('个');
       var num = str.substr(index_1+2, index_2-3);
       var _o = {};
       _o[search] = num;
       _obj.bing.push(_o);
       fs.writeFile('public/data/frequency-result.json', JSON.stringify(_obj), function (err) {
          if (err) throw err;
          console.log('It\'s saved!');
        });
       ++index;
       if(index < 500){
        timeout = setTimeout(function(){
           getData(_data.bing[index])
         }, random); 
       }       
  });
}

var timeout = setTimeout(function(){
  getData(_data.bing[index])
}, random);

var processExcel = function(){
  var xlsx=require('node-xlsx');
  var _path = "./public/data/frequency.xlsx";
  var obj = xlsx.parse(_path);  
  var path = require('path');

  function filterData(data){
    console.log(data[0][1]);
    var _obj = {};
    _obj.jia = [];
    _obj.yi = [];
    _obj.bing = [];

    for(var i=1; i<data.length; i++){
      if(data[i][1]){
        _obj.jia.push(data[i][1])
      }
      if(data[i][5]){
        _obj.yi.push(data[i][5])
      }
      if(data[i][9]){
        _obj.bing.push(data[i][9])
      }
    }

    console.log(_obj.jia.length, _obj.yi.length, _obj.bing.length);

    fs.writeFile('public/data/frequency.json', JSON.stringify(_obj), function (err) {
      if (err) throw err;
      console.log('It\'s saved!');
    });
  }

  filterData(obj[0].data);

}

// processExcel();

  以上是简单的爬虫,采用superagent获取html,用cheerio处理html,然后采用jquery的方式获取元素。

 

 

 

  

posted on 2014-11-14 19:13  j.w  阅读(374)  评论(0编辑  收藏  举报