node爬虫
const cheerio = require('cheerio'); const superagent = require('superagent'); const request = require('request'); var fs = require('fs'); var _data = require('./public/data/frequency.json'); var result = require('./public/data/frequency-result.json'); var url = ''; var random = 1000; var index = 0; var _obj = {}; _obj.jia = []; _obj.yi = []; _obj.bing = []; var getData = function(search){ clearTimeout(timeout); random = Math.ceil(Math.random()*5)*1000; superagent.get(url+encodeURI(search)) .set('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8') .set('Accept-Encoding', 'gzip, deflate') .set('Accept-Language', 'zh-CN,zh;q=0.9,en;q=0.8,zh-TW;q=0.7') .set('Cache-Control', 'max-age=0') .set('Connection', 'keep-alive') .set('Cookie', '') .set('Host', '') .set('Referer', '') .set('Upgrade-Insecure-Requests', '1') .set('User-Agent', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.87 Safari/537.36') .end(function (err, res) { // 抛错拦截 if(err){ // console.log(err); console.log('err'); } var $ = cheerio.load(res.text); console.log(index, search, $('.panel-heading p').eq(1).text()); var str = $('.panel-heading p').eq(1).text().trim(); var index_1 = str.indexOf('共'); var index_2 = str.indexOf('个'); var num = str.substr(index_1+2, index_2-3); var _o = {}; _o[search] = num; _obj.bing.push(_o); fs.writeFile('public/data/frequency-result.json', JSON.stringify(_obj), function (err) { if (err) throw err; console.log('It\'s saved!'); }); ++index; if(index < 500){ timeout = setTimeout(function(){ getData(_data.bing[index]) }, random); } }); } var timeout = setTimeout(function(){ getData(_data.bing[index]) }, random); var processExcel = function(){ var xlsx=require('node-xlsx'); var _path = "./public/data/frequency.xlsx"; var obj = xlsx.parse(_path); var path = require('path'); function filterData(data){ console.log(data[0][1]); var _obj = {}; _obj.jia = []; _obj.yi = []; _obj.bing = []; for(var i=1; i<data.length; i++){ if(data[i][1]){ _obj.jia.push(data[i][1]) } if(data[i][5]){ _obj.yi.push(data[i][5]) } if(data[i][9]){ _obj.bing.push(data[i][9]) } } console.log(_obj.jia.length, _obj.yi.length, _obj.bing.length); fs.writeFile('public/data/frequency.json', JSON.stringify(_obj), function (err) { if (err) throw err; console.log('It\'s saved!'); }); } filterData(obj[0].data); } // processExcel();
以上是简单的爬虫,采用superagent获取html,用cheerio处理html,然后采用jquery的方式获取元素。