没什么
去年意识头脑发热(我经常这样),想利用gayhub和node做一个项目。
大概的想法是,利用nodejs爬取各大招聘网站的招聘需求,然后再用d3.js和github主页功能根据日期来展示各种统计图表出来。
自动爬取是没有思路的,所以当时的思路是每周手动爬取数据,生成json上传到gayhub的主页repo里面去,页面里面根据日期去判断获取哪一个星期的数据。
数据初步的爬取到了,然后后面由于众多原因,我没有继续进行下去,当然,这只是为我的三分发热找的借口。
以下是爬取lagou数据的主要文件,现在还能不能用我也懒的去试验,以此备忘顺道更新blog。
var superagent = require("superagent"); var cheerio = require("cheerio"); var async = require("async"); var fs = require("fs"); var path = require("path"); var rootUrl = "https://www.lagou.com"; var $; var locations = [encodeURI('全国'),encodeURI('北京'),encodeURI('上海'),encodeURI('杭州'),encodeURI('广州'),encodeURI('深圳'),encodeURI('成都')]; var content = ''; //for test only /**/ fs.readFile('./result/class_1481010149483.txt',(err, data) => { if( err ) console.error(err); parse(data); }); /**//* scrawlLocation(0); function scrawlLocation(index){ superagent .get(rootUrl) .set("index_location_city",locations[index]) .end(function(err, res){ file = fs.createWriteStream("./result/class_"+Date.now()+".txt"); console.log(locations[index]); parse(res.text,locations[index]); file.write(res.text); file.end(); if( index + 1 < locations.length){ scrawlLocation(index+1); } }); } /**/ /**/ var today = new Date(); var curDir; function parse(content,currentLocation){ var dataPool = {}; var file; var todayStr = today.getFullYear()+"-"+today.getMonth()+"-"+today.getDate(); curDir = "./result/"+todayStr+"/"; if( !fs.existsSync(curDir)){ fs.mkdirSync(curDir); file = fs.createWriteStream("./result/config.js"); file.write("var revision = "+todayStr+";"); file.end(); } $ = cheerio.load(content,{ignoreWhitespace: true}); var mainClass; var secondClass; var classData; $('div[class="menu_box"]').each(function(k,v){ //console.log("===================="); mainClass = parserMainClass(v);//menu_main job_hopping //file = fs.createWriteStream(curDir+mainClass+".json"); classData = []; parseSecondClass($(v).children()[1], classData);//menu_sub dn dataPool[mainClass] = classData; //file.write(JSON.stringify(classData)); //file.end(); }); file = fs.createWriteStream(curDir+decodeURI(currentLocation)+".json"); file.write(JSON.stringify(dataPool)); file.end(); startScrawlCount(curDir); } function parserMainClass(value){ var h2Item = $(value).children().children()[0]; var title = h2Item.children[0].data; return title.trim(); } function parseSecondClass(value, classArr){ var item; var arr = value.children; var len = arr.length; var data,len1,arr1,item1,len2,arr2,item2; //console.log("*****************************"); for(var i = 0 ; i < len ; i++){//dl item = arr[i]; if( item.type === "text") continue; //console.log("1~~~~~~~~~~~~~~~~~~~~~~~~~~"); //console.log(item); arr1 = item.children; len1 = arr1.length; for(var j = 0; j < len1; j++){ item1 = arr1[j]; if( item1.type === "text") continue; //console.log("2 ~~~~~~~~~~~~~~~~~~~~~~~~~~"); //console.log(item1); if( item1.name === "dt"){ item1 = item1.children[1]; //console.log("3~~~~~~~~~~~~~~~~~~~~~~~~~~"); //console.log(item1); data = {}; data.name = item1.children[0].data; data.isMain = 1; data.href = item1.attribs["href"].substring(2); data.dataLgTjId = item1.attribs["data-lg-tj-id"]; data.dataLgTjNo = item1.attribs["data-lg-tj-no"]; data.dataLgTjCid = item1.attribs["data-lg-tj-cid"]; classArr.push(data); //console.log(item1.children[0].data,item1.attribs["href"],item1.attribs["data-lg-tj-id"],item1.attribs["data-lg-tj-no"],item1.attribs["data-lg-tj-cid"]); }else if( item1.name === "dd"){ //console.log("4~~~~~~~~~~~~~~~~~~~~~~~~~~"); arr2 = item1.children; len2 = arr2.length; for( var k = 0; k < len2; k++){ item2 = arr2[k]; if( item2.type === "text") continue; data = {}; //console.log("5~~~~~~~~~~~~~~~~~~~~~~~~~~"); //console.log(item2); data.name = item2.children[0].data; data.isMain = 0; data.href = item2.attribs["href"].substring(2); data.dataLgTjId = item2.attribs["data-lg-tj-id"]; data.dataLgTjNo = item2.attribs["data-lg-tj-no"]; data.dataLgTjCid = item2.attribs["data-lg-tj-cid"]; classArr.push(data); //console.log(item2.children[0].data,item2.attribs["href"],item2.attribs["data-lg-tj-id"],item2.attribs["data-lg-tj-no"],item2.attribs["data-lg-tj-cid"]); } } } } } const JOB_PER_PAGE = 15; function startScrawlCount(dir){ var files = fs.readdirSync(dir); //files.forEach(function(file){ scrawlFile(files,0,dir); //}); } function scrawlFile(files, index,dir){//city var file = files[index]; var location = encodeURI(file.split(".")[0]); var data; fs.readFile(dir+file,{encoding:'utf8',flag:"r+"},(err, content) =>{ if( err ) console.error(err); data = JSON.parse(content); var total = 0; var complete = 0; for (var k in data){ total++; var tarr = data[k]; var completeCnt = 0; async.eachLimit(tarr,3,function(item, callback){ superagent .get(item.href) .set("index_location_city",location) .end(function(err, res){ if( err ) console.error(err); $ = cheerio.load(res.text); console.log(item.href); var arr = $("#tab_pos").text().match(/\d+[+]?/); if( arr.length != 0){ var countStr = arr[0]; if(countStr.indexOf("+") == -1){ item.count = parseInt(countStr); }else{ var arr1 = $(".page_no"); var maxIndex = 1; var tempIndex; var len = arr1.length var pageItem; for(var i = 0; i < arr1.length; i++){ pageItem = arr1[i]; tempIndex = parseInt(pageItem.attribs["data-index"]); maxIndex = tempIndex > maxIndex ? tempIndex : maxIndex; } item.count = maxIndex * JOB_PER_PAGE; } } completeCnt++; callback(err, res); }); },function(err){ if( err ) console.error(err); complete++; console.log(files[index]+":"+complete+"/"+total); if( complete == total){ var wfile = fs.createWriteStream(dir+file); wfile.write(JSON.stringify(data)); wfile.end(); if( index+1 < files.length){ scrawlFile(files,index+1,dir); } } }); } return; var completeCnt = 0; async.eachLimit(data,3,function(item, callback){ superagent .get(item.href) .set("index_location_city","%E5%8C%97%E4%BA%AC") .end(function(err, res){ if( err ) console.error(err); $ = cheerio.load(res.text); console.log(item.href); var arr = $("#tab_pos").text().match(/\d+[+]?/); if( arr.length != 0){ var countStr = arr[0]; if(countStr.indexOf("+") == -1){ item.count = parseInt(countStr); //console.log(item.count); }else{ var arr1 = $(".page_no"); var maxIndex = 1; var tempIndex; var len = arr1.length var pageItem; for(var i = 0; i < arr1.length; i++){ pageItem = arr1[i]; tempIndex = parseInt(pageItem.attribs["data-index"]); maxIndex = tempIndex > maxIndex ? tempIndex : maxIndex; } //console.log("Count",countStr,"Page:",maxIndex); item.count = maxIndex * JOB_PER_PAGE; } } completeCnt++; //console.log(completeCnt+"/"+data.length); callback(err, res); }); },function(err){ if( err ) console.error(err); console.log("hehe"); var wfile = fs.createWriteStream(dir+file); wfile.write(JSON.stringify(data)); wfile.end(); }); }); }
这个是npm依赖文件:
{ "name": "node-crawl", "version": "1.0.0", "description": "", "main": "app.js", "scripts": { "test": "echo \"Error: no test specified\" && exit 1" }, "keywords": [], "author": "", "license": "ISC", "dependencies": { "asyn": "0.0.1", "async": "^2.1.4", "cheerio": "^0.22.0", "express": "^4.14.0", "superagent": "^3.1.0" } }
这个是当初爬到的数据的样子:
感觉自己好没羞耻