nodejs爬虫
1:引入request模块
2:引入iconv-lite改变编码格式
3:引入cheerio模块类似jquery进行数据分析
编写request.js
var request = require('request');
var iconv = require('iconv-lite');
var cheerio = require('cheerio');
var fs = require('fs');
const requestPromise = (url) =>{
return new Promise((resolve,reject)=>{
request(url,{ encoding:null },function(error,respose,body){
console.log(err);
//console.log(respose);
if(respost.statusCode===200){
const bufs = iconv.decode(body,'gb2312');
const html = bufs.toString('utf8');
resolve(html);
}else{
reject(error);
}
});
});
};
const url = '/html........';
const host = 'http://www.baidu.com';
const getList = async (url) =>{
const html = await requestPromise(host + url);
const $ = cheerio.load(html);
$('.co_content8 ul table tbody tr:nth-child(2) td:nth-child(2) b a:nth-child(2)').each((i,item)=>{
getMovieDetail($(item).attr('href'));
})
}
const getMovieDetail = async (url)=>{
const html = await requestPromise(host + url);
const $ = cheerio.load(html);
const movie = {
name:$('#main-outer > div > div > div.translate-wrap > div.translateio > div.translate-main.clearfix > div.trans-left > div > div.input-wrap');
desc:$('#Zoom > span > p:nth-child(1)').text(),
picture:$('#Zoom > span > p:nth-child(1) >img:nth-child(3)').attr('src')
};
fs.appendFile('./public/index.js',JSON.stringify(movie),function(err){
console.log(err);
})
}
const arr = [];
for(let i=1;i<=255;i++){
arr.push('${host}/html........${i}.html');
getList('${host}/html........${i}.html');
}
arr.reduce((rs,url)=>{
return rs.then(()=>{
return new Promise(async (resolve)=>{
await getList(url);
resolve();
})
})
})