nodejs爬虫

1:引入request模块
2:引入iconv-lite改变编码格式
3:引入cheerio模块类似jquery进行数据分析
编写request.js
var request = require('request');
var iconv = require('iconv-lite');
var cheerio = require('cheerio');
var fs = require('fs');
const requestPromise = (url) =>{
    return new Promise((resolve,reject)=>{
         request(url,{ encoding:null },function(error,respose,body){
            console.log(err);
            //console.log(respose);
            if(respost.statusCode===200){
                const bufs = iconv.decode(body,'gb2312');
            	const html = bufs.toString('utf8');
                resolve(html);
            }else{
                reject(error);
            }
		});
    });
};
const url = '/html........';
const host = 'http://www.baidu.com';
const getList = async (url) =>{
    const html = await requestPromise(host + url);
    const $ = cheerio.load(html);
    $('.co_content8 ul table tbody tr:nth-child(2) td:nth-child(2) b a:nth-child(2)').each((i,item)=>{
        getMovieDetail($(item).attr('href'));
	})
}

const getMovieDetail = async (url)=>{
    const html = await requestPromise(host + url);
    const $ = cheerio.load(html);
    const movie = {
        name:$('#main-outer > div > div > div.translate-wrap > div.translateio > div.translate-main.clearfix > div.trans-left > div > div.input-wrap');
        desc:$('#Zoom > span > p:nth-child(1)').text(),
        picture:$('#Zoom > span > p:nth-child(1) >img:nth-child(3)').attr('src')
    };
	fs.appendFile('./public/index.js',JSON.stringify(movie),function(err){
      console.log(err);
    })
}
const arr = [];
for(let i=1;i<=255;i++){
    arr.push('${host}/html........${i}.html');
    getList('${host}/html........${i}.html');
}
arr.reduce((rs,url)=>{
	return rs.then(()=>{
        return new Promise(async (resolve)=>{
              await getList(url);
                resolve();
            })
	})
})
posted on 2020-11-25 09:28  C_WangFly  阅读(78)  评论(0编辑  收藏  举报