nodejs爬虫获取网页信息

var http = require("https"); 
var fs = require('fs');
var iconv = require('iconv-lite');
var cheerio = require('cheerio')
function getWeb(url, charset) {
  console.log(url);
  return new Promise((resolve) => {
    http.get(url, function (res) {
      var arrBuf = [];
      res.on("data", function (chunk) { //chunk 是一个 buffer对象
        arrBuf.push(chunk);
      })
        .on("end", function () {
          var chunkAll = Buffer.concat(arrBuf);
          var str = iconv.decode(chunkAll, charset) // 汉字不乱码
          resolve(str)
        }).on('error', (err) => {
          console.log(err);
        });
    });
  })
}

async function getSrcArr(host, path, page,positon,charset) { //返回一个promise
  var movie_detail_src_arr = [];
  for(var i = 1; i <= page; i++){
    var res = await getWeb(`${host}${path}${i}.html`, charset);
    var $ = cheerio.load(res);
    $(positon).each(function () { movie_detail_src_arr.push(host + $(this).attr('href')) });
  }
  return new Promise((res)=>{
    res(movie_detail_src_arr)
  }).catch((e)=>{
    console.log(e);
  })
}

 

 

 

posted @ 2020-03-16 12:29  当当和瓶瓶  阅读(622)  评论(0编辑  收藏  举报