nodejs爬虫获取网页信息
var http = require("https");
var fs = require('fs');
var iconv = require('iconv-lite');
var cheerio = require('cheerio')
function getWeb(url, charset) { console.log(url); return new Promise((resolve) => { http.get(url, function (res) { var arrBuf = []; res.on("data", function (chunk) { //chunk 是一个 buffer对象 arrBuf.push(chunk); }) .on("end", function () { var chunkAll = Buffer.concat(arrBuf); var str = iconv.decode(chunkAll, charset) // 汉字不乱码 resolve(str) }).on('error', (err) => { console.log(err); }); }); }) }
async function getSrcArr(host, path, page,positon,charset) { //返回一个promise var movie_detail_src_arr = []; for(var i = 1; i <= page; i++){ var res = await getWeb(`${host}${path}${i}.html`, charset); var $ = cheerio.load(res); $(positon).each(function () { movie_detail_src_arr.push(host + $(this).attr('href')) }); } return new Promise((res)=>{ res(movie_detail_src_arr) }).catch((e)=>{ console.log(e); }) }