Node爬取网站数据
npm安装cheerio和axios
npm isntall cheerio
npm install axios
利用cheerio抓取对应网站中的标签根据链接使用axios获取对应页面数据
const cheerio = require('cheerio');
//获取HTML文档的内容
const request = require('request');
const {writeFile,mkDir} = require('../lcf');
const fs = require('fs');
const url = require('url');
const path = require("path");
const axios = require("axios");
const { title } = require('process');
const httpUrl = "https://www.pkdoutu.com/article/list";
async function wait(millSeconds) {
return new Promise((resolve,reject)=>{
setTimeout(()=>{
resolve("success");
},millSeconds);
})
}
function req(options) {
return new Promise((resolve,reject) => {
request.get(url,headers,function(err,response,body) {
if(err) {
reject(err);
} else {
resolve({response,body});
}
})
})
}
async function getPageNum() {
const { response,body } = await req(httpUrl);
const $ = cheerio.load(body);
const linkLength = $('.page-link').length;
let num = $('.page-link').eq(linkLength - 2).text();
return num;
}
async function getImgList() {
let allNum = await getPageNum();
for(let i=1;i<=10;i++) {
if(i!=1) {
await wait(3000*i);
}
parsePage(httpUrl+"?page="+i)
}
}
async function parsePage(url) {
let { response,body } = await req(url);
let $ = cheerio.load(body);
let reg = /(.*?)\d/i;
$('#home .col-sm-9>a').each(async (index,ele) => {
let pageUrl = $(ele).attr('href');
let title = $(ele).find('.random_title').text();
title = reg.exec(title)[1];
let reg2 = /[/*?:<>|\"\\\\]+/g;
title = title.replace(reg2,"");
console.log(title)
if(!fs.existsSync("img/"+title)) {
mkDir("img/"+title);
console.log("创建目录成功:",title);
getImg(pageUrl,title);
}
})
}
async function getImg(pageUrl,title) {
let { response,body } = await req(pageUrl);
let $ = cheerio.load(body);
$('.pic-content img').each(async (index,ele) =>{
await wait(50*index);
let imgUrl = $(ele).attr('src');
let extName = path.extname(imgUrl);
//创建文件写入流
let imgPath = `img/${title}/${title}-${index}${extName}`;
let ws = fs.createWriteStream(imgPath);
axios.get(imgUrl,{responseType: 'stream'}).then(res => {
res.data.pipe(ws);
console.log("图片加载完成:" + imgPath)
})
})
};
getImgList();
如果对应网站有SSL证书,可以利用puppeteer模拟浏览器进行操作
npm install puppeteer
let puppeteer = require('puppeteer');
async function test() {
let options = {
defaultViewport: {
width: 1400,
height: 800
},
headless: false
}
let browser = await puppeteer.launch(options);
let page = await browser.newPage();
await page.goto('https://www.taobao.com/');
await page.screenshot({path: "screenshot.png"});
//$eval 返回一个promise对象
//$page 返回一个elementHandle
// let elementArr = await page.$$eval(".service-bd a",(elements)=>{
//$获取一个元素 $$获取多个元
//$waitfor 等待元素完成
let elementArr = await page.$$(".service-bd a",(elementArr) => {
let eles = [];
elements.forEach((item,index) => {
if(item.getAttribute("href")!="#") {
var eleobj = {
href: item.getAttribute("href"),
text: item.innerHTML
}
eles.push(eleobj);
}
})
return eles;
})
let searchInput = await page.$('#q',(ele) => {
return ele;
})
await searchInput.focus();
await page.keyboard.type("台灯");
let searchBtn = await page.$(".btn-search",(ele) => elementArr)
await searchBtn.click();
// let loginId = await page.$("#fm-login-id",(ele) => ele);
// let loginPassword = await page.$("#fm-login-password",(ele) => ele);
// setTimeout(async ()=>{
// await loginId.focus();
// await page.keyboard.type("3161775809@qq.com");
// setTimeout(async ()=>{
// await loginPassword.focus();
// await page.keyboard.type("a13851467182");
// },4000)
// },4000);
page.on('console',function(eventMsg){
console.log(eventMsg.text());
})
}
test();