Node.js爬取百度图片瀑布流,使用class类封装。
//爬取百度高清图片 const phantom = require('phantom') const express = require('express'); const app = express(); const fs= require('fs'); const cheerio = require('cheerio'); const request = require('request') let server = app.listen(2000, function () { let host = server.address().address; let port = server.address().port; console.log('Your App is running at http://%s:%s', host, port); }); class stealData { constructor() { // this.base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ct=201326592&lm=-1&cl=2&ie=gb18030&word=%B6%AF%C2%FE%B1%DA%D6%BD&fr=ala&ala=1&pos=0&alatpl=wallpaper&oriquery=%E5%8A%A8%E6%BC%AB%E5%A3%81%E7%BA%B8'; //要爬取的网站 // this.base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&oq=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&rsp=-1' this.base_url = 'https://image.baidu.com/search/index?tn=baiduimage&ipn=r&ct=201326592&cl=2&lm=-1&st=-1&fm=index&fr=&hs=0&xthttps=111111&sf=1&fmq=&pv=&ic=0&nc=1&z=&se=1&showtab=0&fb=0&width=&height=&face=0&istype=2&ie=utf-8&word=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&oq=%E5%8A%A8%E6%BC%AB+%E5%A3%81%E7%BA%B8&rsp=-1' this.current_page = 1; this.result_list = []; this.a='' } async init() { const instance = await phantom.create();//创建一个实例 try { await this.openNet()//打开网页 await this.getLoadPictures();//获取高清图片地址 await this.imgSave(0);//下载图片 await instance.exit()//图片下完之后退出phantomjs环境 } catch (e) { console.log(e); } } sleep(time) { return new Promise((resolve) => { console.log(`自动睡眠中,${time / 1000}秒后重新发送请求......`) setTimeout(() => { resolve(); }, time); }); }//请求延迟时间,防止ip被封 delay(second) { return new Promise((resolve) => { setTimeout(resolve, second * 1000); }); }//延迟时间 async openNet(){ await this.pageScroll(0) } async pageScroll(i) { const instance = await phantom.create();//创建一个实例 const page = await instance.createPage(); const that = this const status = await page.open(this.base_url);//打开网站,返回的是一个状态 //网页加载的初始浏览器窗口大小 const size = await page.property('viewportSize', { width: 1920, height: 1080 }) await that.delay(5) //滚动浏览器 await page.property('scrollPosition', { left: 0, top: 1000 * i }) let content = await page.property('content')//获取dom元素 let $ = cheerio.load(content) console.log($('.imgbox').length) //如果图片数量少于20个 就一直往下滚 if($('.imgbox').length < 20) { await this.pageScroll(++i)//回调自己 知道满足条件。也可使用for循环 可能速度有点慢 } //所有的存储图片的dom元素都放在这里面 this.a = $('.imgitem') } //获取到缩略图集合 async getLoadPictures(a) { const result_list = []; let instance = await phantom.create(); let page = await instance.createPage(); let content = await page.property('content') // let status = await page.open(this.base_url) let $ = cheerio.load(content) const that = this //把所有的高清大图链接都统计起来 this.a.each((index, element) => { result_list.push({ // title: $(element).find('.imgbox a').text(), down_loda_url: ('https://image.baidu.com'+$(element).find('a').attr('href')) }); }); that.result_list.push(...result_list); // console.log('that.result_list',that.result_list[0]) } //取到高清图链接 下载 async imgSave(i) { let instance = await phantom.create(); let page = await instance.createPage(); let status = await page.open(this.result_list[i].down_loda_url)//打开高清大图链接 await this.delay(2) let content = await page.property('content') let $ = cheerio.load(content) let src = $('#currentImg').attr('src')//获取高清大图的链接 console.log('statue',status) console.log('src',src) this.save(src,i)//保存图片 if(i<this.result_list.length) { await this.imgSave(++i)//回掉自己 保存下一张图片。也可通过for循环 取到所有大图的链接 再集中下载 可能速度有点慢 } } //保存图片函数 save(url,i) { console.log(`开始写入第${i+1}张`) let ext = url.split('.').pop() request(url).pipe(fs.createWriteStream(`./image/${new Date().getTime()}.${ext}`)); console.log(`写入成功`) } } const thief = new stealData('xxx_url'); thief.init();
做了一些优化
//爬取百度高清图片 const phantom = require('phantom') const express = require('express'); const app = express(); const fs= require('fs'); const cheerio = require('cheerio'); const request = require('request') let server = app.listen(2000, function () { let host = server.address().address; let port = server.address().port; console.log('Your App is running at http://%s:%s', host, port); }); class stealData { constructor() { this.base_url = 'https://image.baidu.com/search/index?ct=201326592&z=&tn=baiduimage&word=%E6%BC%AB%E5%A8%81%E5%9B%BE%E7%89%87&pn=0&ie=utf-8&oe=utf-8&cl=2&lm=-1&fr=ala&se=&sme=&width=1920&height=1080' this.current_page = 1; this.result_list = []; this.a=''; this.urllist = [] } async init() { const instance = await phantom.create();//创建一个实例 try { await this.openNet()//打开网页 await this.getLoadPictures();//获取缩略图图片地址 await this.getrealPictures();//获取高清图片地址并下载 } catch (e) { console.log(e); } } sleep(time) { return new Promise((resolve) => { console.log(`自动睡眠中,${time / 1000}秒后重新发送请求......`) setTimeout(() => { resolve(); }, time); }); }//请求延迟时间,防止ip被封 delay(second) { return new Promise((resolve) => { setTimeout(resolve, second * 1000); }); }//延迟时间 async openNet(){ await this.pageScroll(0) } async pageScroll(i) { const instance = await phantom.create();//创建一个实例 const page = await instance.createPage(); const that = this const status = await page.open(this.base_url);//打开网站,返回的是一个状态 //网页加载的初始浏览器窗口大小 const size = await page.property('viewportSize', { width: 1920, height: 1080 }) await that.delay(5) //滚动浏览器 await page.property('scrollPosition', { left: 0, top: 1000 * i }) let content = await page.property('content')//获取dom元素 let $ = cheerio.load(content) console.log($('.imgbox').length) //如果图片数量少于20个 就一直往下滚 if($('.imgbox').length < 20) { await this.pageScroll(++i)//回调自己 知道满足条件。也可使用for循环 可能速度有点慢 } //所有的存储图片的dom元素都放在这里面 this.a = $('.imgitem') } //获取到缩略图集合 async getLoadPictures(a) { const result_list = []; let instance = await phantom.create(); let page = await instance.createPage(); let content = await page.property('content') let $ = cheerio.load(content) const that = this //把所有的缩略图链接都统计起来 this.a.each((index, element) => { result_list.push({ // title: $(element).find('.imgbox a').text(), down_loda_url: ('https://image.baidu.com'+$(element).find('a').attr('href')) }); }); that.result_list.push(...result_list); } //取到每一个缩略图对应高清图链接并下载 async getrealPictures() { let instance = await phantom.create(); let page = await instance.createPage(); for(let i=0;i<this.result_list.length;i++){ try { let content = await page.property('content') let status = await page.open(this.result_list[i].down_loda_url) await this.delay(2) let $ = cheerio.load(content) let src = $('#currentImg').attr('src')//获取高清大图的链接 let ext = src.split('.').pop() console.log('src',src) console.log(`开始写入第${i+1}张`) await request(src).pipe(fs.createWriteStream(`./Marvel/${new Date().getTime()}.${ext}`)); await this.sleep(3000)//防止被封 console.log(`写入成功`) if(i==this.result_list.length-1){ console.log('跳出下载') instance.exit() } }catch (e) { console.log('errorheyu:',e) } } } } const thief = new stealData('xxx_url'); thief.init();
参考:1、分分钟教你用node写个爬虫
长风破浪会有时,直挂云帆济沧海