使用nodejs+http(s)+events+cheerio+iconv-lite爬取2717网站图片数据到本地文件夹
源代码如下:
//(node:9240) Warning: Setting the NODE_TLS_REJECT_UNAUTHORIZED environment variable to '0' makes TLS connections and HTTPS requests insecure by disabling certificate verification. //解决 javascript – Node.js请求CERT_HAS_EXPIRED问题,下面这句置首 // process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0'; //end let http = require("http"); let https = require("https"); let iconv = require("iconv-lite"); let cheerio = require("cheerio"); let path = require('path'); let fs = require('fs'); const phantom = require('phantom'); let EventEmitter = require('events').EventEmitter; class MyEmitter extends EventEmitter { } const myEmitter = new MyEmitter(); myEmitter.setMaxListeners(0); // const util = require('util'); const request = require('request'); //var url = "https://www.baidu.com/"; //const getPromise = util.promisify(request.get); const userAgents = [ 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11', 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6', 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)', 'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre', 'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6', 'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6', 'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)', 'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9', 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36' ]; //选择器模板 let selector_temple = [{"normal": "#picBody > p > a > img", "fix": "#picBody > p > img" }, {"normal": "#picBody > center > a > img", "fix": "#picBody > center > img" }, {"normal": "#contentV3_article > div.contentV3_body > p > a > img", "fix": "#contentV3_article > div.contentV3_body > p > img" } ]; /** * 异步延迟 * @param {number} time 延迟的时间,单位毫秒 */ function sleep(time = 0) { return new Promise((resolve, reject) => { setTimeout(() => { resolve(); }, time); }) }; class Spider2717 extends EventEmitter { constructor(_starturl = 'https://www.2717.com/ent/meinvtupian/2019/316305.html', // _selector = 'div.w1200.yh >div.MeinvTuPianBox > ul > li>a>i>img', _type = 'meinv', _nextpage = 1, _lastpage = 1 //_fix_selector = '#picBody > p > img' ) { super() // this._emitter = myEmitter; //src,title,flag:当前页面图片的src,和title及下载完成标志 this.data = []; this.starturl= _starturl; //起始页url前半部分 //this.selector = _selector;//提取数据选择器字符串 //this.fix_selector = _fix_selector;//补丁选择器 this.nextpage = _nextpage;//开始抓取页面 this.lastpage = _lastpage; //最后抓取页面 this.type = _type; //图片类型:meinv(243),meishi(199),stars(16),wenshen(380),zhiwu(100) //初始化保存图片目录 let i1 = this.starturl.lastIndexOf("https://www.2717.com/") + "https://www.2717.com/".length; let i2 = this.starturl.lastIndexOf("/") let tmpstr = this.starturl.substring(i1, i2); //this.savedir = path.join('imgs', this.type).toString(); this.savedir = path.join('imgs', this.type,tmpstr).toString(); console.log("savedir:" + this.savedir); //if (!fs.existsSync(this.savedir)) { // 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。 fs.mkdirSync(this.savedir, {recursive: true}, (err) => { if (err) throw err; }); //} //当前选择器模板序号 //let select_type = 0; //!!!!!!!!!!!!!!!!!!! //下载html页面数据失败标志 this.get_html_flag = true; //下载图片页完成计数器 this.downloaded_imagepage_count = 0; //下载图片单个页面事件名称 this.download_onepage_event = "download_onepage_event"; } /** * 获取指定url中的html文本内容 * @param url * @param no * @param event_name:html,etc */ spidermeinvtupian(url, pno, event_name = 'html') { let userAgent = userAgents[parseInt(Math.random() * userAgents.length)]; let req = request({ url: url, UserAgent: userAgent, timeout: 5000, encoding: null //设置encoding }, function (error, response, body) { if (!error && response.statusCode == 200) { let html = iconv.decode(body, 'gbk').toString(); //解码gb2312 this.get_html_flag = true; myEmitter.emit(event_name, html, pno); } else { console.log("获取 " + url + " 失败!--"+error.message); this.get_html_flag = false; let html = ''; myEmitter.emit(event_name, html, pno); } }); } /** * 从html文本中获取图片src和atl * @param html * @param pno */ getTupianData(html, pno, event_name = 'images') { //body > div.w1200.yh > div.MeinvTuPianBox > ul > li:nth-child(1) > a.MMPic/ const $ = cheerio.load(html); //美女图片 //修正未页网页选择器不同与其它页面的选择器不一致的问题(可点击和不可点击的区别 // console.log("selector:" + this.selector); //let imgs = $('#picBody > p > a > img').toArray(); //#picBody > p > img let imgs = []; for (let i = 0; i < selector_temple.length; i++) { //尝试normal selector imgs = $(selector_temple[i]['normal']).toArray(); console.log("selector:" + selector_temple[i]['normal']); if (imgs.length > 0) break; //尝试fix selector imgs = $(selector_temple[i]['fix']).toArray(); console.log("selector:" + selector_temple[i]['fix']); if (imgs.length > 0) break; } console.log("total page1:" + imgs.length); for (let i = 0; i < imgs.length; i++) { let src = $(imgs[i]).attr('src'); let title = $(imgs[i]).attr("alt"); //增加文件下载标志,true:已完成下载,false:没有下载 //let flag = false; this.data.push({src, title}); // console.log(typeof (this.data.flag)); } // myEmitter.emit("images", this.data, pno); myEmitter.emit(event_name, this.data, pno); // this.emit("images", data, pno); } /** * 根据抓取的图片src和alt下载图片数据 * @param data * @param pno */ downloadphoto(data, pno) { for (let i = 0; i < data.length; i++) { data[i].title = data[i].title.replace(new RegExp("/", 'g'), '_'); data[i].title = data[i].title.replace(new RegExp("\\\\", 'g'), '_'); data[i].title = data[i].title.replace(new RegExp('<', 'g'), '_'); data[i].title = data[i].title.replace(new RegExp('>', 'g'), '_'); data[i].title = data[i].title.replace('|', '_'); this.downloadfile(data[i].src, data[i].title, i, pno); } } /** * 随机延迟下载图片文件 * @param src * @param title * @param no 当前页面第no个图片文件 * @param delaytime * @param pno 当前页面号 */ /* require('https').get({ secure: true, host: 'github.com', method: 'GET', path: '/downloads/Graylog2/graylog2-web-interface/graylog2-web-interface-0.9.6.tar.gz', 'headers': { Host: 'github.com' }}).on('response', function(response) { console.log(response.statusCode); }); */ /** * 用NodeJs实现获取301或302跳转后的URL * @param link * @param collback * https://calfgz.github.io/blog/2018/05/http-redirect-java-node.html find_link(link, collback) { var f = function (link) { var options = { url: link, followRedirect: false, headers: { 'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Charset': 'UTF-8;', 'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.8) Firefox/3.6.8', } } request(options, function (error, response, body) { console.log(response.statusCode); if (response.statusCode == 301 || response.statusCode == 302) { var location = response.headers.location; console.log('location: ' + location); f(location); } else { //console.log(body); collback(link); } }) } f(link); } // find_link("http://a.m.taobao.com/i538372076663.htm?&sid=7ac494a5aa270ce9562feadef7423650", function(link){ // console.log(link); // }); */ calldownload=(src, no, filename, delaytime)=> { //src 非法 if (src == undefined || src.length == 0) { //跳过,继续下一个图片下载 console.log(`下载图片src':${src} '非法,跳过下载,继续下一个`); // this.data[no].flag = true; myEmitter.emit(this.download_onepage_event, "fail", no); return; } let time = 0; time = Math.random() * delaytime; let timeout=setTimeout(() => { let userAgent = userAgents[parseInt(Math.random() * userAgents.length)]; var options = { url: src, followRedirect: false, headers: { 'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Charset': 'UTF-8;', 'User-Agent': userAgent } } if (src.startsWith("https")) { https.get(src, options, res => { // console.log(filename); let writer = fs.createWriteStream(filename); res.pipe(writer); res.on("end", () => { if (res.statusCode == 200) { console.log(new Date().toLocaleString() + ",完成下载:" + filename); //this.data[no].flag = true; myEmitter.emit(this.download_onepage_event, "ok", no); } else if (res.statusCode == 301 || res.statusCode == 302) { console.log("未完成下载:" + filename + ",https返回值:" + res.statusCode); //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用 let location = res.headers.location; console.log("正在重新跳转正确的URL进行下载:" + location); // console.log('src: ' + src); this.calldownload(location, no, filename); } else { //文件下载失败,提示并跳过下载 console.log("下载:" + filename + " 失败,https返回值:" + res.statusCode); //跳过,继续下一个图片下载 //this.data[no].flag = true; myEmitter.emit(this.download_onepage_event, "fail", no); } }); res.on('error',(err)=>{ console.log("download_onepage_event:failed"+err.message); myEmitter.emit(this.download_onepage_event, "fail", no); }); }); } else if (src.startsWith("http")) { http.get(src, res => { // let filename = path.join('imgs', title + path.extname(src)); //console.log(filename); let writer = fs.createWriteStream(filename); res.pipe(writer); res.on("end", () => { if (res.statusCode == 200) { console.log(new Date().toLocaleString() + ",完成下载:" + filename); //this.data[no].flag = true; myEmitter.emit(this.download_onepage_event, "ok", no); } else if (res.statusCode == 301 || res.statusCode == 302) { console.log("未完成下载:" + filename + ",http返回值:" + res.statusCode); //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用 let location = res.headers.location; console.log("正在重新跳转正确的URL进行下载:" + location); this.calldownload(location, no, filename); } else { //文件下载失败,提示并跳过下载 console.log("下载:" + filename + " 失败,https返回值:" + res.statusCode); //跳过,继续下一个图片下载 myEmitter.emit(this.download_onepage_event, "fail", no); } }); res.on('error',(err)=>{ console.log("download_onepage_event:failed"+err.message); myEmitter.emit(this.download_onepage_event, "fail", no); }); }); } clearTimeout((timeout)); }, time); }; /** * 根据src,title,no,pno等参数进行下载图片文件到本地 * @param src * @param title * @param no * @param delaytime * @param pno */ downloadfile=(src, title, no, pno)=> { try { // src= src.replace('https','http'); console.log("src:" + src); //let filename = path.join(this.savedir, title,pno + path.extname(src)); //if (!fs.existsSync(this.savedir)) { // 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。 let dirpath=path.join(this.savedir,title).toString(); fs.mkdirSync(dirpath, {recursive: true}, (err) => { if (err) throw err; }); //}*/ let filename = path.join(this.savedir, title,pno + path.extname(src)); //如果本地文件存在则跳过,不再下载 if (fs.existsSync(filename)) { let stat = fs.statSync(filename); if (stat.size > 1024) { //跳过,继续下一个图片下载 console.log("本地文件:" + filename + "已经存在,系统跳过下载"); // this.data[no].flag = true; myEmitter.emit(this.download_onepage_event, "ingore", no); return; } } console.log(new Date().toLocaleDateString() + ",正在下载:" + filename); // this.calldownload(src, no, filename, 100); } catch (e) { console.log(e); // this.data[no].flag = flag; myEmitter.emit(this.download_onepage_event, "ingore", no); } }; /** * 开启抓取图片数据 */ startSpider=()=> { //注册自定义监听事件 // 根据html获取图片src,art myEmitter.on("html", (html, pno) => { // this.on("html", (html, pno) => { // console.log("html:", html, pno); this.getTupianData(html, pno); }); //根据图片src,alt,及指定页面下载图片到本地 myEmitter.on("images", (data, pno) => { // this.on("images", (data, pno) => { // console.log("images:", data, pno); this.downloadphoto(data, pno); }); //下载图片页完成计数器 this.downloaded_imagepage_count = 0; this.data = []; //下载图片单个页面事件名称 // this.download_onepage_event="download_onepage_event"; myEmitter.on(this.download_onepage_event, (status, pno) => { console.log("download_onepage_event=>status:"+status); this.downloaded_imagepage_count++; if (this.downloaded_imagepage_count >= this.data.length) { console.log("某单页图片数据抓取完毕!"); this.downloaded_imagepage_count = 0; this.data = []; this.nextpage++; if (this.nextpage <= this.lastpage) { console.log("开启第" + this.nextpage + "页面数据抓取。。。。。。。。。。。。。。。"); this.spiderpage(this.nextpage); } else { console.log("所有页面图片数据抓取完毕!"); //clearInterval(interval);//停止定时器 myEmitter.emit("download_allpage_event","ok"); this.data = []; myEmitter.removeAllListeners("html") myEmitter.removeAllListeners("images") //写标志 fs.writeFileSync('save.txt',"ok"); } } }); //首先开启起始页数据下载。。。。 console.log("开启第" + this.nextpage + "页面数据抓取。。。。。。。。。。。。。。。"); this.spiderpage(this.nextpage) }; /** * 开启指定页面数据抓取 * @param pageno */ spiderpage=(pageno)=> { let url = ''; if(pageno===1){ url=this.starturl; } else { url = this.starturl.substring(0, this.starturl.length - 5) + "_" + pageno + ".html"; } // url = this.preurl + pageno + ".html"; console.log("url:" + url); this.spidermeinvtupian(url, pageno); } } /** * 通过原生regquest模块获取指定url中文本内容 * @param url * @param event_name */ function get_html_by_request(url, event_name = 'get_html') { let userAgent = userAgents[parseInt(Math.random() * userAgents.length)]; let req = request({ url: url, UserAgent: userAgent, encoding: null, //设置encoding strictSSL: true }, function (error, response, body) { if (!error && response.statusCode == 200) { let html = iconv.decode(body, 'gbk').toString(); //解码gb2312 myEmitter.emit(event_name, html); } else { console.log("获取 " + url + " 失败:" + response.statusCode); let html = ''; myEmitter.emit(event_name, html); } }); } /** *通过phamtomjs同步获取url对应的html内容 * @param url * @returns {Promise<string|*>} */ async function get_html_from_url_by_phantom(url) { // phantom.outputEncoding='utf-8';//指定编码方式 const instance = await phantom.create(); const page = await instance.createPage(); await page.on('onResourceRequested', function (requestData) { console.info('Requesting', requestData.url); }); //设置动态useragent let userAgent = userAgents[parseInt(Math.random() * userAgents.length)]; //warn: Using page.settings = ...; is not supported. Use page.property('settings', ...) instead. See the README file for more examples of page#property. page.property('settings', { javascriptEnabled: true, loadImages: true, userAgent: userAgent }); const status = await page.open(url); let content = await page.property('content'); // console.log(content); // page.render('example.png'); // await page.close(); await instance.exit(); return content; } /** * 获取总页面数及其标题 * @param html * @returns {number} */ function getPageinfo(html) { const $ = cheerio.load(html); //获取标题 let hs = $('div.warp.mar.oh > div.warp.oh > h1').toArray(); let title = $(hs[0]).text(); // //获取总页面数 let pageinfo = ''; let lis = $('#pageinfo').toArray(); if (lis.length == 0) { pageinfo = '-1'; } else { pageinfo = $(lis[0]).attr('pageinfo'); } let count = Number(pageinfo); let data = {'title':title, 'count':count}; data.title = title; data.count = count; return data; } //--------------------------------------------------------------------------- /** * 无分页网页图片下载类 */ class SpiderOnePageBuff { /** * 监听一个事件的参数 * @param _event_name */ constructor(_html, _event_name, _save_dir) { //初始化保存图片目录 this.savedir = _save_dir; // 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。 fs.mkdirSync(this.savedir, {recursive: true}, (err) => { if (err) throw err; }); this.clsname = 'SpiderOnePageBuff=>'; this.downloaded_one_image = 'downloaded_one_image'; this.html = _html; this.data = []; this.imgs = []; this.event_name = _event_name; /** * 处理所有图片数据完成计数器 * @type {number} */ this.process_event_finish_count = 0; } /** * 随机延迟下载图片文件 * @param src * @param title * @param no 当前页面第no个图片文件 * @param delaytime * @param pno 当前页面号 */ calldownload=(src, no, filename, delaytime)=>{ //src 非法 if (src == undefined || src.length == 0) { //跳过,继续下一个图片下载 console.log(this.clsname + `下载图片src':${src} '非法,跳过下载,继续下一个`); this.data[no].flag = true; return; } let time = 0; time = Math.random() * delaytime; let timeout= setTimeout(() => { let userAgent = userAgents[parseInt(Math.random() * userAgents.length)]; let options = { url: src, followRedirect: false, headers: { 'Content-Type': 'application/x-www-form-urlencoded', 'Accept-Charset': 'UTF-8;', 'User-Agent': userAgent } } if (src.startsWith("https")) { https.get(src, options, res => { // console.log(filename); let writer = fs.createWriteStream(filename); res.pipe(writer); res.on("end", () => { if (res.statusCode == 200) { console.log(this.clsname + new Date().toLocaleString() + ",完成下载:" + filename); myEmitter.emit(this.downloaded_one_image, "ok", no); } else if (res.statusCode == 301 || res.statusCode == 302) { console.log("未完成下载:" + filename + ",https返回值:" + res.statusCode); //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用 let location = res.headers.location; console.log(this.clsname + "正在重新跳转正确的URL进行下载:" + location); // console.log('src: ' + src); this.calldownload(location, no, filename); } else { //文件下载失败,提示并跳过下载 console.log(this.clsname + "下载:" + filename + " 失败,https返回值:" + res.statusCode); //跳过,继续下一个图片下载 myEmitter.emit(this.downloaded_one_image, "fail", no); } }); }); } else if (src.startsWith("http")) { http.get(src, res => { // let filename = path.join('imgs', title + path.extname(src)); //console.log(filename); let writer = fs.createWriteStream(filename); res.pipe(writer); res.on("end", () => { if (res.statusCode == 200) { console.log(this.clsname + new Date().toLocaleString() + ",完成下载:" + filename); myEmitter.emit(this.downloaded_one_image, "ok", no); } else if (res.statusCode == 301 || res.statusCode == 302) { console.log(this.clsname + "未完成下载:" + filename + ",http返回值:" + res.statusCode); //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用 let location = res.headers.location; console.log(this.clsname + "正在重新跳转正确的URL进行下载:" + location); this.calldownload(location, no, filename); } else { //文件下载失败,提示并跳过下载 console.log(this.clsname + "下载:" + filename + " 失败,https返回值:" + res.statusCode); //跳过,继续下一个图片下载 myEmitter.emit(this.downloaded_one_image, "fail", no); } }); }); } clearTimeout(timeout); }, time); } /** * 预先处理标题为文件格式字符 * @param _title * @returns {string} */ preprocess_title(_title) { let title = _title; title = title.replace(new RegExp("\\\\", 'g'), '_'); title = title.replace(new RegExp("/", 'g'), '_'); title = title.replace(new RegExp('<', 'g'), '_'); title = title.replace(new RegExp('>', 'g'), '_'); title = title.replace('|', '_'); return title; } /** * 抓取单个页面图片 * @param html_buff * @param event_name */ //抓取只有单个图片的页面处理函数 spider_one_image=(event_name = 'get_one_image')=> { const $ = cheerio.load(this.html); //修正未页网页选择器不同与其它页面的选择器不一致的问题(可点击和不可点击的区别 for (let i = 0; i < selector_temple.length; i++) { //尝试normal selector this.imgs = $(selector_temple[i]['normal']).toArray(); console.log("selector:" + selector_temple[i]['normal']); if (this.imgs.length > 0) break; //尝试fix selector this.imgs = $(selector_temple[i]['fix']).toArray(); console.log("selector:" + selector_temple[i]['fix']); if (this.imgs.length > 0) break; } console.log("spider_one_image=>total page1:" + this.imgs.length); for (let i = 0; i < this.imgs.length; i++) { let src = $(this.imgs[i]).attr('src'); let title = $(this.imgs[i]).attr("alt"); title = this.preprocess_title(title); //增加文件下载标志,true:已完成下载,false:没有下载 //let flag = false; this.data.push({src, title}); } if (this.imgs.length > 0) myEmitter.emit(event_name, this.imgs); }; /** * 开启入口 */ start_spider=()=>{ this.process_event_finish_count = 0; myEmitter.on('main_download_one_image', (status, no) => { console.log(this.clsname + "status:" + status); console.log("this.event_name:"+this.event_name); this.process_event_finish_count++; if (this.process_event_finish_count >= this.imgs.length) { //如果完成所有文件下载(无论成功与否),则发去完成事件给回调函数 this.process_event_finish_count=0; this.data=[]; myEmitter.emit("download_allpage_event", "ok"); } }); myEmitter.on("get_one_image", data => { let filename = ''; for (let i = 0; i < this.data.length; i++) { //开始下载图片文件 //src, no, filename, delaytime //filename = path.join(this.savedir, (i + 1) + "_" + this.data[i].title + path.extname(this.data[i].src)); filename = path.join(this.savedir, (i + 1) + path.extname(this.data[i].src)); console.log(this.clsname + new Date().toLocaleDateString() + ",正在下载:" + filename); //如果本地文件存在则跳过,不再下载 if (fs.existsSync(filename)) { let stat = fs.statSync(filename); if (stat.size > 1024) { //跳过,继续下一个图片下载 console.log(this.clsname + "本地文件:" + filename + "已经存在,系统跳过下载"); myEmitter.emit("main_download_one_image", "ingore", i); return; } } this.calldownload(this.data[i].src, i, filename, 3000); } }); this.spider_one_image(); } } //抓取页面入口url地址 //!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!! // let url = 'https://www.2717.com/beautiful/zhuomianbeijing/2013/4499.html'; function main(url = 'https://www.2717.com/word/dongwushijie/2018/313620.html',type) { // let pagecount = 0; let title = ''; let html_buff = ''; // let end_flag = false; //完成所有页面图片下载回调处理事件 myEmitter.on("download_allpage_event",status=>{ //开启结束标志 //end_flag=true; }); // console.log('step 1================='); //way1 myEmitter.on('get_html', html => { let data = getPageinfo(html); pagecount = data['count']; title = data['title']; html_buff = html; console.log(title, pagecount); if (pagecount <= 0) { myEmitter.on("main_download_one_image", status => { //下载单个图片完成!!! console.log("下载单个图片完成!!!=状态" + status); //写标志 fs.writeFileSync('save.txt',"ok"); }); //初始化保存图片目录 let i1 = url.lastIndexOf("https://www.2717.com/") + "https://www.2717.com/".length; let i2 = url.lastIndexOf("/") let tmpstr = url.substring(i1, i2); let savedir = path.join('imgs', tmpstr).toString(); if(arguments.length<=1){ savedir=path.join('imgs',title).toString(); } else{ savedir=path.join('imgs',type,title).toString(); } let spiderbuff = new SpiderOnePageBuff(html_buff,"main_download_one_image", savedir); spiderbuff.start_spider(); } else { //有多个图片的tab页显示 // console.log('step 2================='); let typestr=type; if(arguments.length<=1) { typestr=title; } let spider = new Spider2717( url, typestr, 1, pagecount ); spider.startSpider(); } }); //触发获取html内容 get_html_by_request(url); //end way1 } /** * 主调用 * 只需要指定抓取图片首页url */ /* 性感红唇美女暗黑哥特风高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2019/314110.html 清新浪漫的蓝天白云纯美风景图片高清壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313774.html 世外桃源田园山水风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313773.html 祖国山河壮丽的自然风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313772.html 上帝视角俯瞰不一样的自然美景图片 https://www.2717.com/beautiful/zhuomianbeijing/2019/313771.html 小巧可爱的七星瓢虫动物图片壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2019/313769.html 雨后如珠似玉的花卉水珠梦幻特写图壁纸片 https://www.2717.com/beautiful/zhuomianbeijing/2019/313768.html 神奇瑰丽的西藏圣象天门风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313767.html 大自然雄伟雪山美景高清壁纸图片素材 https://www.2717.com/beautiful/zhuomianbeijing/2018/313723.html 唯美图文手机背景高清壁纸图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313722.html 甜美可爱的冬日圣诞女孩手机高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313721.html 联想桌面壁纸高清图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313635.html 香港乐坛天后容祖儿图片桌面壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313634.html 刘德华主演电影高清桌面壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313608.html 美女明星杨蓉白色吊带性感连衣裙高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313590.html 死侍双刀耍酷高清壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313572.html 马思纯露肩性感写真高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313571.html 温馨幸福的韩系情侣高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313558.html 韩国女神美女IU拼接图片大全分享 https://www.2717.com/beautiful/zhuomianbeijing/2018/313557.html 你和我的倾城时光金瀚高清剧照图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313556.html 李易峰高清手机壁纸图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313555.html 最新超级可爱的萌娃拼接图片大全 https://www.2717.com/beautiful/zhuomianbeijing/2018/313552.html 偶像练习生陈立农高清壁纸写真图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313532.html 白敬亭帅气时尚高清壁纸写真图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313531.html 悲伤逆流成河顾森湘高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313517.html 可盐可甜的爱豆高清锁屏壁纸图片大全 https://www.2717.com/beautiful/zhuomianbeijing/2018/313505.html 2016年1月日历精选清新护眼壁纸图片5下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313494.html 奔驰梅赛德斯SLK55汽车壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313489.html 延禧攻略 清宫浮世绘版海报壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313487.html 海洋世界里的动物蓝色图片桌面壁纸1下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313485.html OL制服美女美腿丝袜性感图片桌面壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313470.html 飞檐走壁的美女个性壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313466.html */ let url=''; // url = "https://www.2717.com/ent/meinvtupian/2019/316305.html"; // let url = 'https://www.2717.com/beautiful/zhuomianbeijing/2019/314110.html'; //url='https://lq.2717.com/kbtp/2018/313409.html'; //url='https://lq.2717.com/kbtp/2017/184385.html'; url='https://www.2717.com/beautiful/qichetuku/2015/17388.html'; // url='https://www.2717.com/beautiful/zhuomianbeijing/2018/313450.html'; let arguments = process.argv.splice(2); if(arguments.length>0) { url=arguments[0]; } let type='美女图片'; if(arguments.length>1) { type=arguments[1]; } main(url,type);
本次本来想继承events的事件驱动类来写爬虫的,经过测试死活不行,后来只有使用外部events实列的on,emit方法才通过,但是如下测试代码通过继承events又可以
let EventsDemo = require('events'); class MyEvents extends EventsDemo { constructor() { super(); } callA() { console.log("call A"); this.emit("aaa", "a",123); } callB() { console.log("call B"); this.emit('bbb', 'b',123,456); } start(){ // let myevent = new MyEvents(); this.on("test", (p1, p2, p3) => { let msg = ''; //msg="p1={$p1},p2={$p2},p3={$p3}"; msg = "p1=" + p1 + "," + "p2=" + p2 + "," + "p3=" + p3; console.log(msg); }); this.emit("test", 1, "abc", 3.1415926); console.log("=================================================="); // myevent = new MyEvents(); this.on("aaa",(p1,p2)=>{ let msg = ''; msg = "callA:"+"p1=" + p1 + "," + "p2=" + p2 ; console.log(msg); }); this.on('bbb', (p1,p2,p3)=>{ let msg = ''; msg = "callB:"+"p1=" + p1 + "," + "p2=" + p2 + "," + "p3=" + p3; console.log(msg); }); this.callA(); this.callB(); } }; /** * 主函数 */ //main(); myevent = new MyEvents(); myevent.start();
这个问题有点诡异,知道的朋友请指教,谢谢。