使用nodejs+http(s)+events+cheerio+iconv-lite爬取2717网站图片数据到本地文件夹

 

源代码如下:

 

 
//(node:9240) Warning: Setting the NODE_TLS_REJECT_UNAUTHORIZED environment variable to '0' makes TLS connections and HTTPS requests insecure by disabling certificate verification.
//解决 javascript – Node.js请求CERT_HAS_EXPIRED问题,下面这句置首
// process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
//end

let http = require("http");
let https = require("https");
let iconv = require("iconv-lite");
let cheerio = require("cheerio");
let path = require('path');
let fs = require('fs');
const phantom = require('phantom');

let EventEmitter = require('events').EventEmitter;

class MyEmitter extends EventEmitter {
}

const myEmitter = new MyEmitter();
myEmitter.setMaxListeners(0);

// const util = require('util');

const request = require('request');
//var url = "https://www.baidu.com/";
//const getPromise = util.promisify(request.get);
const userAgents = [
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11',
    'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0) ,Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
    'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)',
    'Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre',
    'Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.8.0.12) Gecko/20070731 Ubuntu/dapper-security Firefox/1.5.0.12',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)',
    'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6',
    'Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6',
    'Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)',
    'Opera/9.25 (Windows NT 5.1; U; en), Lynx/2.8.5rel.1 libwww-FM/2.14 SSL-MM/1.4.1 GNUTLS/1.2.9',
    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/61.0.3163.100 Safari/537.36'
];

//选择器模板
let selector_temple = [{"normal": "#picBody > p > a > img",
    "fix": "#picBody > p >  img"
},
    {"normal": "#picBody > center > a > img",
        "fix": "#picBody > center > img"
    },
    {"normal": "#contentV3_article > div.contentV3_body > p > a > img",
        "fix": "#contentV3_article > div.contentV3_body > p > img"
    }
];

/**
 * 异步延迟
 * @param {number} time 延迟的时间,单位毫秒
 */
function sleep(time = 0) {
    return new Promise((resolve, reject) => {
        setTimeout(() => {
            resolve();
        }, time);
    })
};


class Spider2717 extends EventEmitter {


    constructor(_starturl = 'https://www.2717.com/ent/meinvtupian/2019/316305.html',
                // _selector = 'div.w1200.yh >div.MeinvTuPianBox > ul > li>a>i>img',
                _type = 'meinv',
                _nextpage = 1,
                _lastpage = 1
                //_fix_selector = '#picBody > p > img'
    ) {

        super()
        // this._emitter = myEmitter;
        //src,title,flag:当前页面图片的src,和title及下载完成标志

        this.data = [];
        this.starturl= _starturl; //起始页url前半部分
        //this.selector = _selector;//提取数据选择器字符串
        //this.fix_selector = _fix_selector;//补丁选择器
        this.nextpage = _nextpage;//开始抓取页面
        this.lastpage = _lastpage; //最后抓取页面
        this.type = _type; //图片类型:meinv(243),meishi(199),stars(16),wenshen(380),zhiwu(100)
        //初始化保存图片目录
        let i1 = this.starturl.lastIndexOf("https://www.2717.com/") + "https://www.2717.com/".length;
        let i2 = this.starturl.lastIndexOf("/")
        let tmpstr = this.starturl.substring(i1, i2);

        //this.savedir = path.join('imgs', this.type).toString();
        this.savedir = path.join('imgs',  this.type,tmpstr).toString();
        console.log("savedir:" + this.savedir);
        //if (!fs.existsSync(this.savedir)) {
        // 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。
        fs.mkdirSync(this.savedir, {recursive: true}, (err) => {
            if (err) throw err;
        });
        //}

        //当前选择器模板序号
        //let select_type = 0; //!!!!!!!!!!!!!!!!!!!
        //下载html页面数据失败标志
        this.get_html_flag = true;
        //下载图片页完成计数器
        this.downloaded_imagepage_count = 0;
        //下载图片单个页面事件名称
        this.download_onepage_event = "download_onepage_event";

    }

    /**
     * 获取指定url中的html文本内容
     * @param url
     * @param no
     * @param event_name:html,etc
     */
    spidermeinvtupian(url, pno, event_name = 'html') {

        let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
        let req = request({
            url: url,
            UserAgent: userAgent,
            timeout: 5000,
            encoding: null //设置encoding
        }, function (error, response, body) {
            if (!error && response.statusCode == 200) {
                let html = iconv.decode(body, 'gbk').toString(); //解码gb2312
                this.get_html_flag = true;
                myEmitter.emit(event_name, html, pno);

            } else {
                console.log("获取 " + url + " 失败!--"+error.message);

                this.get_html_flag = false;
                let html = '';
                myEmitter.emit(event_name, html, pno);
            }
        });
    }

    /**
     * 从html文本中获取图片src和atl
     * @param html
     * @param pno
     */
    getTupianData(html, pno, event_name = 'images') {
        //body > div.w1200.yh > div.MeinvTuPianBox > ul > li:nth-child(1) > a.MMPic/
        const $ = cheerio.load(html);
        //美女图片
        //修正未页网页选择器不同与其它页面的选择器不一致的问题(可点击和不可点击的区别
        // console.log("selector:" + this.selector);
        //let imgs = $('#picBody > p > a > img').toArray();
        //#picBody > p > img

        let imgs = [];

        for (let i = 0; i < selector_temple.length; i++) {
            //尝试normal selector
            imgs = $(selector_temple[i]['normal']).toArray();
            console.log("selector:" + selector_temple[i]['normal']);
            if (imgs.length > 0) break;
            //尝试fix selector

            imgs = $(selector_temple[i]['fix']).toArray();
            console.log("selector:" + selector_temple[i]['fix']);
            if (imgs.length > 0) break;

        }
        console.log("total page1:" + imgs.length);


        for (let i = 0; i < imgs.length; i++) {
            let src = $(imgs[i]).attr('src');
            let title = $(imgs[i]).attr("alt");
            //增加文件下载标志,true:已完成下载,false:没有下载
            //let flag = false;
            this.data.push({src, title});
            // console.log(typeof (this.data.flag));
        }
        // myEmitter.emit("images", this.data, pno);
        myEmitter.emit(event_name, this.data, pno);
        //  this.emit("images", data, pno);

    }

    /**
     * 根据抓取的图片src和alt下载图片数据
     * @param data
     * @param pno
     */
    downloadphoto(data, pno) {
        for (let i = 0; i < data.length; i++) {
            data[i].title = data[i].title.replace(new RegExp("/", 'g'), '_');
            data[i].title = data[i].title.replace(new RegExp("\\\\", 'g'), '_');
            data[i].title = data[i].title.replace(new RegExp('<', 'g'), '_');
            data[i].title = data[i].title.replace(new RegExp('>', 'g'), '_');
            data[i].title = data[i].title.replace('|', '_');

            this.downloadfile(data[i].src, data[i].title, i, pno);
        }
    }


    /**
     * 随机延迟下载图片文件
     * @param src
     * @param title
     * @param no 当前页面第no个图片文件
     * @param delaytime
     * @param pno 当前页面号
     */

    /*
    require('https').get({
    secure: true,
    host: 'github.com',
    method: 'GET',
    path: '/downloads/Graylog2/graylog2-web-interface/graylog2-web-interface-0.9.6.tar.gz',
    'headers': {
        Host: 'github.com'
    }}).on('response', function(response) {
    console.log(response.statusCode);
    });
    */
    /**
     * 用NodeJs实现获取301或302跳转后的URL
     * @param link
     * @param collback
     * https://calfgz.github.io/blog/2018/05/http-redirect-java-node.html

     find_link(link, collback) {

        var f = function (link) {
            var options = {
                url: link,
                followRedirect: false,
                headers: {
                    'Content-Type': 'application/x-www-form-urlencoded',
                    'Accept-Charset': 'UTF-8;',
                    'User-Agent': 'Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.2.8) Firefox/3.6.8',
                }
            }

            request(options, function (error, response, body) {
                console.log(response.statusCode);
                if (response.statusCode == 301 || response.statusCode == 302) {
                    var location = response.headers.location;
                    console.log('location: ' + location);
                    f(location);
                } else {
                    //console.log(body);
                    collback(link);
                }
            })
        }

        f(link);
    }

     // find_link("http://a.m.taobao.com/i538372076663.htm?&sid=7ac494a5aa270ce9562feadef7423650", function(link){
    //     console.log(link);
    // });

     */
    calldownload=(src, no, filename, delaytime)=> {
        //src 非法
        if (src == undefined || src.length == 0) {
            //跳过,继续下一个图片下载
            console.log(`下载图片src':${src} '非法,跳过下载,继续下一个`);
            // this.data[no].flag = true;
            myEmitter.emit(this.download_onepage_event, "fail", no);
            return;
        }
        let time = 0;

        time = Math.random() * delaytime;

        let timeout=setTimeout(() => {
            let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
            var options = {
                url: src,
                followRedirect: false,
                headers: {
                    'Content-Type': 'application/x-www-form-urlencoded',
                    'Accept-Charset': 'UTF-8;',
                    'User-Agent': userAgent
                }
            }
            if (src.startsWith("https")) {

                https.get(src, options, res => {

                    // console.log(filename);
                    let writer = fs.createWriteStream(filename);
                    res.pipe(writer);
                    res.on("end", () => {
                        if (res.statusCode == 200) {
                            console.log(new Date().toLocaleString() + ",完成下载:" + filename);
                            //this.data[no].flag = true;
                            myEmitter.emit(this.download_onepage_event, "ok", no);
                        } else if (res.statusCode == 301 || res.statusCode == 302) {
                            console.log("未完成下载:" + filename + ",https返回值:" + res.statusCode);
                            //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
                            let location = res.headers.location;
                            console.log("正在重新跳转正确的URL进行下载:" + location);
                            // console.log('src: ' + src);
                            this.calldownload(location, no, filename);
                        } else { //文件下载失败,提示并跳过下载
                            console.log("下载:" + filename + " 失败,https返回值:" + res.statusCode);
                            //跳过,继续下一个图片下载
                            //this.data[no].flag = true;
                            myEmitter.emit(this.download_onepage_event, "fail", no);
                        }
                    });
                    res.on('error',(err)=>{
                        console.log("download_onepage_event:failed"+err.message);
                        myEmitter.emit(this.download_onepage_event, "fail", no);
                    });

                });
            } else if (src.startsWith("http")) {

                http.get(src, res => {
                    // let filename = path.join('imgs', title + path.extname(src));
                    //console.log(filename);
                    let writer = fs.createWriteStream(filename);
                    res.pipe(writer);
                    res.on("end", () => {
                        if (res.statusCode == 200) {
                            console.log(new Date().toLocaleString() + ",完成下载:" + filename);
                            //this.data[no].flag = true;
                            myEmitter.emit(this.download_onepage_event, "ok", no);
                        } else if (res.statusCode == 301 || res.statusCode == 302) {
                            console.log("未完成下载:" + filename + ",http返回值:" + res.statusCode);
                            //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
                            let location = res.headers.location;
                            console.log("正在重新跳转正确的URL进行下载:" + location);
                            this.calldownload(location, no, filename);
                        } else { //文件下载失败,提示并跳过下载
                            console.log("下载:" + filename + " 失败,https返回值:" + res.statusCode);
                            //跳过,继续下一个图片下载
                            myEmitter.emit(this.download_onepage_event, "fail", no);
                        }

                    });
                    res.on('error',(err)=>{
                        console.log("download_onepage_event:failed"+err.message);
                        myEmitter.emit(this.download_onepage_event, "fail", no);
                    });
                });
            }
            clearTimeout((timeout));
        }, time);
    };

    /**
     * 根据src,title,no,pno等参数进行下载图片文件到本地
     * @param src
     * @param title
     * @param no
     * @param delaytime
     * @param pno
     */
    downloadfile=(src, title, no, pno)=> {


        try {
            //  src= src.replace('https','http');
            console.log("src:" + src);

            //let filename = path.join(this.savedir, title,pno + path.extname(src));
            //if (!fs.existsSync(this.savedir)) {
            // 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。
            let dirpath=path.join(this.savedir,title).toString();
            fs.mkdirSync(dirpath, {recursive: true}, (err) => {
                if (err) throw err;
            });
            //}*/
            let filename = path.join(this.savedir, title,pno + path.extname(src));
            //如果本地文件存在则跳过,不再下载
            if (fs.existsSync(filename)) {

                let stat = fs.statSync(filename);
                if (stat.size > 1024) {
                    //跳过,继续下一个图片下载
                    console.log("本地文件:" + filename + "已经存在,系统跳过下载");
                    //    this.data[no].flag = true;
                    myEmitter.emit(this.download_onepage_event, "ingore", no);
                    return;
                }
            }
            console.log(new Date().toLocaleDateString() + ",正在下载:" + filename);
            //
            this.calldownload(src, no, filename, 100);

        } catch (e) {
            console.log(e);
            //  this.data[no].flag = flag;
            myEmitter.emit(this.download_onepage_event, "ingore", no);
        }


    };


    /**
     * 开启抓取图片数据
     */
    startSpider=()=> {


        //注册自定义监听事件
        // 根据html获取图片src,art
        myEmitter.on("html", (html, pno) => {
            // this.on("html", (html, pno) => {
            //  console.log("html:", html, pno);
            this.getTupianData(html, pno);
        });

        //根据图片src,alt,及指定页面下载图片到本地
        myEmitter.on("images", (data, pno) => {
            // this.on("images", (data, pno) => {
            //   console.log("images:", data, pno);
            this.downloadphoto(data, pno);
        });

        //下载图片页完成计数器
        this.downloaded_imagepage_count = 0;
        this.data = [];
        //下载图片单个页面事件名称
        // this.download_onepage_event="download_onepage_event";
        myEmitter.on(this.download_onepage_event, (status, pno) => {
            console.log("download_onepage_event=>status:"+status);
            this.downloaded_imagepage_count++;
            if (this.downloaded_imagepage_count >= this.data.length) {

                console.log("某单页图片数据抓取完毕!");
                this.downloaded_imagepage_count = 0;
                this.data = [];


                this.nextpage++;
                if (this.nextpage  <= this.lastpage) {

                    console.log("开启第" + this.nextpage + "页面数据抓取。。。。。。。。。。。。。。。");
                    this.spiderpage(this.nextpage);

                } else {
                    console.log("所有页面图片数据抓取完毕!");
                    //clearInterval(interval);//停止定时器
                    myEmitter.emit("download_allpage_event","ok");
                    this.data = [];
                    myEmitter.removeAllListeners("html")
                    myEmitter.removeAllListeners("images")
                    //写标志
                    fs.writeFileSync('save.txt',"ok");

                }

            }
        });

        //首先开启起始页数据下载。。。。
        console.log("开启第" + this.nextpage + "页面数据抓取。。。。。。。。。。。。。。。");
        this.spiderpage(this.nextpage)

    };

    /**
     * 开启指定页面数据抓取
     * @param pageno
     */
    spiderpage=(pageno)=> {
        let url = '';
        if(pageno===1){
            url=this.starturl;
        }
        else {

            url = this.starturl.substring(0, this.starturl.length - 5) + "_" + pageno + ".html";
        }
        // url = this.preurl + pageno + ".html";

        console.log("url:" + url);
        this.spidermeinvtupian(url, pageno);
    }
}

/**
 * 通过原生regquest模块获取指定url中文本内容
 * @param url
 * @param event_name
 */
function get_html_by_request(url, event_name = 'get_html') {

    let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
    let req = request({
        url: url,
        UserAgent: userAgent,
        encoding: null, //设置encoding
        strictSSL: true
    }, function (error, response, body) {
        if (!error && response.statusCode == 200) {
            let html = iconv.decode(body, 'gbk').toString(); //解码gb2312

            myEmitter.emit(event_name, html);

        } else {
            console.log("获取 " + url + " 失败:" + response.statusCode);
            let html = '';
            myEmitter.emit(event_name, html);
        }
    });
}


/**
 *通过phamtomjs同步获取url对应的html内容
 * @param url
 * @returns {Promise<string|*>}
 */
async function get_html_from_url_by_phantom(url) {
    // phantom.outputEncoding='utf-8';//指定编码方式
    const instance = await phantom.create();
    const page = await instance.createPage();
    await page.on('onResourceRequested', function (requestData) {
        console.info('Requesting', requestData.url);
    });
    //设置动态useragent
    let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
    //warn: Using page.settings = ...; is not supported. Use page.property('settings', ...) instead. See the README file for more examples of page#property.
    page.property('settings', {
        javascriptEnabled: true,
        loadImages: true,
        userAgent: userAgent
    });

    const status = await page.open(url);

    let content = await page.property('content');
    //  console.log(content);
    //  page.render('example.png');
    // await  page.close();
    await instance.exit();

    return content;

}

/**
 * 获取总页面数及其标题
 * @param html
 * @returns {number}
 */
function getPageinfo(html) {

    const $ = cheerio.load(html);

    //获取标题
    let hs = $('div.warp.mar.oh > div.warp.oh > h1').toArray();
    let title = $(hs[0]).text();
    //

    //获取总页面数
    let pageinfo = '';
    let lis = $('#pageinfo').toArray();
    if (lis.length == 0) {
        pageinfo = '-1';
    } else {
        pageinfo = $(lis[0]).attr('pageinfo');
    }


    let count = Number(pageinfo);

    let data = {'title':title, 'count':count};

    data.title = title;
    data.count = count;


    return data;

}

//---------------------------------------------------------------------------
/**
 * 无分页网页图片下载类
 */
class SpiderOnePageBuff {
    /**
     * 监听一个事件的参数
     * @param _event_name
     */
    constructor(_html, _event_name, _save_dir) {

        //初始化保存图片目录

        this.savedir = _save_dir;
        // 创建 ${this.type} 目录,无论是否存在嵌套目录比如 /tmp/a/app 目录是否存在 /tmp 和 /tmp/a 目录。
        fs.mkdirSync(this.savedir, {recursive: true}, (err) => {
            if (err) throw err;
        });

        this.clsname = 'SpiderOnePageBuff=>';
        this.downloaded_one_image = 'downloaded_one_image';

        this.html = _html;

        this.data = [];
        this.imgs = [];

        this.event_name = _event_name;
        /**
         * 处理所有图片数据完成计数器
         * @type {number}
         */
        this.process_event_finish_count = 0;

    }

    /**
     * 随机延迟下载图片文件
     * @param src
     * @param title
     * @param no 当前页面第no个图片文件
     * @param delaytime
     * @param pno 当前页面号
     */
    calldownload=(src, no, filename, delaytime)=>{
        //src 非法
        if (src == undefined || src.length == 0) {
            //跳过,继续下一个图片下载
            console.log(this.clsname + `下载图片src':${src} '非法,跳过下载,继续下一个`);
            this.data[no].flag = true;
            return;
        }
        let time = 0;

        time = Math.random() * delaytime;

       let timeout= setTimeout(() => {
            let userAgent = userAgents[parseInt(Math.random() * userAgents.length)];
            let options = {
                url: src,
                followRedirect: false,
                headers: {
                    'Content-Type': 'application/x-www-form-urlencoded',
                    'Accept-Charset': 'UTF-8;',
                    'User-Agent': userAgent
                }
            }
            if (src.startsWith("https")) {

                https.get(src, options, res => {

                    // console.log(filename);
                    let writer = fs.createWriteStream(filename);
                    res.pipe(writer);
                    res.on("end", () => {
                        if (res.statusCode == 200) {
                            console.log(this.clsname + new Date().toLocaleString() + ",完成下载:" + filename);
                            myEmitter.emit(this.downloaded_one_image, "ok", no);
                        } else if (res.statusCode == 301 || res.statusCode == 302) {
                            console.log("未完成下载:" + filename + ",https返回值:" + res.statusCode);
                            //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
                            let location = res.headers.location;
                            console.log(this.clsname + "正在重新跳转正确的URL进行下载:" + location);
                            // console.log('src: ' + src);
                            this.calldownload(location, no, filename);
                        } else { //文件下载失败,提示并跳过下载
                            console.log(this.clsname + "下载:" + filename + " 失败,https返回值:" + res.statusCode);
                            //跳过,继续下一个图片下载
                            myEmitter.emit(this.downloaded_one_image, "fail", no);
                        }
                    });

                });
            } else if (src.startsWith("http")) {

                http.get(src, res => {
                    // let filename = path.join('imgs', title + path.extname(src));
                    //console.log(filename);
                    let writer = fs.createWriteStream(filename);
                    res.pipe(writer);
                    res.on("end", () => {
                        if (res.statusCode == 200) {
                            console.log(this.clsname + new Date().toLocaleString() + ",完成下载:" + filename);
                            myEmitter.emit(this.downloaded_one_image, "ok", no);
                        } else if (res.statusCode == 301 || res.statusCode == 302) {
                            console.log(this.clsname + "未完成下载:" + filename + ",http返回值:" + res.statusCode);
                            //继续查找跳转的url,直到找到目标下载指定的图片文件url,可能需要反复调用
                            let location = res.headers.location;
                            console.log(this.clsname + "正在重新跳转正确的URL进行下载:" + location);
                            this.calldownload(location, no, filename);
                        } else { //文件下载失败,提示并跳过下载
                            console.log(this.clsname + "下载:" + filename + " 失败,https返回值:" + res.statusCode);
                            //跳过,继续下一个图片下载
                            myEmitter.emit(this.downloaded_one_image, "fail", no);
                        }

                    });
                });
            }
            clearTimeout(timeout);
        }, time);
    }


    /**
     * 预先处理标题为文件格式字符
     * @param _title
     * @returns {string}
     */
    preprocess_title(_title) {
        let title = _title;
        title = title.replace(new RegExp("\\\\", 'g'), '_');
        title = title.replace(new RegExp("/", 'g'), '_');
        title = title.replace(new RegExp('<', 'g'), '_');
        title = title.replace(new RegExp('>', 'g'), '_');
        title = title.replace('|', '_');
        return title;
    }

    /**
     * 抓取单个页面图片
     * @param html_buff
     * @param event_name
     */
//抓取只有单个图片的页面处理函数
    spider_one_image=(event_name = 'get_one_image')=> {

        const $ = cheerio.load(this.html);
        //修正未页网页选择器不同与其它页面的选择器不一致的问题(可点击和不可点击的区别
        for (let i = 0; i < selector_temple.length; i++) {
            //尝试normal selector
            this.imgs = $(selector_temple[i]['normal']).toArray();
            console.log("selector:" + selector_temple[i]['normal']);
            if (this.imgs.length > 0) break;
            //尝试fix selector

            this.imgs = $(selector_temple[i]['fix']).toArray();
            console.log("selector:" + selector_temple[i]['fix']);
            if (this.imgs.length > 0) break;

        }
        console.log("spider_one_image=>total page1:" + this.imgs.length);


        for (let i = 0; i < this.imgs.length; i++) {
            let src = $(this.imgs[i]).attr('src');
            let title = $(this.imgs[i]).attr("alt");
            title = this.preprocess_title(title);
            //增加文件下载标志,true:已完成下载,false:没有下载
            //let flag = false;
            this.data.push({src, title});
        }

        if (this.imgs.length > 0)
            myEmitter.emit(event_name, this.imgs);
    };

    /**
     * 开启入口
     */
    start_spider=()=>{
        this.process_event_finish_count = 0;
        myEmitter.on('main_download_one_image', (status, no) => {
            console.log(this.clsname + "status:" + status);
            console.log("this.event_name:"+this.event_name);
            this.process_event_finish_count++;
            if (this.process_event_finish_count >= this.imgs.length) {
                //如果完成所有文件下载(无论成功与否),则发去完成事件给回调函数
                this.process_event_finish_count=0;
                this.data=[];
               myEmitter.emit("download_allpage_event", "ok");
            }

        });
        myEmitter.on("get_one_image", data => {


            let filename = '';
            for (let i = 0; i < this.data.length; i++) {
                //开始下载图片文件
                //src, no, filename, delaytime
                //filename = path.join(this.savedir, (i + 1) + "_" + this.data[i].title + path.extname(this.data[i].src));
                filename = path.join(this.savedir, (i + 1) + path.extname(this.data[i].src));
                console.log(this.clsname + new Date().toLocaleDateString() + ",正在下载:" + filename);
                //如果本地文件存在则跳过,不再下载
                if (fs.existsSync(filename)) {

                    let stat = fs.statSync(filename);
                    if (stat.size > 1024) {
                        //跳过,继续下一个图片下载
                        console.log(this.clsname + "本地文件:" + filename + "已经存在,系统跳过下载");
                        myEmitter.emit("main_download_one_image", "ingore", i);
                        return;
                    }
                }
                this.calldownload(this.data[i].src, i, filename, 3000);
            }

        });

        this.spider_one_image();
    }
}


//抓取页面入口url地址
//!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
// let url = 'https://www.2717.com/beautiful/zhuomianbeijing/2013/4499.html';

function main(url = 'https://www.2717.com/word/dongwushijie/2018/313620.html',type) {
    //
    let pagecount = 0;
    let title = '';
    let html_buff = '';
   // let end_flag = false;


    //完成所有页面图片下载回调处理事件
    myEmitter.on("download_allpage_event",status=>{
        //开启结束标志
       //end_flag=true;
    });
    // console.log('step 1=================');

    //way1
    myEmitter.on('get_html', html => {
        let data = getPageinfo(html);
        pagecount = data['count'];
        title = data['title'];

        html_buff = html;
        console.log(title, pagecount);

        if (pagecount <= 0) {
            myEmitter.on("main_download_one_image", status => {
                //下载单个图片完成!!!
                console.log("下载单个图片完成!!!=状态" + status);
                //写标志
                fs.writeFileSync('save.txt',"ok");

            });
            //初始化保存图片目录
            let i1 = url.lastIndexOf("https://www.2717.com/") + "https://www.2717.com/".length;
            let i2 = url.lastIndexOf("/")
            let tmpstr = url.substring(i1, i2);
            let savedir = path.join('imgs', tmpstr).toString();
            if(arguments.length<=1){
                savedir=path.join('imgs',title).toString();
            }
            else{
                savedir=path.join('imgs',type,title).toString();
            }

            let spiderbuff = new SpiderOnePageBuff(html_buff,"main_download_one_image", savedir);
            spiderbuff.start_spider();
        } else { //有多个图片的tab页显示

            // console.log('step 2=================');
            let typestr=type;

            if(arguments.length<=1)
            {
                typestr=title;
            }

            let spider = new Spider2717(
                url,
                typestr,
                1,
                pagecount
            );
            spider.startSpider();
        }

    });
    //触发获取html内容
    get_html_by_request(url);
    //end way1


}

/**
 * 主调用
 * 只需要指定抓取图片首页url
 */
/*
性感红唇美女暗黑哥特风高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2019/314110.html
清新浪漫的蓝天白云纯美风景图片高清壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313774.html
世外桃源田园山水风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313773.html
祖国山河壮丽的自然风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313772.html
上帝视角俯瞰不一样的自然美景图片 https://www.2717.com/beautiful/zhuomianbeijing/2019/313771.html
小巧可爱的七星瓢虫动物图片壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2019/313769.html
雨后如珠似玉的花卉水珠梦幻特写图壁纸片 https://www.2717.com/beautiful/zhuomianbeijing/2019/313768.html
神奇瑰丽的西藏圣象天门风景图片壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2019/313767.html
大自然雄伟雪山美景高清壁纸图片素材 https://www.2717.com/beautiful/zhuomianbeijing/2018/313723.html
唯美图文手机背景高清壁纸图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313722.html
甜美可爱的冬日圣诞女孩手机高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313721.html
联想桌面壁纸高清图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313635.html
香港乐坛天后容祖儿图片桌面壁纸下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313634.html
刘德华主演电影高清桌面壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313608.html
美女明星杨蓉白色吊带性感连衣裙高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313590.html
死侍双刀耍酷高清壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313572.html
马思纯露肩性感写真高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313571.html
温馨幸福的韩系情侣高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313558.html
韩国女神美女IU拼接图片大全分享 https://www.2717.com/beautiful/zhuomianbeijing/2018/313557.html
你和我的倾城时光金瀚高清剧照图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313556.html
李易峰高清手机壁纸图片下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313555.html
最新超级可爱的萌娃拼接图片大全 https://www.2717.com/beautiful/zhuomianbeijing/2018/313552.html
偶像练习生陈立农高清壁纸写真图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313532.html
白敬亭帅气时尚高清壁纸写真图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313531.html
悲伤逆流成河顾森湘高清壁纸图片 https://www.2717.com/beautiful/zhuomianbeijing/2018/313517.html
可盐可甜的爱豆高清锁屏壁纸图片大全 https://www.2717.com/beautiful/zhuomianbeijing/2018/313505.html
2016年1月日历精选清新护眼壁纸图片5下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313494.html
奔驰梅赛德斯SLK55汽车壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313489.html
延禧攻略 清宫浮世绘版海报壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313487.html
海洋世界里的动物蓝色图片桌面壁纸1下载 https://www.2717.com/beautiful/zhuomianbeijing/2018/313485.html
OL制服美女美腿丝袜性感图片桌面壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313470.html
飞檐走壁的美女个性壁纸 https://www.2717.com/beautiful/zhuomianbeijing/2018/313466.html

*/
let url='';
 // url = "https://www.2717.com/ent/meinvtupian/2019/316305.html";
// let url = 'https://www.2717.com/beautiful/zhuomianbeijing/2019/314110.html';
//url='https://lq.2717.com/kbtp/2018/313409.html';
//url='https://lq.2717.com/kbtp/2017/184385.html';
url='https://www.2717.com/beautiful/qichetuku/2015/17388.html';
// url='https://www.2717.com/beautiful/zhuomianbeijing/2018/313450.html';

let arguments = process.argv.splice(2);
if(arguments.length>0)
{
        url=arguments[0];
}
let type='美女图片';
if(arguments.length>1)
{
    type=arguments[1];
}
main(url,type);

 

 
本次本来想继承events的事件驱动类来写爬虫的,经过测试死活不行,后来只有使用外部events实列的on,emit方法才通过,但是如下测试代码通过继承events又可以

let EventsDemo = require('events');


class MyEvents extends EventsDemo {
    constructor() {
        super();
    }

    callA() {
        console.log("call A");
        this.emit("aaa", "a",123);
    }

    callB() {
        console.log("call B");
        this.emit('bbb', 'b',123,456);
    }

    start(){
        // let   myevent = new MyEvents();


        this.on("test", (p1, p2, p3) => {
            let msg = '';
            //msg="p1={$p1},p2={$p2},p3={$p3}";
            msg = "p1=" + p1 + "," + "p2=" + p2 + "," + "p3=" + p3;

            console.log(msg);
        });
        this.emit("test", 1, "abc", 3.1415926);

        console.log("==================================================");
        // myevent = new MyEvents();

        this.on("aaa",(p1,p2)=>{
            let msg = '';
            msg = "callA:"+"p1=" + p1 + "," + "p2=" + p2 ;
            console.log(msg);
        });

        this.on('bbb', (p1,p2,p3)=>{
            let msg = '';
            msg = "callB:"+"p1=" + p1 + "," + "p2=" + p2 + "," + "p3=" + p3;
            console.log(msg);
        });

        this.callA();
        this.callB();
    }
};



/**
 * 主函数
 */
//main();

myevent = new MyEvents();
myevent.start();

这个问题有点诡异,知道的朋友请指教,谢谢。




 

 
posted @ 2019-12-10 06:15  中国人醒来了  阅读(1118)  评论(0编辑  收藏  举报