Node.Js 简单的数据采集示例,使用 http+cheerio+iconv-lite

源码如下:(collect-http.js)

// 文件名:collect-http.js
/**
 * Node.Js 简单的数据采集示例,使用 http+cheerio+iconv-lite
 * 安装依赖:npm install cheerio iconv-lite
 * 注意事项:gzip页面处理 gzip:true
 */
const NmHttps = require('https')
const NmHttp = require('http')
const NmFs = require('fs');
const NmPath = require('path');
const NmCheerio = require('cheerio')
// Node 环境当中不支持 GBK 编码,所以需要引用 iconv-lite 模块来转码
const NmIconv = require('iconv-lite')
// 图片数据缓冲区
const NmBufferHelper = require('bufferhelper');

let url = 'https://www.163.com';
let imgurl = 'https://pic2020.lianzhixiu.com/2016/1123/19/2.jpg';

collectHtml(url);
collectImage(imgurl);
collectImage2(imgurl)

// 抓取Html页面内容
function collectHtml(url) {
    NmHttps.get(url, function (res) {
        let chunks = [],
            size = 0;
        res.on('data', (chunk) => {
            chunks.push(chunk);
            size += chunk.length;
        })
        res.on('end', () => {
            let data = Buffer.concat(chunks, size);
            data = NmIconv.decode(data, 'GBK');
            let html = data.toString();
            let $ = NmCheerio.load(html);
            $('img').each(function (index, item) {
                console.log('===', item.attribs['data-original']);
                collectImage(item.attribs['data-original']);
            })
            NmFs.writeFile('./collect-http-163.html', html, () => {
                console.log('write success');
            })
        })
    })

}

// 抓取图片
function collectImage(url) {
    if (!url) {
        return;
    }
    let posQuery = url.indexOf('?');
    let pathInfo = NmPath.parse(posQuery > -1 ? url.substring(0, posQuery) : url);
    let savePath = './img/' + pathInfo.base;
    let stream = NmFs.createWriteStream(savePath);
    if (url.indexOf('https') == 0) {
        NmHttps.get(url, function (res) {
            res.pipe(stream);
        })
    } else {
        NmHttp.get(url, function (res) {
            res.pipe(stream);
        })
    }
}

// 采集图片,需要模块 bufferhelper
function collectImage2(url) {
    if (!url) {
        return;
    }
    let posQuery = url.indexOf('?');
    let pathInfo = NmPath.parse(posQuery > -1 ? url.substring(0, posQuery) : url);
    let savePath = './img/' + pathInfo.base;
    if (url.indexOf('https') == 0) {
        NmHttps.get(url, function (res) {
            let buffer = new NmBufferHelper();
            res.on('data', function (chunk) {
                buffer.concat(chunk);
            })
            res.on('end', function () {
                NmFs.writeFile(savePath, buffer.toBuffer(), function (error) {
                    console.log('==', error);
                })
            })
            res.on('error', function (error) {
                console.log('===', error);
            })
        })
    } else {
        NmHttp.get(url, function (res) {
            let buffer = new NmBufferHelper();
            res.on('data', function (chunk) {
                buffer.concat(chunk);
            })
            res.on('end', function () {
                NmFs.writeFile(savePath, buffer.toBuffer(), function (error) {
                    console.log('==', error);
                })
            })
            res.on('error', function (error) {
                console.log('===', error);
            })
        })
    }
}

运行:

node collect-http.js
posted on 2021-02-05 15:52  sochishun  阅读(223)  评论(0编辑  收藏  举报