xgqfrms™, xgqfrms® : xgqfrms's offical website of cnblogs! xgqfrms™, xgqfrms® : xgqfrms's offical website of GitHub!

TypeScript & Node.js crawler All In One

TypeScript & Node.js crawler All In One


"use strict";

/**
 *
 * @author xgqfrms
 * @license MIT
 * @copyright xgqfrms
 * @created 2022-04-01
 * @modified
 *
 * @description TypeScript & Node.js crawler All In One
 * @augments
 * @example
 * @link https://www.cnblogs.com/xgqfrms/p/16086580.html
 *
 */

import fs from "fs";
import path from "path";

import superagent from "superagent";
import * as CheerioAPI from "cheerio";

const log = console.log;


log('ESM ❌ __dirname = \n', __dirname);
log('commonjs ✅ __dirname = \n', __dirname);

type Course = {
  // img?: string;
  img: string;
  title: string;
  value: number;
}
interface Courses {
  timestamp: number;
  // timestamp: Date;
  courses: Course[];
}
interface Content {
  [prop: number]: Course[];
}
class Crawler {
  private token: string = '';
  private url: string = 'https://cdn.xgqfrms/typescript/crawler/index.html';
  private HTMLStr: string = '';
  public loading: boolean = false;
  // public courses: Course[] = [];
  constructor() {
    this.init();
  }
  async init() {
    const html = await this.getHTMLStr();
    // const courses = this.parseHTML(this.HTMLStr);
    const courses = this.parseHTML(html);
    // log('courses =', courses);
    this.jsonGenerator(courses);
  }
  jsonGenerator(courses: Courses) {
    const folder = path.resolve(__dirname, '../data');
    // log('__dirname = \n', __dirname);
    log('folder = \n', folder)
    if (!fs.existsSync(folder)) {
      // 创建文件夹 ✅
      // fs.mkdirSync('./data', 0o744);
      fs.mkdirSync('./data');
      // fs.mkdirSync('./data', {
      //   recursive: true,
      //   mode: 0o744,
      // });
      // mode 默认值 0o744
      log('✅ create folder')
    } else {
      log('❌ create folder')
    }
    // fs.mkdirSync(__dirname, '../data');
    //  UnhandledPromiseRejectionWarning: TypeError [ERR_INVALID_ARG_VALUE]: The argument 'mode' must be a 32-bit unsigned integer or an octal string. Received '../data'
    const filePath = path.resolve(__dirname, '../data/courses.json');
    // const filePath = path.resolve(__dirname + '../data/courses.json');
    let fileContent: Content = {};
    if (fs.existsSync(filePath)) {
      // 初始化
      fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
    }
    // 追加
    fileContent[courses.timestamp] = courses.courses;
    // write 下载文件 ✅ ??? Node.js 控制浏览器,下载文件还是打开预览文件 ???
    fs.writeFileSync(filePath, JSON.stringify(fileContent, null, 4));
    const json = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
    log('json =', json);
  }
  parseHTML(html: string) {
    const courses: Course[] = [];
    // jQuery in HTML String
    const $ = CheerioAPI.load(html);
    const items: any = $('.course-item');
    // const items: Cheerio<Element> = $('.course-item');
    // Type 'Cheerio<Element>' is not an array type or a string type. Use compiler option '--downlevelIteration' to allow iterating of iterators.ts(2569)
    for (const item of [...items]) {
      const img = $(item).find('.course-img').attr('src') ?? '';
      // const img = $(item).find('.course-img').eq(0).attr('src');
      const desc = $(item).find('.course-desc');
      //  CheerioAPI.Cheerio<CheerioAPI.Element>
      const title = desc.eq(0).text();
      // const value = desc.eq(1).text().replace('当前课程学习人数:', '');
      const value = +desc.eq(1).text().replace('当前课程学习人数:', '');
      // const title = $(item).find('.course-desc').eq(0).text();
      // const value = $(item).find('.course-desc').eq(1).text();
      courses.push({
        img,
        title,
        value,
        // value: parseInt(value),
        // value: parseInt(value, 10),
      });
    }
    // console.log('courses =', courses);
    const timestamp = Date.now();
    return {
      timestamp,
      courses,
    };
  }
  async getHTMLStr() {
    const res = await superagent.get(this.url);
    // console.log('res', res);
    // console.log('res.text =\n', res.text);
    this.HTMLStr = res.text ?? '';
    // return Promise.resolve(this.HTMLStr);
    return res.text;
  }
}

export default Crawler;
export {
  Crawler,
};

superagent

$ npm i -D superagent


# 类型注解
$ npm i -D @types/superagent


const superagent = require('superagent');

// callback
superagent
  .post('/api/pet')
  .send({ name: 'Manny', species: 'cat' }) // sends a JSON post body
  .set('X-API-Key', 'foobar')
  .set('accept', 'json')
  .end((err, res) => {
    // Calling the end function will send the request
  });

// promise with then/catch
superagent.post('/api/pet').then(console.log).catch(console.error);

// promise with async/await
(async () => {
  try {
    const res = await superagent.post('/api/pet');
    console.log(res);
  } catch (err) {
    console.error(err);
  }
})();

https://www.npmjs.com/package/superagent

https://github.com/visionmedia/superagent

cheerio

DOM string parser

$ npm i -D cheerio

const cheerio = require('cheerio');
const $ = cheerio.load('<h2 class="title">Hello world</h2>');

$('h2.title').text('Hello there!');
$('h2').addClass('welcome');

$.html();
// <html><head></head><body><h2 class="title welcome">Hello there!</h2></body></html>

https://www.npmjs.com/package/cheerio

https://github.com/cheeriojs/cheerio

https://cheerio.js.org/

demo

cheerio get image tag src

  parseHTML(html: string) {
    const $ = CheerioAPI.load(html);
    // jQuery in HTML String
    const courses: any = $('.course-item');
    // console.log('courses =', courses);
    const data = [];
    // Type 'Cheerio<Element>' is not an array type or a string type. Use compiler option '--downlevelIteration' to allow iterating of iterators.ts(2569)
    for (const item of [...courses]) {
      const img = $(item).find('.course-img').attr('src');
      // const img = $(item).find('.course-img').eq(0).attr('src');
      const desc = $(item).find('.course-desc');
      const title = desc.eq(0).text();
      const value = desc.eq(1).text();
      // const title = $(item).find('.course-desc').eq(0).text();
      // const value = $(item).find('.course-desc').eq(1).text();
      data.push({
        img,
        title,
        value,
      });
    }
    console.log('data =', data);
    const timestamp = Date.now();
    return {
      timestamp,
    };
  }

https://www.tabnine.com/code/javascript/functions/cheerio/src

https://stackoverflow.com/questions/47542338/cheerio-get-image-src-with-no-class

Node.js path.resolve

path.resolve([...paths])

path.resolve('/foo/bar', './baz');
// Returns: '/foo/bar/baz'

path.resolve('/foo/bar', '/tmp/file/');
// Returns: '/tmp/file'

path.resolve('wwwroot', 'static_files/png/', '../gif/image.gif');
// If the current working directory is /home/myself/node,
// this returns '/home/myself/node/wwwroot/static_files/gif/image.gif'

https://nodejs.org/api/path.html#pathresolvepaths

https://stackoverflow.com/questions/35048686/whats-the-difference-between-path-resolve-and-path-join

pdf crawler / pdf 爬虫

Node.js download pdf files / Node.js 下载 pdf 文件

// esm / ts

https://stackoverflow.com/questions/25945714/how-to-download-pdf-file-from-url-in-node-js

const fs = require("fs");
var path = require("path");
const { exit } = require("process");
const log = console.log;
const request = require("request");
// const request = require("request-promise-native");

var folder = path.resolve(__dirname, '../pdf');

// log('folder', folder);

if (!fs.existsSync(folder)) {
  fs.mkdirSync(folder);
}


async function downloadPDF(url, filename) {
  log('🚧 pdf downloading ...');
  const pdfBuffer = await request.get({
    uri: url,
    encoding: null,
    // encoding: 'utf-8',
  });
  fs.writeFileSync(filename, pdfBuffer);
  log('✅ pdf finished!');
  // exit 0;
}

const url = 'https://cs193p.sites.stanford.edu/sites/g/files/sbiybj16636/files/media/file/l1.pdf';
const filename = folder + '/cs193p-2021-l1.pdf';


// log('filename =', filename);

downloadPDF(url, filename);


https://github.com/request/request
https://github.com/request/request#promises--asyncawait
https://github.com/request/request-promise-nativ

"use strict";

/**
 *
 * @author xgqfrms
 * @license MIT
 * @copyright xgqfrms
 * @created 2022-04-01
 * @modified
 *
 * @description  Node.js pdf crawler
 * @augments
 * @example
 * @link
 *
 */

// 0. commonjs module using `require` keyword
const fs = require("fs");
const path = require("path");
const { exit } = require("process");

// const request = require("request");
// request 解析 pdf 错误
// 1. just using `request` instead of `request-promise-native`, which is too slow!
const request = require("request-promise-native");

const log = console.log;

// 2. custom download folder
const folder = path.resolve(__dirname, '../pdf');
// log('folder', folder);

// 3. check if the folder exists, if not create it
if (!fs.existsSync(folder)) {
  fs.mkdirSync(folder);
}

async function downloadPDF(url, filename) {
  log('🚧 pdf downloading ...');
  const pdfBuffer = await request.get({
    uri: url,
    encoding: null,
    // encoding: 'utf-8',
  });
  // 4. write file to local file system
  fs.writeFileSync(filename, pdfBuffer);
  log('✅ pdf download finished!');
  // 5. exit the terminal after download finished
  exit(0);
}

const url = 'https://cs193p.sites.stanford.edu/sites/g/files/sbiybj16636/files/media/file/l1.pdf';
const filename = folder + '/cs193p-2021-l1.pdf';
// log('filename =', filename);

downloadPDF(url, filename);


https://nodejs.org/docs/v0.4.12/api/http.html#response.writeHead

refs

Stanford University Spring 2021 CS193p Course pdf

https://www.youtube.com/watch?v=--qKOhdgJAs



©xgqfrms 2012-2020

www.cnblogs.com/xgqfrms 发布文章使用:只允许注册用户才可以访问!

原创文章,版权所有©️xgqfrms, 禁止转载 🈲️,侵权必究⚠️!


posted @ 2022-04-01 13:48  xgqfrms  阅读(102)  评论(12编辑  收藏  举报