TypeScript & Node.js crawler All In One
TypeScript & Node.js crawler All In One
"use strict";
/**
*
* @author xgqfrms
* @license MIT
* @copyright xgqfrms
* @created 2022-04-01
* @modified
*
* @description TypeScript & Node.js crawler All In One
* @augments
* @example
* @link https://www.cnblogs.com/xgqfrms/p/16086580.html
*
*/
import fs from "fs";
import path from "path";
import superagent from "superagent";
import * as CheerioAPI from "cheerio";
const log = console.log;
log('ESM ❌ __dirname = \n', __dirname);
log('commonjs ✅ __dirname = \n', __dirname);
type Course = {
// img?: string;
img: string;
title: string;
value: number;
}
interface Courses {
timestamp: number;
// timestamp: Date;
courses: Course[];
}
interface Content {
[prop: number]: Course[];
}
class Crawler {
private token: string = '';
private url: string = 'https://cdn.xgqfrms/typescript/crawler/index.html';
private HTMLStr: string = '';
public loading: boolean = false;
// public courses: Course[] = [];
constructor() {
this.init();
}
async init() {
const html = await this.getHTMLStr();
// const courses = this.parseHTML(this.HTMLStr);
const courses = this.parseHTML(html);
// log('courses =', courses);
this.jsonGenerator(courses);
}
jsonGenerator(courses: Courses) {
const folder = path.resolve(__dirname, '../data');
// log('__dirname = \n', __dirname);
log('folder = \n', folder)
if (!fs.existsSync(folder)) {
// 创建文件夹 ✅
// fs.mkdirSync('./data', 0o744);
fs.mkdirSync('./data');
// fs.mkdirSync('./data', {
// recursive: true,
// mode: 0o744,
// });
// mode 默认值 0o744
log('✅ create folder')
} else {
log('❌ create folder')
}
// fs.mkdirSync(__dirname, '../data');
// UnhandledPromiseRejectionWarning: TypeError [ERR_INVALID_ARG_VALUE]: The argument 'mode' must be a 32-bit unsigned integer or an octal string. Received '../data'
const filePath = path.resolve(__dirname, '../data/courses.json');
// const filePath = path.resolve(__dirname + '../data/courses.json');
let fileContent: Content = {};
if (fs.existsSync(filePath)) {
// 初始化
fileContent = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
}
// 追加
fileContent[courses.timestamp] = courses.courses;
// write 下载文件 ✅ ??? Node.js 控制浏览器,下载文件还是打开预览文件 ???
fs.writeFileSync(filePath, JSON.stringify(fileContent, null, 4));
const json = JSON.parse(fs.readFileSync(filePath, 'utf-8'));
log('json =', json);
}
parseHTML(html: string) {
const courses: Course[] = [];
// jQuery in HTML String
const $ = CheerioAPI.load(html);
const items: any = $('.course-item');
// const items: Cheerio<Element> = $('.course-item');
// Type 'Cheerio<Element>' is not an array type or a string type. Use compiler option '--downlevelIteration' to allow iterating of iterators.ts(2569)
for (const item of [...items]) {
const img = $(item).find('.course-img').attr('src') ?? '';
// const img = $(item).find('.course-img').eq(0).attr('src');
const desc = $(item).find('.course-desc');
// CheerioAPI.Cheerio<CheerioAPI.Element>
const title = desc.eq(0).text();
// const value = desc.eq(1).text().replace('当前课程学习人数:', '');
const value = +desc.eq(1).text().replace('当前课程学习人数:', '');
// const title = $(item).find('.course-desc').eq(0).text();
// const value = $(item).find('.course-desc').eq(1).text();
courses.push({
img,
title,
value,
// value: parseInt(value),
// value: parseInt(value, 10),
});
}
// console.log('courses =', courses);
const timestamp = Date.now();
return {
timestamp,
courses,
};
}
async getHTMLStr() {
const res = await superagent.get(this.url);
// console.log('res', res);
// console.log('res.text =\n', res.text);
this.HTMLStr = res.text ?? '';
// return Promise.resolve(this.HTMLStr);
return res.text;
}
}
export default Crawler;
export {
Crawler,
};
superagent
$ npm i -D superagent
# 类型注解
$ npm i -D @types/superagent
const superagent = require('superagent');
// callback
superagent
.post('/api/pet')
.send({ name: 'Manny', species: 'cat' }) // sends a JSON post body
.set('X-API-Key', 'foobar')
.set('accept', 'json')
.end((err, res) => {
// Calling the end function will send the request
});
// promise with then/catch
superagent.post('/api/pet').then(console.log).catch(console.error);
// promise with async/await
(async () => {
try {
const res = await superagent.post('/api/pet');
console.log(res);
} catch (err) {
console.error(err);
}
})();
https://www.npmjs.com/package/superagent
https://github.com/visionmedia/superagent
cheerio
DOM string parser
$ npm i -D cheerio
const cheerio = require('cheerio');
const $ = cheerio.load('<h2 class="title">Hello world</h2>');
$('h2.title').text('Hello there!');
$('h2').addClass('welcome');
$.html();
// <html><head></head><body><h2 class="title welcome">Hello there!</h2></body></html>
https://www.npmjs.com/package/cheerio
https://github.com/cheeriojs/cheerio
demo
cheerio get image tag src
parseHTML(html: string) {
const $ = CheerioAPI.load(html);
// jQuery in HTML String
const courses: any = $('.course-item');
// console.log('courses =', courses);
const data = [];
// Type 'Cheerio<Element>' is not an array type or a string type. Use compiler option '--downlevelIteration' to allow iterating of iterators.ts(2569)
for (const item of [...courses]) {
const img = $(item).find('.course-img').attr('src');
// const img = $(item).find('.course-img').eq(0).attr('src');
const desc = $(item).find('.course-desc');
const title = desc.eq(0).text();
const value = desc.eq(1).text();
// const title = $(item).find('.course-desc').eq(0).text();
// const value = $(item).find('.course-desc').eq(1).text();
data.push({
img,
title,
value,
});
}
console.log('data =', data);
const timestamp = Date.now();
return {
timestamp,
};
}
https://www.tabnine.com/code/javascript/functions/cheerio/src
https://stackoverflow.com/questions/47542338/cheerio-get-image-src-with-no-class
Node.js path.resolve
path.resolve([...paths])
path.resolve('/foo/bar', './baz');
// Returns: '/foo/bar/baz'
path.resolve('/foo/bar', '/tmp/file/');
// Returns: '/tmp/file'
path.resolve('wwwroot', 'static_files/png/', '../gif/image.gif');
// If the current working directory is /home/myself/node,
// this returns '/home/myself/node/wwwroot/static_files/gif/image.gif'
https://nodejs.org/api/path.html#pathresolvepaths
https://stackoverflow.com/questions/35048686/whats-the-difference-between-path-resolve-and-path-join
pdf crawler / pdf 爬虫
Node.js download pdf files / Node.js 下载 pdf 文件
// esm / ts
https://stackoverflow.com/questions/25945714/how-to-download-pdf-file-from-url-in-node-js
const fs = require("fs");
var path = require("path");
const { exit } = require("process");
const log = console.log;
const request = require("request");
// const request = require("request-promise-native");
var folder = path.resolve(__dirname, '../pdf');
// log('folder', folder);
if (!fs.existsSync(folder)) {
fs.mkdirSync(folder);
}
async function downloadPDF(url, filename) {
log('🚧 pdf downloading ...');
const pdfBuffer = await request.get({
uri: url,
encoding: null,
// encoding: 'utf-8',
});
fs.writeFileSync(filename, pdfBuffer);
log('✅ pdf finished!');
// exit 0;
}
const url = 'https://cs193p.sites.stanford.edu/sites/g/files/sbiybj16636/files/media/file/l1.pdf';
const filename = folder + '/cs193p-2021-l1.pdf';
// log('filename =', filename);
downloadPDF(url, filename);
https://github.com/request/request
https://github.com/request/request#promises--asyncawait
https://github.com/request/request-promise-nativ
"use strict";
/**
*
* @author xgqfrms
* @license MIT
* @copyright xgqfrms
* @created 2022-04-01
* @modified
*
* @description Node.js pdf crawler
* @augments
* @example
* @link
*
*/
// 0. commonjs module using `require` keyword
const fs = require("fs");
const path = require("path");
const { exit } = require("process");
// const request = require("request");
// request 解析 pdf 错误
// 1. just using `request` instead of `request-promise-native`, which is too slow!
const request = require("request-promise-native");
const log = console.log;
// 2. custom download folder
const folder = path.resolve(__dirname, '../pdf');
// log('folder', folder);
// 3. check if the folder exists, if not create it
if (!fs.existsSync(folder)) {
fs.mkdirSync(folder);
}
async function downloadPDF(url, filename) {
log('🚧 pdf downloading ...');
const pdfBuffer = await request.get({
uri: url,
encoding: null,
// encoding: 'utf-8',
});
// 4. write file to local file system
fs.writeFileSync(filename, pdfBuffer);
log('✅ pdf download finished!');
// 5. exit the terminal after download finished
exit(0);
}
const url = 'https://cs193p.sites.stanford.edu/sites/g/files/sbiybj16636/files/media/file/l1.pdf';
const filename = folder + '/cs193p-2021-l1.pdf';
// log('filename =', filename);
downloadPDF(url, filename);
https://nodejs.org/docs/v0.4.12/api/http.html#response.writeHead
refs
Stanford University Spring 2021 CS193p Course pdf
https://www.youtube.com/watch?v=--qKOhdgJAs
©xgqfrms 2012-2020
www.cnblogs.com/xgqfrms 发布文章使用:只允许注册用户才可以访问!
原创文章,版权所有©️xgqfrms, 禁止转载 🈲️,侵权必究⚠️!
本文首发于博客园,作者:xgqfrms,原文链接:https://www.cnblogs.com/xgqfrms/p/16086580.html
未经授权禁止转载,违者必究!