如何实现一个具有自动翻页功能的 Node.js 爬虫 All In One
如何实现一个具有自动翻页功能的 Node.js 爬虫 All In One
网络爬虫
是一种从互联网抓取数据信息的自动化程序
;
爬虫原理分析
- 分析待爬取网站的种子 URL 格式、页面结构,确定目标链接唯一标识和翻页参数
- 通过种子 URL 把网页下载为 HTML 字符串格式
- 解析 HTML 字符串,动态读取页面的总分页数量
- 使用循环,动态生成 URL
- 把当前爬取页面的目标链接,全部按行写入到一个本地文件中
- 待完成目标链接到收集后,再按行读取本地文件,设置安全的下载频率,批量下载目标资源(
.pdf
/.mp4
等文件)
https://nodejs.dev/en/learn/writing-files-with-nodejs/
爬虫待选方案
- Node.js
- Puppeteer (headless Chrome)
- Shell Script
- Python Script
- node-fetch
- axios
... 等
注意: 爬虫翻页时需要遵守网站的反爬虫策略
,如果爬取频率
过快,可能会导致 IP 被封
HTML string => HTML DOM
解析 HTML 字符串
$ npm i -S cheerio
import * as cheerio from 'cheerio';
// const cheerio = require('cheerio');
// const $ = cheerio.load('html string');
const $ = cheerio.load('<h2 class="title">Hello world</h2>');
// selector ✅
$('h2.title').text();
// "Hello world"
// traverse the DOM ✅
$('h2.title').find('.subtitle').text();
// manipulate the element
$('h2.title').text('Hello there!');
$('h2').after('<h3>How are you?</h3>');
pure js solution ❌
import fs from 'node:fs';
import https from 'node:https';
// import http from 'node:http';
// import * as http2 from 'http2';
import path from 'path';
// 默认当前 root 路径 ✅
const __dirname = path.resolve();
console.log('__dirname', __dirname)
async function download(url, dest, cb) {
const file = fs.createWriteStream(dest);
const request = https.get(url, res => {
// 自动下载
res.pipe(file);
file.on('finish', function() {
file.close(cb);
// close() is async, call cb after close completes.
}).on(err => {
fs.unlink(dest);
if (cb) {
cb(err.message);
}
});;
})
.on('error', err => {
// Delete the file async.
fs.unlink(dest);
if (cb) {
cb(err.message);
}
})
};
// ❌ throw er; // Unhandled 'error' event
// ❌ Error: EISDIR: illegal operation on a directory
const url = `https://cdn.xgqfrms.xyz/video/web-testing.mp4`
download(url, __dirname, (msg) => console.log(`msg =`, msg))
// download(url, `./video.mp4`, (msg) => console.log(`msg =`, msg))
// await fetch(url)
// .then(res => res.arrayBuffer())
// // .then(res => res.blob())
// .then(bytes => {
// let id = setInterval(() => {
// console.log(`⏳ loading ...`)
// }, 1000);
// fs.writeFile(`./video.mp4`, JSON.stringify(bytes), () => {
// clearInterval(id)
// console.log(`✅ finished =`, bytes)
// })
// })
// // .then(bytes => fs.writeFileSync(`./video.mp4`, bytes))
// .catch(err => console.error(err));
demos
"use strict";
/**
*
* @author xgqfrms
* @license MIT
* @copyright xgqfrms
* @created 2023-08-24
* @modified
*
* @description
* @difficulty Easy Medium Hard
* @time_complexity O(n)
* @space_complexity O(n)
* @augments
* @example
* @link https://www.cnblogs.com/xgqfrms/p/17655286.html
* @solutions
*
* @best_solutions
*
*/
// const log = console.log;
import * as cheerio from 'cheerio'
import url from 'node:url';
import fs from 'node:fs';
class WebCrawler {
constructor(url, pagination, links) {
console.log(`node.js 🕷️`)
if(!url || !pagination) {
throw new Error(`必传参数缺失 ❌\nurl=${url}\npagination=${pagination}`)
}
this.index = 0
this.url = url
this.pagination = pagination
this.links = links
this.items = []
this.groups = []
}
async getVideos() {
for (const item of this.items) {
const html = await fetch(item.href).then(res => res.text())
// 正则表达式
const reg = /https\:\/\/edu-vod\.lagou\.com\/sv\/(.*)\.mp4/ig
// const reg = /https\:\/\/edu-vod\.lagou\.com(.*)\.mp4/ig
html.replaceAll(reg, group => {
this.groups.push({
url: group,
title: item.title,
})
return
});
}
console.log(`✅ groups`, this.groups.length, this.groups);
await this.writeFile()
}
async writeFile() {
let strs = ``;
for (const group of this.groups) {
strs += `${group.url}\n`
}
await fs.writeFile(`./strs.md`, strs, () => {
console.log(`OK ✅`)
})
// ❌ TypeError [ERR_INVALID_ARG_TYPE]: The "data" argument must be of type string or an instance of Buffer, TypedArray, or DataView. Received an instance of Array
// const data = JSON.stringify(this.items, null, 4)
// await fs.writeFile(`./data.json`, data, () => {
// console.log(`OK ✅`)
// })
}
getOrigin() {
const urlObject = new URL(this.url);
return urlObject.origin;
// https://nodejs.org/api/url.html
}
async getItems() {
const html = await fetch(`${this.url}?${this.pagination}=${this.index}`).then(res => res.text())
const $ = cheerio.load(html)
const items = $(`[class="article-title"]`)
const origin = this.getOrigin();
for (const item of items) {
const href = origin + $(item).attr('href') ?? '';
const title = $(item).attr('title') ?? '';
const text = $(item).text() ?? '';
this.items.push({
href,
title,
text,
})
}
}
async getPages() {
const html = await fetch(`${this.url}`).then(res => res.text())
const $ = cheerio.load(html)
const pages = $(`li[class^="ant-pagination-item"]`)
for (let i = 0; i < pages.length; i++) {
this.index += 1;
console.log(`✅ index =`, i, this.index)
await this.getItems()
}
await this.getVideos()
}
async parseHTML() {
await this.getPages()
}
}
function test() {
const url = `http://abczzz.cn/archives/136`
const pagination = `page`
const links = {
targetLink: `targetLink`,
pageLink: `pageLink`,
fileLink: `fileLink`,
}
const crawler = new WebCrawler(url, pagination, links)
crawler.parseHTML()
}
test();
export default WebCrawler;
export {
WebCrawler,
};
/Users/xgqfrms-mm/Documents/github/node-web-framework-all-in-one/000-xyz/crawler/server.js
#!/usr/bin/env bash
# author: xgqfrms
# created: 2023.08.24
# desciption: video downloader for web crawler
# 下载目录
rm -rf videos
mkdir videos
# $1 是传递给 shell 的第一个参数
# read line 按行读取文件
cat $1 | while read line
do
# shell 变量需要使用双引号包裹, 或 echo $line
echo "✅ $line"
# 按行分割,每行一个
url=$(echo ${line} | tr -d '\n')
# 进入目录
cd videos
# -O/--remote-name , 把输出写到该文件中,保留远程文件的文件名 ✅
curl $url -O
# 退出目录
cd ../
# 延迟 3 秒
sleep 3
done
# ./video.sh ./strs.md
$ ./video.sh ./strs.md
(🐞 反爬虫测试!打击盗版⚠️)如果你看到这个信息, 说明这是一篇剽窃的文章,请访问 https://www.cnblogs.com/xgqfrms/ 查看原创文章!
node-fetch
import fetch from "node-fetch";
import path from 'node:path';
import {fileURLToPath} from 'node:url';
// import fs from 'node:fs';
import {createWriteStream} from 'node:fs';
import {pipeline} from 'node:stream';
import {promisify} from 'node:util'
// const __filename = fileURLToPath(import.meta.url);
// const __dirname = path.dirname(__filename);
// console.log(`import.meta.url`, import.meta.url)
// console.log(`__dirname`, __dirname)
async function downloadFile(url, path) {
const streamPipeline = promisify(pipeline);
fetch(url).then(async (res) => {
if (!res.ok) {
throw new Error(`unexpected response ${res.statusText}`);
}
console.log(`✅ res =`, res)
return await streamPipeline(res.body, createWriteStream(path));
}).catch(err => {
console.log(`❌ err =`, err)
}).finally(() => {
console.log(`finally 👻`)
})
}
// async function downloadFile(url, path) {
// const streamPipeline = promisify(pipeline);
// const res = await fetch(url)
// if (!res.ok) {
// throw new Error(`unexpected response ${res.statusText}`);
// }
// // console.log(`✅ res =`, res)
// await streamPipeline(res.body, createWriteStream(path));
// }
// const url = `https://edu-vod.lagou.com/sv/2daa3bb9-1765150ee6e/2daa3bb9-1765150ee6e.mp4`
// const url = `https://cdn.xgqfrms.xyz/video/web-testing.mp4`
const url = `https://cdn.xgqfrms.xyz/video/web-testing.mp5`
await downloadFile(url, "./test.mp4");
/*
$ node ./node-fetch.js
*/
axios
import axios from 'axios';
// import axios, {isCancel, AxiosError} from 'axios';
// import fs from 'node:fs';
import {createWriteStream} from 'node:fs';
// import path from 'path';
// import { fileURLToPath } from 'url';
// const __filename = fileURLToPath(import.meta.url);
// const __dirname = path.dirname(__filename);
// console.log(`import.meta.url`, import.meta.url)
// console.log(`__dirname`, __dirname)
// async function downloadFile(url, path) {
// const res = await axios({
// url,
// method: "GET",
// responseType: "stream",
// });
// console.log(`✅ content-type =`, res.headers['content-type'])
// res.data.pipe(createWriteStream(path));
// }
async function downloadFile(url, path) {
await axios({
url,
method: "GET",
responseType: "stream",
}).then(res => {
console.log(`✅ content-type =`, res.headers['content-type'])
return res.data.pipe(createWriteStream(path));
}).catch(err => {
console.log(`❌ err =`, err)
}).finally(() => {
console.log(`finally 👻`)
})
}
// const url = `https://edu-vod.lagou.com/sv/2daa3bb9-1765150ee6e/2daa3bb9-1765150ee6e.mp4`
const url = `https://cdn.xgqfrms.xyz/video/web-testing.mp4`
await downloadFile(url, "./test.mp4");
/*
$ node ./axios.js
*/
refs
https://github.com/xgqfrms/node-web-framework-all-in-one
https://www.cnblogs.com/xgqfrms/tag/爬虫/
https://www.cnblogs.com/xgqfrms/p/16086580.html
©xgqfrms 2012-2021
www.cnblogs.com/xgqfrms 发布文章使用:只允许注册用户才可以访问!
原创文章,版权所有©️xgqfrms, 禁止转载 🈲️,侵权必究⚠️!
本文首发于博客园,作者:xgqfrms,原文链接:https://www.cnblogs.com/xgqfrms/p/17655286.html
未经授权禁止转载,违者必究!