xgqfrms™, xgqfrms® : xgqfrms's offical website of cnblogs! xgqfrms™, xgqfrms® : xgqfrms's offical website of GitHub!

如何实现一个具有自动翻页功能的 Node.js 爬虫 All In One

如何实现一个具有自动翻页功能的 Node.js 爬虫 All In One

网络爬虫是一种从互联网抓取数据信息的自动化程序;

爬虫原理分析

  1. 分析待爬取网站的种子 URL 格式、页面结构,确定目标链接唯一标识和翻页参数
  2. 通过种子 URL 把网页下载为 HTML 字符串格式
  3. 解析 HTML 字符串,动态读取页面的总分页数量
  4. 使用循环,动态生成 URL
  5. 把当前爬取页面的目标链接,全部按行写入到一个本地文件中
  6. 待完成目标链接到收集后,再按行读取本地文件,设置安全的下载频率,批量下载目标资源( .pdf / .mp4 等文件)

https://nodejs.dev/en/learn/writing-files-with-nodejs/

image

爬虫待选方案

  • Node.js
  • Puppeteer (headless Chrome)
  • Shell Script
  • Python Script
  • node-fetch
  • axios
    ... 等

注意: 爬虫翻页时需要遵守网站的反爬虫策略,如果爬取频率过快,可能会导致 IP 被封

HTML string => HTML DOM

解析 HTML 字符串

$ npm i -S cheerio

import * as cheerio from 'cheerio';
// const cheerio = require('cheerio');

// const $ = cheerio.load('html string');
const $ = cheerio.load('<h2 class="title">Hello world</h2>');

// selector ✅
$('h2.title').text();
// "Hello world"

// traverse the DOM ✅
$('h2.title').find('.subtitle').text();

// manipulate the element
$('h2.title').text('Hello there!');
$('h2').after('<h3>How are you?</h3>');


https://cheerio.js.org/

pure js solution ❌

import fs from 'node:fs';
import https from 'node:https';
// import http from 'node:http';
// import * as http2 from 'http2';
import path from 'path';

// 默认当前 root 路径 ✅
const __dirname = path.resolve();

console.log('__dirname', __dirname)

async function download(url, dest, cb) {
  const file = fs.createWriteStream(dest);
  const request = https.get(url, res => {
    // 自动下载
    res.pipe(file);
    file.on('finish', function() {
      file.close(cb);
      // close() is async, call cb after close completes.
    }).on(err => {
      fs.unlink(dest);
      if (cb) {
        cb(err.message);
      }
    });;
  })
  .on('error', err => {
    // Delete the file async.
    fs.unlink(dest);
    if (cb) {
      cb(err.message);
    }
  })
};

// ❌ throw er; // Unhandled 'error' event
// ❌ Error: EISDIR: illegal operation on a directory
const url = `https://cdn.xgqfrms.xyz/video/web-testing.mp4`

download(url, __dirname, (msg) => console.log(`msg =`, msg))
// download(url, `./video.​mp4`, (msg) => console.log(`msg =`, msg))

// await fetch(url)
//   .then(res => res.arrayBuffer())
//   // .then(res => res.blob())
//   .then(bytes => {
//     let id = setInterval(() => {
//       console.log(`⏳ loading ...`)
//     }, 1000);
//     fs.writeFile(`./video.​mp4`, JSON.stringify(bytes), () => {
//       clearInterval(id)
//       console.log(`✅ finished =`, bytes)
//     })
//   })
//   // .then(bytes => fs.writeFileSync(`./video.​mp4`, bytes))
//   .catch(err => console.error(err));

demos

"use strict";

/**
 *
 * @author xgqfrms
 * @license MIT
 * @copyright xgqfrms
 * @created 2023-08-24
 * @modified
 *
 * @description
 * @difficulty Easy Medium Hard
 * @time_complexity O(n)
 * @space_complexity O(n)
 * @augments
 * @example
 * @link https://www.cnblogs.com/xgqfrms/p/17655286.html
 * @solutions
 *
 * @best_solutions
 *
 */


// const log = console.log;

import * as cheerio from 'cheerio'
import url from 'node:url';
import fs from 'node:fs';

class WebCrawler {
  constructor(url, pagination, links) {
    console.log(`node.js 🕷️`)
    if(!url || !pagination) {
      throw new Error(`必传参数缺失 ❌\nurl=${url}\npagination=${pagination}`)
    }
    this.index = 0
    this.url = url
    this.pagination = pagination
    this.links = links
    this.items = []
    this.groups = []
  }
  async getVideos() {
    for (const item of this.items) {
      const html = await fetch(item.href).then(res => res.text())
      // 正则表达式
      const reg = /https\:\/\/edu-vod\.lagou\.com\/sv\/(.*)\.mp4/ig
      // const reg = /https\:\/\/edu-vod\.lagou\.com(.*)\.mp4/ig
      html.replaceAll(reg, group => {
        this.groups.push({
          url: group,
          title: item.title,
        })
        return
      });
    }
    console.log(`✅ groups`, this.groups.length, this.groups);
    await this.writeFile()
  }
  async writeFile() {
    let strs = ``;
    for (const group of this.groups) {
      strs += `${group.url}\n`
    }
    await fs.writeFile(`./strs.md`, strs, () => {
      console.log(`OK ✅`)
    })
    // ❌ TypeError [ERR_INVALID_ARG_TYPE]: The "data" argument must be of type string or an instance of Buffer, TypedArray, or DataView. Received an instance of Array
    // const data = JSON.stringify(this.items, null, 4)
    // await fs.writeFile(`./data.json`, data, () => {
    //   console.log(`OK ✅`)
    // })
  }
  getOrigin() {
    const urlObject = new URL(this.url);
    return urlObject.origin;
    // https://nodejs.org/api/url.html
  }
  async getItems() {
    const html = await fetch(`${this.url}?${this.pagination}=${this.index}`).then(res => res.text())
    const $ = cheerio.load(html)
    const items = $(`[class="article-title"]`)
    const origin = this.getOrigin();
    for (const item of items) {
      const href = origin + $(item).attr('href') ?? '';
      const title = $(item).attr('title') ?? '';
      const text = $(item).text() ?? '';
      this.items.push({
        href,
        title,
        text,
      })
    }
  }
  async getPages() {
    const html = await fetch(`${this.url}`).then(res => res.text())
    const $ = cheerio.load(html)
    const pages = $(`li[class^="ant-pagination-item"]`)
    for (let i = 0; i < pages.length; i++) {
      this.index += 1;
      console.log(`✅ index =`, i, this.index)
      await this.getItems()
    }
    await this.getVideos()
  }
  async parseHTML() {
    await this.getPages()
  }
}

function test() {
  const url = `http://abczzz.cn/archives/136`
  const pagination = `page`
  const links = {
    targetLink: `targetLink`,
    pageLink: `pageLink`,
    fileLink: `fileLink`,
  }
  const crawler = new WebCrawler(url, pagination, links)
  crawler.parseHTML()
}

test();

export default WebCrawler;
export {
  WebCrawler,
};

/Users/xgqfrms-mm/Documents/github/node-web-framework-all-in-one/000-xyz/crawler/server.js

image

#!/usr/bin/env bash

# author: xgqfrms
# created: 2023.08.24
# desciption: video downloader for web crawler

# 下载目录
rm -rf videos
mkdir videos

# $1 是传递给 shell 的第一个参数
# read line 按行读取文件
cat $1 | while read line
do
  # shell 变量需要使用双引号包裹, 或 echo $line
  echo "✅ $line"
  # 按行分割,每行一个
  url=$(echo ${line} | tr -d '\n')
  # 进入目录
  cd videos
  # -O/--remote-name , 把输出写到该文件中,保留远程文件的文件名 ✅
  curl $url -O
  # 退出目录
  cd ../
  # 延迟 3 秒
  sleep 3
done

# ./video.sh ./strs.md

$ ./video.sh ./strs.md

image

(🐞 反爬虫测试!打击盗版⚠️)如果你看到这个信息, 说明这是一篇剽窃的文章,请访问 https://www.cnblogs.com/xgqfrms/ 查看原创文章!

node-fetch


import fetch from "node-fetch";

import path from 'node:path';
import {fileURLToPath} from 'node:url';

// import fs from 'node:fs';
import {createWriteStream} from 'node:fs';
import {pipeline} from 'node:stream';
import {promisify} from 'node:util'

// const __filename = fileURLToPath(import.meta.url);
// const __dirname = path.dirname(__filename);
// console.log(`import.meta.url`, import.meta.url)
// console.log(`__dirname`, __dirname)


async function downloadFile(url, path) {
  const streamPipeline = promisify(pipeline);
  fetch(url).then(async (res) => {
    if (!res.ok) {
      throw new Error(`unexpected response ${res.statusText}`);
    }
    console.log(`✅ res =`, res)
    return await streamPipeline(res.body, createWriteStream(path));
  }).catch(err => {
    console.log(`❌ err =`, err)
  }).finally(() => {
    console.log(`finally 👻`)
  })
}

// async function downloadFile(url, path) {
//   const streamPipeline = promisify(pipeline);
//   const res = await fetch(url)
//   if (!res.ok) {
//     throw new Error(`unexpected response ${res.statusText}`);
//   }
//   // console.log(`✅ res =`, res)
//   await streamPipeline(res.body, createWriteStream(path));
// }


// const url = `https://edu-vod.lagou.com/sv/2daa3bb9-1765150ee6e/2daa3bb9-1765150ee6e.mp4`
// const url = `https://cdn.xgqfrms.xyz/video/web-testing.mp4`
const url = `https://cdn.xgqfrms.xyz/video/web-testing.mp5`
await downloadFile(url, "./test.mp4");

/*

$ node ./node-fetch.js

*/


axios

import axios from 'axios';
// import axios, {isCancel, AxiosError} from 'axios';

// import fs from 'node:fs';
import {createWriteStream} from 'node:fs';

// import path from 'path';
// import { fileURLToPath } from 'url';

// const __filename = fileURLToPath(import.meta.url);
// const __dirname = path.dirname(__filename);
// console.log(`import.meta.url`, import.meta.url)
// console.log(`__dirname`, __dirname)

// async function downloadFile(url, path) {
//   const res = await axios({
//     url,
//     method: "GET",
//     responseType: "stream",
//   });
//   console.log(`✅ content-type =`, res.headers['content-type'])
//   res.data.pipe(createWriteStream(path));
// }

async function downloadFile(url, path) {
  await axios({
    url,
    method: "GET",
    responseType: "stream",
  }).then(res => {
    console.log(`✅ content-type =`, res.headers['content-type'])
    return res.data.pipe(createWriteStream(path));
  }).catch(err => {
    console.log(`❌ err =`, err)
  }).finally(() => {
    console.log(`finally 👻`)
  })
}


// const url = `https://edu-vod.lagou.com/sv/2daa3bb9-1765150ee6e/2daa3bb9-1765150ee6e.mp4`
const url = `https://cdn.xgqfrms.xyz/video/web-testing.mp4`
await downloadFile(url, "./test.mp4");


/*

$ node ./axios.js

*/

refs

https://github.com/xgqfrms/node-web-framework-all-in-one

https://www.cnblogs.com/xgqfrms/tag/爬虫/

https://www.cnblogs.com/xgqfrms/p/16086580.html



©xgqfrms 2012-2021

www.cnblogs.com/xgqfrms 发布文章使用:只允许注册用户才可以访问!

原创文章,版权所有©️xgqfrms, 禁止转载 🈲️,侵权必究⚠️!


posted @ 2023-08-24 22:22  xgqfrms  阅读(97)  评论(10编辑  收藏  举报