日常生活的交流与学习

首页 新随笔 联系 管理

E:\公众号文章采集\fi_filter_过滤器\src\exact_新浪博客手机版提取连接.js

const fs = require('fs');
const jsdom = require('jsdom');
const { JSDOM } = jsdom;

fs.readdir('./html', function (err, files) {
  files.forEach((file) => {
    fs.readFile('./html/' + file, 'utf-8', (err, data) => {
      const { window } = new JSDOM(data);
      const $ = require('jQuery')(window);
      const writeStream = fs.createWriteStream('./urls.txt', 'utf-8');
      let index = 1;
      // #js_content a           href  超链接

      $('#js_content a').each(function () {
        fs.appendFile('./urls.txt', `${$(this).attr('href')}\r\n`, (err) => {
          if (err) {
            return console.log('append txt failed');
          }
          console.log(index++ + '__append file success');
        });
      });

      //--------------------------------------------------------
      //#js_content_overlay ul li                   data-link  话题

      // $("#js_content_overlay ul li").each(function () {
      //   fs.appendFile(
      //     "./urls.txt",
      //     `第一章\r\n${$(this).attr("data-link")}\r\n`,
      //     (err) => {
      //       if (err) {
      //         return console.log("append txt failed");
      //       }
      //       console.log(index++ + "__append file success");
      //     }
      //   );
      // });

      //-----------------------------------------------
      //新浪博客手机版提取连接
      // $("body ul li a").each(function () {
      //   fs.appendFile(
      //     "./urls.txt",
      //     `第一章\r\n${$(this).attr("href")}\r\n`,
      //     (err) => {
      //       if (err) {
      //         return console.log("append txt failed");
      //       }
      //       console.log(index++ + "__append file success");
      //     }
      //   );
      // });
      //----------------------------------------
      // 微信公众号主页提取连接
      // $("span[hrefs]").each(function () {
      //   fs.appendFile(
      //     "./urls.txt",
      //     `第一章\r\n${$(this).attr("hrefs")}\r\n`,
      //     (err) => {
      //       if (err) {
      //         return console.log("append txt failed");
      //       }
      //       console.log(index++ + "__append file success");
      //     }
      //   );
      // });

      writeStream.end();
    });
  });
});

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_倒序.js

const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;
fs.readdir(path, function (err, files) {
  files.forEach((file) => {
    console.log(file);
    if (file.split('.')[1] === 'txt') {
      fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
        const $ = cheerio.load(data);
        // const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
        const writeStream = fs.createWriteStream(
          path + '/目录/' + '新浪博客目录.txt',
          'utf-8'
        );
        $($('#pl-home-bloglist > article > ul>li').get().reverse()).each(
          (data, ele) => {
            let title = $(ele).find('h2').html();
            let url = $(ele).find('a').attr('data-link');
            writeStream.write('[');
            writeStream.write(title);
            writeStream.write(']');
            writeStream.write('(');
            writeStream.write(url);
            writeStream.write(')');
            writeStream.write('\n');
            writeStream.write('\n');
            console.log(title);
            console.log(url);
          }
        );
        writeStream.end();
      });
    }
  });
});

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_公众号_历史消息_原创_时间正序.js

const fs = require('fs');
const cheerio = require('cheerio');
let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息\\test`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) {
  let suffixReg = /\.(html)$/;
  if (suffixReg.test(file)) {
    const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
    const $ = cheerio.load(fileContent);
    const writeStream = fs.createWriteStream(
      path + '/' + file.split('.')[0] + '_讯飞有声_超链接_正序.txt',
      'utf-8'
    );
    // 获取所有的消息卡片
    let js_history_list = $('#js_history_list').children();
    // 对消息卡片进行排序,即倒序过来
    let history_list = Array.from(js_history_list).sort((a, b) => {
      return a.attribs.msgid - b.attribs.msgid;
    });
    //遍历消息卡片
    for (let ele of history_list) {
      const $1 = cheerio.load(ele);
      let link = $1('h4').attr('hrefs');
      let isOrigin = $1('#copyright_logo').html();
      // 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
      let type = $1('h4').parent().attr('data-type');
      if (isOrigin === '原创' && type === 'APPMSG') {
        console.log(link);
        writeStream.write(link);
        writeStream.write('\n');
      }
    }
    writeStream.end();
  }
}

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_公众号_历史消息_原创_时间正序_markdown.js

const fs = require('fs');
const cheerio = require('cheerio');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) {
  var o = {
    'M+': this.getMonth() + 1, //月份
    'd+': this.getDate(), //日
    'h+': this.getHours(), //小时
    'm+': this.getMinutes(), //分
    's+': this.getSeconds(), //秒
    'q+': Math.floor((this.getMonth() + 3) / 3), //季度
    S: this.getMilliseconds(), //毫秒
  };
  if (/(y+)/.test(fmt)) {
    fmt = fmt.replace(
      RegExp.$1,
      (this.getFullYear() + '').substr(4 - RegExp.$1.length)
    );
  }
  for (var k in o) {
    if (new RegExp('(' + k + ')').test(fmt)) {
      fmt = fmt.replace(
        RegExp.$1,
        RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length)
      );
    }
  }
  return fmt;
};

let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息\\test`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) {
  let suffixReg = /\.(html)$/;
  if (suffixReg.test(file)) {
    const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
    const $ = cheerio.load(fileContent);
    const writeStream = fs.createWriteStream(
      path + '/' + file.split('.')[0] + '_link_正序.md',
      'utf-8'
    );
    // 获取所有的消息卡片
    let js_history_list = $('#js_history_list').children();
    // 对消息卡片进行排序,即倒序过来
    let history_list = Array.from(js_history_list).sort((a, b) => {
      return a.attribs.msgid - b.attribs.msgid;
    });
    //遍历消息卡片
    for (let ele of history_list) {
      const $1 = cheerio.load(ele);
      let link = $1('h4').attr('hrefs');
      let isOrigin = $1('#copyright_logo').html();
      let time = $1('p.weui_media_extra_info').html().split('<span')[0].trim();
      // 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
      let type = $1('h4').parent().attr('data-type');
      // 如果是原创和图文消息的话
      if (isOrigin === '原创' && type === 'APPMSG') {
        // 对日期进行处理,将2020年7月2日-->2020年07月02日
        time = time.replace('年', '/').replace('月', '/').replace('日', '');
        time = new Date(time).format('yyyy年MM月dd日');
        // 获取文章的标题,要在这个里面获取,因为TEXT消息没有标题
        let title = $1('h4').html().split('</span>')[1].trim();
        console.log(time + '_' + title);
        // 写入Markdown文件
        writeStream.write(`[${time}_${title}](${link})`);
        writeStream.write('\n');
      }
    }
    writeStream.end();
  }
}

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_公众号_历史消息_原创_时间正序_markdown_当前文件夹.js

const fs = require('fs');
const cheerio = require('cheerio');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) {
  var o = {
    'M+': this.getMonth() + 1, //月份
    'd+': this.getDate(), //日
    'h+': this.getHours(), //小时
    'm+': this.getMinutes(), //分
    's+': this.getSeconds(), //秒
    'q+': Math.floor((this.getMonth() + 3) / 3), //季度
    S: this.getMilliseconds(), //毫秒
  };
  if (/(y+)/.test(fmt)) {
    fmt = fmt.replace(
      RegExp.$1,
      (this.getFullYear() + '').substr(4 - RegExp.$1.length)
    );
  }
  for (var k in o) {
    if (new RegExp('(' + k + ')').test(fmt)) {
      fmt = fmt.replace(
        RegExp.$1,
        RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length)
      );
    }
  }
  return fmt;
};
// 让程序的处理路径切换成当前文件夹
let path = __dirname;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList) {
  console.log('---------' + file + '----------');
  let suffixReg = /\.(txt)$/;
  if (suffixReg.test(file)) {
    // 读取文件,然后加载
    const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
    const $ = cheerio.load(fileContent);
    // 获取所有的消息卡片
    let js_history_list = $('#js_history_list').children();
    // 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
    if (!js_history_list) continue;
    // 创建文件写入流
    const writeStream = fs.createWriteStream(
      path + '/' + file.split('.')[0] + '_link_正序.md',
      'utf-8'
    );
    // 对消息卡片进行排序,即倒序过来
    let history_list = Array.from(js_history_list).sort((a, b) => {
      return a.attribs.msgid - b.attribs.msgid;
    });
    //遍历消息卡片
    for (let ele of history_list) {
      const $1 = cheerio.load(ele);
      let link = $1('h4').attr('hrefs');
      let isOrigin = $1('#copyright_logo').html();
      let time = $1('p.weui_media_extra_info').html().split('<span')[0].trim();
      // 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
      let type = $1('h4').parent().attr('data-type');
      // 如果是原创和图文消息的话
      if (isOrigin === '原创' && type === 'APPMSG') {
        // 对日期进行处理,将2020年7月2日-->2020年07月02日
        time = time.replace('年', '/').replace('月', '/').replace('日', '');
        time = new Date(time).format('yyyy年MM月dd日');
        // 获取文章的标题,要在这个里面获取,因为TEXT消息没有标题
        let title = $1('h4').html().split('</span>')[1];
        if (title !== undefined) {
          title = title.trim();
        }
        console.log(time + '_' + title);
        // 写入Markdown文件
        writeStream.write(`[${time}_${title}](${link})`);
        writeStream.write('\n');
      }
    }
    writeStream.end();
  }
}

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_公众号_历史消息_原创_时间正序_markdown_当前文件夹_合并.js

const fs = require('fs');
const cheerio = require('cheerio');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) {
  var o = {
    'M+': this.getMonth() + 1, //月份
    'd+': this.getDate(), //日
    'h+': this.getHours(), //小时
    'm+': this.getMinutes(), //分
    's+': this.getSeconds(), //秒
    'q+': Math.floor((this.getMonth() + 3) / 3), //季度
    S: this.getMilliseconds(), //毫秒
  };
  if (/(y+)/.test(fmt)) {
    fmt = fmt.replace(
      RegExp.$1,
      (this.getFullYear() + '').substr(4 - RegExp.$1.length)
    );
  }
  for (var k in o) {
    if (new RegExp('(' + k + ')').test(fmt)) {
      fmt = fmt.replace(
        RegExp.$1,
        RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length)
      );
    }
  }
  return fmt;
};
// 让程序的处理路径切换成当前文件夹
// let path = __dirname;
let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息\\test3`;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 创建文件写入流
const writeStream = fs.createWriteStream(
  path + '/' + '原创文章-超链接-时间顺序-合并.md',
  'utf-8'
);
// 遍历文件列表数组
for (let file of fileList) {
  console.log('---------' + file + '----------');
  let suffixReg = /\.(txt)$/;
  if (suffixReg.test(file)) {
    // 读取文件,然后加载
    const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
    const $ = cheerio.load(fileContent);
    // 获取所有的消息卡片
    let js_history_list = $('#js_history_list').children();
    // 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
    if (!js_history_list) continue;

    // 写入当前文件标题,作为Markdown的一级标题
    writeStream.write(`# ${file.split('.')[0]}\n`);
    // 对消息卡片进行排序,即倒序过来
    let history_list = Array.from(js_history_list).sort((a, b) => {
      return a.attribs.msgid - b.attribs.msgid;
    });
    //遍历消息卡片
    for (let ele of history_list) {
      const $1 = cheerio.load(ele);
      // 获取时间
      let time = $1('.weui_msg_card_hd:first-child').html();
      time = timeConvert(time);
      const msgList = $1('.weui_msg_card_bd').children();
      for (let msg of msgList) {
        const $2 = cheerio.load(msg);
        // 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
        let type = $2('h4').parent().attr('data-type');
        // 如果是文章是原创的话,h4标题里面有个span标签,反之,里面只有标题
        let isOrigin = $2('#copyright_logo').html();
        // 获取当前图文标题
        let title = $2('h4').html();
        // 对标题进行处理
        if (isOrigin) {
          title = title.split('</span>')[1].trim();
        } else {
          title = title.trim();
        }
        // 获取链接
        let link = $2('h4').attr('hrefs');
        // 如果是原创和图文消息的话
        if (isOrigin === '原创' && type === 'APPMSG') {
          console.log(time + '_' + title);
          writeStream.write(`[${time}_${title}](${link}) `);
          writeStream.write('\n');
        }
      }
    }
  }
}
writeStream.end();

// 对日期进行处理,将2020年7月2日-->2020年07月02日
function timeConvert(time) {
  time = time.replace('年', '/').replace('月', '/').replace('日', '');
  let newTime = new Date(time).format('yyyy年MM月dd日');
  return newTime;
}

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_公众号_历史消息_原创_时间正序_markdown_当前文件夹_合并_优化.js

const fs = require('fs');
const cheerio = require('cheerio');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) {
  var o = {
    'M+': this.getMonth() + 1, //月份
    'd+': this.getDate(), //日
    'h+': this.getHours(), //小时
    'm+': this.getMinutes(), //分
    's+': this.getSeconds(), //秒
    'q+': Math.floor((this.getMonth() + 3) / 3), //季度
    S: this.getMilliseconds(), //毫秒
  };
  if (/(y+)/.test(fmt)) {
    fmt = fmt.replace(
      RegExp.$1,
      (this.getFullYear() + '').substr(4 - RegExp.$1.length)
    );
  }
  for (var k in o) {
    if (new RegExp('(' + k + ')').test(fmt)) {
      fmt = fmt.replace(
        RegExp.$1,
        RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length)
      );
    }
  }
  return fmt;
};
// 让程序的处理路径切换成当前文件夹
let path = __dirname;
// let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息\\test3`;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 创建文件写入流
const writeStream = fs.createWriteStream(
  path + '/' + '原创文章-超链接-时间顺序-合并.md',
  'utf-8'
);
// 遍历文件列表数组
for (let file of fileList) {
  console.log('---------' + file + '----------');
  let suffixReg = /\.(txt)$/;
  if (suffixReg.test(file)) {
    // 读取文件,然后加载
    const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
    const $ = cheerio.load(fileContent);
    // 获取所有的消息卡片
    let js_history_list = $('#js_history_list').children();

    // 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
    if (!js_history_list) continue;

    // 写入当前文件标题,作为Markdown的一级标题
    writeStream.write(`# ${file.split('.')[0]}\n`);
    // 对消息卡片进行排序,即倒序过来
    let history_list = Array.from(js_history_list).sort((a, b) => {
      return a.attribs.msgid - b.attribs.msgid;
    });
    //遍历消息卡片
    for (let ele of history_list) {
      const $1 = cheerio.load(ele);
      // 获取时间
      let time = $1('.weui_msg_card_hd:first-child').html();
      time = timeConvert(time);
      const msgList = $1('.weui_msg_card_bd').children();
      for (let msg of msgList) {
        const $2 = cheerio.load(msg);
        // 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
        let type = $2('h4').parent().attr('data-type');
        // 提取图文消息
        if (type === 'APPMSG') {
          // 如果是文章是原创的话,h4标题里面有个span标签,反之,里面只有标题
          let isOrigin = $2('#copyright_logo').html();
          // 获取当前图文标题
          let title = $2('h4').html();
          // 对标题进行处理
          if (isOrigin) {
            title = title.split('</span>')[1].trim();
          } else {
            title = title.trim();
          }
          // 获取链接
          let link = $2('h4').attr('hrefs');
          // 如果是原创和图文消息的话
          if (isOrigin === '原创') {
            console.log(time + '_' + title);
            writeStream.write(`[${time}_${title}](${link}) `);
            writeStream.write('\n');
          }
        }
      }
    }
  }
}
writeStream.end();

// 对日期进行处理,将2020年7月2日-->2020年07月02日
function timeConvert(time) {
  time = time.replace('年', '/').replace('月', '/').replace('日', '');
  let newTime = new Date(time).format('yyyy年MM月dd日');
  return newTime;
}

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_微信公众号_历史消息_提取链接.js

const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) {
  console.log(file);
  if (file.split('.')[1] === 'txt') {
    const data = fs.readFileSync(path + '/' + file, 'utf8');
    const $ = cheerio.load(data);
    // 1. 提取目录
    const writeStream = fs.createWriteStream(path + '/目录/' + file, 'utf-8');
    $('#js_history_list h4').each((index, ele) => {
      console.log('--------' + index);
      writeStream.write($(ele).attr('hrefs') ?? '');
      writeStream.write('\n');
    });
    writeStream.end();
  }
}

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_微信公众号_数据清洗.js

const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;
fs.readdir(path, function (err, files) {
  files.forEach((file) => {
    console.log(file);
    if (file.split('.')[1] === 'txt') {
      fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
        const $ = cheerio.load(data);

        // 1. 数据清洗_相同路径下面创建相同文件,用来清理script标签
        const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
        $('script').remove();
        $('link').remove();
        writeStream.write($('html').html());
        writeStream.end();
      });
    }
  });
});

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_微信公众号_话题_提取链接.js

const fs = require("fs");
const cheerio = require("cheerio");
let path = `./html`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// console.log(fileList);

// 遍历文件列表数组
// for (let file of fileList) {
//   console.log(file);
//   if (file.split('.')[1] === 'txt') {
//     fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
//       const $ = cheerio.load(data);

//       // 1. 提取目录
//       const writeStream = fs.createWriteStream(path + '/目录/' + file, 'utf-8');
//       $(
//         '#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li'
//       ).each((index, ele) => {
//         writeStream.write($(ele).attr('data-link'));
//         writeStream.write('\n');
//       });
//       writeStream.end();
//     });
//   }
// }

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_微信公众号_页面_提取链接.js

const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;

fs.readdir(path, function (err, files) {
  files.forEach((file) => {
    console.log(file);
    if (file.split('.')[1] === 'html') {
      fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
        const $ = cheerio.load(data);

        // 1. 提取目录
        const writeStream = fs.createWriteStream(
          path + '/目录/' + file,
          'utf-8'
        );
        $('#js_content a').each((index, ele) => {
          writeStream.write($(ele).attr('href'));
          writeStream.write('\n');
        });
        writeStream.end();
      });
    }
  });
});

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_微信公众号_页面_提取链接_同步.js

const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) {
  if (file.split('.')[1] === 'txt') {
    // 读取文件的中内容
    const fileContent = fs.readFileSync(path + '/' + file, 'utf8');
    // 加载cheerio中生成dom树
    const $ = cheerio.load(fileContent);
    // 创建写入文件流
    const writeStream = fs.createWriteStream(path + '/目录/' + file, 'utf8');
    // 写入相关内容
    $('#js_content a').each((index, ele) => {
      writeStream.write($(ele).attr('href'));
      writeStream.write('\n');
    });
    // 关闭写入流
    writeStream.end();
  }
}

// fs.readdir(path, function (err, files) {
//   files.forEach((file) => {
//     console.log(file);
//     if (file.split('.')[1] === 'html') {
//       fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
//         const $ = cheerio.load(data);

//         // 1. 提取目录
//         const writeStream = fs.createWriteStream(
//           path + '/目录/' + file,
//           'utf-8'
//         );
//         $('#js_content a').each((index, ele) => {
//           writeStream.write($(ele).attr('href'));
//           writeStream.write('\n');
//         });
//         writeStream.end();
//       });
//     }
//   });
// });

E:\公众号文章采集\fi_filter_过滤器\src\extract_link_正序.js

const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;
fs.readdir(path, function (err, files) {
  files.forEach((file) => {
    console.log(file);
    if (file.split('.')[1] === 'txt') {
      fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
        const $ = cheerio.load(data);
        // const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
        const writeStream = fs.createWriteStream(
          path + '/目录/' + '新浪博客目录.txt',
          'utf-8'
        );
        $('#pl-home-bloglist > article > ul>li').each((data, ele) => {
          let title = $(ele).find('h2').html();
          let url = $(ele).find('a').attr('data-link');
          writeStream.write('[');
          writeStream.write(title);
          writeStream.write(']');
          writeStream.write('(');
          writeStream.write(url);
          writeStream.write(')');
          writeStream.write('\n');
          writeStream.write('\n');
          console.log(title);
          console.log(url);
        });
        writeStream.end();
      });
    }
  });
});

E:\公众号文章采集\fi_filter_过滤器\src\filter_html.js

const fs = require("fs");
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
let path = `E:\\公众号文章采集\\公众号HTML\\灵智宝鬘`;
fs.readdir(path, function (err, files) {
  files.forEach((file) => {
    console.log(file);
    if (file.split(".")[1] === "html") {
      fs.readFile(path + "/" + file, "utf-8", (err, data) => {
        const { window } = new JSDOM(data);
        const $ = require("jQuery")(window);
        const writeStream = fs.createWriteStream(path + "/" + file, "utf-8");
        // 包含特定文字,掺杂在正文中的p标签,或者span标签
        $("span:contains('***')").remove();
        $("span:contains('--- TBC ---')").remove();
        $("span:contains('支持原创翻译')").remove();
        $("span:contains('节选自室利·萨马塔·罗摩达斯')").remove();
        $("a:contains('阅读全文')").remove();
        $("p:contains('因此,在《给弟子的忠告》')").remove();
        //标题移除
        $("#activity-name").remove();
        // 所有音频标签
        $("section").remove();
        // 所有的h3标签
        $("h3").remove();
        // 话题标签
        $("#js_tags").remove();
        // 所有的img图片;
        $("img").remove();
        // 所有的script标签
        $("script").remove();
        //公众号名称 时间信息
        $("div#meta_content").remove();
        //底部评论信息
        $("div.comment").remove();
        //html写入
        writeStream.write($("html").html());
        writeStream.end();
      });
    }
  });
});

E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio.js

const fs = require('fs');
const cheerio = require('cheerio');
let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) {
  if (file.split('.')[1] === 'html') {
    const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
    const $ = cheerio.load(fileContent);
    const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
    // 过滤内容
    // 所有的script标签
    $('script').remove();
    $('link').remove();
    //html写入
    writeStream.write($('html').html());
    writeStream.end();
  }
}

function filterContent($) {
  // 包含特定文字,掺杂在正文中的p标签,或者span标签
  // $("section:contains('相关阅读')").remove();
  // $("span:contains('--- TBC ---')").remove();
  // $("span:contains('支持原创翻译')").remove();
  // $("span:contains('节选自室利·萨马塔·罗摩达斯')").remove();
  // $("a:contains('阅读全文')").remove();
  // $("p:contains('因此,在《给弟子的忠告》')").remove();
  //标题移除
  // $('#activity-name').remove();
  // 所有音频标签
  // $('section').remove();
  // 所有的h3标签
  // $('h3').remove();
  // 话题标签
  // $('#js_tags').remove();
  // 所有的img图片;
  // $('img').remove();
  // 所有的script标签
  $('script').remove();
  //公众号名称 时间信息
  // $('div#meta_content').remove();
  //底部评论信息
  // $('div.comment').remove();
}

E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio_听心坊.js

const fs = require('fs');
const cheerio = require('cheerio');
let path = `E:\\公众号文章采集\\公众号HTML\\听心坊\\`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) {
  console.log(file);
  if (file.split('.')[1] === 'html') {
    const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
    const $ = cheerio.load(fileContent);

    // 过滤内容---------------------------------------------------------
    // 包含特定文字,掺杂在正文中的p标签,或者span标签
    // $("span:contains('如欲与陈明亮先生交流,请登陆:')").remove();
    // $("span:contains('facebook.com/profile.php?id=100039436871466')").remove();
    // $("span:contains('更多陈明亮的文章,请前往')").remove();
    // $("span:contains('更多的音频')").remove();
    // $("span:contains('摄影')").remove();
    // 获取文章内的数字索引
    // console.log($("span:contains('【明亮说')").text());
    let idx = $("span:contains('【明亮说')").text().split('·')[1];
    console.log(idx);

    // //测试断点
    // writeStream.write($('html').html());
    // writeStream.end();
    // break;
    // //测试断点

    // $("a:contains('阅读全文')").remove();
    // $("p:contains('因此,在《给弟子的忠告》')").remove();
    //标题移除
    // $('#activity-name').remove();
    // 所有音频标签
    // $('section').remove();
    // 所有的h3标签
    // $('h3').remove();
    // 话题标签
    // $('#js_tags').remove();
    // 所有的img图片;
    // $('img').remove();
    // 所有的script标签
    // $('script').remove();
    //公众号名称 时间信息
    // $('div#meta_content').remove();
    //底部评论信息
    // $('div.comment').remove();
    //html写入-------------------------------------------------
    const writeStream = fs.createWriteStream(path + '/' + idx + file, 'utf-8');
    writeStream.write($('html').html());
    writeStream.end();
  }
}

E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio_旭然之光.js

const fs = require('fs');
const cheerio = require('cheerio');
let path = `E:\\公众号文章采集\\公众号HTML\\煦然之光`;
fs.readdir(path, function (err, files) {
  files.forEach((file) => {
    console.log(file);
    if (file.split('.')[1] === 'html') {
      fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
        const $ = cheerio.load(data);
        const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
        // 包含特定文字,掺杂在正文中的p标签,或者span标签
        // $("section:contains('相关阅读')").remove();
        // $("span:contains('--- TBC ---')").remove();
        // $("span:contains('支持原创翻译')").remove();
        // $("span:contains('节选自室利·萨马塔·罗摩达斯')").remove();
        // $("a:contains('阅读全文')").remove();
        // $("p:contains('因此,在《给弟子的忠告》')").remove();
        //标题移除
        // $('#activity-name').remove();
        // 所有音频标签
        // $('section').remove();
        // 所有的h3标签
        // $('h3').remove();
        // 话题标签
        // $('#js_tags').remove();
        // 所有的img图片;
        // $('img').remove();
        // 所有的script标签
        // $('script').remove();
        //公众号名称 时间信息
        // $('div#meta_content').remove();
        //底部评论信息
        // $('div.comment').remove();
        //html写入
        writeStream.write($('html').html());
        writeStream.end();
      });
    }
  });
});

E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio_阿知事业林.js

const fs = require('fs');
const cheerio = require('cheerio');
let path = `E:\\公众号文章采集\\公众号HTML\\阿知事业林`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) {
  console.log(file);
  if (file.split('.')[1] === 'html') {
    // 读取文件的中内容
    const fileContent = fs.readFileSync(path + '/' + file, 'utf8');
    // 加载cheerio中生成dom树
    const $ = cheerio.load(fileContent);
    // 创建写入文件流
    const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
    // 所有的img图片;
    $('img').remove();
    // 所有的script标签
    $('script').remove();
    // ---------------------
    // 包含特定文字,掺杂在正文中的p标签,或者span标签
    // $("section:contains('相关阅读')").remove();
    // $("span:contains('--- TBC ---')").remove();
    // $("span:contains('支持原创翻译')").remove();
    // $("span:contains('节选自室利·萨马塔·罗摩达斯')").remove();
    // $("a:contains('阅读全文')").remove();
    // $("p:contains('因此,在《给弟子的忠告》')").remove();
    // 作者和发布的时间信息
    $('#meta_content').remove();
    // 评论信息
    $('.comment').remove();
    // 话题标签
    $('#js_tags').remove();
    // 评论赞助
    $('#js_sponsor_ad_area').remove();
    // 超链接
    $('a').remove();
    // 所有音频标签
    // $('section').remove();
    // 所有的h3标签
    // $('h3').remove();
    // 话题标签
    // $('#js_tags').remove();
    //公众号名称 时间信息
    // $('div#meta_content').remove();
    //底部评论信息
    // $('div.comment').remove();
    //html写入
    writeStream.write($('html').html());
    writeStream.end();
  }
}

E:\公众号文章采集\fi_filter_过滤器\src\filter_html_不死甘露.js

const fs = require("fs");
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
let path = `E:\\公众号文章采集\\公众号HTML\\灵智宝鬘`;
fs.readdir(path, function (err, files) {
  files.forEach((file) => {
    console.log(file);
    if (file.split(".")[1] === "html") {
      fs.readFile(path + "/" + file, "utf-8", (err, data) => {
        const { window } = new JSDOM(data);
        const $ = require("jQuery")(window);
        const writeStream = fs.createWriteStream(path + "/" + file, "utf-8");

        // 包含特定文字,掺杂在正文中的p标签,或者span标签
        $("span:contains('***')").remove();
        $("span:contains('--- TBC ---')").remove();
        $("span:contains('支持原创翻译')").remove();
        $("a:contains('阅读全文')").remove();
        $("strong:contains('不死甘露')").remove();
        $("strong:contains('关于永恒的开示录')").remove();
        $("strong:contains('THE NECTAR OF IMMORTALITY')").remove();

        $("span:contains('室利·尼萨迦达塔·马哈拉吉 著')").remove();
        $("span:contains('灵智宝鬘翻译团队 中译')").remove();
        $("p:contains('喜欢作者')").remove();
        $("p:contains('——')").remove();

        //标题移除
        $("#activity-name").remove();
        // 所有音频标签
        $("section").remove();
        // 所有的h3标签
        $("h3").remove();
        // 话题标签
        $("#js_tags").remove();
        // 所有的img图片;
        $("img").remove();
        // 所有的script标签
        $("script").remove();
        //公众号名称 时间信息
        $("div#meta_content").remove();
        //底部评论信息
        $("div.comment").remove();
        //html写入
        writeStream.write($("html").html());
        writeStream.end();
      });
    }
  });
});

E:\公众号文章采集\fi_filter_过滤器\src\html2txt.js

const fs = require("fs");
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
let path = `E:\\公众号文章采集\\公众号HTML\\养猫学习`;

fs.readdir(path, function (err, files) {
  files.forEach((file) => {
    console.log(file);
    if (file.split(".")[1] === "html") {
      fs.readFile(path + "/" + file, "utf-8", (err, data) => {
        const { window } = new JSDOM(data);
        const $ = require("jQuery")(window);
        const writeStream = fs.createWriteStream(
          path + "/" + file.split(".")[0] + ".txt",
          "utf-8"
        );
        //标题
        writeStream.write($("#activity-name").text());
        //内容
        writeStream.write($("#js_content").text());
        writeStream.end();
      });
    }
  });
});

E:\公众号文章采集\fi_filter_过滤器\src\html2txt_cheerio.js

const fs = require('fs');
const cheerio = require('cheerio');

let path = `E:\\公众号文章采集\\公众号HTML\\阿知事业林`;

// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);

// 遍历文件列表数组
for (let file of fileList) {
  console.log(file);
  if (file.split('.')[1] === 'html') {
    // 读取文件的中内容
    const fileContent = fs.readFileSync(path + '/' + file, 'utf8');
    // 加载cheerio中生成dom树
    const $ = cheerio.load(fileContent);
    // 创建一个文件写入流
    const writeStream = fs.createWriteStream(
      path + '/' + file.split('.')[0] + '.txt',
      'utf-8'
    );
    // //标题
    // writeStream.write($('#activity-name').text());
    // //内容
    writeStream.write($('body').text());
    writeStream.write('endendend');

    writeStream.end();
  }
}

E:\公众号文章采集\fi_filter_过滤器\src\任意路径文件写入.js

const fs = require('fs');

const writeFileRecursive = function (path, buffer, callback) {
  // 前面的文件路径
  let lastPath = path.substring(0, path.lastIndexOf('/'));
  // 递归创建目录
  fs.mkdir(lastPath, { recursive: true }, (err) => {
    if (err) return callback(err);
    fs.writeFile(path, buffer, function (err) {
      if (err) return callback(err);
      return callback(null);
    });
  });
};

const buffer = 'hello';
writeFileRecursive('./public/test/test.txt', buffer, (err) => {
  if (err) console.error(err);
  console.info('write success');
});

E:\公众号文章采集\fi_filter_过滤器\src\crawler\crawler.ts

import superagent from "superagent";
import { load, CheerioAPI } from "cheerio";

import { log } from "console";
import { createWriteStream } from "fs";

export default class Crawler {
  private url = ``;
  private $: CheerioAPI;

  constructor() {}

  setUrl(url: string) {
    this.url = url;
  }

  async init() {
    const res = await superagent.get(this.url);
    this.$ = load(res.text);
  }

  save(path: string) {
    const writeStream = createWriteStream(path, "utf-8");
    writeStream.write(this.$("html"));
    writeStream.end();
  }
  getTitle() {
    log(this.$("#activity-name").text());
    return this.$("#activity-name").text();
  }
  getTime() {
    log(this.$("script:contains('function htmlDecode(str)')").text());
  }

  getContent() {
    // 萨特桑指出
    const quotes = this.$("span:contains('萨特桑指出')");
    return quotes.text();
  }
}

E:\公众号文章采集\fi_filter_过滤器\src\crawler\index.ts

import { log } from "console";
import Crawler from "./crawler";

const crawler = new Crawler();

crawler.setUrl("https://mp.weixin.qq.com/s/EgZhFJTzsgfYzZZ-4_SI4Q");
await crawler.init();
crawler.getTime();
// crawler.getTitle();
// crawler.save("");

// const content = crawler.getContent();

// log(content);

E:\公众号文章采集\fi_filter_过滤器\src\filter\01_灵智宝鬘_话题_尼萨迦达塔.ts

import { log } from "console";
import {
  readFileSync,
  readdirSync,
  lstatSync,
  createWriteStream,
  mkdirSync,
  statSync,
} from "fs";
import { basename, extname, join, resolve } from "path";
import { load, CheerioAPI } from "cheerio";

// 这个process.cwd()就是当前执行程序的文件夹
// const basePath = process.cwd();
// E:\公众号文章采集\公众号HTML\灵智宝鬘
const basePath = "E:\\公众号文章采集\\公众号HTML\\灵智宝鬘";
const outPath = join(basePath, "out");

try {
  exitsFolder(outPath);
} catch (e) {
  log(e);
}

// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);

const pureFilePathList = fileList
  .filter((fileName) => {
    return lstatSync(join(basePath, fileName)).isFile();
  })
  .filter((fileName) => {
    const fileExt = extname(fileName);
    return fileExt === ".html";
  })
  .map((fileName) => {
    return join(basePath, fileName);
  });

// pureFilePathList.forEach((filePath) => {
//   extractTopic(filePath);
// });

for (let filePath of pureFilePathList) {
  const $: CheerioAPI = loadHtmlDom(filePath);
  filterDom($);
  const outFilePath = getOutFilePath(filePath);
  const writeStream = createWriteStream(outFilePath, "utf-8");
  writeStream.write($("html").html());
  writeStream.end();
}

// ====================================================================================================================
// 移除标签的原则,尽量少移除p标签,有可能会误把正文内容移除
function filterDom($: CheerioAPI) {
  // 话题标签
  $("#js_tags").remove();
  // 包含特定文字的span标签
  $("span:contains('灵智宝鬘翻译团队 中译')").remove();
  // style="white-space: normal;text-align: center;"
  $(
    "p[style*='white-space: normal;text-align: center;']:contains('我是那')"
  ).remove();
  // 室利·尼萨迦达塔·马哈拉吉的开示录
  $(
    "p[style*='white-space: normal;text-align: center;']:contains('室利·尼萨迦达塔·马哈拉吉的开示录')"
  ).remove();
  // 文字颜色是 color: rgb(136, 136, 136)   ,且包含" 室利·尼萨迦达塔·马哈拉吉 著"的span标签
  $(
    "span[style*='color: rgb(136, 136, 136)']:contains('室利·尼萨迦达塔·马哈拉吉 著')"
  ).remove();
  // 红色的span和strong标签
  $("span[style*='color: rgb(255, 76, 65)']").remove();
  $("strong[style*='color: rgb(255, 76, 65)']").remove();
  // 类名是comment的div标签
  $("div.comment").remove();
}

function loadHtmlDom(filePath: string): CheerioAPI {
  const htmlText = readFileSync(filePath, "utf-8");
  return load(htmlText);
}

function extractLink($: CheerioAPI) {
  const oLinkList = $("#js_articles > div");
  if (!oLinkList.length) return [];
  const linkArr: string[] = [];
  oLinkList.each((i, oLink) => {
    const url = $(oLink).attr("data-jump_url");
    if (!url) return;
    linkArr.push(url);
  });

  return linkArr;
}

function exitsFolder(absPath: string) {
  try {
    statSync(absPath);
  } catch (e) {
    // 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
    mkdirSync(absPath, { recursive: true });
  }
}

function getCurDate() {
  const d_t = new Date();

  let year = d_t.getFullYear();
  let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
  let day = ("0" + d_t.getDate()).slice(-2);
  let hour = d_t.getHours();
  let minute = d_t.getMinutes();
  let second = d_t.getSeconds();

  // prints date & time in YYYY-MM-DD HH:MM:SS format
  return (
    year +
    "年" +
    month +
    "月" +
    day +
    "日" +
    hour +
    "时" +
    minute +
    "分" +
    second +
    "秒"
  );
}

function getOutFilePath(filePath: string) {
  return join(outPath, basename(filePath));
}

E:\公众号文章采集\fi_filter_过滤器\src\topic\01_非推送_链接_一行一个.ts

import { log } from "console";
import {
  readFileSync,
  readdirSync,
  lstatSync,
  createWriteStream,
  mkdirSync,
  statSync,
} from "fs";
import { basename, extname, join, resolve } from "path";
import { load, CheerioAPI } from "cheerio";

// 这个process.cwd()就是当前执行程序的文件夹
const basePath = process.cwd();
const outPath = join(basePath, "out");
try {
  exitsFolder(outPath);
} catch (e) {
  log(e);
}

// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
  .filter((fileName) => {
    return lstatSync(join(basePath, fileName)).isFile();
  })
  .filter((fileName) => {
    const fileExt = extname(fileName);
    return fileExt === ".txt" || fileExt === ".html";
  })
  .map((fileName) => {
    return join(basePath, fileName);
  });

pureFilePathList.forEach((filePath) => {
  extractTopic(filePath);
});

function extractTopic(filePath: string) {
  const $: CheerioAPI = loadHtmlDom(filePath);
  const urlArr = extractLink($);
  const outFilePath = getOutFilePath(filePath);
  const writeStream = createWriteStream(outFilePath, "utf-8");
  urlArr.forEach((url) => {
    writeStream.write(url);
    writeStream.write("\n");
  });
  writeStream.end();
}

function loadHtmlDom(filePath: string): CheerioAPI {
  const htmlText = readFileSync(filePath, "utf-8");
  return load(htmlText);
}

function extractLink($: CheerioAPI) {
  const oLinkList = $(
    "#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li"
  );

  const linkArr: string[] = [];

  oLinkList.each((i, oLink) => {
    const url = $(oLink).attr("data-link");
    linkArr.push(url ? url : "");
  });

  return linkArr;
}

function exitsFolder(absPath: string) {
  try {
    statSync(absPath);
  } catch (e) {
    // 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
    mkdirSync(absPath, { recursive: true });
  }
}

function getCurDate() {
  const d_t = new Date();

  let year = d_t.getFullYear();
  let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
  let day = ("0" + d_t.getDate()).slice(-2);
  let hour = d_t.getHours();
  let minute = d_t.getMinutes();
  let second = d_t.getSeconds();

  // prints date & time in YYYY-MM-DD HH:MM:SS format
  return (
    year +
    "年" +
    month +
    "月" +
    day +
    "日" +
    hour +
    "时" +
    minute +
    "分" +
    second +
    "秒"
  );
}

function getOutFilePath(filePath: string) {
  return join(
    outPath,
    getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt"
  );
}

E:\公众号文章采集\fi_filter_过滤器\src\topic\02_推送_链接_一行一个.ts

import { log } from "console";
import {
  readFileSync,
  readdirSync,
  lstatSync,
  createWriteStream,
  mkdirSync,
  statSync,
} from "fs";
import { basename, extname, join, resolve } from "path";
import { load, CheerioAPI } from "cheerio";

// 这个process.cwd()就是当前执行程序的文件夹
const basePath = process.cwd();
const outPath = join(basePath, "out");
try {
  exitsFolder(outPath);
} catch (e) {
  log(e);
}

// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
  .filter((fileName) => {
    return lstatSync(join(basePath, fileName)).isFile();
  })
  .filter((fileName) => {
    const fileExt = extname(fileName);
    return fileExt === ".txt" || fileExt === ".html";
  })
  .map((fileName) => {
    return join(basePath, fileName);
  });

pureFilePathList.forEach((filePath) => {
  extractTopic(filePath);
});

function extractTopic(filePath: string) {
  const $: CheerioAPI = loadHtmlDom(filePath);
  const urlArr = extractLink($)!;
  if (!urlArr.length) return;

  const outFilePath = getOutFilePath(filePath);
  const writeStream = createWriteStream(outFilePath, "utf-8");
  urlArr.forEach((url) => {
    writeStream.write(url);
    writeStream.write("\n");
  });
  writeStream.end();
}

function loadHtmlDom(filePath: string): CheerioAPI {
  const htmlText = readFileSync(filePath, "utf-8");
  return load(htmlText);
}

function extractLink($: CheerioAPI) {
  const oLinkList = $("#js_articles > div");
  if (!oLinkList.length) return [];
  const linkArr: string[] = [];
  oLinkList.each((i, oLink) => {
    const url = $(oLink).attr("data-jump_url");
    if (!url) return;
    linkArr.push(url);
  });

  return linkArr;
}

function exitsFolder(absPath: string) {
  try {
    statSync(absPath);
  } catch (e) {
    // 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
    mkdirSync(absPath, { recursive: true });
  }
}

function getCurDate() {
  const d_t = new Date();

  let year = d_t.getFullYear();
  let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
  let day = ("0" + d_t.getDate()).slice(-2);
  let hour = d_t.getHours();
  let minute = d_t.getMinutes();
  let second = d_t.getSeconds();

  // prints date & time in YYYY-MM-DD HH:MM:SS format
  return (
    year +
    "年" +
    month +
    "月" +
    day +
    "日" +
    hour +
    "时" +
    minute +
    "分" +
    second +
    "秒"
  );
}

function getOutFilePath(filePath: string) {
  return join(
    outPath,
    getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt"
  );
}

E:\公众号文章采集\fi_filter_过滤器\src\txt\01_合集.ts

import { log } from "console";
import {
  readFileSync,
  readdirSync,
  lstatSync,
  createWriteStream,
  mkdirSync,
  statSync,
} from "fs";
import { basename, extname, join, resolve } from "path";
import { load, CheerioAPI } from "cheerio";

// 这个process.cwd()就是当前执行程序的文件夹
// const basePath = process.cwd();
// E:\公众号文章采集\公众号HTML\灵智宝鬘
const basePath = "E:\\公众号文章采集\\公众号HTML\\灵智宝鬘\\out";
const outFileName = "灵智宝鬘_尼萨迦达塔_我是那";
const outPath = join(basePath, "txt");

try {
  exitsFolder(outPath);
} catch (e) {
  log(e);
}

// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);

const pureFilePathList = fileList
  .filter((fileName) => {
    return lstatSync(join(basePath, fileName)).isFile();
  })
  .filter((fileName) => {
    const fileExt = extname(fileName);
    return fileExt === ".html";
  })
  .map((fileName) => {
    return join(basePath, fileName);
  });

// pureFilePathList.forEach((filePath) => {
//   extractTopic(filePath);
// });

const outFilePath = getOutFilePath();
const writeStream = createWriteStream(outFilePath, "utf-8");

for (let [index, filePath] of pureFilePathList.entries()) {
  const $: CheerioAPI = loadHtmlDom(filePath);
  const textContent = extractText($);

  writeStream.write("\n");
  writeStream.write(`第${index + 1}章`);
  writeStream.write("\n");

  writeStream.write(textContent.title);
  writeStream.write("\n");

  writeStream.write(textContent.pubDate);
  writeStream.write("\n");

  writeStream.write(textContent.content);
  writeStream.write("\n");
  log(`${index}_${filePath}`);
}
writeStream.end();

// ====================================================================================================================

function loadHtmlDom(filePath: string): CheerioAPI {
  const htmlText = readFileSync(filePath, "utf-8");
  return load(htmlText);
}

function exitsFolder(absPath: string) {
  try {
    statSync(absPath);
  } catch (e) {
    // 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
    mkdirSync(absPath, { recursive: true });
  }
}

function getOutFilePath() {
  return join(outPath, outFileName + ".txt");
}

function extractText($: CheerioAPI) {
  // #activity-name
  const title = $("#activity-name").text();
  // #publish_time
  const pubDate = $("#publish_time").text();
  const content = $("#js_content").text();
  const splitContent = handleContent(content);
  return {
    title,
    pubDate,
    content: splitContent,
  };
}

function handleContent(content: string) {
  return content
    .replace("尼萨迦达塔:", replaceContent("尼萨迦达塔:", 1000))
    .replace("尼:", replaceContent("尼萨迦达塔:", 1000))
    .replace("提问者:", replaceContent("提问者:", 1000))
    .replace("问:", replaceContent("提问者:", 1000));
}

function replaceContent(keyword: string, time: number = 1000) {
  return `\n[p${time.toString()}]\n${keyword}\n`;
}

posted on 2023-04-05 23:49  lazycookie  阅读(84)  评论(0编辑  收藏  举报