E:\公众号文章采集\fi_filter_过滤器\src\exact_新浪博客手机版提取连接.js
const fs = require('fs');
const jsdom = require('jsdom');
const { JSDOM } = jsdom;
fs.readdir('./html', function (err, files) {
files.forEach((file) => {
fs.readFile('./html/' + file, 'utf-8', (err, data) => {
const { window } = new JSDOM(data);
const $ = require('jQuery')(window);
const writeStream = fs.createWriteStream('./urls.txt', 'utf-8');
let index = 1;
// #js_content a href 超链接
$('#js_content a').each(function () {
fs.appendFile('./urls.txt', `${$(this).attr('href')}\r\n`, (err) => {
if (err) {
return console.log('append txt failed');
}
console.log(index++ + '__append file success');
});
});
//--------------------------------------------------------
//#js_content_overlay ul li data-link 话题
// $("#js_content_overlay ul li").each(function () {
// fs.appendFile(
// "./urls.txt",
// `第一章\r\n${$(this).attr("data-link")}\r\n`,
// (err) => {
// if (err) {
// return console.log("append txt failed");
// }
// console.log(index++ + "__append file success");
// }
// );
// });
//-----------------------------------------------
//新浪博客手机版提取连接
// $("body ul li a").each(function () {
// fs.appendFile(
// "./urls.txt",
// `第一章\r\n${$(this).attr("href")}\r\n`,
// (err) => {
// if (err) {
// return console.log("append txt failed");
// }
// console.log(index++ + "__append file success");
// }
// );
// });
//----------------------------------------
// 微信公众号主页提取连接
// $("span[hrefs]").each(function () {
// fs.appendFile(
// "./urls.txt",
// `第一章\r\n${$(this).attr("hrefs")}\r\n`,
// (err) => {
// if (err) {
// return console.log("append txt failed");
// }
// console.log(index++ + "__append file success");
// }
// );
// });
writeStream.end();
});
});
});
const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;
fs.readdir(path, function (err, files) {
files.forEach((file) => {
console.log(file);
if (file.split('.')[1] === 'txt') {
fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
const $ = cheerio.load(data);
// const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
const writeStream = fs.createWriteStream(
path + '/目录/' + '新浪博客目录.txt',
'utf-8'
);
$($('#pl-home-bloglist > article > ul>li').get().reverse()).each(
(data, ele) => {
let title = $(ele).find('h2').html();
let url = $(ele).find('a').attr('data-link');
writeStream.write('[');
writeStream.write(title);
writeStream.write(']');
writeStream.write('(');
writeStream.write(url);
writeStream.write(')');
writeStream.write('\n');
writeStream.write('\n');
console.log(title);
console.log(url);
}
);
writeStream.end();
});
}
});
});
const fs = require('fs');
const cheerio = require('cheerio');
let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息\\test`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList) {
let suffixReg = /\.(html)$/;
if (suffixReg.test(file)) {
const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
const $ = cheerio.load(fileContent);
const writeStream = fs.createWriteStream(
path + '/' + file.split('.')[0] + '_讯飞有声_超链接_正序.txt',
'utf-8'
);
// 获取所有的消息卡片
let js_history_list = $('#js_history_list').children();
// 对消息卡片进行排序,即倒序过来
let history_list = Array.from(js_history_list).sort((a, b) => {
return a.attribs.msgid - b.attribs.msgid;
});
//遍历消息卡片
for (let ele of history_list) {
const $1 = cheerio.load(ele);
let link = $1('h4').attr('hrefs');
let isOrigin = $1('#copyright_logo').html();
// 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
let type = $1('h4').parent().attr('data-type');
if (isOrigin === '原创' && type === 'APPMSG') {
console.log(link);
writeStream.write(link);
writeStream.write('\n');
}
}
writeStream.end();
}
}
const fs = require('fs');
const cheerio = require('cheerio');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) {
var o = {
'M+': this.getMonth() + 1, //月份
'd+': this.getDate(), //日
'h+': this.getHours(), //小时
'm+': this.getMinutes(), //分
's+': this.getSeconds(), //秒
'q+': Math.floor((this.getMonth() + 3) / 3), //季度
S: this.getMilliseconds(), //毫秒
};
if (/(y+)/.test(fmt)) {
fmt = fmt.replace(
RegExp.$1,
(this.getFullYear() + '').substr(4 - RegExp.$1.length)
);
}
for (var k in o) {
if (new RegExp('(' + k + ')').test(fmt)) {
fmt = fmt.replace(
RegExp.$1,
RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length)
);
}
}
return fmt;
};
let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息\\test`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList) {
let suffixReg = /\.(html)$/;
if (suffixReg.test(file)) {
const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
const $ = cheerio.load(fileContent);
const writeStream = fs.createWriteStream(
path + '/' + file.split('.')[0] + '_link_正序.md',
'utf-8'
);
// 获取所有的消息卡片
let js_history_list = $('#js_history_list').children();
// 对消息卡片进行排序,即倒序过来
let history_list = Array.from(js_history_list).sort((a, b) => {
return a.attribs.msgid - b.attribs.msgid;
});
//遍历消息卡片
for (let ele of history_list) {
const $1 = cheerio.load(ele);
let link = $1('h4').attr('hrefs');
let isOrigin = $1('#copyright_logo').html();
let time = $1('p.weui_media_extra_info').html().split('<span')[0].trim();
// 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
let type = $1('h4').parent().attr('data-type');
// 如果是原创和图文消息的话
if (isOrigin === '原创' && type === 'APPMSG') {
// 对日期进行处理,将2020年7月2日-->2020年07月02日
time = time.replace('年', '/').replace('月', '/').replace('日', '');
time = new Date(time).format('yyyy年MM月dd日');
// 获取文章的标题,要在这个里面获取,因为TEXT消息没有标题
let title = $1('h4').html().split('</span>')[1].trim();
console.log(time + '_' + title);
// 写入Markdown文件
writeStream.write(`[${time}_${title}](${link})`);
writeStream.write('\n');
}
}
writeStream.end();
}
}
const fs = require('fs');
const cheerio = require('cheerio');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) {
var o = {
'M+': this.getMonth() + 1, //月份
'd+': this.getDate(), //日
'h+': this.getHours(), //小时
'm+': this.getMinutes(), //分
's+': this.getSeconds(), //秒
'q+': Math.floor((this.getMonth() + 3) / 3), //季度
S: this.getMilliseconds(), //毫秒
};
if (/(y+)/.test(fmt)) {
fmt = fmt.replace(
RegExp.$1,
(this.getFullYear() + '').substr(4 - RegExp.$1.length)
);
}
for (var k in o) {
if (new RegExp('(' + k + ')').test(fmt)) {
fmt = fmt.replace(
RegExp.$1,
RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length)
);
}
}
return fmt;
};
// 让程序的处理路径切换成当前文件夹
let path = __dirname;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList) {
console.log('---------' + file + '----------');
let suffixReg = /\.(txt)$/;
if (suffixReg.test(file)) {
// 读取文件,然后加载
const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
const $ = cheerio.load(fileContent);
// 获取所有的消息卡片
let js_history_list = $('#js_history_list').children();
// 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
if (!js_history_list) continue;
// 创建文件写入流
const writeStream = fs.createWriteStream(
path + '/' + file.split('.')[0] + '_link_正序.md',
'utf-8'
);
// 对消息卡片进行排序,即倒序过来
let history_list = Array.from(js_history_list).sort((a, b) => {
return a.attribs.msgid - b.attribs.msgid;
});
//遍历消息卡片
for (let ele of history_list) {
const $1 = cheerio.load(ele);
let link = $1('h4').attr('hrefs');
let isOrigin = $1('#copyright_logo').html();
let time = $1('p.weui_media_extra_info').html().split('<span')[0].trim();
// 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
let type = $1('h4').parent().attr('data-type');
// 如果是原创和图文消息的话
if (isOrigin === '原创' && type === 'APPMSG') {
// 对日期进行处理,将2020年7月2日-->2020年07月02日
time = time.replace('年', '/').replace('月', '/').replace('日', '');
time = new Date(time).format('yyyy年MM月dd日');
// 获取文章的标题,要在这个里面获取,因为TEXT消息没有标题
let title = $1('h4').html().split('</span>')[1];
if (title !== undefined) {
title = title.trim();
}
console.log(time + '_' + title);
// 写入Markdown文件
writeStream.write(`[${time}_${title}](${link})`);
writeStream.write('\n');
}
}
writeStream.end();
}
}
const fs = require('fs');
const cheerio = require('cheerio');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) {
var o = {
'M+': this.getMonth() + 1, //月份
'd+': this.getDate(), //日
'h+': this.getHours(), //小时
'm+': this.getMinutes(), //分
's+': this.getSeconds(), //秒
'q+': Math.floor((this.getMonth() + 3) / 3), //季度
S: this.getMilliseconds(), //毫秒
};
if (/(y+)/.test(fmt)) {
fmt = fmt.replace(
RegExp.$1,
(this.getFullYear() + '').substr(4 - RegExp.$1.length)
);
}
for (var k in o) {
if (new RegExp('(' + k + ')').test(fmt)) {
fmt = fmt.replace(
RegExp.$1,
RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length)
);
}
}
return fmt;
};
// 让程序的处理路径切换成当前文件夹
// let path = __dirname;
let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息\\test3`;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 创建文件写入流
const writeStream = fs.createWriteStream(
path + '/' + '原创文章-超链接-时间顺序-合并.md',
'utf-8'
);
// 遍历文件列表数组
for (let file of fileList) {
console.log('---------' + file + '----------');
let suffixReg = /\.(txt)$/;
if (suffixReg.test(file)) {
// 读取文件,然后加载
const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
const $ = cheerio.load(fileContent);
// 获取所有的消息卡片
let js_history_list = $('#js_history_list').children();
// 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
if (!js_history_list) continue;
// 写入当前文件标题,作为Markdown的一级标题
writeStream.write(`# ${file.split('.')[0]}\n`);
// 对消息卡片进行排序,即倒序过来
let history_list = Array.from(js_history_list).sort((a, b) => {
return a.attribs.msgid - b.attribs.msgid;
});
//遍历消息卡片
for (let ele of history_list) {
const $1 = cheerio.load(ele);
// 获取时间
let time = $1('.weui_msg_card_hd:first-child').html();
time = timeConvert(time);
const msgList = $1('.weui_msg_card_bd').children();
for (let msg of msgList) {
const $2 = cheerio.load(msg);
// 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
let type = $2('h4').parent().attr('data-type');
// 如果是文章是原创的话,h4标题里面有个span标签,反之,里面只有标题
let isOrigin = $2('#copyright_logo').html();
// 获取当前图文标题
let title = $2('h4').html();
// 对标题进行处理
if (isOrigin) {
title = title.split('</span>')[1].trim();
} else {
title = title.trim();
}
// 获取链接
let link = $2('h4').attr('hrefs');
// 如果是原创和图文消息的话
if (isOrigin === '原创' && type === 'APPMSG') {
console.log(time + '_' + title);
writeStream.write(`[${time}_${title}](${link}) `);
writeStream.write('\n');
}
}
}
}
}
writeStream.end();
// 对日期进行处理,将2020年7月2日-->2020年07月02日
function timeConvert(time) {
time = time.replace('年', '/').replace('月', '/').replace('日', '');
let newTime = new Date(time).format('yyyy年MM月dd日');
return newTime;
}
const fs = require('fs');
const cheerio = require('cheerio');
// Date原型上添加一个用于格式化日期的函数
Date.prototype.format = function (fmt) {
var o = {
'M+': this.getMonth() + 1, //月份
'd+': this.getDate(), //日
'h+': this.getHours(), //小时
'm+': this.getMinutes(), //分
's+': this.getSeconds(), //秒
'q+': Math.floor((this.getMonth() + 3) / 3), //季度
S: this.getMilliseconds(), //毫秒
};
if (/(y+)/.test(fmt)) {
fmt = fmt.replace(
RegExp.$1,
(this.getFullYear() + '').substr(4 - RegExp.$1.length)
);
}
for (var k in o) {
if (new RegExp('(' + k + ')').test(fmt)) {
fmt = fmt.replace(
RegExp.$1,
RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length)
);
}
}
return fmt;
};
// 让程序的处理路径切换成当前文件夹
let path = __dirname;
// let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息\\test3`;
// 读取文件夹,返回一个文件列表数组,这个文件列表不递归
const fileList = fs.readdirSync(path);
// 创建文件写入流
const writeStream = fs.createWriteStream(
path + '/' + '原创文章-超链接-时间顺序-合并.md',
'utf-8'
);
// 遍历文件列表数组
for (let file of fileList) {
console.log('---------' + file + '----------');
let suffixReg = /\.(txt)$/;
if (suffixReg.test(file)) {
// 读取文件,然后加载
const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
const $ = cheerio.load(fileContent);
// 获取所有的消息卡片
let js_history_list = $('#js_history_list').children();
// 如果消息卡片为空,说明这个不是公众号历史消息,可以话题,人家整理的链接,加上下面的判断可以提高代码的容错性
if (!js_history_list) continue;
// 写入当前文件标题,作为Markdown的一级标题
writeStream.write(`# ${file.split('.')[0]}\n`);
// 对消息卡片进行排序,即倒序过来
let history_list = Array.from(js_history_list).sort((a, b) => {
return a.attribs.msgid - b.attribs.msgid;
});
//遍历消息卡片
for (let ele of history_list) {
const $1 = cheerio.load(ele);
// 获取时间
let time = $1('.weui_msg_card_hd:first-child').html();
time = timeConvert(time);
const msgList = $1('.weui_msg_card_bd').children();
for (let msg of msgList) {
const $2 = cheerio.load(msg);
// 获取消息卡片的类型,具体有 图文(APPMSG)、视频(VIDEO)、文字(TEXT)
let type = $2('h4').parent().attr('data-type');
// 提取图文消息
if (type === 'APPMSG') {
// 如果是文章是原创的话,h4标题里面有个span标签,反之,里面只有标题
let isOrigin = $2('#copyright_logo').html();
// 获取当前图文标题
let title = $2('h4').html();
// 对标题进行处理
if (isOrigin) {
title = title.split('</span>')[1].trim();
} else {
title = title.trim();
}
// 获取链接
let link = $2('h4').attr('hrefs');
// 如果是原创和图文消息的话
if (isOrigin === '原创') {
console.log(time + '_' + title);
writeStream.write(`[${time}_${title}](${link}) `);
writeStream.write('\n');
}
}
}
}
}
}
writeStream.end();
// 对日期进行处理,将2020年7月2日-->2020年07月02日
function timeConvert(time) {
time = time.replace('年', '/').replace('月', '/').replace('日', '');
let newTime = new Date(time).format('yyyy年MM月dd日');
return newTime;
}
const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList) {
console.log(file);
if (file.split('.')[1] === 'txt') {
const data = fs.readFileSync(path + '/' + file, 'utf8');
const $ = cheerio.load(data);
// 1. 提取目录
const writeStream = fs.createWriteStream(path + '/目录/' + file, 'utf-8');
$('#js_history_list h4').each((index, ele) => {
console.log('--------' + index);
writeStream.write($(ele).attr('hrefs') ?? '');
writeStream.write('\n');
});
writeStream.end();
}
}
const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;
fs.readdir(path, function (err, files) {
files.forEach((file) => {
console.log(file);
if (file.split('.')[1] === 'txt') {
fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
const $ = cheerio.load(data);
// 1. 数据清洗_相同路径下面创建相同文件,用来清理script标签
const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
$('script').remove();
$('link').remove();
writeStream.write($('html').html());
writeStream.end();
});
}
});
});
const fs = require("fs");
const cheerio = require("cheerio");
let path = `./html`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// console.log(fileList);
// 遍历文件列表数组
// for (let file of fileList) {
// console.log(file);
// if (file.split('.')[1] === 'txt') {
// fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
// const $ = cheerio.load(data);
// // 1. 提取目录
// const writeStream = fs.createWriteStream(path + '/目录/' + file, 'utf-8');
// $(
// '#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li'
// ).each((index, ele) => {
// writeStream.write($(ele).attr('data-link'));
// writeStream.write('\n');
// });
// writeStream.end();
// });
// }
// }
const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;
fs.readdir(path, function (err, files) {
files.forEach((file) => {
console.log(file);
if (file.split('.')[1] === 'html') {
fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
const $ = cheerio.load(data);
// 1. 提取目录
const writeStream = fs.createWriteStream(
path + '/目录/' + file,
'utf-8'
);
$('#js_content a').each((index, ele) => {
writeStream.write($(ele).attr('href'));
writeStream.write('\n');
});
writeStream.end();
});
}
});
});
const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList) {
if (file.split('.')[1] === 'txt') {
// 读取文件的中内容
const fileContent = fs.readFileSync(path + '/' + file, 'utf8');
// 加载cheerio中生成dom树
const $ = cheerio.load(fileContent);
// 创建写入文件流
const writeStream = fs.createWriteStream(path + '/目录/' + file, 'utf8');
// 写入相关内容
$('#js_content a').each((index, ele) => {
writeStream.write($(ele).attr('href'));
writeStream.write('\n');
});
// 关闭写入流
writeStream.end();
}
}
// fs.readdir(path, function (err, files) {
// files.forEach((file) => {
// console.log(file);
// if (file.split('.')[1] === 'html') {
// fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
// const $ = cheerio.load(data);
// // 1. 提取目录
// const writeStream = fs.createWriteStream(
// path + '/目录/' + file,
// 'utf-8'
// );
// $('#js_content a').each((index, ele) => {
// writeStream.write($(ele).attr('href'));
// writeStream.write('\n');
// });
// writeStream.end();
// });
// }
// });
// });
const fs = require('fs');
const cheerio = require('cheerio');
let path = `./html`;
fs.readdir(path, function (err, files) {
files.forEach((file) => {
console.log(file);
if (file.split('.')[1] === 'txt') {
fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
const $ = cheerio.load(data);
// const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
const writeStream = fs.createWriteStream(
path + '/目录/' + '新浪博客目录.txt',
'utf-8'
);
$('#pl-home-bloglist > article > ul>li').each((data, ele) => {
let title = $(ele).find('h2').html();
let url = $(ele).find('a').attr('data-link');
writeStream.write('[');
writeStream.write(title);
writeStream.write(']');
writeStream.write('(');
writeStream.write(url);
writeStream.write(')');
writeStream.write('\n');
writeStream.write('\n');
console.log(title);
console.log(url);
});
writeStream.end();
});
}
});
});
E:\公众号文章采集\fi_filter_过滤器\src\filter_html.js
const fs = require("fs");
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
let path = `E:\\公众号文章采集\\公众号HTML\\灵智宝鬘`;
fs.readdir(path, function (err, files) {
files.forEach((file) => {
console.log(file);
if (file.split(".")[1] === "html") {
fs.readFile(path + "/" + file, "utf-8", (err, data) => {
const { window } = new JSDOM(data);
const $ = require("jQuery")(window);
const writeStream = fs.createWriteStream(path + "/" + file, "utf-8");
// 包含特定文字,掺杂在正文中的p标签,或者span标签
$("span:contains('***')").remove();
$("span:contains('--- TBC ---')").remove();
$("span:contains('支持原创翻译')").remove();
$("span:contains('节选自室利·萨马塔·罗摩达斯')").remove();
$("a:contains('阅读全文')").remove();
$("p:contains('因此,在《给弟子的忠告》')").remove();
//标题移除
$("#activity-name").remove();
// 所有音频标签
$("section").remove();
// 所有的h3标签
$("h3").remove();
// 话题标签
$("#js_tags").remove();
// 所有的img图片;
$("img").remove();
// 所有的script标签
$("script").remove();
//公众号名称 时间信息
$("div#meta_content").remove();
//底部评论信息
$("div.comment").remove();
//html写入
writeStream.write($("html").html());
writeStream.end();
});
}
});
});
E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio.js
const fs = require('fs');
const cheerio = require('cheerio');
let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList) {
if (file.split('.')[1] === 'html') {
const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
const $ = cheerio.load(fileContent);
const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
// 过滤内容
// 所有的script标签
$('script').remove();
$('link').remove();
//html写入
writeStream.write($('html').html());
writeStream.end();
}
}
function filterContent($) {
// 包含特定文字,掺杂在正文中的p标签,或者span标签
// $("section:contains('相关阅读')").remove();
// $("span:contains('--- TBC ---')").remove();
// $("span:contains('支持原创翻译')").remove();
// $("span:contains('节选自室利·萨马塔·罗摩达斯')").remove();
// $("a:contains('阅读全文')").remove();
// $("p:contains('因此,在《给弟子的忠告》')").remove();
//标题移除
// $('#activity-name').remove();
// 所有音频标签
// $('section').remove();
// 所有的h3标签
// $('h3').remove();
// 话题标签
// $('#js_tags').remove();
// 所有的img图片;
// $('img').remove();
// 所有的script标签
$('script').remove();
//公众号名称 时间信息
// $('div#meta_content').remove();
//底部评论信息
// $('div.comment').remove();
}
E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio_听心坊.js
const fs = require('fs');
const cheerio = require('cheerio');
let path = `E:\\公众号文章采集\\公众号HTML\\听心坊\\`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList) {
console.log(file);
if (file.split('.')[1] === 'html') {
const fileContent = fs.readFileSync(path + '/' + file, 'utf-8');
const $ = cheerio.load(fileContent);
// 过滤内容---------------------------------------------------------
// 包含特定文字,掺杂在正文中的p标签,或者span标签
// $("span:contains('如欲与陈明亮先生交流,请登陆:')").remove();
// $("span:contains('facebook.com/profile.php?id=100039436871466')").remove();
// $("span:contains('更多陈明亮的文章,请前往')").remove();
// $("span:contains('更多的音频')").remove();
// $("span:contains('摄影')").remove();
// 获取文章内的数字索引
// console.log($("span:contains('【明亮说')").text());
let idx = $("span:contains('【明亮说')").text().split('·')[1];
console.log(idx);
// //测试断点
// writeStream.write($('html').html());
// writeStream.end();
// break;
// //测试断点
// $("a:contains('阅读全文')").remove();
// $("p:contains('因此,在《给弟子的忠告》')").remove();
//标题移除
// $('#activity-name').remove();
// 所有音频标签
// $('section').remove();
// 所有的h3标签
// $('h3').remove();
// 话题标签
// $('#js_tags').remove();
// 所有的img图片;
// $('img').remove();
// 所有的script标签
// $('script').remove();
//公众号名称 时间信息
// $('div#meta_content').remove();
//底部评论信息
// $('div.comment').remove();
//html写入-------------------------------------------------
const writeStream = fs.createWriteStream(path + '/' + idx + file, 'utf-8');
writeStream.write($('html').html());
writeStream.end();
}
}
E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio_旭然之光.js
const fs = require('fs');
const cheerio = require('cheerio');
let path = `E:\\公众号文章采集\\公众号HTML\\煦然之光`;
fs.readdir(path, function (err, files) {
files.forEach((file) => {
console.log(file);
if (file.split('.')[1] === 'html') {
fs.readFile(path + '/' + file, 'utf-8', (err, data) => {
const $ = cheerio.load(data);
const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
// 包含特定文字,掺杂在正文中的p标签,或者span标签
// $("section:contains('相关阅读')").remove();
// $("span:contains('--- TBC ---')").remove();
// $("span:contains('支持原创翻译')").remove();
// $("span:contains('节选自室利·萨马塔·罗摩达斯')").remove();
// $("a:contains('阅读全文')").remove();
// $("p:contains('因此,在《给弟子的忠告》')").remove();
//标题移除
// $('#activity-name').remove();
// 所有音频标签
// $('section').remove();
// 所有的h3标签
// $('h3').remove();
// 话题标签
// $('#js_tags').remove();
// 所有的img图片;
// $('img').remove();
// 所有的script标签
// $('script').remove();
//公众号名称 时间信息
// $('div#meta_content').remove();
//底部评论信息
// $('div.comment').remove();
//html写入
writeStream.write($('html').html());
writeStream.end();
});
}
});
});
E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio_阿知事业林.js
const fs = require('fs');
const cheerio = require('cheerio');
let path = `E:\\公众号文章采集\\公众号HTML\\阿知事业林`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList) {
console.log(file);
if (file.split('.')[1] === 'html') {
// 读取文件的中内容
const fileContent = fs.readFileSync(path + '/' + file, 'utf8');
// 加载cheerio中生成dom树
const $ = cheerio.load(fileContent);
// 创建写入文件流
const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8');
// 所有的img图片;
$('img').remove();
// 所有的script标签
$('script').remove();
// ---------------------
// 包含特定文字,掺杂在正文中的p标签,或者span标签
// $("section:contains('相关阅读')").remove();
// $("span:contains('--- TBC ---')").remove();
// $("span:contains('支持原创翻译')").remove();
// $("span:contains('节选自室利·萨马塔·罗摩达斯')").remove();
// $("a:contains('阅读全文')").remove();
// $("p:contains('因此,在《给弟子的忠告》')").remove();
// 作者和发布的时间信息
$('#meta_content').remove();
// 评论信息
$('.comment').remove();
// 话题标签
$('#js_tags').remove();
// 评论赞助
$('#js_sponsor_ad_area').remove();
// 超链接
$('a').remove();
// 所有音频标签
// $('section').remove();
// 所有的h3标签
// $('h3').remove();
// 话题标签
// $('#js_tags').remove();
//公众号名称 时间信息
// $('div#meta_content').remove();
//底部评论信息
// $('div.comment').remove();
//html写入
writeStream.write($('html').html());
writeStream.end();
}
}
E:\公众号文章采集\fi_filter_过滤器\src\filter_html_不死甘露.js
const fs = require("fs");
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
let path = `E:\\公众号文章采集\\公众号HTML\\灵智宝鬘`;
fs.readdir(path, function (err, files) {
files.forEach((file) => {
console.log(file);
if (file.split(".")[1] === "html") {
fs.readFile(path + "/" + file, "utf-8", (err, data) => {
const { window } = new JSDOM(data);
const $ = require("jQuery")(window);
const writeStream = fs.createWriteStream(path + "/" + file, "utf-8");
// 包含特定文字,掺杂在正文中的p标签,或者span标签
$("span:contains('***')").remove();
$("span:contains('--- TBC ---')").remove();
$("span:contains('支持原创翻译')").remove();
$("a:contains('阅读全文')").remove();
$("strong:contains('不死甘露')").remove();
$("strong:contains('关于永恒的开示录')").remove();
$("strong:contains('THE NECTAR OF IMMORTALITY')").remove();
$("span:contains('室利·尼萨迦达塔·马哈拉吉 著')").remove();
$("span:contains('灵智宝鬘翻译团队 中译')").remove();
$("p:contains('喜欢作者')").remove();
$("p:contains('——')").remove();
//标题移除
$("#activity-name").remove();
// 所有音频标签
$("section").remove();
// 所有的h3标签
$("h3").remove();
// 话题标签
$("#js_tags").remove();
// 所有的img图片;
$("img").remove();
// 所有的script标签
$("script").remove();
//公众号名称 时间信息
$("div#meta_content").remove();
//底部评论信息
$("div.comment").remove();
//html写入
writeStream.write($("html").html());
writeStream.end();
});
}
});
});
E:\公众号文章采集\fi_filter_过滤器\src\html2txt.js
const fs = require("fs");
const jsdom = require("jsdom");
const { JSDOM } = jsdom;
let path = `E:\\公众号文章采集\\公众号HTML\\养猫学习`;
fs.readdir(path, function (err, files) {
files.forEach((file) => {
console.log(file);
if (file.split(".")[1] === "html") {
fs.readFile(path + "/" + file, "utf-8", (err, data) => {
const { window } = new JSDOM(data);
const $ = require("jQuery")(window);
const writeStream = fs.createWriteStream(
path + "/" + file.split(".")[0] + ".txt",
"utf-8"
);
//标题
writeStream.write($("#activity-name").text());
//内容
writeStream.write($("#js_content").text());
writeStream.end();
});
}
});
});
E:\公众号文章采集\fi_filter_过滤器\src\html2txt_cheerio.js
const fs = require('fs');
const cheerio = require('cheerio');
let path = `E:\\公众号文章采集\\公众号HTML\\阿知事业林`;
// 读取文件夹,返回一个文件列表数组
const fileList = fs.readdirSync(path);
// 遍历文件列表数组
for (let file of fileList) {
console.log(file);
if (file.split('.')[1] === 'html') {
// 读取文件的中内容
const fileContent = fs.readFileSync(path + '/' + file, 'utf8');
// 加载cheerio中生成dom树
const $ = cheerio.load(fileContent);
// 创建一个文件写入流
const writeStream = fs.createWriteStream(
path + '/' + file.split('.')[0] + '.txt',
'utf-8'
);
// //标题
// writeStream.write($('#activity-name').text());
// //内容
writeStream.write($('body').text());
writeStream.write('endendend');
writeStream.end();
}
}
E:\公众号文章采集\fi_filter_过滤器\src\任意路径文件写入.js
const fs = require('fs');
const writeFileRecursive = function (path, buffer, callback) {
// 前面的文件路径
let lastPath = path.substring(0, path.lastIndexOf('/'));
// 递归创建目录
fs.mkdir(lastPath, { recursive: true }, (err) => {
if (err) return callback(err);
fs.writeFile(path, buffer, function (err) {
if (err) return callback(err);
return callback(null);
});
});
};
const buffer = 'hello';
writeFileRecursive('./public/test/test.txt', buffer, (err) => {
if (err) console.error(err);
console.info('write success');
});
E:\公众号文章采集\fi_filter_过滤器\src\crawler\crawler.ts
import superagent from "superagent";
import { load, CheerioAPI } from "cheerio";
import { log } from "console";
import { createWriteStream } from "fs";
export default class Crawler {
private url = ``;
private $: CheerioAPI;
constructor() {}
setUrl(url: string) {
this.url = url;
}
async init() {
const res = await superagent.get(this.url);
this.$ = load(res.text);
}
save(path: string) {
const writeStream = createWriteStream(path, "utf-8");
writeStream.write(this.$("html"));
writeStream.end();
}
getTitle() {
log(this.$("#activity-name").text());
return this.$("#activity-name").text();
}
getTime() {
log(this.$("script:contains('function htmlDecode(str)')").text());
}
getContent() {
// 萨特桑指出
const quotes = this.$("span:contains('萨特桑指出')");
return quotes.text();
}
}
E:\公众号文章采集\fi_filter_过滤器\src\crawler\index.ts
import { log } from "console";
import Crawler from "./crawler";
const crawler = new Crawler();
crawler.setUrl("https://mp.weixin.qq.com/s/EgZhFJTzsgfYzZZ-4_SI4Q");
await crawler.init();
crawler.getTime();
// crawler.getTitle();
// crawler.save("");
// const content = crawler.getContent();
// log(content);
E:\公众号文章采集\fi_filter_过滤器\src\filter\01_灵智宝鬘_话题_尼萨迦达塔.ts
import { log } from "console";
import {
readFileSync,
readdirSync,
lstatSync,
createWriteStream,
mkdirSync,
statSync,
} from "fs";
import { basename, extname, join, resolve } from "path";
import { load, CheerioAPI } from "cheerio";
// 这个process.cwd()就是当前执行程序的文件夹
// const basePath = process.cwd();
// E:\公众号文章采集\公众号HTML\灵智宝鬘
const basePath = "E:\\公众号文章采集\\公众号HTML\\灵智宝鬘";
const outPath = join(basePath, "out");
try {
exitsFolder(outPath);
} catch (e) {
log(e);
}
// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
.filter((fileName) => {
return lstatSync(join(basePath, fileName)).isFile();
})
.filter((fileName) => {
const fileExt = extname(fileName);
return fileExt === ".html";
})
.map((fileName) => {
return join(basePath, fileName);
});
// pureFilePathList.forEach((filePath) => {
// extractTopic(filePath);
// });
for (let filePath of pureFilePathList) {
const $: CheerioAPI = loadHtmlDom(filePath);
filterDom($);
const outFilePath = getOutFilePath(filePath);
const writeStream = createWriteStream(outFilePath, "utf-8");
writeStream.write($("html").html());
writeStream.end();
}
// ====================================================================================================================
// 移除标签的原则,尽量少移除p标签,有可能会误把正文内容移除
function filterDom($: CheerioAPI) {
// 话题标签
$("#js_tags").remove();
// 包含特定文字的span标签
$("span:contains('灵智宝鬘翻译团队 中译')").remove();
// style="white-space: normal;text-align: center;"
$(
"p[style*='white-space: normal;text-align: center;']:contains('我是那')"
).remove();
// 室利·尼萨迦达塔·马哈拉吉的开示录
$(
"p[style*='white-space: normal;text-align: center;']:contains('室利·尼萨迦达塔·马哈拉吉的开示录')"
).remove();
// 文字颜色是 color: rgb(136, 136, 136) ,且包含" 室利·尼萨迦达塔·马哈拉吉 著"的span标签
$(
"span[style*='color: rgb(136, 136, 136)']:contains('室利·尼萨迦达塔·马哈拉吉 著')"
).remove();
// 红色的span和strong标签
$("span[style*='color: rgb(255, 76, 65)']").remove();
$("strong[style*='color: rgb(255, 76, 65)']").remove();
// 类名是comment的div标签
$("div.comment").remove();
}
function loadHtmlDom(filePath: string): CheerioAPI {
const htmlText = readFileSync(filePath, "utf-8");
return load(htmlText);
}
function extractLink($: CheerioAPI) {
const oLinkList = $("#js_articles > div");
if (!oLinkList.length) return [];
const linkArr: string[] = [];
oLinkList.each((i, oLink) => {
const url = $(oLink).attr("data-jump_url");
if (!url) return;
linkArr.push(url);
});
return linkArr;
}
function exitsFolder(absPath: string) {
try {
statSync(absPath);
} catch (e) {
// 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
mkdirSync(absPath, { recursive: true });
}
}
function getCurDate() {
const d_t = new Date();
let year = d_t.getFullYear();
let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
let day = ("0" + d_t.getDate()).slice(-2);
let hour = d_t.getHours();
let minute = d_t.getMinutes();
let second = d_t.getSeconds();
// prints date & time in YYYY-MM-DD HH:MM:SS format
return (
year +
"年" +
month +
"月" +
day +
"日" +
hour +
"时" +
minute +
"分" +
second +
"秒"
);
}
function getOutFilePath(filePath: string) {
return join(outPath, basename(filePath));
}
E:\公众号文章采集\fi_filter_过滤器\src\topic\01_非推送_链接_一行一个.ts
import { log } from "console";
import {
readFileSync,
readdirSync,
lstatSync,
createWriteStream,
mkdirSync,
statSync,
} from "fs";
import { basename, extname, join, resolve } from "path";
import { load, CheerioAPI } from "cheerio";
// 这个process.cwd()就是当前执行程序的文件夹
const basePath = process.cwd();
const outPath = join(basePath, "out");
try {
exitsFolder(outPath);
} catch (e) {
log(e);
}
// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
.filter((fileName) => {
return lstatSync(join(basePath, fileName)).isFile();
})
.filter((fileName) => {
const fileExt = extname(fileName);
return fileExt === ".txt" || fileExt === ".html";
})
.map((fileName) => {
return join(basePath, fileName);
});
pureFilePathList.forEach((filePath) => {
extractTopic(filePath);
});
function extractTopic(filePath: string) {
const $: CheerioAPI = loadHtmlDom(filePath);
const urlArr = extractLink($);
const outFilePath = getOutFilePath(filePath);
const writeStream = createWriteStream(outFilePath, "utf-8");
urlArr.forEach((url) => {
writeStream.write(url);
writeStream.write("\n");
});
writeStream.end();
}
function loadHtmlDom(filePath: string): CheerioAPI {
const htmlText = readFileSync(filePath, "utf-8");
return load(htmlText);
}
function extractLink($: CheerioAPI) {
const oLinkList = $(
"#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li"
);
const linkArr: string[] = [];
oLinkList.each((i, oLink) => {
const url = $(oLink).attr("data-link");
linkArr.push(url ? url : "");
});
return linkArr;
}
function exitsFolder(absPath: string) {
try {
statSync(absPath);
} catch (e) {
// 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
mkdirSync(absPath, { recursive: true });
}
}
function getCurDate() {
const d_t = new Date();
let year = d_t.getFullYear();
let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
let day = ("0" + d_t.getDate()).slice(-2);
let hour = d_t.getHours();
let minute = d_t.getMinutes();
let second = d_t.getSeconds();
// prints date & time in YYYY-MM-DD HH:MM:SS format
return (
year +
"年" +
month +
"月" +
day +
"日" +
hour +
"时" +
minute +
"分" +
second +
"秒"
);
}
function getOutFilePath(filePath: string) {
return join(
outPath,
getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt"
);
}
E:\公众号文章采集\fi_filter_过滤器\src\topic\02_推送_链接_一行一个.ts
import { log } from "console";
import {
readFileSync,
readdirSync,
lstatSync,
createWriteStream,
mkdirSync,
statSync,
} from "fs";
import { basename, extname, join, resolve } from "path";
import { load, CheerioAPI } from "cheerio";
// 这个process.cwd()就是当前执行程序的文件夹
const basePath = process.cwd();
const outPath = join(basePath, "out");
try {
exitsFolder(outPath);
} catch (e) {
log(e);
}
// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
.filter((fileName) => {
return lstatSync(join(basePath, fileName)).isFile();
})
.filter((fileName) => {
const fileExt = extname(fileName);
return fileExt === ".txt" || fileExt === ".html";
})
.map((fileName) => {
return join(basePath, fileName);
});
pureFilePathList.forEach((filePath) => {
extractTopic(filePath);
});
function extractTopic(filePath: string) {
const $: CheerioAPI = loadHtmlDom(filePath);
const urlArr = extractLink($)!;
if (!urlArr.length) return;
const outFilePath = getOutFilePath(filePath);
const writeStream = createWriteStream(outFilePath, "utf-8");
urlArr.forEach((url) => {
writeStream.write(url);
writeStream.write("\n");
});
writeStream.end();
}
function loadHtmlDom(filePath: string): CheerioAPI {
const htmlText = readFileSync(filePath, "utf-8");
return load(htmlText);
}
function extractLink($: CheerioAPI) {
const oLinkList = $("#js_articles > div");
if (!oLinkList.length) return [];
const linkArr: string[] = [];
oLinkList.each((i, oLink) => {
const url = $(oLink).attr("data-jump_url");
if (!url) return;
linkArr.push(url);
});
return linkArr;
}
function exitsFolder(absPath: string) {
try {
statSync(absPath);
} catch (e) {
// 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
mkdirSync(absPath, { recursive: true });
}
}
function getCurDate() {
const d_t = new Date();
let year = d_t.getFullYear();
let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
let day = ("0" + d_t.getDate()).slice(-2);
let hour = d_t.getHours();
let minute = d_t.getMinutes();
let second = d_t.getSeconds();
// prints date & time in YYYY-MM-DD HH:MM:SS format
return (
year +
"年" +
month +
"月" +
day +
"日" +
hour +
"时" +
minute +
"分" +
second +
"秒"
);
}
function getOutFilePath(filePath: string) {
return join(
outPath,
getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt"
);
}
E:\公众号文章采集\fi_filter_过滤器\src\txt\01_合集.ts
import { log } from "console";
import {
readFileSync,
readdirSync,
lstatSync,
createWriteStream,
mkdirSync,
statSync,
} from "fs";
import { basename, extname, join, resolve } from "path";
import { load, CheerioAPI } from "cheerio";
// 这个process.cwd()就是当前执行程序的文件夹
// const basePath = process.cwd();
// E:\公众号文章采集\公众号HTML\灵智宝鬘
const basePath = "E:\\公众号文章采集\\公众号HTML\\灵智宝鬘\\out";
const outFileName = "灵智宝鬘_尼萨迦达塔_我是那";
const outPath = join(basePath, "txt");
try {
exitsFolder(outPath);
} catch (e) {
log(e);
}
// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
.filter((fileName) => {
return lstatSync(join(basePath, fileName)).isFile();
})
.filter((fileName) => {
const fileExt = extname(fileName);
return fileExt === ".html";
})
.map((fileName) => {
return join(basePath, fileName);
});
// pureFilePathList.forEach((filePath) => {
// extractTopic(filePath);
// });
const outFilePath = getOutFilePath();
const writeStream = createWriteStream(outFilePath, "utf-8");
for (let [index, filePath] of pureFilePathList.entries()) {
const $: CheerioAPI = loadHtmlDom(filePath);
const textContent = extractText($);
writeStream.write("\n");
writeStream.write(`第${index + 1}章`);
writeStream.write("\n");
writeStream.write(textContent.title);
writeStream.write("\n");
writeStream.write(textContent.pubDate);
writeStream.write("\n");
writeStream.write(textContent.content);
writeStream.write("\n");
log(`${index}_${filePath}`);
}
writeStream.end();
// ====================================================================================================================
function loadHtmlDom(filePath: string): CheerioAPI {
const htmlText = readFileSync(filePath, "utf-8");
return load(htmlText);
}
function exitsFolder(absPath: string) {
try {
statSync(absPath);
} catch (e) {
// 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
mkdirSync(absPath, { recursive: true });
}
}
function getOutFilePath() {
return join(outPath, outFileName + ".txt");
}
function extractText($: CheerioAPI) {
// #activity-name
const title = $("#activity-name").text();
// #publish_time
const pubDate = $("#publish_time").text();
const content = $("#js_content").text();
const splitContent = handleContent(content);
return {
title,
pubDate,
content: splitContent,
};
}
function handleContent(content: string) {
return content
.replace("尼萨迦达塔:", replaceContent("尼萨迦达塔:", 1000))
.replace("尼:", replaceContent("尼萨迦达塔:", 1000))
.replace("提问者:", replaceContent("提问者:", 1000))
.replace("问:", replaceContent("提问者:", 1000));
}
function replaceContent(keyword: string, time: number = 1000) {
return `\n[p${time.toString()}]\n${keyword}\n`;
}