cheerio 关键字过滤 关键字替换 内容剔除 八曲仙人之歌讲解
import { log } from "console";
import {
readFileSync,
readdirSync,
lstatSync,
createWriteStream,
mkdirSync,
statSync,
} from "fs";
import { basename, extname, join, resolve } from "path";
import { load, CheerioAPI, Cheerio } from "cheerio";
// 这个process.cwd()就是当前执行程序的文件夹
// const basePath = process.cwd();
const basePath = "E:\\公众号文章采集\\公众号HTML\\妙高峰上";
const outPath = join(basePath, "out");
try {
exitsFolder(outPath);
} catch (e) {
log(e);
}
// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(basePath);
const pureFilePathList = fileList
.filter((fileName) => {
return lstatSync(join(basePath, fileName)).isFile();
})
.filter((fileName) => {
const fileExt = extname(fileName);
return fileExt === ".html";
})
.map((fileName) => {
return join(basePath, fileName);
});
// pureFilePathList.forEach((filePath) => {
// extractTopic(filePath);
// });
for (let filePath of pureFilePathList) {
log(filePath);
const $: CheerioAPI = loadHtmlDom(filePath);
filterDom($);
const outFilePath = getOutFilePath(filePath);
const writeStream = createWriteStream(outFilePath, "utf-8");
writeStream.write($("html").html());
writeStream.end();
break;
}
// ====================================================================================================================
// 移除标签的原则,尽量少移除p标签,有可能会误把正文内容移除
function filterDom($: CheerioAPI) {
// 话题标签
$("#js_tags").remove();
// 包含特定文字的span标签
$("span:contains('↓↓↓ 请点击左下角“阅读原文”')").remove();
// style="white-space: normal;text-align: center;"
// $(
// "p[style*='white-space: normal;text-align: center;']:contains('我是那')"
// ).remove();
// 室利·尼萨迦达塔·马哈拉吉的开示录
// $(
// "p[style*='white-space: normal;text-align: center;']:contains('室利·尼萨迦达塔·马哈拉吉的开示录')"
// ).remove();
// 文字颜色是 color: rgb(136, 136, 136) ,且包含" 室利·尼萨迦达塔·马哈拉吉 著"的span标签
// $(
// "span[style*='color: rgb(136, 136, 136)']:contains('室利·尼萨迦达塔·马哈拉吉 著')"
// ).remove();
// 红色的span和strong标签
// $("span[style*='color: rgb(255, 76, 65)']").remove();
// $("strong[style*='color: rgb(255, 76, 65)']").remove();
// 类名是comment的div标签
$("div.comment").remove();
// data-id="85560"
$("section[data-id='85560']").remove();
// 删除从当前元素的所有元素,包括自己========================================================================
const delList: Cheerio<any>[] = [];
// data-id="89227"
let objCache = $("section[data-id='89227']");
while (objCache.next().length !== 0) {
delList.push(objCache);
objCache = objCache.next();
}
if (delList.length === 0) return;
delList.forEach((item) => {
item.remove();
});
// 移除正文之后的内容,比如说查看全文========================================================================
const delList2: Cheerio<any>[] = [];
let objCache2 = $("#js_article");
while (objCache2.next().length !== 0) {
delList2.push(objCache2.next());
objCache2 = objCache2.next();
}
if (delList2.length === 0) return;
delList2.forEach((item) => {
item.remove();
});
// 获取一个元素的所有兄弟元素,并删除=======================================================================
let curObj = $("span:contains('(题图:拉玛那')");
$("span:contains('(题图:拉玛那')").siblings().remove(); //先是删除所有的兄弟元素
curObj.remove(); //然后是删除自己
// ========================================
// p标签的题图
$("p:contains('题图:')").remove();
// ================================================================================
const rawEle = $(
"p[style='margin-bottom: -1px; padding-right: 5px; padding-bottom: 6px; padding-left: 5px; border-bottom-width: 2px; border-bottom-style: solid; border-bottom-color: rgb(172, 29, 16); display: inline-block; line-height: 1.1; font-size: 18px;']"
);
const rawText = rawEle.text();
const newText = rawText + "哈哈";
rawEle.text(newText);
}
function loadHtmlDom(filePath: string): CheerioAPI {
const htmlText = readFileSync(filePath, "utf-8");
return load(htmlText);
}
function extractLink($: CheerioAPI) {
const oLinkList = $("#js_articles > div");
if (!oLinkList.length) return [];
const linkArr: string[] = [];
oLinkList.each((i, oLink) => {
const url = $(oLink).attr("data-jump_url");
if (!url) return;
linkArr.push(url);
});
return linkArr;
}
function exitsFolder(absPath: string) {
try {
statSync(absPath);
} catch (e) {
// 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
mkdirSync(absPath, { recursive: true });
}
}
function getCurDate() {
const d_t = new Date();
let year = d_t.getFullYear();
let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
let day = ("0" + d_t.getDate()).slice(-2);
let hour = d_t.getHours();
let minute = d_t.getMinutes();
let second = d_t.getSeconds();
// prints date & time in YYYY-MM-DD HH:MM:SS format
return (
year +
"年" +
month +
"月" +
day +
"日" +
hour +
"时" +
minute +
"分" +
second +
"秒"
);
}
function getOutFilePath(filePath: string) {
return join(outPath, basename(filePath));
}