cheerio 关键字过滤 关键字替换 内容剔除 八曲仙人之歌讲解
| import { log } from "console"; |
| import { |
| readFileSync, |
| readdirSync, |
| lstatSync, |
| createWriteStream, |
| mkdirSync, |
| statSync, |
| } from "fs"; |
| import { basename, extname, join, resolve } from "path"; |
| import { load, CheerioAPI, Cheerio } from "cheerio"; |
| |
| |
| |
| |
| const basePath = "E:\\公众号文章采集\\公众号HTML\\妙高峰上"; |
| const outPath = join(basePath, "out"); |
| |
| try { |
| exitsFolder(outPath); |
| } catch (e) { |
| log(e); |
| } |
| |
| |
| const fileList = readdirSync(basePath); |
| |
| const pureFilePathList = fileList |
| .filter((fileName) => { |
| return lstatSync(join(basePath, fileName)).isFile(); |
| }) |
| .filter((fileName) => { |
| const fileExt = extname(fileName); |
| return fileExt === ".html"; |
| }) |
| .map((fileName) => { |
| return join(basePath, fileName); |
| }); |
| |
| |
| |
| |
| |
| for (let filePath of pureFilePathList) { |
| log(filePath); |
| const $: CheerioAPI = loadHtmlDom(filePath); |
| filterDom($); |
| const outFilePath = getOutFilePath(filePath); |
| const writeStream = createWriteStream(outFilePath, "utf-8"); |
| writeStream.write($("html").html()); |
| writeStream.end(); |
| break; |
| } |
| |
| |
| |
| function filterDom($: CheerioAPI) { |
| |
| $("#js_tags").remove(); |
| |
| $("span:contains('↓↓↓ 请点击左下角“阅读原文”')").remove(); |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| $("div.comment").remove(); |
| |
| |
| $("section[data-id='85560']").remove(); |
| |
| |
| const delList: Cheerio<any>[] = []; |
| |
| |
| let objCache = $("section[data-id='89227']"); |
| |
| while (objCache.next().length !== 0) { |
| delList.push(objCache); |
| objCache = objCache.next(); |
| } |
| if (delList.length === 0) return; |
| delList.forEach((item) => { |
| item.remove(); |
| }); |
| |
| const delList2: Cheerio<any>[] = []; |
| let objCache2 = $("#js_article"); |
| while (objCache2.next().length !== 0) { |
| delList2.push(objCache2.next()); |
| objCache2 = objCache2.next(); |
| } |
| if (delList2.length === 0) return; |
| delList2.forEach((item) => { |
| item.remove(); |
| }); |
| |
| let curObj = $("span:contains('(题图:拉玛那')"); |
| $("span:contains('(题图:拉玛那')").siblings().remove(); |
| curObj.remove(); |
| |
| |
| $("p:contains('题图:')").remove(); |
| |
| |
| const rawEle = $( |
| "p[style='margin-bottom: -1px; padding-right: 5px; padding-bottom: 6px; padding-left: 5px; border-bottom-width: 2px; border-bottom-style: solid; border-bottom-color: rgb(172, 29, 16); display: inline-block; line-height: 1.1; font-size: 18px;']" |
| ); |
| const rawText = rawEle.text(); |
| const newText = rawText + "哈哈"; |
| rawEle.text(newText); |
| } |
| |
| function loadHtmlDom(filePath: string): CheerioAPI { |
| const htmlText = readFileSync(filePath, "utf-8"); |
| return load(htmlText); |
| } |
| |
| function extractLink($: CheerioAPI) { |
| const oLinkList = $("#js_articles > div"); |
| if (!oLinkList.length) return []; |
| const linkArr: string[] = []; |
| oLinkList.each((i, oLink) => { |
| const url = $(oLink).attr("data-jump_url"); |
| if (!url) return; |
| linkArr.push(url); |
| }); |
| |
| return linkArr; |
| } |
| |
| function exitsFolder(absPath: string) { |
| try { |
| statSync(absPath); |
| } catch (e) { |
| |
| mkdirSync(absPath, { recursive: true }); |
| } |
| } |
| |
| function getCurDate() { |
| const d_t = new Date(); |
| |
| let year = d_t.getFullYear(); |
| let month = ("0" + (d_t.getMonth() + 1)).slice(-2); |
| let day = ("0" + d_t.getDate()).slice(-2); |
| let hour = d_t.getHours(); |
| let minute = d_t.getMinutes(); |
| let second = d_t.getSeconds(); |
| |
| |
| return ( |
| year + |
| "年" + |
| month + |
| "月" + |
| day + |
| "日" + |
| hour + |
| "时" + |
| minute + |
| "分" + |
| second + |
| "秒" |
| ); |
| } |
| |
| function getOutFilePath(filePath: string) { |
| return join(outPath, basename(filePath)); |
| } |
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 阿里最新开源QwQ-32B,效果媲美deepseek-r1满血版,部署成本又又又降低了!
· 单线程的Redis速度为什么快?
· SQL Server 2025 AI相关能力初探
· AI编程工具终极对决:字节Trae VS Cursor,谁才是开发者新宠?
· 展开说说关于C#中ORM框架的用法!