nodejs cheerio 提取超链接
| import { log } from "console"; |
| import { |
| readFileSync, |
| readdirSync, |
| lstatSync, |
| createWriteStream, |
| mkdirSync, |
| statSync, |
| } from "fs"; |
| import { basename, join, resolve } from "path"; |
| import { load, CheerioAPI } from "cheerio"; |
| |
| const basePath = resolve(__dirname, ".."); |
| const htmlPath = join(basePath, "html"); |
| const outPath = join(htmlPath, "out"); |
| try { |
| exitsFolder(outPath); |
| } catch (e) { |
| log(e); |
| } |
| |
| const fileList = readdirSync(htmlPath); |
| const pureFileList = fileList.filter((file) => { |
| return lstatSync(join(htmlPath, file)).isFile(); |
| }); |
| |
| pureFileList.forEach((file) => { |
| extractTopic(join(htmlPath, file)); |
| }); |
| |
| function extractTopic(filePath: string) { |
| const $: CheerioAPI = loadHtmlDom(filePath); |
| const urlArr = extractLink($); |
| const outFilePath = getOutFilePath(filePath); |
| const writeStream = createWriteStream(outFilePath, "utf-8"); |
| urlArr.forEach((url) => { |
| writeStream.write(url); |
| writeStream.write("\n"); |
| }); |
| writeStream.end(); |
| } |
| |
| function loadHtmlDom(filePath: string): CheerioAPI { |
| const htmlText = readFileSync(filePath, "utf-8"); |
| return load(htmlText); |
| } |
| |
| function extractLink($: CheerioAPI) { |
| const oLinkList = $( |
| "#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li" |
| ); |
| |
| const linkArr: string[] = []; |
| |
| oLinkList.each((i, oLink) => { |
| const url = $(oLink).attr("data-link"); |
| linkArr.push(url ? url : ""); |
| }); |
| |
| return linkArr; |
| } |
| |
| function exitsFolder(absPath: string) { |
| try { |
| statSync(absPath); |
| } catch (e) { |
| |
| mkdirSync(absPath, { recursive: true }); |
| } |
| } |
| |
| function getCurDate() { |
| const d_t = new Date(); |
| |
| let year = d_t.getFullYear(); |
| let month = ("0" + (d_t.getMonth() + 1)).slice(-2); |
| let day = ("0" + d_t.getDate()).slice(-2); |
| let hour = d_t.getHours(); |
| let minute = d_t.getMinutes(); |
| |
| |
| return year + "年" + month + "月" + day + "日" + hour + "时" + minute + "分"; |
| } |
| |
| function getOutFilePath(filePath: string) { |
| return join( |
| outPath, |
| getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt" |
| ); |
| } |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· PowerShell开发游戏 · 打蜜蜂
· 在鹅厂做java开发是什么体验
· WPF到Web的无缝过渡:英雄联盟客户端的OpenSilver迁移实战
2022-04-04 vue报错 Unexpected side effect in "xxx" computed property;
2022-04-04 vuex 需要 全局挂载 Vue.prototype.$store 吗?