nodejs cheerio 提取超链接
import { log } from "console";
import {
readFileSync,
readdirSync,
lstatSync,
createWriteStream,
mkdirSync,
statSync,
} from "fs";
import { basename, join, resolve } from "path";
import { load, CheerioAPI } from "cheerio";
const basePath = resolve(__dirname, "..");
const htmlPath = join(basePath, "html");
const outPath = join(htmlPath, "out");
try {
exitsFolder(outPath);
} catch (e) {
log(e);
}
// 读取文件夹,返回一个文件列表数组
const fileList = readdirSync(htmlPath);
const pureFileList = fileList.filter((file) => {
return lstatSync(join(htmlPath, file)).isFile();
});
pureFileList.forEach((file) => {
extractTopic(join(htmlPath, file));
});
function extractTopic(filePath: string) {
const $: CheerioAPI = loadHtmlDom(filePath);
const urlArr = extractLink($);
const outFilePath = getOutFilePath(filePath);
const writeStream = createWriteStream(outFilePath, "utf-8");
urlArr.forEach((url) => {
writeStream.write(url);
writeStream.write("\n");
});
writeStream.end();
}
function loadHtmlDom(filePath: string): CheerioAPI {
const htmlText = readFileSync(filePath, "utf-8");
return load(htmlText);
}
function extractLink($: CheerioAPI) {
const oLinkList = $(
"#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li"
);
const linkArr: string[] = [];
oLinkList.each((i, oLink) => {
const url = $(oLink).attr("data-link");
linkArr.push(url ? url : "");
});
return linkArr;
}
function exitsFolder(absPath: string) {
try {
statSync(absPath);
} catch (e) {
// 不存在文件夹,直接创建 {recursive: true} 这个配置项是配置自动创建多个文件夹
mkdirSync(absPath, { recursive: true });
}
}
function getCurDate() {
const d_t = new Date();
let year = d_t.getFullYear();
let month = ("0" + (d_t.getMonth() + 1)).slice(-2);
let day = ("0" + d_t.getDate()).slice(-2);
let hour = d_t.getHours();
let minute = d_t.getMinutes();
// prints date & time in YYYY-MM-DD HH:MM:SS format
return year + "年" + month + "月" + day + "日" + hour + "时" + minute + "分";
}
function getOutFilePath(filePath: string) {
return join(
outPath,
getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt"
);
}