E:\公众号文章采集\fi_filter_过滤器\src\exact_新浪博客手机版提取连接.js
| const fs = require('fs'); |
| const jsdom = require('jsdom'); |
| const { JSDOM } = jsdom; |
| |
| fs.readdir('./html', function (err, files) { |
| files.forEach((file) => { |
| fs.readFile('./html/' + file, 'utf-8', (err, data) => { |
| const { window } = new JSDOM(data); |
| const $ = require('jQuery')(window); |
| const writeStream = fs.createWriteStream('./urls.txt', 'utf-8'); |
| let index = 1; |
| |
| |
| $('#js_content a').each(function () { |
| fs.appendFile('./urls.txt', `${$(this).attr('href')}\r\n`, (err) => { |
| if (err) { |
| return console.log('append txt failed'); |
| } |
| console.log(index++ + '__append file success'); |
| }); |
| }); |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| writeStream.end(); |
| }); |
| }); |
| }); |
| |
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| let path = `./html`; |
| fs.readdir(path, function (err, files) { |
| files.forEach((file) => { |
| console.log(file); |
| if (file.split('.')[1] === 'txt') { |
| fs.readFile(path + '/' + file, 'utf-8', (err, data) => { |
| const $ = cheerio.load(data); |
| |
| const writeStream = fs.createWriteStream( |
| path + '/目录/' + '新浪博客目录.txt', |
| 'utf-8' |
| ); |
| $($('#pl-home-bloglist > article > ul>li').get().reverse()).each( |
| (data, ele) => { |
| let title = $(ele).find('h2').html(); |
| let url = $(ele).find('a').attr('data-link'); |
| writeStream.write('['); |
| writeStream.write(title); |
| writeStream.write(']'); |
| writeStream.write('('); |
| writeStream.write(url); |
| writeStream.write(')'); |
| writeStream.write('\n'); |
| writeStream.write('\n'); |
| console.log(title); |
| console.log(url); |
| } |
| ); |
| writeStream.end(); |
| }); |
| } |
| }); |
| }); |
| |
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息\\test`; |
| |
| |
| const fileList = fs.readdirSync(path); |
| |
| |
| for (let file of fileList) { |
| let suffixReg = /\.(html)$/; |
| if (suffixReg.test(file)) { |
| const fileContent = fs.readFileSync(path + '/' + file, 'utf-8'); |
| const $ = cheerio.load(fileContent); |
| const writeStream = fs.createWriteStream( |
| path + '/' + file.split('.')[0] + '_讯飞有声_超链接_正序.txt', |
| 'utf-8' |
| ); |
| |
| let js_history_list = $('#js_history_list').children(); |
| |
| let history_list = Array.from(js_history_list).sort((a, b) => { |
| return a.attribs.msgid - b.attribs.msgid; |
| }); |
| |
| for (let ele of history_list) { |
| const $1 = cheerio.load(ele); |
| let link = $1('h4').attr('hrefs'); |
| let isOrigin = $1('#copyright_logo').html(); |
| |
| let type = $1('h4').parent().attr('data-type'); |
| if (isOrigin === '原创' && type === 'APPMSG') { |
| console.log(link); |
| writeStream.write(link); |
| writeStream.write('\n'); |
| } |
| } |
| writeStream.end(); |
| } |
| } |
| |
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| |
| Date.prototype.format = function (fmt) { |
| var o = { |
| 'M+': this.getMonth() + 1, |
| 'd+': this.getDate(), |
| 'h+': this.getHours(), |
| 'm+': this.getMinutes(), |
| 's+': this.getSeconds(), |
| 'q+': Math.floor((this.getMonth() + 3) / 3), |
| S: this.getMilliseconds(), |
| }; |
| if (/(y+)/.test(fmt)) { |
| fmt = fmt.replace( |
| RegExp.$1, |
| (this.getFullYear() + '').substr(4 - RegExp.$1.length) |
| ); |
| } |
| for (var k in o) { |
| if (new RegExp('(' + k + ')').test(fmt)) { |
| fmt = fmt.replace( |
| RegExp.$1, |
| RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length) |
| ); |
| } |
| } |
| return fmt; |
| }; |
| |
| let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息\\test`; |
| |
| |
| const fileList = fs.readdirSync(path); |
| |
| |
| for (let file of fileList) { |
| let suffixReg = /\.(html)$/; |
| if (suffixReg.test(file)) { |
| const fileContent = fs.readFileSync(path + '/' + file, 'utf-8'); |
| const $ = cheerio.load(fileContent); |
| const writeStream = fs.createWriteStream( |
| path + '/' + file.split('.')[0] + '_link_正序.md', |
| 'utf-8' |
| ); |
| |
| let js_history_list = $('#js_history_list').children(); |
| |
| let history_list = Array.from(js_history_list).sort((a, b) => { |
| return a.attribs.msgid - b.attribs.msgid; |
| }); |
| |
| for (let ele of history_list) { |
| const $1 = cheerio.load(ele); |
| let link = $1('h4').attr('hrefs'); |
| let isOrigin = $1('#copyright_logo').html(); |
| let time = $1('p.weui_media_extra_info').html().split('<span')[0].trim(); |
| |
| let type = $1('h4').parent().attr('data-type'); |
| |
| if (isOrigin === '原创' && type === 'APPMSG') { |
| |
| time = time.replace('年', '/').replace('月', '/').replace('日', ''); |
| time = new Date(time).format('yyyy年MM月dd日'); |
| |
| let title = $1('h4').html().split('</span>')[1].trim(); |
| console.log(time + '_' + title); |
| |
| writeStream.write(`[${time}_${title}](${link})`); |
| writeStream.write('\n'); |
| } |
| } |
| writeStream.end(); |
| } |
| } |
| |
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| |
| Date.prototype.format = function (fmt) { |
| var o = { |
| 'M+': this.getMonth() + 1, |
| 'd+': this.getDate(), |
| 'h+': this.getHours(), |
| 'm+': this.getMinutes(), |
| 's+': this.getSeconds(), |
| 'q+': Math.floor((this.getMonth() + 3) / 3), |
| S: this.getMilliseconds(), |
| }; |
| if (/(y+)/.test(fmt)) { |
| fmt = fmt.replace( |
| RegExp.$1, |
| (this.getFullYear() + '').substr(4 - RegExp.$1.length) |
| ); |
| } |
| for (var k in o) { |
| if (new RegExp('(' + k + ')').test(fmt)) { |
| fmt = fmt.replace( |
| RegExp.$1, |
| RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length) |
| ); |
| } |
| } |
| return fmt; |
| }; |
| |
| let path = __dirname; |
| |
| const fileList = fs.readdirSync(path); |
| |
| for (let file of fileList) { |
| console.log('---------' + file + '----------'); |
| let suffixReg = /\.(txt)$/; |
| if (suffixReg.test(file)) { |
| |
| const fileContent = fs.readFileSync(path + '/' + file, 'utf-8'); |
| const $ = cheerio.load(fileContent); |
| |
| let js_history_list = $('#js_history_list').children(); |
| |
| if (!js_history_list) continue; |
| |
| const writeStream = fs.createWriteStream( |
| path + '/' + file.split('.')[0] + '_link_正序.md', |
| 'utf-8' |
| ); |
| |
| let history_list = Array.from(js_history_list).sort((a, b) => { |
| return a.attribs.msgid - b.attribs.msgid; |
| }); |
| |
| for (let ele of history_list) { |
| const $1 = cheerio.load(ele); |
| let link = $1('h4').attr('hrefs'); |
| let isOrigin = $1('#copyright_logo').html(); |
| let time = $1('p.weui_media_extra_info').html().split('<span')[0].trim(); |
| |
| let type = $1('h4').parent().attr('data-type'); |
| |
| if (isOrigin === '原创' && type === 'APPMSG') { |
| |
| time = time.replace('年', '/').replace('月', '/').replace('日', ''); |
| time = new Date(time).format('yyyy年MM月dd日'); |
| |
| let title = $1('h4').html().split('</span>')[1]; |
| if (title !== undefined) { |
| title = title.trim(); |
| } |
| console.log(time + '_' + title); |
| |
| writeStream.write(`[${time}_${title}](${link})`); |
| writeStream.write('\n'); |
| } |
| } |
| writeStream.end(); |
| } |
| } |
| |
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| |
| Date.prototype.format = function (fmt) { |
| var o = { |
| 'M+': this.getMonth() + 1, |
| 'd+': this.getDate(), |
| 'h+': this.getHours(), |
| 'm+': this.getMinutes(), |
| 's+': this.getSeconds(), |
| 'q+': Math.floor((this.getMonth() + 3) / 3), |
| S: this.getMilliseconds(), |
| }; |
| if (/(y+)/.test(fmt)) { |
| fmt = fmt.replace( |
| RegExp.$1, |
| (this.getFullYear() + '').substr(4 - RegExp.$1.length) |
| ); |
| } |
| for (var k in o) { |
| if (new RegExp('(' + k + ')').test(fmt)) { |
| fmt = fmt.replace( |
| RegExp.$1, |
| RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length) |
| ); |
| } |
| } |
| return fmt; |
| }; |
| |
| |
| let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息\\test3`; |
| |
| const fileList = fs.readdirSync(path); |
| |
| const writeStream = fs.createWriteStream( |
| path + '/' + '原创文章-超链接-时间顺序-合并.md', |
| 'utf-8' |
| ); |
| |
| for (let file of fileList) { |
| console.log('---------' + file + '----------'); |
| let suffixReg = /\.(txt)$/; |
| if (suffixReg.test(file)) { |
| |
| const fileContent = fs.readFileSync(path + '/' + file, 'utf-8'); |
| const $ = cheerio.load(fileContent); |
| |
| let js_history_list = $('#js_history_list').children(); |
| |
| if (!js_history_list) continue; |
| |
| |
| writeStream.write(`# ${file.split('.')[0]}\n`); |
| |
| let history_list = Array.from(js_history_list).sort((a, b) => { |
| return a.attribs.msgid - b.attribs.msgid; |
| }); |
| |
| for (let ele of history_list) { |
| const $1 = cheerio.load(ele); |
| |
| let time = $1('.weui_msg_card_hd:first-child').html(); |
| time = timeConvert(time); |
| const msgList = $1('.weui_msg_card_bd').children(); |
| for (let msg of msgList) { |
| const $2 = cheerio.load(msg); |
| |
| let type = $2('h4').parent().attr('data-type'); |
| |
| let isOrigin = $2('#copyright_logo').html(); |
| |
| let title = $2('h4').html(); |
| |
| if (isOrigin) { |
| title = title.split('</span>')[1].trim(); |
| } else { |
| title = title.trim(); |
| } |
| |
| let link = $2('h4').attr('hrefs'); |
| |
| if (isOrigin === '原创' && type === 'APPMSG') { |
| console.log(time + '_' + title); |
| writeStream.write(`[${time}_${title}](${link}) `); |
| writeStream.write('\n'); |
| } |
| } |
| } |
| } |
| } |
| writeStream.end(); |
| |
| |
| function timeConvert(time) { |
| time = time.replace('年', '/').replace('月', '/').replace('日', ''); |
| let newTime = new Date(time).format('yyyy年MM月dd日'); |
| return newTime; |
| } |
| |
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| |
| Date.prototype.format = function (fmt) { |
| var o = { |
| 'M+': this.getMonth() + 1, |
| 'd+': this.getDate(), |
| 'h+': this.getHours(), |
| 'm+': this.getMinutes(), |
| 's+': this.getSeconds(), |
| 'q+': Math.floor((this.getMonth() + 3) / 3), |
| S: this.getMilliseconds(), |
| }; |
| if (/(y+)/.test(fmt)) { |
| fmt = fmt.replace( |
| RegExp.$1, |
| (this.getFullYear() + '').substr(4 - RegExp.$1.length) |
| ); |
| } |
| for (var k in o) { |
| if (new RegExp('(' + k + ')').test(fmt)) { |
| fmt = fmt.replace( |
| RegExp.$1, |
| RegExp.$1.length == 1 ? o[k] : ('00' + o[k]).substr(('' + o[k]).length) |
| ); |
| } |
| } |
| return fmt; |
| }; |
| |
| let path = __dirname; |
| |
| |
| const fileList = fs.readdirSync(path); |
| |
| const writeStream = fs.createWriteStream( |
| path + '/' + '原创文章-超链接-时间顺序-合并.md', |
| 'utf-8' |
| ); |
| |
| for (let file of fileList) { |
| console.log('---------' + file + '----------'); |
| let suffixReg = /\.(txt)$/; |
| if (suffixReg.test(file)) { |
| |
| const fileContent = fs.readFileSync(path + '/' + file, 'utf-8'); |
| const $ = cheerio.load(fileContent); |
| |
| let js_history_list = $('#js_history_list').children(); |
| |
| |
| if (!js_history_list) continue; |
| |
| |
| writeStream.write(`# ${file.split('.')[0]}\n`); |
| |
| let history_list = Array.from(js_history_list).sort((a, b) => { |
| return a.attribs.msgid - b.attribs.msgid; |
| }); |
| |
| for (let ele of history_list) { |
| const $1 = cheerio.load(ele); |
| |
| let time = $1('.weui_msg_card_hd:first-child').html(); |
| time = timeConvert(time); |
| const msgList = $1('.weui_msg_card_bd').children(); |
| for (let msg of msgList) { |
| const $2 = cheerio.load(msg); |
| |
| let type = $2('h4').parent().attr('data-type'); |
| |
| if (type === 'APPMSG') { |
| |
| let isOrigin = $2('#copyright_logo').html(); |
| |
| let title = $2('h4').html(); |
| |
| if (isOrigin) { |
| title = title.split('</span>')[1].trim(); |
| } else { |
| title = title.trim(); |
| } |
| |
| let link = $2('h4').attr('hrefs'); |
| |
| if (isOrigin === '原创') { |
| console.log(time + '_' + title); |
| writeStream.write(`[${time}_${title}](${link}) `); |
| writeStream.write('\n'); |
| } |
| } |
| } |
| } |
| } |
| } |
| writeStream.end(); |
| |
| |
| function timeConvert(time) { |
| time = time.replace('年', '/').replace('月', '/').replace('日', ''); |
| let newTime = new Date(time).format('yyyy年MM月dd日'); |
| return newTime; |
| } |
| |
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| let path = `./html`; |
| |
| |
| const fileList = fs.readdirSync(path); |
| |
| |
| for (let file of fileList) { |
| console.log(file); |
| if (file.split('.')[1] === 'txt') { |
| const data = fs.readFileSync(path + '/' + file, 'utf8'); |
| const $ = cheerio.load(data); |
| |
| const writeStream = fs.createWriteStream(path + '/目录/' + file, 'utf-8'); |
| $('#js_history_list h4').each((index, ele) => { |
| console.log('--------' + index); |
| writeStream.write($(ele).attr('hrefs') ?? ''); |
| writeStream.write('\n'); |
| }); |
| writeStream.end(); |
| } |
| } |
| |
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| let path = `./html`; |
| fs.readdir(path, function (err, files) { |
| files.forEach((file) => { |
| console.log(file); |
| if (file.split('.')[1] === 'txt') { |
| fs.readFile(path + '/' + file, 'utf-8', (err, data) => { |
| const $ = cheerio.load(data); |
| |
| |
| const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8'); |
| $('script').remove(); |
| $('link').remove(); |
| writeStream.write($('html').html()); |
| writeStream.end(); |
| }); |
| } |
| }); |
| }); |
| |
| const fs = require("fs"); |
| const cheerio = require("cheerio"); |
| let path = `./html`; |
| |
| |
| const fileList = fs.readdirSync(path); |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| let path = `./html`; |
| |
| fs.readdir(path, function (err, files) { |
| files.forEach((file) => { |
| console.log(file); |
| if (file.split('.')[1] === 'html') { |
| fs.readFile(path + '/' + file, 'utf-8', (err, data) => { |
| const $ = cheerio.load(data); |
| |
| |
| const writeStream = fs.createWriteStream( |
| path + '/目录/' + file, |
| 'utf-8' |
| ); |
| $('#js_content a').each((index, ele) => { |
| writeStream.write($(ele).attr('href')); |
| writeStream.write('\n'); |
| }); |
| writeStream.end(); |
| }); |
| } |
| }); |
| }); |
| |
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| let path = `./html`; |
| |
| |
| const fileList = fs.readdirSync(path); |
| |
| |
| for (let file of fileList) { |
| if (file.split('.')[1] === 'txt') { |
| |
| const fileContent = fs.readFileSync(path + '/' + file, 'utf8'); |
| |
| const $ = cheerio.load(fileContent); |
| |
| const writeStream = fs.createWriteStream(path + '/目录/' + file, 'utf8'); |
| |
| $('#js_content a').each((index, ele) => { |
| writeStream.write($(ele).attr('href')); |
| writeStream.write('\n'); |
| }); |
| |
| writeStream.end(); |
| } |
| } |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| let path = `./html`; |
| fs.readdir(path, function (err, files) { |
| files.forEach((file) => { |
| console.log(file); |
| if (file.split('.')[1] === 'txt') { |
| fs.readFile(path + '/' + file, 'utf-8', (err, data) => { |
| const $ = cheerio.load(data); |
| |
| const writeStream = fs.createWriteStream( |
| path + '/目录/' + '新浪博客目录.txt', |
| 'utf-8' |
| ); |
| $('#pl-home-bloglist > article > ul>li').each((data, ele) => { |
| let title = $(ele).find('h2').html(); |
| let url = $(ele).find('a').attr('data-link'); |
| writeStream.write('['); |
| writeStream.write(title); |
| writeStream.write(']'); |
| writeStream.write('('); |
| writeStream.write(url); |
| writeStream.write(')'); |
| writeStream.write('\n'); |
| writeStream.write('\n'); |
| console.log(title); |
| console.log(url); |
| }); |
| writeStream.end(); |
| }); |
| } |
| }); |
| }); |
| |
E:\公众号文章采集\fi_filter_过滤器\src\filter_html.js
| const fs = require("fs"); |
| const jsdom = require("jsdom"); |
| const { JSDOM } = jsdom; |
| let path = `E:\\公众号文章采集\\公众号HTML\\灵智宝鬘`; |
| fs.readdir(path, function (err, files) { |
| files.forEach((file) => { |
| console.log(file); |
| if (file.split(".")[1] === "html") { |
| fs.readFile(path + "/" + file, "utf-8", (err, data) => { |
| const { window } = new JSDOM(data); |
| const $ = require("jQuery")(window); |
| const writeStream = fs.createWriteStream(path + "/" + file, "utf-8"); |
| |
| $("span:contains('***')").remove(); |
| $("span:contains('--- TBC ---')").remove(); |
| $("span:contains('支持原创翻译')").remove(); |
| $("span:contains('节选自室利·萨马塔·罗摩达斯')").remove(); |
| $("a:contains('阅读全文')").remove(); |
| $("p:contains('因此,在《给弟子的忠告》')").remove(); |
| |
| $("#activity-name").remove(); |
| |
| $("section").remove(); |
| |
| $("h3").remove(); |
| |
| $("#js_tags").remove(); |
| |
| $("img").remove(); |
| |
| $("script").remove(); |
| |
| $("div#meta_content").remove(); |
| |
| $("div.comment").remove(); |
| |
| writeStream.write($("html").html()); |
| writeStream.end(); |
| }); |
| } |
| }); |
| }); |
| |
E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio.js
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| let path = `E:\\公众号文章采集\\fi_filter_过滤器\\公众号历史消息`; |
| |
| |
| const fileList = fs.readdirSync(path); |
| |
| |
| for (let file of fileList) { |
| if (file.split('.')[1] === 'html') { |
| const fileContent = fs.readFileSync(path + '/' + file, 'utf-8'); |
| const $ = cheerio.load(fileContent); |
| const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8'); |
| |
| |
| $('script').remove(); |
| $('link').remove(); |
| |
| writeStream.write($('html').html()); |
| writeStream.end(); |
| } |
| } |
| |
| function filterContent($) { |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| $('script').remove(); |
| |
| |
| |
| |
| } |
| |
E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio_听心坊.js
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| let path = `E:\\公众号文章采集\\公众号HTML\\听心坊\\`; |
| |
| |
| const fileList = fs.readdirSync(path); |
| |
| |
| for (let file of fileList) { |
| console.log(file); |
| if (file.split('.')[1] === 'html') { |
| const fileContent = fs.readFileSync(path + '/' + file, 'utf-8'); |
| const $ = cheerio.load(fileContent); |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| let idx = $("span:contains('【明亮说')").text().split('·')[1]; |
| console.log(idx); |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| const writeStream = fs.createWriteStream(path + '/' + idx + file, 'utf-8'); |
| writeStream.write($('html').html()); |
| writeStream.end(); |
| } |
| } |
| |
E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio_旭然之光.js
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| let path = `E:\\公众号文章采集\\公众号HTML\\煦然之光`; |
| fs.readdir(path, function (err, files) { |
| files.forEach((file) => { |
| console.log(file); |
| if (file.split('.')[1] === 'html') { |
| fs.readFile(path + '/' + file, 'utf-8', (err, data) => { |
| const $ = cheerio.load(data); |
| const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8'); |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| writeStream.write($('html').html()); |
| writeStream.end(); |
| }); |
| } |
| }); |
| }); |
| |
E:\公众号文章采集\fi_filter_过滤器\src\filter_html_cheerio_阿知事业林.js
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| let path = `E:\\公众号文章采集\\公众号HTML\\阿知事业林`; |
| |
| |
| const fileList = fs.readdirSync(path); |
| |
| |
| for (let file of fileList) { |
| console.log(file); |
| if (file.split('.')[1] === 'html') { |
| |
| const fileContent = fs.readFileSync(path + '/' + file, 'utf8'); |
| |
| const $ = cheerio.load(fileContent); |
| |
| const writeStream = fs.createWriteStream(path + '/' + file, 'utf-8'); |
| |
| $('img').remove(); |
| |
| $('script').remove(); |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| $('#meta_content').remove(); |
| |
| $('.comment').remove(); |
| |
| $('#js_tags').remove(); |
| |
| $('#js_sponsor_ad_area').remove(); |
| |
| $('a').remove(); |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| |
| writeStream.write($('html').html()); |
| writeStream.end(); |
| } |
| } |
| |
E:\公众号文章采集\fi_filter_过滤器\src\filter_html_不死甘露.js
| const fs = require("fs"); |
| const jsdom = require("jsdom"); |
| const { JSDOM } = jsdom; |
| let path = `E:\\公众号文章采集\\公众号HTML\\灵智宝鬘`; |
| fs.readdir(path, function (err, files) { |
| files.forEach((file) => { |
| console.log(file); |
| if (file.split(".")[1] === "html") { |
| fs.readFile(path + "/" + file, "utf-8", (err, data) => { |
| const { window } = new JSDOM(data); |
| const $ = require("jQuery")(window); |
| const writeStream = fs.createWriteStream(path + "/" + file, "utf-8"); |
| |
| |
| $("span:contains('***')").remove(); |
| $("span:contains('--- TBC ---')").remove(); |
| $("span:contains('支持原创翻译')").remove(); |
| $("a:contains('阅读全文')").remove(); |
| $("strong:contains('不死甘露')").remove(); |
| $("strong:contains('关于永恒的开示录')").remove(); |
| $("strong:contains('THE NECTAR OF IMMORTALITY')").remove(); |
| |
| $("span:contains('室利·尼萨迦达塔·马哈拉吉 著')").remove(); |
| $("span:contains('灵智宝鬘翻译团队 中译')").remove(); |
| $("p:contains('喜欢作者')").remove(); |
| $("p:contains('——')").remove(); |
| |
| |
| $("#activity-name").remove(); |
| |
| $("section").remove(); |
| |
| $("h3").remove(); |
| |
| $("#js_tags").remove(); |
| |
| $("img").remove(); |
| |
| $("script").remove(); |
| |
| $("div#meta_content").remove(); |
| |
| $("div.comment").remove(); |
| |
| writeStream.write($("html").html()); |
| writeStream.end(); |
| }); |
| } |
| }); |
| }); |
| |
E:\公众号文章采集\fi_filter_过滤器\src\html2txt.js
| const fs = require("fs"); |
| const jsdom = require("jsdom"); |
| const { JSDOM } = jsdom; |
| let path = `E:\\公众号文章采集\\公众号HTML\\养猫学习`; |
| |
| fs.readdir(path, function (err, files) { |
| files.forEach((file) => { |
| console.log(file); |
| if (file.split(".")[1] === "html") { |
| fs.readFile(path + "/" + file, "utf-8", (err, data) => { |
| const { window } = new JSDOM(data); |
| const $ = require("jQuery")(window); |
| const writeStream = fs.createWriteStream( |
| path + "/" + file.split(".")[0] + ".txt", |
| "utf-8" |
| ); |
| |
| writeStream.write($("#activity-name").text()); |
| |
| writeStream.write($("#js_content").text()); |
| writeStream.end(); |
| }); |
| } |
| }); |
| }); |
| |
E:\公众号文章采集\fi_filter_过滤器\src\html2txt_cheerio.js
| const fs = require('fs'); |
| const cheerio = require('cheerio'); |
| |
| let path = `E:\\公众号文章采集\\公众号HTML\\阿知事业林`; |
| |
| |
| const fileList = fs.readdirSync(path); |
| |
| |
| for (let file of fileList) { |
| console.log(file); |
| if (file.split('.')[1] === 'html') { |
| |
| const fileContent = fs.readFileSync(path + '/' + file, 'utf8'); |
| |
| const $ = cheerio.load(fileContent); |
| |
| const writeStream = fs.createWriteStream( |
| path + '/' + file.split('.')[0] + '.txt', |
| 'utf-8' |
| ); |
| |
| |
| |
| writeStream.write($('body').text()); |
| writeStream.write('endendend'); |
| |
| writeStream.end(); |
| } |
| } |
| |
E:\公众号文章采集\fi_filter_过滤器\src\任意路径文件写入.js
| const fs = require('fs'); |
| |
| const writeFileRecursive = function (path, buffer, callback) { |
| |
| let lastPath = path.substring(0, path.lastIndexOf('/')); |
| |
| fs.mkdir(lastPath, { recursive: true }, (err) => { |
| if (err) return callback(err); |
| fs.writeFile(path, buffer, function (err) { |
| if (err) return callback(err); |
| return callback(null); |
| }); |
| }); |
| }; |
| |
| const buffer = 'hello'; |
| writeFileRecursive('./public/test/test.txt', buffer, (err) => { |
| if (err) console.error(err); |
| console.info('write success'); |
| }); |
| |
E:\公众号文章采集\fi_filter_过滤器\src\crawler\crawler.ts
| import superagent from "superagent"; |
| import { load, CheerioAPI } from "cheerio"; |
| |
| import { log } from "console"; |
| import { createWriteStream } from "fs"; |
| |
| export default class Crawler { |
| private url = ``; |
| private $: CheerioAPI; |
| |
| constructor() {} |
| |
| setUrl(url: string) { |
| this.url = url; |
| } |
| |
| async init() { |
| const res = await superagent.get(this.url); |
| this.$ = load(res.text); |
| } |
| |
| save(path: string) { |
| const writeStream = createWriteStream(path, "utf-8"); |
| writeStream.write(this.$("html")); |
| writeStream.end(); |
| } |
| getTitle() { |
| log(this.$("#activity-name").text()); |
| return this.$("#activity-name").text(); |
| } |
| getTime() { |
| log(this.$("script:contains('function htmlDecode(str)')").text()); |
| } |
| |
| getContent() { |
| |
| const quotes = this.$("span:contains('萨特桑指出')"); |
| return quotes.text(); |
| } |
| } |
| |
E:\公众号文章采集\fi_filter_过滤器\src\crawler\index.ts
| import { log } from "console"; |
| import Crawler from "./crawler"; |
| |
| const crawler = new Crawler(); |
| |
| crawler.setUrl("https://mp.weixin.qq.com/s/EgZhFJTzsgfYzZZ-4_SI4Q"); |
| await crawler.init(); |
| crawler.getTime(); |
| |
| |
| |
| |
| |
| |
| |
E:\公众号文章采集\fi_filter_过滤器\src\filter\01_灵智宝鬘_话题_尼萨迦达塔.ts
| import { log } from "console"; |
| import { |
| readFileSync, |
| readdirSync, |
| lstatSync, |
| createWriteStream, |
| mkdirSync, |
| statSync, |
| } from "fs"; |
| import { basename, extname, join, resolve } from "path"; |
| import { load, CheerioAPI } from "cheerio"; |
| |
| |
| |
| |
| const basePath = "E:\\公众号文章采集\\公众号HTML\\灵智宝鬘"; |
| const outPath = join(basePath, "out"); |
| |
| try { |
| exitsFolder(outPath); |
| } catch (e) { |
| log(e); |
| } |
| |
| |
| const fileList = readdirSync(basePath); |
| |
| const pureFilePathList = fileList |
| .filter((fileName) => { |
| return lstatSync(join(basePath, fileName)).isFile(); |
| }) |
| .filter((fileName) => { |
| const fileExt = extname(fileName); |
| return fileExt === ".html"; |
| }) |
| .map((fileName) => { |
| return join(basePath, fileName); |
| }); |
| |
| |
| |
| |
| |
| for (let filePath of pureFilePathList) { |
| const $: CheerioAPI = loadHtmlDom(filePath); |
| filterDom($); |
| const outFilePath = getOutFilePath(filePath); |
| const writeStream = createWriteStream(outFilePath, "utf-8"); |
| writeStream.write($("html").html()); |
| writeStream.end(); |
| } |
| |
| |
| |
| function filterDom($: CheerioAPI) { |
| |
| $("#js_tags").remove(); |
| |
| $("span:contains('灵智宝鬘翻译团队 中译')").remove(); |
| |
| $( |
| "p[style*='white-space: normal;text-align: center;']:contains('我是那')" |
| ).remove(); |
| |
| $( |
| "p[style*='white-space: normal;text-align: center;']:contains('室利·尼萨迦达塔·马哈拉吉的开示录')" |
| ).remove(); |
| |
| $( |
| "span[style*='color: rgb(136, 136, 136)']:contains('室利·尼萨迦达塔·马哈拉吉 著')" |
| ).remove(); |
| |
| $("span[style*='color: rgb(255, 76, 65)']").remove(); |
| $("strong[style*='color: rgb(255, 76, 65)']").remove(); |
| |
| $("div.comment").remove(); |
| } |
| |
| function loadHtmlDom(filePath: string): CheerioAPI { |
| const htmlText = readFileSync(filePath, "utf-8"); |
| return load(htmlText); |
| } |
| |
| function extractLink($: CheerioAPI) { |
| const oLinkList = $("#js_articles > div"); |
| if (!oLinkList.length) return []; |
| const linkArr: string[] = []; |
| oLinkList.each((i, oLink) => { |
| const url = $(oLink).attr("data-jump_url"); |
| if (!url) return; |
| linkArr.push(url); |
| }); |
| |
| return linkArr; |
| } |
| |
| function exitsFolder(absPath: string) { |
| try { |
| statSync(absPath); |
| } catch (e) { |
| |
| mkdirSync(absPath, { recursive: true }); |
| } |
| } |
| |
| function getCurDate() { |
| const d_t = new Date(); |
| |
| let year = d_t.getFullYear(); |
| let month = ("0" + (d_t.getMonth() + 1)).slice(-2); |
| let day = ("0" + d_t.getDate()).slice(-2); |
| let hour = d_t.getHours(); |
| let minute = d_t.getMinutes(); |
| let second = d_t.getSeconds(); |
| |
| |
| return ( |
| year + |
| "年" + |
| month + |
| "月" + |
| day + |
| "日" + |
| hour + |
| "时" + |
| minute + |
| "分" + |
| second + |
| "秒" |
| ); |
| } |
| |
| function getOutFilePath(filePath: string) { |
| return join(outPath, basename(filePath)); |
| } |
| |
E:\公众号文章采集\fi_filter_过滤器\src\topic\01_非推送_链接_一行一个.ts
| import { log } from "console"; |
| import { |
| readFileSync, |
| readdirSync, |
| lstatSync, |
| createWriteStream, |
| mkdirSync, |
| statSync, |
| } from "fs"; |
| import { basename, extname, join, resolve } from "path"; |
| import { load, CheerioAPI } from "cheerio"; |
| |
| |
| const basePath = process.cwd(); |
| const outPath = join(basePath, "out"); |
| try { |
| exitsFolder(outPath); |
| } catch (e) { |
| log(e); |
| } |
| |
| |
| const fileList = readdirSync(basePath); |
| const pureFilePathList = fileList |
| .filter((fileName) => { |
| return lstatSync(join(basePath, fileName)).isFile(); |
| }) |
| .filter((fileName) => { |
| const fileExt = extname(fileName); |
| return fileExt === ".txt" || fileExt === ".html"; |
| }) |
| .map((fileName) => { |
| return join(basePath, fileName); |
| }); |
| |
| pureFilePathList.forEach((filePath) => { |
| extractTopic(filePath); |
| }); |
| |
| function extractTopic(filePath: string) { |
| const $: CheerioAPI = loadHtmlDom(filePath); |
| const urlArr = extractLink($); |
| const outFilePath = getOutFilePath(filePath); |
| const writeStream = createWriteStream(outFilePath, "utf-8"); |
| urlArr.forEach((url) => { |
| writeStream.write(url); |
| writeStream.write("\n"); |
| }); |
| writeStream.end(); |
| } |
| |
| function loadHtmlDom(filePath: string): CheerioAPI { |
| const htmlText = readFileSync(filePath, "utf-8"); |
| return load(htmlText); |
| } |
| |
| function extractLink($: CheerioAPI) { |
| const oLinkList = $( |
| "#js_content_overlay > div.album.js_album_container.album-rich_media_area_primary_full > div > div.album__content.js_album_bd > ul li" |
| ); |
| |
| const linkArr: string[] = []; |
| |
| oLinkList.each((i, oLink) => { |
| const url = $(oLink).attr("data-link"); |
| linkArr.push(url ? url : ""); |
| }); |
| |
| return linkArr; |
| } |
| |
| function exitsFolder(absPath: string) { |
| try { |
| statSync(absPath); |
| } catch (e) { |
| |
| mkdirSync(absPath, { recursive: true }); |
| } |
| } |
| |
| function getCurDate() { |
| const d_t = new Date(); |
| |
| let year = d_t.getFullYear(); |
| let month = ("0" + (d_t.getMonth() + 1)).slice(-2); |
| let day = ("0" + d_t.getDate()).slice(-2); |
| let hour = d_t.getHours(); |
| let minute = d_t.getMinutes(); |
| let second = d_t.getSeconds(); |
| |
| |
| return ( |
| year + |
| "年" + |
| month + |
| "月" + |
| day + |
| "日" + |
| hour + |
| "时" + |
| minute + |
| "分" + |
| second + |
| "秒" |
| ); |
| } |
| |
| function getOutFilePath(filePath: string) { |
| return join( |
| outPath, |
| getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt" |
| ); |
| } |
| |
E:\公众号文章采集\fi_filter_过滤器\src\topic\02_推送_链接_一行一个.ts
| import { log } from "console"; |
| import { |
| readFileSync, |
| readdirSync, |
| lstatSync, |
| createWriteStream, |
| mkdirSync, |
| statSync, |
| } from "fs"; |
| import { basename, extname, join, resolve } from "path"; |
| import { load, CheerioAPI } from "cheerio"; |
| |
| |
| const basePath = process.cwd(); |
| const outPath = join(basePath, "out"); |
| try { |
| exitsFolder(outPath); |
| } catch (e) { |
| log(e); |
| } |
| |
| |
| const fileList = readdirSync(basePath); |
| const pureFilePathList = fileList |
| .filter((fileName) => { |
| return lstatSync(join(basePath, fileName)).isFile(); |
| }) |
| .filter((fileName) => { |
| const fileExt = extname(fileName); |
| return fileExt === ".txt" || fileExt === ".html"; |
| }) |
| .map((fileName) => { |
| return join(basePath, fileName); |
| }); |
| |
| pureFilePathList.forEach((filePath) => { |
| extractTopic(filePath); |
| }); |
| |
| function extractTopic(filePath: string) { |
| const $: CheerioAPI = loadHtmlDom(filePath); |
| const urlArr = extractLink($)!; |
| if (!urlArr.length) return; |
| |
| const outFilePath = getOutFilePath(filePath); |
| const writeStream = createWriteStream(outFilePath, "utf-8"); |
| urlArr.forEach((url) => { |
| writeStream.write(url); |
| writeStream.write("\n"); |
| }); |
| writeStream.end(); |
| } |
| |
| function loadHtmlDom(filePath: string): CheerioAPI { |
| const htmlText = readFileSync(filePath, "utf-8"); |
| return load(htmlText); |
| } |
| |
| function extractLink($: CheerioAPI) { |
| const oLinkList = $("#js_articles > div"); |
| if (!oLinkList.length) return []; |
| const linkArr: string[] = []; |
| oLinkList.each((i, oLink) => { |
| const url = $(oLink).attr("data-jump_url"); |
| if (!url) return; |
| linkArr.push(url); |
| }); |
| |
| return linkArr; |
| } |
| |
| function exitsFolder(absPath: string) { |
| try { |
| statSync(absPath); |
| } catch (e) { |
| |
| mkdirSync(absPath, { recursive: true }); |
| } |
| } |
| |
| function getCurDate() { |
| const d_t = new Date(); |
| |
| let year = d_t.getFullYear(); |
| let month = ("0" + (d_t.getMonth() + 1)).slice(-2); |
| let day = ("0" + d_t.getDate()).slice(-2); |
| let hour = d_t.getHours(); |
| let minute = d_t.getMinutes(); |
| let second = d_t.getSeconds(); |
| |
| |
| return ( |
| year + |
| "年" + |
| month + |
| "月" + |
| day + |
| "日" + |
| hour + |
| "时" + |
| minute + |
| "分" + |
| second + |
| "秒" |
| ); |
| } |
| |
| function getOutFilePath(filePath: string) { |
| return join( |
| outPath, |
| getCurDate() + "_目录_" + basename(filePath).split(".")[0] + ".txt" |
| ); |
| } |
| |
E:\公众号文章采集\fi_filter_过滤器\src\txt\01_合集.ts
| import { log } from "console"; |
| import { |
| readFileSync, |
| readdirSync, |
| lstatSync, |
| createWriteStream, |
| mkdirSync, |
| statSync, |
| } from "fs"; |
| import { basename, extname, join, resolve } from "path"; |
| import { load, CheerioAPI } from "cheerio"; |
| |
| |
| |
| |
| const basePath = "E:\\公众号文章采集\\公众号HTML\\灵智宝鬘\\out"; |
| const outFileName = "灵智宝鬘_尼萨迦达塔_我是那"; |
| const outPath = join(basePath, "txt"); |
| |
| try { |
| exitsFolder(outPath); |
| } catch (e) { |
| log(e); |
| } |
| |
| |
| const fileList = readdirSync(basePath); |
| |
| const pureFilePathList = fileList |
| .filter((fileName) => { |
| return lstatSync(join(basePath, fileName)).isFile(); |
| }) |
| .filter((fileName) => { |
| const fileExt = extname(fileName); |
| return fileExt === ".html"; |
| }) |
| .map((fileName) => { |
| return join(basePath, fileName); |
| }); |
| |
| |
| |
| |
| |
| const outFilePath = getOutFilePath(); |
| const writeStream = createWriteStream(outFilePath, "utf-8"); |
| |
| for (let [index, filePath] of pureFilePathList.entries()) { |
| const $: CheerioAPI = loadHtmlDom(filePath); |
| const textContent = extractText($); |
| |
| writeStream.write("\n"); |
| writeStream.write(`第${index + 1}章`); |
| writeStream.write("\n"); |
| |
| writeStream.write(textContent.title); |
| writeStream.write("\n"); |
| |
| writeStream.write(textContent.pubDate); |
| writeStream.write("\n"); |
| |
| writeStream.write(textContent.content); |
| writeStream.write("\n"); |
| log(`${index}_${filePath}`); |
| } |
| writeStream.end(); |
| |
| |
| |
| function loadHtmlDom(filePath: string): CheerioAPI { |
| const htmlText = readFileSync(filePath, "utf-8"); |
| return load(htmlText); |
| } |
| |
| function exitsFolder(absPath: string) { |
| try { |
| statSync(absPath); |
| } catch (e) { |
| |
| mkdirSync(absPath, { recursive: true }); |
| } |
| } |
| |
| function getOutFilePath() { |
| return join(outPath, outFileName + ".txt"); |
| } |
| |
| function extractText($: CheerioAPI) { |
| |
| const title = $("#activity-name").text(); |
| |
| const pubDate = $("#publish_time").text(); |
| const content = $("#js_content").text(); |
| const splitContent = handleContent(content); |
| return { |
| title, |
| pubDate, |
| content: splitContent, |
| }; |
| } |
| |
| function handleContent(content: string) { |
| return content |
| .replace("尼萨迦达塔:", replaceContent("尼萨迦达塔:", 1000)) |
| .replace("尼:", replaceContent("尼萨迦达塔:", 1000)) |
| .replace("提问者:", replaceContent("提问者:", 1000)) |
| .replace("问:", replaceContent("提问者:", 1000)); |
| } |
| |
| function replaceContent(keyword: string, time: number = 1000) { |
| return `\n[p${time.toString()}]\n${keyword}\n`; |
| } |
| |
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】凌霞软件回馈社区,博客园 & 1Panel & Halo 联合会员上线
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】博客园社区专享云产品让利特惠,阿里云新客6.5折上折
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· 微软正式发布.NET 10 Preview 1:开启下一代开发框架新篇章
· 没有源码,如何修改代码逻辑?
· PowerShell开发游戏 · 打蜜蜂
· 在鹅厂做java开发是什么体验
· WPF到Web的无缝过渡:英雄联盟客户端的OpenSilver迁移实战