wordsinasentence.com 单词英英翻译 17987个 含获取nodejs代码

wordsinasentence.com 单词英英翻译 17987个
这个网站的英英单词翻译非常不错,汇总成markdown,方便查询

官方在线查询地址
https://wordsinasentence.com/vocabulary-word-list/

由于直接贴上来,页面就崩了,所以改为附件了,本地查阅吧
https://files.cnblogs.com/files/pengchenggang/wordsinasentence-wordsArr.zip?t=1722836509&download=true

wordsinasentence.js

// wordsinasentence.com 单词的英语翻译 const { getPageListUrl, loadData, getTxt, saveData } = require('../getHtmlUtils.js') const axios = require("axios") const startNum = 1 const endNum = 37 const rootPath = 'wordsIn/' // 根目录 目录请手工创建 const folderPath = 'wordsinasentence/' // 目录请手工创建 下次叫list const detailPath = 'details/' // 目录请手工创建 const step1 = () => { // 1. 获取单词列表的页面 const pageListUrl = 'https://wordsinasentence.com/vocabulary-word-list/?_page=$num' // 获取全部列表页 getPageListUrl(pageListUrl, startNum, endNum, rootPath + folderPath) } const step2 = () => { // 获取本地文件,解析其中的详情页连接,并将结果放到一个数组 const allArr = [] for (let i = startNum; i <= endNum; i++) { const data = loadData(rootPath + folderPath + i + '.html') const txt = dataFilter(data) // 对数据进行一次过滤 const arr = txtReg(txt) // 获取页面中的链接和单词的数组 allArr.push(...arr) } // console.log(allArr) saveData(rootPath + 'linkArr.json', JSON.stringify(allArr)) } const txtReg = txt => { const regex = /<a href="([^"]+)">([^<]+)<\/a>/gs var matches = [...txt.matchAll(regex)] return matches.map(match => { return [match[1], match[2].trim()] }) } const dataFilter = data => { const mainStartPosition = `<div class="pt-cv-wrapper"><div class="pt-cv-view` const mainEndPosition = `<div class="text-left pt-cv-pagination-wrapper"><ul` return getTxt(mainStartPosition, mainEndPosition, data) } // step2() const step3 = () => { // 获取详情页 const linkArr = JSON.parse(loadData(rootPath + 'linkArr.json')) getLinkManager(linkArr, 499) } let currIndex = 0 const getLinkManager = (linkArr, index) => { currIndex = index // 创建5个线程 加快速度 for (let i = 0; i < 20; i++) { currIndex += 1 getLink(linkArr, currIndex) } } const getLink = (linkArr, index) => { if (index >= linkArr.length) return const [link, word] = linkArr[index] axios.get(link).then(res => { console.info('获取 word:' + word) const html = res.data saveData(rootPath + detailPath + index + '.html', html) currIndex += 1 getLink(linkArr, currIndex) // 递归调用 }) } // step3() const step4 = () => { // 获取详情页面中的单词和解释 let wordsArr = [] let wordsTxt = '' for (let i = 0; i <= 17986; i++) { console.info('i', i) const html = loadData(rootPath + detailPath + i +'.html') const html2 = dataFilterByStep4(html) // console.log('html2', html2) const [title, content] = txtRegByStep4(html2) // wordsArr.push({ title, content}) wordsTxt += `## ${title} \n${content} \n \n` } saveData(rootPath + 'wordsArr.txt', wordsTxt) } const txtRegByStep4 = html2 => { var regex = /<p[^>]*>(.*?)<\/p>/gs var matches = [...html2.matchAll(regex)] var firstTwo = matches.map(match => { return match[1] }).slice(0, 2) return [firstTwo[0].replace('Definition of ', ''), firstTwo[1]] } const dataFilterByStep4 = data => { const mainStartPosition = `<!--.headline_area-->` const mainEndPosition = `<!–– SENTENCE 1 */ ––>` return getTxt(mainStartPosition, mainEndPosition, data) } step4()

getHtmlUtils.js

const axios = require("axios") const fs = require('fs') exports.getPageListUrl = (url, startNum, endNum, folderPath) => { // let i = startNum for (let i = startNum; i <= endNum; i++) { axios.get(url.replace('$num', i)).then(res => { const html = res.data const filePath = '.\\' + folderPath + i + '.html' saveData(filePath, html) }) } } exports.saveData = (filePath, data) => { fs.writeFile(filePath, data, function () { console.info(filePath + ' 写入完成!') }) } exports.loadData = (filePath) => { return fs.readFileSync(filePath, 'utf8') } exports.getTxt = (mainStartPosition, mainEndPosition, content) => { // const mainStartPosition = '<div class="lacontent">' // const mainEndPosition = '<div class="contextdhall clearfix">' let pos1 = 0, pos2 = 0 pos1 = content.indexOf(mainStartPosition) // + mainStartPosition.length pos2 = content.indexOf(mainEndPosition) // + mainEndPosition.length // console.info('pos1', pos1) // console.info('pos2', pos2) const listContent = content.substring(pos1, pos2) // console.info('listContent', listContent) return listContent }

__EOF__

本文作者Reciter
本文链接https://www.cnblogs.com/pengchenggang/p/18343029.html
关于博主:评论和私信会在第一时间回复。或者直接私信我。
版权声明:本博客所有文章除特别声明外,均采用 BY-NC-SA 许可协议。转载请注明出处!
声援博主:如果您觉得文章对您有帮助,可以点击文章右下角推荐一下。您的鼓励是博主的最大动力!
posted @   彭成刚  阅读(123)  评论(0编辑  收藏  举报
相关博文:
阅读排行:
· winform 绘制太阳,地球,月球 运作规律
· AI与.NET技术实操系列(五):向量存储与相似性搜索在 .NET 中的实现
· 超详细:普通电脑也行Windows部署deepseek R1训练数据并当服务器共享给他人
· 【硬核科普】Trae如何「偷看」你的代码?零基础破解AI编程运行原理
· 上周热点回顾(3.3-3.9)
历史上的今天:
2022-08-05 vscode 点击 import 的对象 from 带有 @ 不能自动跳转 - 要配置 jsconfig.json
2022-08-05 码上掘金 里面 嵌入自定义组件,可以用做组件demo演示
2021-08-05 腾讯云函数
2020-08-05 AutoTipZen 实时根据文字是否溢出 提示title
点击右上角即可分享
微信分享提示