首图 首都图书馆 国家图书馆 自动查借阅书籍 脚本puppeteer汇总
(15条消息) puppeteer_彭争杰的博客-CSDN博客
api: https://github.com/puppeteer/puppeteer/blob/main/docs/api.md
examples: https://github.com/puppeteer/puppeteer/tree/main/examples
阮一峰:js 异步操作 - JavaScript 教程 - 网道 (wangdoc.com)
Promise 对象 - ECMAScript 6入门 (ruanyifeng.com)
Node: Puppeteer + 图像识别 实现百度指数爬虫 - 掘金 (juejin.cn)
Puppeteer 用来做爬虫太 Low 了!但用在这里很合适! - 掘金 (juejin.cn)
使用puppeteer控制浏览器中视频播放 - 掘金 (juejin.cn)
奶奶都能轻松入门的 Puppeteer 教程 - 掘金 (juejin.cn)
cnpm install puppeteer-core 默认不自带浏览器
cnpm install puppeteer -S
国家图书馆 借书查询 国图
const puppeteer = require('puppeteer'); (async () => { const browser = await puppeteer.launch({ executablePath: "C:\\Program Files\\Google\\Chrome Dev\\Application\\chrome.exe",//process.env.CHROME_PATH, // Path to chromium build with autofill domain headless: true, args: ['--start-maximized'], defaultViewport: { width: 1800, height: 1000 }, slowMo: 0 }); async function visitNationalLibrary(card, name) { const page = await browser.newPage(); // 添加控制台监听器 //page.on('console', msg => console.log('- [Browser Console Log]', msg.text())); const session = await page.target().createCDPSession(); console.log('---------------------------') console.log(card + ' ' + name) await page.goto("http://opac.nlc.cn/F", { waitUntil: 'load', // Remove the timeout timeout: 0 }); try { await page.waitForXPath("/html/body/form/center/div/table/tbody/tr/td[2]/input") await page.type("input[name=bor_id]", card, { delay: 100 }) await page.type("input[name=bor_verification]", secret) await page.$eval('form[name=form1', form => form.submit()); // await page.waitForTimeout(500) await page.waitForXPath('//*[@id="history"]/a[1]/table/tbody/tr[1]/td[2]/a') // 检查元素的值并决定是否点击 const shouldClick = await page.evaluate(() => { const element = document.querySelector('#history > a:nth-child(1) > table > tbody > tr:nth-child(1) > td.td1 > a'); if (element) { const text = element.textContent.trim(); //console.log("you have books?", parseInt(text, 10)); // 这行仍然会输出到浏览器控制台 return { shouldClick: parseInt(text, 10) > 0, text: text }; } return { shouldClick: false, text: null }; }); if (!shouldClick.shouldClick) { console.log("No Book."); // 这行会输出到 Node.js 控制台 return []; } await page.click('#history > a:nth-child(1) > table > tbody > tr:nth-child(1) > td.td1 > a'); await page.waitForSelector('#baseinfo > center > table:nth-child(6) > tbody > tr.tr1 > th:nth-child(5)') //await page.waitForSelector('#baseinfo > center > table:nth-child(6) > tbody > tr:nth-child(2)') const data0 = await page.evaluate(() => { //const tds = Array.from(document.querySelectorAll('#baseinfo > center > table:nth-child(6) tr')); const tds = Array.from(document.querySelectorAll('#baseinfo > center > table:nth-child(6) > tbody > tr:nth-child(n+2)')); return tds.map(tr => { const cells = Array.from(tr.querySelectorAll('td')); console.log('Data inside evaluate:', cells.length); if (cells.length > 0) { return { "题名": cells[3]?.innerText.trim(), "应还日期": cells[5]?.innerText.trim(), }; } }).filter(Boolean); }); console.log(data0) //输出借的书 } catch (err) { console.error(err) } await page.close(); } var names = new Map() // 添加数据到Map中 /**/ names.set("8888888888888888", "user1"); names.set("8888888888888888", "user2"); const secret="808080"; for (let [card, name] of names) { await visitNationalLibrary(card, name); } await browser.close(); })().catch(e => { console.error(e) });
首都图书馆自动登录查询脚本
需要安装 tesseract.js
cnpm install tesseract.js (用淘宝国内源 cnpm)
const puppeteer = require('puppeteer'); const fs = require('fs'); const Tesseract = require('tesseract.js'); //browser path: "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe" // "C:\\Program Files\\Google\\Chrome Dev\\Application\\chrome.exe" const chrome_path = "C:\\Program Files\\Google\\Chrome Dev\\Application\\chrome.exe" // 验证码处理 async function handleCaptcha(page) { let loginSuccess = false; let maxAttempts = 5; for (let attempt = 0; attempt < maxAttempts && !loginSuccess; attempt++) { // 使用 fetch API 下载验证码图片 const captchaImage = await page.$eval('#loginform-verifycode-image', img => img.src); const buffer = await page.evaluate(async (src) => { const response = await fetch(src); const buffer = await response.arrayBuffer(); return Array.from(new Uint8Array(buffer)); }, captchaImage); // 转换为 Node.js Buffer const imageBuffer = Buffer.from(buffer); // 调用Tesseract.js进行OCR识别 const { data: { text } } = await Tesseract.recognize( imageBuffer, // 图像数据,可以是Buffer或Uint8Array 'eng', // 语言代码(英语) { //logger: info => console.log(info) // 可选的日志输出 } ); console.log('识别结果:', text); if (text == "") { console.log("识别结果为空,马上重试!"); continue; } // 输入识别结果到输入框 await page.waitForXPath("/html/body/div/div[2]/div/form/div[3]/div/div[1]/input") await page.$eval("input[id=loginform-verifycode]", input => input.value = ""); await page.type("input[id=loginform-verifycode]", text, { delay: 10 }) // 等待验证码检查结果,执行一会儿再去检查,检查是否出现了错误信息 await page.waitForTimeout(2000); const errorElement = await page.$('.error'); if (errorElement) { const errorStyle = await page.evaluate(el => el.style.display, errorElement); //console.log(errorStyle) if (errorStyle !== 'none') { console.log('验证码不正确,请刷新后重试'); // 点击验证码图片,获取新的验证码 await page.click('#loginform-verifycode-image'); await page.waitForTimeout(2000); // 等待一段时间让图片加载完成 console.log(`登录失败,已刷新。正在尝试第 ${attempt + 2} 次...`); } else { loginSuccess = true; } } else { // 如果没有找到 .error 元素,我们认为页面已经跳转了。就是认证通过了。 loginSuccess = true; } /* 竞争,获取结果 const errorElement = await page.waitForSelector('.error', { timeout: 3000 }).catch(() => null); const navigationPromise = page.waitForNavigation().catch(e => null); const elementPromise = page.waitForSelector("#borInfo > div.borrows > div > div:nth-child(1) > ul > button").catch(e => null); const result = await Promise.race([navigationPromise, elementPromise,errorElement]); //page.click('button[name=login-button]') if (result === navigationPromise) { console.log('跳转了'); loginSuccess = true; } else if (result === elementPromise) { console.log('查看出来了'); loginSuccess = true; } else if (errorElement) { console.log('验证码不正确,请刷新后重试'); // 点击验证码图片,获取新的验证码 await page.click('#loginform-verifycode-image'); await page.waitForTimeout(2000); // 等待一段时间让图片加载完成 console.log(`登录失败,正在尝试第 ${attempt + 2} 次...`); } else { loginSuccess = true; } */ }//for console.log(`验证码验证结果:`, loginSuccess); return loginSuccess; } async function showTable(page) { const data = await page.$$eval('#w0 table.table tbody tr', rows => { return rows.map(row => { const properties = {}; const secondTd = row.querySelector('td:nth-child(2)'); const fourthTd = row.querySelector('td:nth-child(4)'); properties.title = secondTd ? secondTd.innerText : null; properties.dueDate = fourthTd ? fourthTd.innerText : null; return properties; }); }); console.log(data); } //----------------- main ------------------------------------ (async () => { const browser = await puppeteer.launch({ executablePath: chrome_path, //process.env.CHROME_PATH, // Path to chromium build with autofill domain headless: true, args: ['--start-maximized', '--no-sandbox', '--disable-setuid-sandbox'], defaultViewport: { width: 1800, height: 1000 }, slowMo: 0, //dumpio: true, timeout: 60000, }); async function visit(card, name) { const page = await browser.newPage(); console.log(card + ' ' + name) var url = 'https://www.clcn.net.cn/user/auth/login' //url="https://www.clcn.net.cn/" await page.goto(url, { //waitUntil: 'load', // Remove the timeout waitUntil: 'domcontentloaded', timeout: 10000 }); await page.waitForXPath("/html/body/div/div[2]/div/form/div[1]/input") const inputElement = await page.$('input[name="LoginForm[username]"]'); await inputElement.type(card, { delay: 10 }); await page.waitForXPath("/html/body/div/div[2]/div/form/div[2]/div/input") await page.type("input[id=loginform-password]", '123456', { delay: 10 })//密码 let loginSuccess = false; loginSuccess = await handleCaptcha(page) //console.log('loginSuccess:', loginSuccess) if (loginSuccess) { //console.log('等待查看') await page.waitForSelector("#borInfo > div.borrows > div > div:nth-child(1) > ul > button"); //console.log('点击查看') await page.click("#borInfo > div.borrows > div > div:nth-child(1) > ul > button"); //console.log('等待table') await page.waitForXPath("/html/body/div/div[2]/div/div[3]/div[2]/div/table") //显示 await showTable(page) } const exit_selector='#container > div.container.user > div > div.userinfo > div > div.col-lg-4.col-md-4.col-sm-4.col-xs-12.userinfo-btn > form > button' await page.click(exit_selector); await page.close() } var names = new Map() // 添加数据到Map中 names.set("88888", "name0"); names.set("9999", "name1"); // 遍历Map中的数据 console.log() for (let [card, name] of names) { await visit(card, name); } await browser.close(); })().catch(err => { console.error(err); process.exit(1); });
https://datacadamia.com/web/dom/innerhtml
首都图书馆的老接口:
const puppeteer = require('puppeteer'); const fs = require('fs'); const Tesseract = require('tesseract.js'); //browser path: "C:\\Program Files (x86)\\Microsoft\\Edge\\Application\\msedge.exe" // "C:\\Program Files\\Google\\Chrome Dev\\Application\\chrome.exe" const chrome_path = "C:\\Program Files\\Google\\Chrome Dev\\Application\\chrome.exe" async function showTable(page) { const myprofile = '#exlidMyAccount > a' await page.waitForSelector(myprofile); //console.log('点击查看') await page.click(myprofile); const result_table = '#LoansTable' // await page.waitForSelector(result_table); const table_fullpath = '#LoansTable > tbody > tr'; await page.waitForFunction('document.querySelector("#LoansTable > tbody > tr") !== null'); const data = await page.$$eval(table_fullpath, rows => { /**====== 重要 =============== */ //这里的 console.log 打印不出来。断点也无法断住!!!!!!!! const rows0 = document.querySelectorAll("#LoansTable > tbody > tr"); console.log(rows0.length); // 打印行的数量 //no borrow const noDataElement = document.querySelector('#w0 table.table tbody tr td div.empty'); if (noDataElement) { console.log("=== You don't borrow the book ==="); return ["=== You don't borrow the book ==="]; // Return an empty array to indicate no data. } //get th table header const headerRow = document.querySelector('#LoansTable > thead > tr'); if (!headerRow) { console.log('Table header not found.'); return [headerRow]; } const headerCells = Array.from(headerRow.querySelectorAll('th')); if (headerCells.length === 0) { console.error('No header cells found.'); return ['No header cells found.']; } return rows.map(row => { const properties = {}; const fourthTd = row.querySelector('[id^="titleSTL"]'); const pos = row.querySelector('[id^="locationSTL"]') const date = row.querySelector('[id^="dueDateSTL"]'); console.error('No header cells found.', headerCells); properties[headerCells[2].innerText.trim()] = fourthTd ? fourthTd.innerText : null; properties[headerCells[4].innerText.trim()] = date ? date.innerText : null; properties[headerCells[7].innerText.trim()] = pos ? pos.innerText : null; return properties; }); }); await data; if (Array.isArray(data) && data.length === 0) { console.log("=== No table. You don't borrow the book. ==="); } else { console.log('total books:' + data.length) console.log(data); } } //----------------- main ------------------------------------ var browser; var url = 'https://primo.clcn.net.cn/primo_library/libweb/action/loginpage.do?targetURL=https%3a%2f%2fprimo.clcn.net.cn%2fprimo_library%2flibweb%2faction%2fsearch.do%3fvid%3dST%26amp%3bdscnt%3d0%26amp%3bdstmp%3d1705632515141%26amp%3binitializeIndex%3dtrue&isMobile=false' // https://www.clcn.net.cn/user/my/index //url="https://www.clcn.net.cn/" (async () => { browser = await puppeteer.launch({ executablePath: chrome_path, //process.env.CHROME_PATH, // Path to chromium build with autofill domain headless: false, args: ['--no-sandbox', '--disable-setuid-sandbox'],//'--start-maximized', defaultViewport: { width: 1800, height: 1000 }, slowMo: 0, //dumpio: true, timeout: 60000, }); async function visit(card, name) { const page = await browser.newPage(); try { console.log(card + ' ' + name) await page.goto(url, { //waitUntil: 'load', // Remove the timeout waitUntil: 'domcontentloaded', timeout: 10000 }); //登录 await page.waitForXPath("/html/body/div[2]/div[2]/form/md-card/md-card-content/md-input-container[1]/input") //user const inputElement = await page.$('input[name="username"]'); await inputElement.type(card, { delay: 10 }); //pswd //await page.waitForXPath("/html/body/div[2]/div[2]/form/md-card/md-card-content/md-input-container[2]/input") await page.type("input[id=input_1]", '888888', { delay: 10 }) // await page.waitForSelector("body > div.EXLPRMLoginCard.layout-align-center-start.layout-row > div.EXLPRMLoginColumn.layout-column.flex-xs-100.flex-sm-100.flex-25 > form > md-card > div.md-actions.layout-align-end-center.layout-row > a.EXLPRMLoginButtonSubmit.md-button.md-ink-ripple"); //click to login await page.click("body > div.EXLPRMLoginCard.layout-align-center-start.layout-row > div.EXLPRMLoginColumn.layout-column.flex-xs-100.flex-sm-100.flex-25 > form > md-card > div.md-actions.layout-align-end-center.layout-row > a.EXLPRMLoginButtonSubmit.md-button.md-ink-ripple"); //console.log('等待table') await page.waitForXPath("/html/body/div[1]/div[2]/div[2]/ul/li[3]/a") //显示 await showTable(page) const exit_selector = '#exlidSignOut > a' await page.click(exit_selector); } catch (error) { console.error(error); } await page.close() } var names = new Map() // 添加数据到Map中 names.set("000000000000", "zhang san"); names.set("888888888888", "li si"); // 遍历Map中的数据 for (let [card, name] of names) { await visit(card, name); } console.log("https://www.clcn.net.cn/user/my/index") await browser.close(); })().catch(err => { console.error(err); } ).finally(err => { //console.log("browser.close") browser.close(); process.exit(1); });
完全替换元素内容:
let htmlFragment = "<p>Replacing the whole body node content with a paragraph</p>"; document.body.innerHTML += htmlFragment
script elements 使用innerHTML 不会执行
DOM - InsertAdjacent 比 appendChild更高级,能执行script
https://datacadamia.com/web/dom/insertadjacent
let bodySibling = document.createElement('script'); bodySibling.text = 'console.log("Hello World !");'; document.body.insertAdjacentElement('beforeend', bodySibling);
插入element
let pSibling = document.createElement('p'); pSibling.innerText = 'A paragraph'; document.body.insertAdjacentElement('afterbegin', pSibling);
插入html
document.body.insertAdjacentHTML('afterend', '<p>Body Sibling HTML</p>');
注入js
https://www.tabnine.com/code/javascript/functions/puppeteer/Page/%2524eval
async function main() { const browser = await puppeteer.launch({ headless: false, userDataDir: path.join(process.cwd(), "ChromeSession") }); const page = await browser.newPage(); await page.goto('https://web.whatsapp.com', { waitUntil: 'networkidle0', timeout: 0 }); await page.waitForSelector('*[data-icon=chat]', { polling: 1000, timeout: 0 }) console.log("Logged in!") var filepath = path.join(__dirname, "WAPI.js"); await page.addScriptTag({ path: require.resolve(filepath) }); filepath = path.join(__dirname, "inject.js"); await page.addScriptTag({path: require.resolve(filepath)}); //await browser.close(); }
获取元素用 page.$ or evaluate
https://helloworldmaster.com/article/get-a-dom-element-using-puppeteer
【推荐】国内首个AI IDE,深度理解中文开发场景,立即下载体验Trae
【推荐】编程新体验,更懂你的AI,立即体验豆包MarsCode编程助手
【推荐】抖音旗下AI助手豆包,你的智能百科全书,全免费不限次数
【推荐】轻量又高性能的 SSH 工具 IShell:AI 加持,快人一步
· DeepSeek 开源周回顾「GitHub 热点速览」
· 物流快递公司核心技术能力-地址解析分单基础技术分享
· .NET 10首个预览版发布:重大改进与新特性概览!
· AI与.NET技术实操系列(二):开始使用ML.NET
· 单线程的Redis速度为什么快?
2019-04-11 Spring Boot整合Elasticsearch