Puppeteer的入门教程和实践

Puppeter是什么的?

Puppeter在github上对自己的介绍是:

Haedless Chrome Node API

puppeteer是一个nodejs的库,支持调用Chrome的API来操纵Web,相比较Selenium或是PhantomJs,它最大的特点就是它的操作Dom可以完全在内存中进行模拟既在V8引擎中处理而不打开浏览器(headless无界面)。但要注意的是,它虽然很好用,但一般却不建议用来做测试使用,因为是专门针对Chrome处理的,当然你也可以根据业务需要来选择。

Puppeter能做什么?

Puppeter官网给了几个例子,分别是:
(1)网页截图。
(2)生成页面的PDF。
(3)分析当前页的脚本。
(4) 写爬虫
(5) ....

安装

Puppeteer 至少需要 Node v6.4.0,如要使用 async / await,只有 Node v7.6.0 或更高版本才支持。
如果项目路径下没有package.json就先执行“npm init”,然后按照提示填写完毕后,生成一个package.json文件,然后执行:

npm i puppeteer

我在安装过程中遇到了错误:

是在执行install.js 下载Chromium时出错,你也可以通过设置环境变量set PUPPETEER_SKIP_CHROMIUM_DOWNLOAD=1阻止下载 Chromium,稍后再手动下载,但手动下载后还要配置路径,太麻烦啦,所以解决方案是打开FQ软件再重新执行下“npm i puppeteer”。

使用

(1)网页截图

//screenshot.js
const puppeteer = require('puppeteer');
const config = require('./config/config');

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto('https://www.baidu.com');
    await page.screenshot({
          path:`${config.screenshot}/${Date.now()}.png`,

    });
    await browser.close();
})();
//config.js
const path = require('path')

module.exports ={
      screenshot:path.resolve(__dirname,'../../screenshot')
}

(2) 将网页生成pdf

const puppeteer = require('puppeteer');
const config = require('./config/config');

(async () => {
    const browser = await puppeteer.launch();
    const page = await browser.newPage();
    await page.goto('https://www.baidu.com',{waitUntil:'networkidle2'});
    await page.pdf({path: `${config.pdfroot}/${Date.now()}.pdf`, format: 'A4'});
    await browser.close();
})();

(3)分析网页

const puppeteer = require('puppeteer');

(async () => {
  const browser = await puppeteer.launch();
  const page = await browser.newPage();
  await page.goto('https://www.baidu.com');

  // Get the "viewport" of the page, as reported by the page.
  const dimensions = await page.evaluate(() => {
    return {
      width: document.documentElement.clientWidth,
      height: document.documentElement.clientHeight,
      deviceScaleFactor: window.devicePixelRatio
    };
  });
  console.log('Dimensions:', dimensions);
  await browser.close();
})();

(4) 写爬虫

//screenshot.js
const puppeteer = require('puppeteer');
const config = require('./config/config');
const srcToImg = require('./helper/srcToImg');
const chalk = require('chalk');

    (async () => {
        const browser = await puppeteer.launch();
        const page = await browser.newPage();
        await page.goto('https://image.baidu.com/');
        console.log('go to https://image.baidu.com/')
        await page.setViewport({
            width: 1920,
            height: 1080
        })
        console.log("reset viewpoint");
        await page.focus('#kw');
        await page.keyboard.sendCharacter('单身狗');
        await page.click('.s_search');
        console.log(chalk.red(("reset viewpoint")));
        console.log('go to searchlist');
        page.on('load', async () => {
            console.log('page loading done,start fetch.........')
            const srcs = await page.evaluate(() => {
                const images = document.querySelectorAll('img.main_img');
                return Array.prototype.map.call(images, img => img.src);
            })
            srcs.forEach(src => {
                srcToImg(src,config.imgUrl)
            });

            await browser.close();
        })
    })();

//srcToImg.js
const http = require('http');
const https = require('https');
const path = require('path');
const fs = require('fs');
const { promisify } = require('util');
const writeFile = promisify(fs.writeFile)


module.exports = async (src,dir) =>{
   if(/\.(jpg|png|gif)$/.test(src)){
         await urlToImg(src,dir);
   }else{
        await base64ToImg(src,dir);
   }
}

//url => img
const urlToImg = async (url,dir) =>{
   const mod = /^https:/.test(url)?https:http;
   const ext = path.extname(url);
   const file = path.join(dir,`${Date.now()}${ext}`)
   
   mod.get(url, res => {
       res.pipe(fs.createWriteStream(file))
       .on('finish',() =>{
           console.log(file);
       })
   })

}



//base64 => img

const base64ToImg = async function(base64Str,dir){
   const matches = base64Str.match(/^data:(.+?);base64,(.+)$/);
   try{
       const ext = matches[1].split('/')[1]
       .replace('jpeg','jpg');
       const file = path.join(dir,`${Date.now()}.${ext}`)

       await writeFile(file,matches[2],'base64');
       console.log(file);

   }catch(err){
       console.log("非法的base64 字符串")
   }
};

//config.js
const path = require('path')
module.exports ={
      imgUrl:path.resolve(__dirname,'../../images'),
     
}
posted @ 2018-08-17 14:28  Julie在进化  阅读(2774)  评论(0编辑  收藏  举报