puppeteerでの要素の取得方法

puppeteerでの要素の取得のための関数は

page.evaluate + (querySelector|querySelectorALL)
page.$
page.$$
page.$eval
page.$$eval

とありますが、実際にそれらを使うためにどう書くかをまとめています。速度等は検証できていませんが、evalを使うのがそれぞれコードがシンプルになるので良いかと思います。

APIの動作の詳細はpuppeteerのAPIを参照ください。
https://github.com/GoogleChrome/puppeteer/blob/master/docs/api.md

前提

以下のようなliの中にアンカータグが入っているページを取得することを想定しています。

<ul>
  <li><a href="some link">some HTML</a></li>
  <li><a href="some link">some HTML</a></li>
  <li><a href="some link">some HTML</a></li>
  ...
</ul>

以下のようにページ読み込みを準備します。

  const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'] });
  const page = await browser.newPage();
  await page.goto('url you want to check');

  let itemSelector="some selecter > ul > li:nth-child(1) > a";
  let listSelector="some selecter > ul > li > a";

一つセレクターの一つの要素をとる

page.evaluate + querySelector

  var data = await page.evaluate((selector) => {
    return document.querySelector(selector).textContent;
  }, itemSelector);

page.$

  var item = await page.$(itemSelector);
  var data = await (await item.getProperty('textContent')).jsonValue();

page.$eval

  var data = await page.$eval(itemSelector, item => {
      return item.textContent;
  });

一つセレクターの複数要素をとる

page.evaluate + querySelector

  var data = await page.evaluate((selector) => {
    return {
        href: document.querySelector(selector).href,
        textContent: document.querySelector(selector).textContent,
        innerHTML: document.querySelector(selector).innerHTML
    };
  }, itemSelector);

page.$

  var item = await page.$(itemSelector);
  var data = {
        href: await (await item.getProperty('href')).jsonValue(),
        textContent: await (await item.getProperty('textContent')).jsonValue(),
        innerHTML: await (await item.getProperty('innerHTML')).jsonValue()
  };

page.$eval

  var data = await page.$eval(itemSelector, item => {
    return {
        href: item.href,
        textContent: item.textContent,
        innerHTML: item.innerHTML
    };
  });

複数セレクターの一つの要素をとる

page.evaluate + querySelectorAll

  var datas = await page.evaluate((selector) => {
    const list = Array.from(document.querySelectorAll(selector));
    return list.map(data => data.textContent);
  }, listSelector);

page.$$

  var list = await page.$$(listSelector);
  var datas = [];
  for (let i = 0; i < list.length; i++) {
    datas.push(await (await list[i].getProperty('textContent')).jsonValue())
  }

page.$$eval

  var datas = await page.$$eval(listSelector, list => {
      return list.map(data => data.textContent);
  });

複数セレクターの複数要素をとる

page.evaluate + querySelectorAll

  var datas = await page.evaluate((selector) => {
    const list = Array.from(document.querySelectorAll(selector));
    var datas=[];
    for (let i = 0; i < list.length; i++) {
      var data = {
        href: list[i].href,
        textContent: list[i].textContent,
        innerHTML: list[i].innerHTML
      };
      datas.push(data);
    }
    return datas;
  }, listSelector);

page.$$

  var list = await page.$$(listSelector);
  var datas = [];
  for (let i = 0; i < list.length; i++) {
    var data = {
      href: await (await list[i].getProperty('href')).jsonValue(),
      textContent: await (await list[i].getProperty('textContent')).jsonValue(),
      innerHTML: await (await list[i].getProperty('innerHTML')).jsonValue()
    };
    datas.push(data);
  }

page.$$eval

  var datas = await page.$$eval(listSelector, list => {
    var datas=[];
    for (let i = 0; i < list.length; i++) {
      var data = {
        href: list[i].href,
        textContent: list[i].textContent,
        innerHTML: list[i].innerHTML
      };
      datas.push(data);
    }
    return datas;
  });

確認用コード

const fs = require('fs');
const puppeteer = require('puppeteer');

(async() => {

  const browser = await puppeteer.launch({ args: ['--no-sandbox', '--disable-setuid-sandbox'] });
  const page = await browser.newPage();
  await page.goto('url you want to check');

  let itemSelector="some selecter > ul > li:nth-child(1) > a";
  let listSelector="some selecter > ul > li > a";

  ////////////////////////////////////////////////////////

  var data = await page.evaluate((selector) => {
    return document.querySelector(selector).textContent;
  }, itemSelector);
  console.log("one item one attribute");
  console.log(data);

  var item = await page.$(itemSelector);
  var data = await (await item.getProperty('textContent')).jsonValue();
  console.log("one item one attribute using $");
  console.log(data);

  var data = await page.$eval(itemSelector, item => {
      return item.textContent;
  });
  console.log("some items one attribute using $eval");
  console.log(data);

  ////////////////////////////////////////////////////////

  var data = await page.evaluate((selector) => {
    return {
        href: document.querySelector(selector).href,
        textContent: document.querySelector(selector).textContent,
        innerHTML: document.querySelector(selector).innerHTML
    };
  }, itemSelector);
  console.log("one item some attributes");
  console.log(data);

  var item = await page.$(itemSelector);
  var data = {
        href: await (await item.getProperty('href')).jsonValue(),
        textContent: await (await item.getProperty('textContent')).jsonValue(),
        innerHTML: await (await item.getProperty('innerHTML')).jsonValue()
  };
  console.log("one item some attributes using $");
  console.log(data);

  var data = await page.$eval(itemSelector, item => {
    return {
        href: item.href,
        textContent: item.textContent,
        innerHTML: item.innerHTML
    };
  });
  console.log("some items some attributes using $eval");
  console.log(data);

  ////////////////////////////////////////////////////////

  var datas = await page.evaluate((selector) => {
    const list = Array.from(document.querySelectorAll(selector));
    return list.map(data => data.textContent);
  }, listSelector);
  console.log("some items one attribute");
  console.log(datas);

  var list = await page.$$(listSelector);
  var datas = [];
  for (let i = 0; i < list.length; i++) {
    datas.push(await (await list[i].getProperty('textContent')).jsonValue())
  }
  console.log("some items one attribute using $$");
  console.log(datas);

  var datas = await page.$$eval(listSelector, list => {
      return list.map(data => data.textContent);
  });
  console.log("some items one attribute using $$eval");
  console.log(datas);

  ////////////////////////////////////////////////////////

  var datas = await page.evaluate((selector) => {
    const list = Array.from(document.querySelectorAll(selector));
    var datas=[];
    for (let i = 0; i < list.length; i++) {
      var data = {
        href: list[i].href,
        textContent: list[i].textContent,
        innerHTML: list[i].innerHTML
      };
      datas.push(data);
    }
    return datas;
  }, listSelector);
  console.log("some items some attributes");
  console.log(datas);

  var list = await page.$$(listSelector);
  var datas = [];
  for (let i = 0; i < list.length; i++) {
    var data = {
      href: await (await list[i].getProperty('href')).jsonValue(),
      textContent: await (await list[i].getProperty('textContent')).jsonValue(),
      innerHTML: await (await list[i].getProperty('innerHTML')).jsonValue()
    };
    datas.push(data);
  }
  console.log("some items one attribute using $$");
  console.log(datas);

  var datas = await page.$$eval(listSelector, list => {
    var datas=[];
    for (let i = 0; i < list.length; i++) {
      var data = {
        href: list[i].href,
        textContent: list[i].textContent,
        innerHTML: list[i].innerHTML
      };
      datas.push(data);
    }
    return datas;
  });
  console.log("some items some attributes using $$eval");
  console.log(datas);

  ////////////////////////////////////////////////////////

  browser.close();

})();
posted @ 2019-08-29 14:50  公众号python学习开发  阅读(82)  评论(0编辑  收藏  举报