browserless scrape api 简单说明

以前说过browserless提供了不少api 能力，以下简单说明下scrape api 的处理

参考定义

如下图，browserless 对于不同浏览器进行了不同的处理（内部实现包含了公共的）

内部处理

scrape.http.ts 定义中，browserless 使用了puppeteer-core 包进行处理

参考代码

// 包装的方法，方便puppeteer-core 的page.evaluate 使用

const scrape = async (elements: ScrapeElementSelector[]) => {

  const wait = (selector: string, timeout = 30000) => {

    return new Promise<void>((resolve, reject) => {

      const timeoutId = setTimeout(() => {

        clearTimeout(timeoutId);

        clearInterval(intervalId);

        reject(new Error(`Timed out waiting for selector "${selector}"`));

      }, timeout);

      const intervalId = setInterval(() => {

        if (document.querySelector(selector)) {

          clearTimeout(timeoutId);

          clearInterval(intervalId);

          return resolve();

        }

      }, 100);

    });

  };

 // 先进行wait 处理，

  await Promise.all(

    elements.map(({ selector, timeout }) => wait(selector, timeout)),

  );

// 获取elements selector 的额html 信息

  return elements.map(({ selector }) => {

    const $els = [...document.querySelectorAll(selector)] as HTMLElement[];

    return {

      results: $els.map(($el) => {

        const rect = $el.getBoundingClientRect();

        return {

          attributes: [...$el.attributes].map((attr) => ({

            name: attr.name,

            value: attr.value,

          })),

          height: $el.offsetHeight,

          html: $el.innerHTML,

          left: rect.left,

          text: $el.innerText,

          top: rect.top,

          width: $el.offsetWidth,

        };

      }),

      selector,

    };

  });

};

// ChromiumScrapePostRoute api route 

export default class ChromiumScrapePostRoute extends BrowserHTTPRoute {

  name = BrowserlessRoutes.ChromiumScrapePostRoute;

  accepts = [contentTypes.json];

  auth = true;

  browser = ChromiumCDP;

  concurrency = true;

  contentTypes = [contentTypes.json];

  description = dedent(`

    A JSON-based API that returns text, html, and meta-data from a given list of selectors.

    Debugging information is available by sending in the appropriate flags in the "debugOpts"

    property. Responds with an array of JSON objects.

  `);

  method = Methods.post;

  path = [HTTPRoutes.scrape, HTTPRoutes.chromiumScrape];

  tags = [APITags.browserAPI];

  handler = async (

    req: Request,

    res: ServerResponse,

    logger: Logger,

    browser: BrowserInstance,

  ) => {

    logger.info('Scrape API invoked with body:', req.body);

    const contentType =

      !req.headers.accept || req.headers.accept?.includes('*')

        ? contentTypes.html

        : req.headers.accept;
 
    if (!req.body) {

      throw new BadRequest(`Couldn't parse JSON body`);

    }
 
    res.setHeader('Content-Type', contentType);
 
    const {

      bestAttempt = false,

      url,

      gotoOptions,

      authenticate,

      addScriptTag = [],

      addStyleTag = [],

      cookies = [],

      debugOpts,

      elements,

      emulateMediaType,

      html,

      rejectRequestPattern = [],

      requestInterceptors = [],

      rejectResourceTypes = [],

      setExtraHTTPHeaders,

      setJavaScriptEnabled,

      userAgent,

      viewport,

      waitForTimeout,

      waitForFunction,

      waitForSelector,

      waitForEvent,

    } = req.body as BodySchema;
 
    const content = url || html;
 
    if (!content) {

      throw new BadRequest(`One of "url" or "html" properties are required.`);

    }
 
    const page = (await browser.newPage()) as UnwrapPromise<

      ReturnType<ChromiumCDP['newPage']>

    >;

    const gotoCall = url ? page.goto.bind(page) : page.setContent.bind(page);

    const messages: string[] = [];

    const outbound: OutBoundRequest[] = [];

    const inbound: InBoundRequest[] = [];
 
    if (debugOpts?.console) {

      page.on('console', (msg) => messages.push(msg.text()));

    }
 
    if (debugOpts?.network) {

      page.setRequestInterception(true);
 
      page.on('request', (req) => {

        outbound.push({

          headers: req.headers,

          method: req.method(),

          url: req.url(),

        });

        req.continue();

      });
 
      page.on('response', (res) => {

        inbound.push({

          headers: res.headers,

          status: res.status(),

          url: res.url(),

        });

      });

    }
 
    if (emulateMediaType) {

      await page.emulateMediaType(emulateMediaType);

    }
 
    if (cookies.length) {

      await page.setCookie(...cookies);

    }
 
    if (viewport) {

      await page.setViewport(viewport);

    }
 
    if (userAgent) {

      await page.setUserAgent(userAgent);

    }
 
    if (authenticate) {

      await page.authenticate(authenticate);

    }
 
    if (setExtraHTTPHeaders) {

      await page.setExtraHTTPHeaders(setExtraHTTPHeaders);

    }
 
    if (setJavaScriptEnabled) {

      await page.setJavaScriptEnabled(setJavaScriptEnabled);

    }
 
    if (

      rejectRequestPattern.length ||

      requestInterceptors.length ||

      rejectResourceTypes.length

    ) {

      await page.setRequestInterception(true);
 
      page.on('request', (req) => {

        if (

          !!rejectRequestPattern.find((pattern) => req.url().match(pattern)) ||

          rejectResourceTypes.includes(req.resourceType())

        ) {

          logger.debug(`Aborting request ${req.method()}: ${req.url()}`);

          return req.abort();

        }

        const interceptor = requestInterceptors.find((r) =>

          req.url().match(r.pattern),

        );

        if (interceptor) {

          return req.respond(interceptor.response);

        }

        return req.continue();

      });

    }
 
    const gotoResponse = await gotoCall(content, gotoOptions).catch(

      bestAttemptCatch(bestAttempt),

    );
 
    if (addStyleTag.length) {

      for (const tag in addStyleTag) {

        await page.addStyleTag(addStyleTag[tag]);

      }

    }
 
    if (addScriptTag.length) {

      for (const tag in addScriptTag) {

        await page.addScriptTag(addScriptTag[tag]);

      }

    }
 
    if (waitForTimeout) {

      await sleep(waitForTimeout).catch(bestAttemptCatch(bestAttempt));

    }
 
    if (waitForFunction) {

      await waitForFn(page, waitForFunction).catch(

        bestAttemptCatch(bestAttempt),

      );

    }
 
    if (waitForSelector) {

      const { selector, hidden, timeout, visible } = waitForSelector;

      await page

        .waitForSelector(selector, { hidden, timeout, visible })

        .catch(bestAttemptCatch(bestAttempt));

    }
 
    if (waitForEvent) {

      await waitForEvt(page, waitForEvent).catch(bestAttemptCatch(bestAttempt));

    }
 
    const headers = {

      'X-Response-Code': gotoResponse?.status(),

      'X-Response-IP': gotoResponse?.remoteAddress().ip,

      'X-Response-Port': gotoResponse?.remoteAddress().port,

      'X-Response-Status': gotoResponse?.statusText(),

      'X-Response-URL': gotoResponse?.url().substring(0, 1000),

    };
 
    for (const [key, value] of Object.entries(headers)) {

      if (value !== undefined) {

        res.setHeader(key, value);

      }

    }
 
    const data = await page.evaluate(scrape, elements).catch((e) => {

      if (e.message.includes('Timed out')) {

        throw new Timeout(e);

      }

      throw e;

    });
 
    const [debugHTML, screenshot, pageCookies] = await Promise.all([

      debugOpts?.html ? (page.content() as Promise<string>) : null,

      debugOpts?.screenshot

        ? (page.screenshot(debugScreenshotOpts) as unknown as Promise<string>)

        : null,

      debugOpts?.cookies ? page.cookies() : null,

    ]);
 
    const debugData = debugOpts

      ? {

          console: messages,

          cookies: pageCookies,

          html: debugHTML,

          network: {

            inbound,

            outbound,

          },

          screenshot,

        }

      : null;
 
    const response: ResponseSchema = {

      data,

      debug: debugData,

    };
 
    page.close().catch(noop);
 
    logger.info('Scrape API request completed');
 
    return jsonResponse(res, 200, response, false);

  };

}

说明

目前browserless 包装的rest api 部分内部是直接使用的puppeteer-core api，以上是一个简单说明，详细的可以阅读源码，部分处理上对于我们使用puppeteer-core 也是很值得学习的

参考资料

src/shared/scrape.http.ts
https://docs.browserless.io/HTTP-APIs/apis
https://docs.browserless.io/open-api/

posted on 2024-06-29 08:00 荣锋亮阅读(32) 评论(0) 编辑收藏举报

刷新页面返回顶部

rongfengliang-荣锋亮

browserless scrape api 简单说明

参考定义

内部处理

说明

参考资料

导航

公告