browserless scrape api 简单说明

以前说过browserless提供了不少api 能力,以下简单说明下scrape api 的处理

参考定义

如下图,browserless 对于不同浏览器进行了不同的处理(内部实现包含了公共的)

内部处理

scrape.http.ts 定义中,browserless 使用了puppeteer-core 包进行处理

  • 参考代码
// 包装的方法,方便puppeteer-core 的page.evaluate 使用
const scrape = async (elements: ScrapeElementSelector[]) => {
  const wait = (selector: string, timeout = 30000) => {
    return new Promise<void>((resolve, reject) => {
      const timeoutId = setTimeout(() => {
        clearTimeout(timeoutId);
        clearInterval(intervalId);
        reject(new Error(`Timed out waiting for selector "${selector}"`));
      }, timeout);
      const intervalId = setInterval(() => {
        if (document.querySelector(selector)) {
          clearTimeout(timeoutId);
          clearInterval(intervalId);
          return resolve();
        }
      }, 100);
    });
  };
 // 先进行wait 处理,
  await Promise.all(
    elements.map(({ selector, timeout }) => wait(selector, timeout)),
  );
// 获取elements selector 的额html 信息
  return elements.map(({ selector }) => {
    const $els = [...document.querySelectorAll(selector)] as HTMLElement[];
    return {
      results: $els.map(($el) => {
        const rect = $el.getBoundingClientRect();
        return {
          attributes: [...$el.attributes].map((attr) => ({
            name: attr.name,
            value: attr.value,
          })),
          height: $el.offsetHeight,
          html: $el.innerHTML,
          left: rect.left,
          text: $el.innerText,
          top: rect.top,
          width: $el.offsetWidth,
        };
      }),
      selector,
    };
  });
};
// ChromiumScrapePostRoute api route 
export default class ChromiumScrapePostRoute extends BrowserHTTPRoute {
  name = BrowserlessRoutes.ChromiumScrapePostRoute;
  accepts = [contentTypes.json];
  auth = true;
  browser = ChromiumCDP;
  concurrency = true;
  contentTypes = [contentTypes.json];
  description = dedent(`
    A JSON-based API that returns text, html, and meta-data from a given list of selectors.
    Debugging information is available by sending in the appropriate flags in the "debugOpts"
    property. Responds with an array of JSON objects.
  `);
  method = Methods.post;
  path = [HTTPRoutes.scrape, HTTPRoutes.chromiumScrape];
  tags = [APITags.browserAPI];
  handler = async (
    req: Request,
    res: ServerResponse,
    logger: Logger,
    browser: BrowserInstance,
  ) => {
    logger.info('Scrape API invoked with body:', req.body);
    const contentType =
      !req.headers.accept || req.headers.accept?.includes('*')
        ? contentTypes.html
        : req.headers.accept;
 
    if (!req.body) {
      throw new BadRequest(`Couldn't parse JSON body`);
    }
 
    res.setHeader('Content-Type', contentType);
 
    const {
      bestAttempt = false,
      url,
      gotoOptions,
      authenticate,
      addScriptTag = [],
      addStyleTag = [],
      cookies = [],
      debugOpts,
      elements,
      emulateMediaType,
      html,
      rejectRequestPattern = [],
      requestInterceptors = [],
      rejectResourceTypes = [],
      setExtraHTTPHeaders,
      setJavaScriptEnabled,
      userAgent,
      viewport,
      waitForTimeout,
      waitForFunction,
      waitForSelector,
      waitForEvent,
    } = req.body as BodySchema;
 
    const content = url || html;
 
    if (!content) {
      throw new BadRequest(`One of "url" or "html" properties are required.`);
    }
 
    const page = (await browser.newPage()) as UnwrapPromise<
      ReturnType<ChromiumCDP['newPage']>
    >;
    const gotoCall = url ? page.goto.bind(page) : page.setContent.bind(page);
    const messages: string[] = [];
    const outbound: OutBoundRequest[] = [];
    const inbound: InBoundRequest[] = [];
 
    if (debugOpts?.console) {
      page.on('console', (msg) => messages.push(msg.text()));
    }
 
    if (debugOpts?.network) {
      page.setRequestInterception(true);
 
      page.on('request', (req) => {
        outbound.push({
          headers: req.headers,
          method: req.method(),
          url: req.url(),
        });
        req.continue();
      });
 
      page.on('response', (res) => {
        inbound.push({
          headers: res.headers,
          status: res.status(),
          url: res.url(),
        });
      });
    }
 
    if (emulateMediaType) {
      await page.emulateMediaType(emulateMediaType);
    }
 
    if (cookies.length) {
      await page.setCookie(...cookies);
    }
 
    if (viewport) {
      await page.setViewport(viewport);
    }
 
    if (userAgent) {
      await page.setUserAgent(userAgent);
    }
 
    if (authenticate) {
      await page.authenticate(authenticate);
    }
 
    if (setExtraHTTPHeaders) {
      await page.setExtraHTTPHeaders(setExtraHTTPHeaders);
    }
 
    if (setJavaScriptEnabled) {
      await page.setJavaScriptEnabled(setJavaScriptEnabled);
    }
 
    if (
      rejectRequestPattern.length ||
      requestInterceptors.length ||
      rejectResourceTypes.length
    ) {
      await page.setRequestInterception(true);
 
      page.on('request', (req) => {
        if (
          !!rejectRequestPattern.find((pattern) => req.url().match(pattern)) ||
          rejectResourceTypes.includes(req.resourceType())
        ) {
          logger.debug(`Aborting request ${req.method()}: ${req.url()}`);
          return req.abort();
        }
        const interceptor = requestInterceptors.find((r) =>
          req.url().match(r.pattern),
        );
        if (interceptor) {
          return req.respond(interceptor.response);
        }
        return req.continue();
      });
    }
 
    const gotoResponse = await gotoCall(content, gotoOptions).catch(
      bestAttemptCatch(bestAttempt),
    );
 
    if (addStyleTag.length) {
      for (const tag in addStyleTag) {
        await page.addStyleTag(addStyleTag[tag]);
      }
    }
 
    if (addScriptTag.length) {
      for (const tag in addScriptTag) {
        await page.addScriptTag(addScriptTag[tag]);
      }
    }
 
    if (waitForTimeout) {
      await sleep(waitForTimeout).catch(bestAttemptCatch(bestAttempt));
    }
 
    if (waitForFunction) {
      await waitForFn(page, waitForFunction).catch(
        bestAttemptCatch(bestAttempt),
      );
    }
 
    if (waitForSelector) {
      const { selector, hidden, timeout, visible } = waitForSelector;
      await page
        .waitForSelector(selector, { hidden, timeout, visible })
        .catch(bestAttemptCatch(bestAttempt));
    }
 
    if (waitForEvent) {
      await waitForEvt(page, waitForEvent).catch(bestAttemptCatch(bestAttempt));
    }
 
    const headers = {
      'X-Response-Code': gotoResponse?.status(),
      'X-Response-IP': gotoResponse?.remoteAddress().ip,
      'X-Response-Port': gotoResponse?.remoteAddress().port,
      'X-Response-Status': gotoResponse?.statusText(),
      'X-Response-URL': gotoResponse?.url().substring(0, 1000),
    };
 
    for (const [key, value] of Object.entries(headers)) {
      if (value !== undefined) {
        res.setHeader(key, value);
      }
    }
 
    const data = await page.evaluate(scrape, elements).catch((e) => {
      if (e.message.includes('Timed out')) {
        throw new Timeout(e);
      }
      throw e;
    });
 
    const [debugHTML, screenshot, pageCookies] = await Promise.all([
      debugOpts?.html ? (page.content() as Promise<string>) : null,
      debugOpts?.screenshot
        ? (page.screenshot(debugScreenshotOpts) as unknown as Promise<string>)
        : null,
      debugOpts?.cookies ? page.cookies() : null,
    ]);
 
    const debugData = debugOpts
      ? {
          console: messages,
          cookies: pageCookies,
          html: debugHTML,
          network: {
            inbound,
            outbound,
          },
          screenshot,
        }
      : null;
 
    const response: ResponseSchema = {
      data,
      debug: debugData,
    };
 
    page.close().catch(noop);
 
    logger.info('Scrape API request completed');
 
    return jsonResponse(res, 200, response, false);
  };
}

说明

目前browserless 包装的rest api 部分内部是直接使用的puppeteer-core api,以上是一个简单说明,详细的可以阅读源码,部分处理上对于我们使用puppeteer-core 也是很值得学习的

参考资料

src/shared/scrape.http.ts
https://docs.browserless.io/HTTP-APIs/apis
https://docs.browserless.io/open-api/

posted on 2024-06-29 08:00  荣锋亮  阅读(7)  评论(0编辑  收藏  举报

导航