browserless ws 服务处理简单说明

browserless ws 的处理实际上一个proxy 对于启动的实际无头浏览器ws 服务进行了代理,同时为了安全browserless 进行了token 的处理
以下对于内部实现进行一个简单说明

参考处理

  • ws route 注册

browserless 模块中的start 方法
wsRoutes 是实现WebSocketRoute 或者BrowserWebsocketRoute 的子类

const wsRoutes: Array<WebSocketRoute | BrowserWebsocketRoute> = [];
...
httpRoutes.forEach((r) => this.router.registerHTTPRoute(r));
wsRoutes.forEach((r) => this.router.registerWebSocketRoute(r));
...

BrowserWebsocketRoute 子类如下

 

  • ChromiumCDPWebSocketRoute ws route 定义

可以看到需要一个browser 对象,比对象是属于懒加载的,在需要的时候才会进行创建,对于路由Route 都会有一个BrowserManager 对象实现浏览器的管理,对于不同浏览器的实现,直接传递了实际的browser 实现,比如ChromiumCDPWebSocketRoute 使用的ChromiumCDP

export default class ChromiumCDPWebSocketRoute extends BrowserWebsocketRoute {
  name = BrowserlessRoutes.ChromiumCDPWebSocketRoute;
  auth = true;
  browser = ChromiumCDP;
  concurrency = true;
  description = `Launch and connect to Chromium with a library like puppeteer or others that work over chrome-devtools-protocol.`;
  path = [WebsocketRoutes['/'], WebsocketRoutes.chromium];
  tags = [APITags.browserWS];
  handler = async (
    req: Request,
    socket: Duplex,
    head: Buffer,
    _logger: Logger,
    browser: ChromiumCDP,
  ): Promise<void> => browser.proxyWebSocket(req, socket, head);
}
  • ChromiumCDP 实现的功能

ChromiumCDP 实现了实际通过websocket 访问浏览器的能力,同时也包含了对无头浏览器的启动管理,详细的可以查看ChromiumCDP 类

  • 浏览器的启动
    核心是router 一个websocket 的包装方法
 public registerWebSocketRoute(
    route: WebSocketRoute | BrowserWebsocketRoute,
  ): WebSocketRoute | BrowserWebsocketRoute {
    this.log.trace(`Registering WebSocket "${route.path}"`);
 
    const bound = route.handler.bind(route);
    const wrapped = this.wrapWebSocketHandler(route, bound);
   // 此处是一个并发控制
    route.handler = route.concurrency
      ? this.limiter.limit(
          wrapped,
          this.onQueueFullWebSocket,
          this.onWebsocketTimeout,
          this.getTimeout,
        )
      : wrapped;    const wrapped = this.wrapWebSocketHandler(route, bound);

wrapWebSocketHandler 的处理

protected wrapWebSocketHandler =
    (
      route: WebSocketRoute | BrowserWebsocketRoute,
      handler: WebSocketRoute['handler'] | BrowserWebsocketRoute['handler'],
    ) =>
    async (req: Request, socket: stream.Duplex, head: Buffer) => {
      if (!isConnected(socket)) {
        this.log.warn(`WebSocket Request has closed prior to running`);
        return Promise.resolve();
      }
      const logger = new this.logger(route.name, req);
      if ('browser' in route && route.browser) {
        //  通过browserManager 获取或者创建浏览器
        const browser = await this.browserManager.getBrowserForRequest(
          req,
          route,
          logger,
        );
 
        if (!isConnected(socket)) {
          this.log.warn(`WebSocket Request has closed prior to running`);
          this.browserManager.complete(browser);
          return Promise.resolve();
        }
 
        if (!browser) {
          return writeResponse(socket, 500, `Error loading the browser.`);
        }
 
        try {
          this.log.trace(`Running found WebSocket handler.`);
          await handler(req, socket, head, logger, browser);
        } finally {
          this.log.trace(`WebSocket Request handler has finished.`);
          this.browserManager.complete(browser);
        }
        return;
      }
      return (handler as WebSocketRoute['handler'])(req, socket, head, logger);
    };

getBrowserForRequest 处理
整个代码还是比较长的,核心是基于参数进行浏览器的获取或者创建

public getBrowserForRequest = async (
    req: Request,
    router: BrowserHTTPRoute | BrowserWebsocketRoute,
    logger: Logger,
  ): Promise<BrowserInstance> => {
    const { browser: Browser } = router;
    const blockAds = parseBooleanParam(
      req.parsed.searchParams,
      'blockAds',
      false,
    );
    const decodedLaunchOptions = convertIfBase64(
      req.parsed.searchParams.get('launch') || '{}',
    );
    let parsedLaunchOptions: BrowserServerOptions | CDPLaunchOptions;
 
    // Handle browser re-connects here
    if (req.parsed.pathname.includes('/devtools/browser')) {
      const sessions = Array.from(this.browsers);
      const id = req.parsed.pathname.split('/').pop() as string;
      const found = sessions.find(([b]) =>
        b.wsEndpoint()?.includes(req.parsed.pathname),
      );
 
      if (found) {
        const [browser, session] = found;
        ++session.numbConnected;
        this.log.debug(`Located browser with ID ${id}`);
        return browser;
      }
 
      throw new NotFound(
        `Couldn't locate browser "${id}" for request "${req.parsed.pathname}"`,
      );
    }
 
    // Handle page connections here
    if (req.parsed.pathname.includes('/devtools/page')) {
      const id = req.parsed.pathname.split('/').pop() as string;
      if (!id.includes(BLESS_PAGE_IDENTIFIER)) {
        const browsers = Array.from(this.browsers).map(([browser]) => browser);
        const allPages = await Promise.all(
          browsers
            .filter((b) => !!b.wsEndpoint())
            .map(async (browser) => {
              const { port } = new URL(
                browser.wsEndpoint() as unknown as string,
              );
              const response = await fetch(
                `http://127.0.0.1:${port}/json/list`,
                {
                  headers: {
                    Host: '127.0.0.1',
                  },
                },
              ).catch(() => ({
                json: () => Promise.resolve([]),
                ok: false,
              }));
              if (response.ok) {
                const body = await response.json();
                // @ts-ignore
                return body.map((b) => ({ ...b, browser }));
              }
              return null;
            }),
        );
        const found = allPages.flat().find((b) => b.id === id);
 
        if (found) {
          const session = this.browsers.get(found.browser)!;
          ++session.numbConnected;
          return found.browser;
        }
 
        throw new NotFound(
          `Couldn't locate browser "${id}" for request "${req.parsed.pathname}"`,
        );
      }
    }
 
    try {
      parsedLaunchOptions = JSON.parse(decodedLaunchOptions);
    } catch (err) {
      throw new BadRequest(
        `Error parsing launch-options: ${err}. Launch options must be a JSON or base64-encoded JSON object`,
      );
    }
 
    const routerOptions =
      typeof router.defaultLaunchOptions === 'function'
        ? router.defaultLaunchOptions(req)
        : router.defaultLaunchOptions;
 
    const launchOptions = {
      ...routerOptions,
      ...parsedLaunchOptions,
    };
 
    const manualUserDataDir =
      launchOptions.args
        ?.find((arg) => arg.includes('--user-data-dir='))
        ?.split('=')[1] || (launchOptions as CDPLaunchOptions).userDataDir;
 
    // Always specify a user-data-dir since plugins can "inject" their own
    // unless it's playwright which takes care of its own data-dirs
    const userDataDir =
      manualUserDataDir ||
      (!this.playwrightBrowserNames.includes(Browser.name)
        ? await generateDataDir(undefined, this.config)
        : null);
 
    const proxyServerArg = launchOptions.args?.find((arg) =>
      arg.includes('--proxy-server='),
    );
 
    /**
     * If it is a playwright request
     */
    if (
      launchOptions.args &&
      proxyServerArg &&
      req.parsed.pathname.startsWith('/playwright')
    ) {
      (launchOptions as BrowserServerOptions).proxy = {
        server: proxyServerArg.split('=')[1],
      };
      const argIndex = launchOptions.args.indexOf(proxyServerArg);
      launchOptions.args.splice(argIndex, 1);
    }
 
    const browser = new Browser({
      blockAds,
      config: this.config,
      logger,
      userDataDir,
    });
 
    const session: BrowserlessSession = {
      id: null,
      initialConnectURL:
        path.join(req.parsed.pathname, req.parsed.search) || '',
      isTempDataDir: !manualUserDataDir,
      launchOptions,
      numbConnected: 1,
      resolver: noop,
      routePath: router.path,
      startedOn: Date.now(),
      ttl: 0,
      userDataDir,
    };
 
    this.browsers.set(browser, session);
 
    const match = (req.headers['user-agent'] || '').match(pwVersionRegex);
    const pwVersion = match ? match[1] : 'default';
   //  启动浏览器    
    await browser.launch(launchOptions as object, pwVersion);
  // 执行hooks ,后边介绍下
    await this.hooks.browser({ browser, meta: req.parsed });
 
    browser.on('newPage', async (page) => {
      await this.onNewPage(req, page);
      (router.onNewPage || noop)(req.parsed || '', page);
    });
 
    return browser;
  };

说明

browserless 对于ws 的处理实际上就是ws proxy 对于浏览器的管理是基于了BrowserManager,同时为了确保稳定基于了queue 队列实现了
限速处理,以上是一个简单说明,通过此可以简单了解内部处理

参考资料

src/browsers/chrome.cdp.ts
src/browsers/chromium.cdp.ts
src/router.ts
src/browsers/index.ts
src/browserless.ts
https://docs.browserless.io/open-api#tag/Browser-WebSocket-APIs
https://github.com/berstend/puppeteer-extra/tree/master/packages/playwright-extra
https://github.com/berstend/puppeteer-extra/tree/master/packages/puppeteer-extra

posted on 2024-06-30 08:00  荣锋亮  阅读(42)  评论(0编辑  收藏  举报

导航