lightdash 对于headless browser的使用
对于slack 的unfurl lightdash 使用了headless browser,以下说明下内部处理
参考图
此图来自官方文档,可以看出主要包含了调度以及slack 的
内部实现
实际上就是对于 unfurl 的图片处理部分,里边包含了cookie处理(安全)以及对于请求进行拦截(核心也是为了安全)同时还支持基于一些
配置对于页面的信息重新设置,整体使用上都是标准的puppeteer 操作,还是值得学习参考的
private async saveScreenshot({
imageId,
cookie,
url,
lightdashPage,
chartType,
organizationUuid,
userUuid,
gridWidth = undefined,
resourceUuid = undefined,
resourceName = undefined,
}: {
imageId: string;
cookie: string;
url: string;
lightdashPage: LightdashPage;
chartType?: string;
organizationUuid?: string;
userUuid: string;
gridWidth?: number | undefined;
resourceUuid?: string;
resourceName?: string;
}): Promise<Buffer | undefined> {
if (this.lightdashConfig.headlessBrowser?.host === undefined) {
this.logger.error(
`Can't get screenshot if HEADLESS_BROWSER_HOST env variable is not defined`,
);
throw new Error(
`Can't get screenshot if HEADLESS_BROWSER_HOST env variable is not defined`,
);
}
const startTime = Date.now();
let hasError = false;
const isPuppeteerSetViewportDynamicallyEnabled =
await isFeatureFlagEnabled(
FeatureFlags.PuppeteerSetViewportDynamically,
{ userUuid, organizationUuid },
);
const isPuppeteerScrollElementIntoViewEnabled =
await isFeatureFlagEnabled(
FeatureFlags.PuppeteerScrollElementIntoView,
{ userUuid, organizationUuid },
);
return tracer.startActiveSpan(
'UnfurlService.saveScreenshot',
async (span) => {
let browser;
try {
const browserWSEndpoint = `ws://${
this.lightdashConfig.headlessBrowser?.host
}:${this.lightdashConfig.headlessBrowser?.port || 3001}`;
browser = await puppeteer.connect({
browserWSEndpoint,
});
const page = await browser.newPage();
const parsedUrl = new URL(url);
const cookieMatch = cookie.match(/connect\.sid=([^;]+)/); // Extract cookie value
if (!cookieMatch)
throw new Error('Invalid cookie provided');
const cookieValue = cookieMatch[1];
// Set cookie using `setCookie` instead of `setExtraHTTPHeaders` , otherwise this cookie will be leaked into other domains
await page.setCookie({
name: 'connect.sid',
value: cookieValue,
domain: parsedUrl.hostname, // Don't use ports here, cookies do not provide isolation by port
sameSite: 'Strict',
});
if (chartType === ChartType.BIG_NUMBER) {
await page.setViewport(bigNumberViewport);
} else {
await page.setViewport({
...viewport,
width: gridWidth ?? viewport.width,
});
}
page.on('requestfailed', (request) => {
this.logger.warn(
`Headless browser request error - method: ${request.method()}, url: ${request.url()}, text: ${
request.failure()?.errorText
}`,
);
});
page.on('console', (msg) => {
const type = msg.type();
if (type === 'error') {
this.logger.warn(
`Headless browser console error - file: ${
msg.location().url
}, text ${msg.text()} `,
);
}
});
/*
// This code can be used to block requests to external domains
// We disabled this so people can use images on markdown
await page.setRequestInterception(true);
await page.on('request', (request: HTTPRequest) => {
const requestUrl = request.url();
const cookie = request.headers()['cookie']
const parsedUrl = new URL(url);
// Only allow request to the same host
if (!requestUrl.includes(parsedUrl.origin)) {
request.abort();
return;
}
request.continue();
});
*/
let chartRequests = 0;
let chartRequestErrors = 0;
page.on('response', (response) => {
const responseUrl = response.url();
const regexUrlToMatch =
lightdashPage === LightdashPage.EXPLORE
? /\/saved\/[a-f0-9-]+\/results/
: /\/saved\/[a-f0-9-]+\/chart-and-results/; // NOTE: Chart endpoint in Dashboards is different
if (responseUrl.match(regexUrlToMatch)) {
chartRequests += 1;
response.buffer().then(
(buffer) => {
const status = response.status();
if (status >= 400) {
this.logger.error(
`Headless browser response error - url: ${responseUrl}, code: ${response.status()}, text: ${buffer}`,
);
chartRequestErrors += 1;
}
},
(error) => {
this.logger.error(
`Headless browser response buffer error: ${error.message}`,
);
chartRequestErrors += 1;
},
);
}
});
let timeout = false;
try {
await page.goto(url, {
timeout: 150000, // Wait 2.5 mins for the page to load
waitUntil: 'networkidle0',
});
} catch (e) {
timeout = true;
this.logger.warn(
`Got a timeout when waiting for the page to load, returning current content`,
);
}
// Wait until the page is fully loaded
await page
.waitForSelector('.loading_chart', {
hidden: true,
timeout: 30000,
})
.catch(() => {
timeout = true;
this.logger.warn(
`Got a timeout when waiting for all charts to be loaded, returning current content`,
);
});
const path = `/tmp/${imageId}.png`;
let selector =
lightdashPage === LightdashPage.EXPLORE
? `[data-testid="visualization"]`
: 'body';
if (
isPuppeteerSetViewportDynamicallyEnabled &&
lightdashPage === LightdashPage.DASHBOARD
) {
selector = '.react-grid-layout';
}
const element = await page.waitForSelector(selector, {
timeout: 60000,
});
if (
isPuppeteerSetViewportDynamicallyEnabled &&
lightdashPage === LightdashPage.DASHBOARD
) {
const fullPage = await page.$('.react-grid-layout');
const fullPageSize = await fullPage?.boundingBox();
await page.setViewport({
width: gridWidth ?? viewport.width,
height: fullPageSize?.height
? parseInt(fullPageSize.height.toString(), 10)
: viewport.height,
});
}
if (!element) {
this.logger.warn(`Can't find element on page`);
return undefined;
}
const box = await element.boundingBox();
const pageMetrics = await page.metrics();
chartCounter.addCallback(async (result) => {
result.observe(chartRequests, {
errors: chartRequestErrors,
timeout,
organization_uuid: organizationUuid || 'undefined',
});
});
span.setAttributes({
'page.width': box?.width,
'page.height': box?.height,
'chart.requests.total': chartRequests,
'chart.requests.error': chartRequestErrors,
'page.metrics.task_duration': pageMetrics.TaskDuration,
'page.metrics.heap_size': pageMetrics.JSHeapUsedSize,
'page.metrics.total_size': pageMetrics.JSHeapTotalSize,
'page.type': lightdashPage,
url,
chartType: chartType || 'undefined',
organization_uuid: organizationUuid || 'undefined',
'page.metrics.event_listeners':
pageMetrics.JSEventListeners,
timeout,
});
if (this.lightdashConfig.scheduler.screenshotTimeout) {
await new Promise((resolve) => {
setTimeout(
resolve,
this.lightdashConfig.scheduler
.screenshotTimeout,
);
});
}
const imageBuffer = await element.screenshot({
path,
lightdashPage === LightdashPage.DASHBOARD
? {
scrollIntoView: true,
}
: {}),
});
return imageBuffer;
} catch (e) {
Sentry.captureException(e);
hasError = true;
span.recordException(e);
span.setAttributes({
'page.type': lightdashPage,
url,
chartType: chartType || 'undefined',
organization_uuid: organizationUuid || 'undefined',
uuid: resourceUuid ?? 'undefined',
title: resourceName ?? 'undefined',
is_viewport_dynamically_enabled: `${isPuppeteerSetViewportDynamicallyEnabled}`,
is_scroll_into_view_enabled: `${isPuppeteerScrollElementIntoViewEnabled}`,
custom_width: `${gridWidth}`,
});
span.setStatus({
code: SpanStatusCode.ERROR,
});
this.logger.error(
`Unable to fetch screenshots for scheduler with url ${url}, of type: ${lightdashPage}. Message: ${e.message}`,
);
throw e;
} finally {
if (browser) await browser.close();
span.end();
const executionTime = Date.now() - startTime;
this.logger.info(
`UnfurlService saveScreenshot took ${executionTime} ms`,
);
taskDurationHistogram.record(executionTime, {
error: hasError,
});
}
},
);
}
说明
日常中需要进行快照的场景还是不少的,基于无头浏览器是一个很不错的选择,成本相对比,开发也比较快速,browserless 是一个很不错的工具
参考资料
https://docs.lightdash.com/self-host/customize-deployment/enable-headless-browser-for-lightdash/
https://pptr.dev/
https://github.com/browserless/browserless