browserless scrape api 简单说明
以前说过browserless提供了不少api 能力,以下简单说明下scrape api 的处理
参考定义
如下图,browserless 对于不同浏览器进行了不同的处理(内部实现包含了公共的)
内部处理
scrape.http.ts 定义中,browserless 使用了puppeteer-core 包进行处理
- 参考代码
// 包装的方法,方便puppeteer-core 的page.evaluate 使用
const scrape = async (elements: ScrapeElementSelector[]) => {
const wait = (selector: string, timeout = 30000) => {
return new Promise<void>((resolve, reject) => {
const timeoutId = setTimeout(() => {
clearTimeout(timeoutId);
clearInterval(intervalId);
reject(new Error(`Timed out waiting for selector "${selector}"`));
}, timeout);
const intervalId = setInterval(() => {
if (document.querySelector(selector)) {
clearTimeout(timeoutId);
clearInterval(intervalId);
return resolve();
}
}, 100);
});
};
// 先进行wait 处理,
await Promise.all(
elements.map(({ selector, timeout }) => wait(selector, timeout)),
);
// 获取elements selector 的额html 信息
return elements.map(({ selector }) => {
const $els = [
return {
results: $els.map(($el) => {
const rect = $el.getBoundingClientRect();
return {
attributes: [
name: attr.name,
value: attr.value,
})),
height: $el.offsetHeight,
html: $el.innerHTML,
left: rect.left,
text: $el.innerText,
top: rect.top,
width: $el.offsetWidth,
};
}),
selector,
};
});
};
// ChromiumScrapePostRoute api route
export default class ChromiumScrapePostRoute extends BrowserHTTPRoute {
name = BrowserlessRoutes.ChromiumScrapePostRoute;
accepts = [contentTypes.json];
auth = true;
browser = ChromiumCDP;
concurrency = true;
contentTypes = [contentTypes.json];
description = dedent(`
A JSON-based API that returns text, html, and meta-data from a given list of selectors.
Debugging information is available by sending in the appropriate flags in the "debugOpts"
property. Responds with an array of JSON objects.
`);
method = Methods.post;
path = [HTTPRoutes.scrape, HTTPRoutes.chromiumScrape];
tags = [APITags.browserAPI];
handler = async (
req: Request,
res: ServerResponse,
logger: Logger,
browser: BrowserInstance,
) => {
logger.info('Scrape API invoked with body:', req.body);
const contentType =
!req.headers.accept || req.headers.accept?.includes('*')
? contentTypes.html
: req.headers.accept;
if (!req.body) {
throw new BadRequest(`Couldn't parse JSON body`);
}
res.setHeader('Content-Type', contentType);
const {
bestAttempt = false,
url,
gotoOptions,
authenticate,
addScriptTag = [],
addStyleTag = [],
cookies = [],
debugOpts,
elements,
emulateMediaType,
html,
rejectRequestPattern = [],
requestInterceptors = [],
rejectResourceTypes = [],
setExtraHTTPHeaders,
setJavaScriptEnabled,
userAgent,
viewport,
waitForTimeout,
waitForFunction,
waitForSelector,
waitForEvent,
} = req.body as BodySchema;
const content = url || html;
if (!content) {
throw new BadRequest(`One of "url" or "html" properties are required.`);
}
const page = (await browser.newPage()) as UnwrapPromise<
ReturnType<ChromiumCDP['newPage']>
>;
const gotoCall = url ? page.goto.bind(page) : page.setContent.bind(page);
const messages: string[] = [];
const outbound: OutBoundRequest[] = [];
const inbound: InBoundRequest[] = [];
if (debugOpts?.console) {
page.on('console', (msg) => messages.push(msg.text()));
}
if (debugOpts?.network) {
page.setRequestInterception(true);
page.on('request', (req) => {
outbound.push({
headers: req.headers,
method: req.method(),
url: req.url(),
});
req.continue();
});
page.on('response', (res) => {
inbound.push({
headers: res.headers,
status: res.status(),
url: res.url(),
});
});
}
if (emulateMediaType) {
await page.emulateMediaType(emulateMediaType);
}
if (cookies.length) {
await page.setCookie(
}
if (viewport) {
await page.setViewport(viewport);
}
if (userAgent) {
await page.setUserAgent(userAgent);
}
if (authenticate) {
await page.authenticate(authenticate);
}
if (setExtraHTTPHeaders) {
await page.setExtraHTTPHeaders(setExtraHTTPHeaders);
}
if (setJavaScriptEnabled) {
await page.setJavaScriptEnabled(setJavaScriptEnabled);
}
if (
rejectRequestPattern.length ||
requestInterceptors.length ||
rejectResourceTypes.length
) {
await page.setRequestInterception(true);
page.on('request', (req) => {
if (
!!rejectRequestPattern.find((pattern) => req.url().match(pattern)) ||
rejectResourceTypes.includes(req.resourceType())
) {
logger.debug(`Aborting request ${req.method()}: ${req.url()}`);
return req.abort();
}
const interceptor = requestInterceptors.find((r) =>
req.url().match(r.pattern),
);
if (interceptor) {
return req.respond(interceptor.response);
}
return req.continue();
});
}
const gotoResponse = await gotoCall(content, gotoOptions).catch(
bestAttemptCatch(bestAttempt),
);
if (addStyleTag.length) {
for (const tag in addStyleTag) {
await page.addStyleTag(addStyleTag[tag]);
}
}
if (addScriptTag.length) {
for (const tag in addScriptTag) {
await page.addScriptTag(addScriptTag[tag]);
}
}
if (waitForTimeout) {
await sleep(waitForTimeout).catch(bestAttemptCatch(bestAttempt));
}
if (waitForFunction) {
await waitForFn(page, waitForFunction).catch(
bestAttemptCatch(bestAttempt),
);
}
if (waitForSelector) {
const { selector, hidden, timeout, visible } = waitForSelector;
await page
.waitForSelector(selector, { hidden, timeout, visible })
.catch(bestAttemptCatch(bestAttempt));
}
if (waitForEvent) {
await waitForEvt(page, waitForEvent).catch(bestAttemptCatch(bestAttempt));
}
const headers = {
'X-Response-Code': gotoResponse?.status(),
'X-Response-IP': gotoResponse?.remoteAddress().ip,
'X-Response-Port': gotoResponse?.remoteAddress().port,
'X-Response-Status': gotoResponse?.statusText(),
'X-Response-URL': gotoResponse?.url().substring(0, 1000),
};
for (const [key, value] of Object.entries(headers)) {
if (value !== undefined) {
res.setHeader(key, value);
}
}
const data = await page.evaluate(scrape, elements).catch((e) => {
if (e.message.includes('Timed out')) {
throw new Timeout(e);
}
throw e;
});
const [debugHTML, screenshot, pageCookies] = await Promise.all([
debugOpts?.html ? (page.content() as Promise<string>) : null,
debugOpts?.screenshot
? (page.screenshot(debugScreenshotOpts) as unknown as Promise<string>)
: null,
debugOpts?.cookies ? page.cookies() : null,
]);
const debugData = debugOpts
? {
console: messages,
cookies: pageCookies,
html: debugHTML,
network: {
inbound,
outbound,
},
screenshot,
}
: null;
const response: ResponseSchema = {
data,
debug: debugData,
};
page.close().catch(noop);
logger.info('Scrape API request completed');
return jsonResponse(res, 200, response, false);
};
}
说明
目前browserless 包装的rest api 部分内部是直接使用的puppeteer-core api,以上是一个简单说明,详细的可以阅读源码,部分处理上对于我们使用puppeteer-core 也是很值得学习的
参考资料
src/shared/scrape.http.ts
https://docs.browserless.io/HTTP-APIs/apis
https://docs.browserless.io/open-api/