feat: reuse existing page instance

This commit is contained in:
Rohit Rajan
2025-11-21 13:21:18 +05:30
parent e6451d0972
commit fa961c5f03
4 changed files with 116 additions and 70 deletions

View File

@@ -662,6 +662,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
}; };
} }
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
if (recording.recording_meta.type === 'scrape') { if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for API run ${id}`); logger.log('info', `Executing scrape robot for API run ${id}`);
@@ -690,13 +700,13 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
// Markdown conversion // Markdown conversion
if (formats.includes('markdown')) { if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url); markdown = await convertPageToMarkdown(url, currentPage);
serializableOutput.markdown = [{ content: markdown }]; serializableOutput.markdown = [{ content: markdown }];
} }
// HTML conversion // HTML conversion
if (formats.includes('html')) { if (formats.includes('html')) {
html = await convertPageToHTML(url); html = await convertPageToHTML(url, currentPage);
serializableOutput.html = [{ content: html }]; serializableOutput.html = [{ content: html }];
} }
@@ -824,16 +834,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
plainRun.status = 'running'; plainRun.status = 'running';
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
const workflow = AddGeneratedFlags(recording.recording); const workflow = AddGeneratedFlags(recording.recording);
browser.interpreter.setRunId(plainRun.runId); browser.interpreter.setRunId(plainRun.runId);

View File

@@ -1,17 +1,46 @@
import { chromium } from "playwright"; import { chromium, Page } from "playwright";
import { parseMarkdown } from "./markdown"; import { parseMarkdown } from "./markdown";
import logger from "../logger";
async function gotoWithFallback(page: any, url: string) {
try {
return await page.goto(url, {
waitUntil: "networkidle",
timeout: 100000,
});
} catch (err) {
// fallback: JS-heavy or unstable sites
return await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 100000,
});
}
}
/** /**
* Fetches a webpage, strips scripts/styles/images/etc, * Fetches a webpage, strips scripts/styles/images/etc,
* returns clean Markdown using parser. * returns clean Markdown using parser.
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
*/ */
export async function convertPageToMarkdown(url: string): Promise<string> { export async function convertPageToMarkdown(url: string, existingPage?: Page): Promise<string> {
const browser = await chromium.launch(); let browser: any = null;
const page = await browser.newPage(); let page: Page;
let shouldCloseBrowser = false;
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); if (existingPage) {
logger.log('info', `[Scrape] Reusing existing Playwright page instance for markdown conversion of ${url}`);
page = existingPage;
} else {
logger.log('info', `[Scrape] Creating new Chromium browser instance for markdown conversion of ${url}`);
browser = await chromium.launch();
page = await browser.newPage();
shouldCloseBrowser = true;
}
await page.addInitScript(() => { await gotoWithFallback(page, url);
const cleanedHtml = await page.evaluate(() => {
const selectors = [ const selectors = [
"script", "script",
"style", "style",
@@ -42,14 +71,16 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
} }
}); });
}); });
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML; return document.documentElement.outerHTML;
}); });
await browser.close(); if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
}
// Convert cleaned HTML → Markdown // Convert cleaned HTML → Markdown
const markdown = await parseMarkdown(cleanedHtml, url); const markdown = await parseMarkdown(cleanedHtml, url);
@@ -59,14 +90,27 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
/** /**
* Fetches a webpage, strips scripts/styles/images/etc, * Fetches a webpage, strips scripts/styles/images/etc,
* returns clean HTML. * returns clean HTML.
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
*/ */
export async function convertPageToHTML(url: string): Promise<string> { export async function convertPageToHTML(url: string, existingPage?: Page): Promise<string> {
const browser = await chromium.launch(); let browser: any = null;
const page = await browser.newPage(); let page: Page;
let shouldCloseBrowser = false;
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); if (existingPage) {
logger.log('info', `[Scrape] Reusing existing Playwright page instance for HTML conversion of ${url}`);
page = existingPage;
} else {
logger.log('info', `[Scrape] Creating new Chromium browser instance for HTML conversion of ${url}`);
browser = await chromium.launch();
page = await browser.newPage();
shouldCloseBrowser = true;
}
await page.addInitScript(() => { await gotoWithFallback(page, url);
const cleanedHtml = await page.evaluate(() => {
const selectors = [ const selectors = [
"script", "script",
"style", "style",
@@ -97,14 +141,16 @@ export async function convertPageToHTML(url: string): Promise<string> {
} }
}); });
}); });
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML; return document.documentElement.outerHTML;
}); });
await browser.close(); if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
}
// Return cleaned HTML directly // Return cleaned HTML directly
return cleanedHtml; return cleanedHtml;

View File

@@ -181,7 +181,7 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
logger.log('info', `Browser ${browserId} found and ready for execution`); logger.log('info', `Browser ${browserId} found and ready for execution`);
try { try {
// Find the recording // Find the recording
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true }); const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
@@ -189,6 +189,30 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
throw new Error(`Recording for run ${data.runId} not found`); throw new Error(`Recording for run ${data.runId} not found`);
} }
let currentPage = browser.getCurrentPage();
const pageWaitStart = Date.now();
let lastPageLogTime = 0;
let pageAttempts = 0;
const MAX_PAGE_ATTEMPTS = 15;
while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
const currentTime = Date.now();
pageAttempts++;
if (currentTime - lastPageLogTime > 5000) {
logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
lastPageLogTime = currentTime;
}
await new Promise(resolve => setTimeout(resolve, 1000));
currentPage = browser.getCurrentPage();
}
if (!currentPage) {
throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
}
if (recording.recording_meta.type === 'scrape') { if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for run ${data.runId}`); logger.log('info', `Executing scrape robot for run ${data.runId}`);
@@ -212,13 +236,13 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
// Markdown conversion // Markdown conversion
if (formats.includes('markdown')) { if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url); markdown = await convertPageToMarkdown(url, currentPage);
serializableOutput.markdown = [{ content: markdown }]; serializableOutput.markdown = [{ content: markdown }];
} }
// HTML conversion // HTML conversion
if (formats.includes('html')) { if (formats.includes('html')) {
html = await convertPageToHTML(url); html = await convertPageToHTML(url, currentPage);
serializableOutput.html = [{ content: html }]; serializableOutput.html = [{ content: html }];
} }
@@ -328,30 +352,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
} }
}; };
let currentPage = browser.getCurrentPage();
const pageWaitStart = Date.now();
let lastPageLogTime = 0;
let pageAttempts = 0;
const MAX_PAGE_ATTEMPTS = 15;
while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
const currentTime = Date.now();
pageAttempts++;
if (currentTime - lastPageLogTime > 5000) {
logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
lastPageLogTime = currentTime;
}
await new Promise(resolve => setTimeout(resolve, 1000));
currentPage = browser.getCurrentPage();
}
if (!currentPage) {
throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
}
logger.log('info', `Starting workflow execution for run ${data.runId}`); logger.log('info', `Starting workflow execution for run ${data.runId}`);
await run.update({ await run.update({

View File

@@ -208,6 +208,16 @@ async function executeRun(id: string, userId: string) {
} }
} }
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
if (recording.recording_meta.type === 'scrape') { if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for scheduled run ${id}`); logger.log('info', `Executing scrape robot for scheduled run ${id}`);
@@ -252,13 +262,13 @@ async function executeRun(id: string, userId: string) {
// Markdown conversion // Markdown conversion
if (formats.includes('markdown')) { if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url); markdown = await convertPageToMarkdown(url, currentPage);
serializableOutput.markdown = [{ content: markdown }]; serializableOutput.markdown = [{ content: markdown }];
} }
// HTML conversion // HTML conversion
if (formats.includes('html')) { if (formats.includes('html')) {
html = await convertPageToHTML(url); html = await convertPageToHTML(url, currentPage);
serializableOutput.html = [{ content: html }]; serializableOutput.html = [{ content: html }];
} }
@@ -391,16 +401,6 @@ async function executeRun(id: string, userId: string) {
logger.log('warn', `Failed to send run-started notification for run ${plainRun.runId}: ${socketError.message}`); logger.log('warn', `Failed to send run-started notification for run ${plainRun.runId}: ${socketError.message}`);
} }
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
const workflow = AddGeneratedFlags(recording.recording); const workflow = AddGeneratedFlags(recording.recording);
// Set run ID for real-time data persistence // Set run ID for real-time data persistence