Merge pull request #894 from getmaxun/reuse-page

feat: reuse existing page instance
This commit is contained in:
Karishma Shukla
2025-11-30 17:45:36 +05:30
committed by GitHub
4 changed files with 116 additions and 70 deletions

View File

@@ -662,6 +662,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
};
}
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for API run ${id}`);
@@ -690,13 +700,13 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
markdown = await convertPageToMarkdown(url, currentPage);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
html = await convertPageToHTML(url, currentPage);
serializableOutput.html = [{ content: html }];
}
@@ -824,16 +834,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
plainRun.status = 'running';
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
const workflow = AddGeneratedFlags(recording.recording);
browser.interpreter.setRunId(plainRun.runId);

View File

@@ -1,17 +1,46 @@
import { chromium } from "playwright";
import { chromium, Page } from "playwright";
import { parseMarkdown } from "./markdown";
import logger from "../logger";
async function gotoWithFallback(page: any, url: string) {
try {
return await page.goto(url, {
waitUntil: "networkidle",
timeout: 100000,
});
} catch (err) {
// fallback: JS-heavy or unstable sites
return await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 100000,
});
}
}
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean Markdown using parser.
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
*/
export async function convertPageToMarkdown(url: string): Promise<string> {
const browser = await chromium.launch();
const page = await browser.newPage();
export async function convertPageToMarkdown(url: string, existingPage?: Page): Promise<string> {
let browser: any = null;
let page: Page;
let shouldCloseBrowser = false;
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
if (existingPage) {
logger.log('info', `[Scrape] Reusing existing Playwright page instance for markdown conversion of ${url}`);
page = existingPage;
} else {
logger.log('info', `[Scrape] Creating new Chromium browser instance for markdown conversion of ${url}`);
browser = await chromium.launch();
page = await browser.newPage();
shouldCloseBrowser = true;
}
await page.addInitScript(() => {
await gotoWithFallback(page, url);
const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
@@ -42,14 +71,16 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
}
});
});
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML;
});
await browser.close();
if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
}
// Convert cleaned HTML → Markdown
const markdown = await parseMarkdown(cleanedHtml, url);
@@ -59,14 +90,27 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean HTML.
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
*/
export async function convertPageToHTML(url: string): Promise<string> {
const browser = await chromium.launch();
const page = await browser.newPage();
export async function convertPageToHTML(url: string, existingPage?: Page): Promise<string> {
let browser: any = null;
let page: Page;
let shouldCloseBrowser = false;
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
if (existingPage) {
logger.log('info', `[Scrape] Reusing existing Playwright page instance for HTML conversion of ${url}`);
page = existingPage;
} else {
logger.log('info', `[Scrape] Creating new Chromium browser instance for HTML conversion of ${url}`);
browser = await chromium.launch();
page = await browser.newPage();
shouldCloseBrowser = true;
}
await page.addInitScript(() => {
await gotoWithFallback(page, url);
const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
@@ -97,14 +141,16 @@ export async function convertPageToHTML(url: string): Promise<string> {
}
});
});
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML;
});
await browser.close();
if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
}
// Return cleaned HTML directly
return cleanedHtml;

View File

@@ -181,7 +181,7 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
logger.log('info', `Browser ${browserId} found and ready for execution`);
try {
try {
// Find the recording
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
@@ -189,6 +189,30 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
throw new Error(`Recording for run ${data.runId} not found`);
}
let currentPage = browser.getCurrentPage();
const pageWaitStart = Date.now();
let lastPageLogTime = 0;
let pageAttempts = 0;
const MAX_PAGE_ATTEMPTS = 15;
while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
const currentTime = Date.now();
pageAttempts++;
if (currentTime - lastPageLogTime > 5000) {
logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
lastPageLogTime = currentTime;
}
await new Promise(resolve => setTimeout(resolve, 1000));
currentPage = browser.getCurrentPage();
}
if (!currentPage) {
throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
}
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for run ${data.runId}`);
@@ -212,13 +236,13 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
markdown = await convertPageToMarkdown(url, currentPage);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
html = await convertPageToHTML(url, currentPage);
serializableOutput.html = [{ content: html }];
}
@@ -328,30 +352,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
}
};
let currentPage = browser.getCurrentPage();
const pageWaitStart = Date.now();
let lastPageLogTime = 0;
let pageAttempts = 0;
const MAX_PAGE_ATTEMPTS = 15;
while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
const currentTime = Date.now();
pageAttempts++;
if (currentTime - lastPageLogTime > 5000) {
logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
lastPageLogTime = currentTime;
}
await new Promise(resolve => setTimeout(resolve, 1000));
currentPage = browser.getCurrentPage();
}
if (!currentPage) {
throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
}
logger.log('info', `Starting workflow execution for run ${data.runId}`);
await run.update({

View File

@@ -208,6 +208,16 @@ async function executeRun(id: string, userId: string) {
}
}
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for scheduled run ${id}`);
@@ -252,13 +262,13 @@ async function executeRun(id: string, userId: string) {
// Markdown conversion
if (formats.includes('markdown')) {
markdown = await convertPageToMarkdown(url);
markdown = await convertPageToMarkdown(url, currentPage);
serializableOutput.markdown = [{ content: markdown }];
}
// HTML conversion
if (formats.includes('html')) {
html = await convertPageToHTML(url);
html = await convertPageToHTML(url, currentPage);
serializableOutput.html = [{ content: html }];
}
@@ -391,16 +401,6 @@ async function executeRun(id: string, userId: string) {
logger.log('warn', `Failed to send run-started notification for run ${plainRun.runId}: ${socketError.message}`);
}
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
const workflow = AddGeneratedFlags(recording.recording);
// Set run ID for real-time data persistence