From fa961c5f034aef3b4e46db2bc362aca857d0ca07 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Fri, 21 Nov 2025 13:21:18 +0530 Subject: [PATCH] feat: reuse existing page instance --- server/src/api/record.ts | 24 +++--- server/src/markdownify/scrape.ts | 84 ++++++++++++++----- server/src/pgboss-worker.ts | 54 ++++++------ .../workflow-management/scheduler/index.ts | 24 +++--- 4 files changed, 116 insertions(+), 70 deletions(-) diff --git a/server/src/api/record.ts b/server/src/api/record.ts index 7c665001..3d95e1c7 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -662,6 +662,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ }; } + browser = browserPool.getRemoteBrowser(plainRun.browserId); + if (!browser) { + throw new Error('Could not access browser'); + } + + let currentPage = await browser.getCurrentPage(); + if (!currentPage) { + throw new Error('Could not create a new page'); + } + if (recording.recording_meta.type === 'scrape') { logger.log('info', `Executing scrape robot for API run ${id}`); @@ -690,13 +700,13 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ // Markdown conversion if (formats.includes('markdown')) { - markdown = await convertPageToMarkdown(url); + markdown = await convertPageToMarkdown(url, currentPage); serializableOutput.markdown = [{ content: markdown }]; } // HTML conversion if (formats.includes('html')) { - html = await convertPageToHTML(url); + html = await convertPageToHTML(url, currentPage); serializableOutput.html = [{ content: html }]; } @@ -824,16 +834,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ plainRun.status = 'running'; - browser = browserPool.getRemoteBrowser(plainRun.browserId); - if (!browser) { - throw new Error('Could not access browser'); - } - - let currentPage = await browser.getCurrentPage(); - if (!currentPage) { - throw new Error('Could not create a new page'); - } - const workflow = AddGeneratedFlags(recording.recording); browser.interpreter.setRunId(plainRun.runId); diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts index 6821bfdb..f78bba80 100644 --- a/server/src/markdownify/scrape.ts +++ b/server/src/markdownify/scrape.ts @@ -1,17 +1,46 @@ -import { chromium } from "playwright"; +import { chromium, Page } from "playwright"; import { parseMarkdown } from "./markdown"; +import logger from "../logger"; + +async function gotoWithFallback(page: any, url: string) { + try { + return await page.goto(url, { + waitUntil: "networkidle", + timeout: 100000, + }); + } catch (err) { + // fallback: JS-heavy or unstable sites + return await page.goto(url, { + waitUntil: "domcontentloaded", + timeout: 100000, + }); + } +} /** * Fetches a webpage, strips scripts/styles/images/etc, * returns clean Markdown using parser. + * @param url - The URL to convert + * @param existingPage - Optional existing Playwright page instance to reuse */ -export async function convertPageToMarkdown(url: string): Promise { - const browser = await chromium.launch(); - const page = await browser.newPage(); +export async function convertPageToMarkdown(url: string, existingPage?: Page): Promise { + let browser: any = null; + let page: Page; + let shouldCloseBrowser = false; - await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); + if (existingPage) { + logger.log('info', `[Scrape] Reusing existing Playwright page instance for markdown conversion of ${url}`); + page = existingPage; + } else { + logger.log('info', `[Scrape] Creating new Chromium browser instance for markdown conversion of ${url}`); + browser = await chromium.launch(); + page = await browser.newPage(); + shouldCloseBrowser = true; + } - await page.addInitScript(() => { + await gotoWithFallback(page, url); + + const cleanedHtml = await page.evaluate(() => { const selectors = [ "script", "style", @@ -42,14 +71,16 @@ export async function convertPageToMarkdown(url: string): Promise { } }); }); - }); - // Re-extract HTML after cleanup - const cleanedHtml = await page.evaluate(() => { return document.documentElement.outerHTML; }); - await browser.close(); + if (shouldCloseBrowser && browser) { + logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`); + await browser.close(); + } else { + logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`); + } // Convert cleaned HTML → Markdown const markdown = await parseMarkdown(cleanedHtml, url); @@ -59,14 +90,27 @@ export async function convertPageToMarkdown(url: string): Promise { /** * Fetches a webpage, strips scripts/styles/images/etc, * returns clean HTML. + * @param url - The URL to convert + * @param existingPage - Optional existing Playwright page instance to reuse */ -export async function convertPageToHTML(url: string): Promise { - const browser = await chromium.launch(); - const page = await browser.newPage(); +export async function convertPageToHTML(url: string, existingPage?: Page): Promise { + let browser: any = null; + let page: Page; + let shouldCloseBrowser = false; - await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); + if (existingPage) { + logger.log('info', `[Scrape] Reusing existing Playwright page instance for HTML conversion of ${url}`); + page = existingPage; + } else { + logger.log('info', `[Scrape] Creating new Chromium browser instance for HTML conversion of ${url}`); + browser = await chromium.launch(); + page = await browser.newPage(); + shouldCloseBrowser = true; + } - await page.addInitScript(() => { + await gotoWithFallback(page, url); + + const cleanedHtml = await page.evaluate(() => { const selectors = [ "script", "style", @@ -97,14 +141,16 @@ export async function convertPageToHTML(url: string): Promise { } }); }); - }); - // Re-extract HTML after cleanup - const cleanedHtml = await page.evaluate(() => { return document.documentElement.outerHTML; }); - await browser.close(); + if (shouldCloseBrowser && browser) { + logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`); + await browser.close(); + } else { + logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`); + } // Return cleaned HTML directly return cleanedHtml; diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts index f5d719b4..415eea91 100644 --- a/server/src/pgboss-worker.ts +++ b/server/src/pgboss-worker.ts @@ -181,7 +181,7 @@ async function processRunExecution(job: Job) { logger.log('info', `Browser ${browserId} found and ready for execution`); - try { + try { // Find the recording const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true }); @@ -189,6 +189,30 @@ async function processRunExecution(job: Job) { throw new Error(`Recording for run ${data.runId} not found`); } + let currentPage = browser.getCurrentPage(); + + const pageWaitStart = Date.now(); + let lastPageLogTime = 0; + let pageAttempts = 0; + const MAX_PAGE_ATTEMPTS = 15; + + while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) { + const currentTime = Date.now(); + pageAttempts++; + + if (currentTime - lastPageLogTime > 5000) { + logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`); + lastPageLogTime = currentTime; + } + + await new Promise(resolve => setTimeout(resolve, 1000)); + currentPage = browser.getCurrentPage(); + } + + if (!currentPage) { + throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`); + } + if (recording.recording_meta.type === 'scrape') { logger.log('info', `Executing scrape robot for run ${data.runId}`); @@ -212,13 +236,13 @@ async function processRunExecution(job: Job) { // Markdown conversion if (formats.includes('markdown')) { - markdown = await convertPageToMarkdown(url); + markdown = await convertPageToMarkdown(url, currentPage); serializableOutput.markdown = [{ content: markdown }]; } // HTML conversion if (formats.includes('html')) { - html = await convertPageToHTML(url); + html = await convertPageToHTML(url, currentPage); serializableOutput.html = [{ content: html }]; } @@ -328,30 +352,6 @@ async function processRunExecution(job: Job) { } }; - let currentPage = browser.getCurrentPage(); - - const pageWaitStart = Date.now(); - let lastPageLogTime = 0; - let pageAttempts = 0; - const MAX_PAGE_ATTEMPTS = 15; - - while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) { - const currentTime = Date.now(); - pageAttempts++; - - if (currentTime - lastPageLogTime > 5000) { - logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`); - lastPageLogTime = currentTime; - } - - await new Promise(resolve => setTimeout(resolve, 1000)); - currentPage = browser.getCurrentPage(); - } - - if (!currentPage) { - throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`); - } - logger.log('info', `Starting workflow execution for run ${data.runId}`); await run.update({ diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts index 470cdacb..95d366ae 100644 --- a/server/src/workflow-management/scheduler/index.ts +++ b/server/src/workflow-management/scheduler/index.ts @@ -208,6 +208,16 @@ async function executeRun(id: string, userId: string) { } } + browser = browserPool.getRemoteBrowser(plainRun.browserId); + if (!browser) { + throw new Error('Could not access browser'); + } + + let currentPage = await browser.getCurrentPage(); + if (!currentPage) { + throw new Error('Could not create a new page'); + } + if (recording.recording_meta.type === 'scrape') { logger.log('info', `Executing scrape robot for scheduled run ${id}`); @@ -252,13 +262,13 @@ async function executeRun(id: string, userId: string) { // Markdown conversion if (formats.includes('markdown')) { - markdown = await convertPageToMarkdown(url); + markdown = await convertPageToMarkdown(url, currentPage); serializableOutput.markdown = [{ content: markdown }]; } // HTML conversion if (formats.includes('html')) { - html = await convertPageToHTML(url); + html = await convertPageToHTML(url, currentPage); serializableOutput.html = [{ content: html }]; } @@ -391,16 +401,6 @@ async function executeRun(id: string, userId: string) { logger.log('warn', `Failed to send run-started notification for run ${plainRun.runId}: ${socketError.message}`); } - browser = browserPool.getRemoteBrowser(plainRun.browserId); - if (!browser) { - throw new Error('Could not access browser'); - } - - let currentPage = await browser.getCurrentPage(); - if (!currentPage) { - throw new Error('Could not create a new page'); - } - const workflow = AddGeneratedFlags(recording.recording); // Set run ID for real-time data persistence