Merge pull request #894 from getmaxun/reuse-page

feat: reuse existing page instance
2025-11-30 17:45:36 +05:30
parent ad8df66ecd fa961c5f03
commit cbb7e3b8e5
4 changed files with 116 additions and 70 deletions
--- a/server/src/api/record.ts
+++ b/server/src/api/record.ts
@@ -662,6 +662,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
            };
        }

+        browser = browserPool.getRemoteBrowser(plainRun.browserId);
+        if (!browser) {
+            throw new Error('Could not access browser');
+        }
+
+        let currentPage = await browser.getCurrentPage();
+        if (!currentPage) {
+            throw new Error('Could not create a new page');
+        }
+
        if (recording.recording_meta.type === 'scrape') {
            logger.log('info', `Executing scrape robot for API run ${id}`);

@@ -690,13 +700,13 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[

                // Markdown conversion
                if (formats.includes('markdown')) {
-                    markdown = await convertPageToMarkdown(url);
+                    markdown = await convertPageToMarkdown(url, currentPage);
                    serializableOutput.markdown = [{ content: markdown }];
                }

                // HTML conversion
                if (formats.includes('html')) {
-                    html = await convertPageToHTML(url);
+                    html = await convertPageToHTML(url, currentPage);
                    serializableOutput.html = [{ content: html }];
                }

@@ -824,16 +834,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[

        plainRun.status = 'running';

-        browser = browserPool.getRemoteBrowser(plainRun.browserId);
-        if (!browser) {
-            throw new Error('Could not access browser');
-        }
-
-        let currentPage = await browser.getCurrentPage();
-        if (!currentPage) {
-            throw new Error('Could not create a new page');
-        }
-
        const workflow = AddGeneratedFlags(recording.recording);

        browser.interpreter.setRunId(plainRun.runId);
--- a/server/src/markdownify/scrape.ts
+++ b/server/src/markdownify/scrape.ts
@@ -1,17 +1,46 @@
-import { chromium } from "playwright";
+import { chromium, Page } from "playwright";
 import { parseMarkdown } from "./markdown";
+import logger from "../logger";
+
+async function gotoWithFallback(page: any, url: string) {
+  try {
+    return await page.goto(url, {
+      waitUntil: "networkidle",
+      timeout: 100000,
+    });
+  } catch (err) {
+    // fallback: JS-heavy or unstable sites
+    return await page.goto(url, {
+      waitUntil: "domcontentloaded",
+      timeout: 100000,
+    });
+  }
+}

 /**
 * Fetches a webpage, strips scripts/styles/images/etc,
 * returns clean Markdown using parser.
+ * @param url - The URL to convert
+ * @param existingPage - Optional existing Playwright page instance to reuse
 */
-export async function convertPageToMarkdown(url: string): Promise<string> {
-  const browser = await chromium.launch();
-  const page = await browser.newPage();
+export async function convertPageToMarkdown(url: string, existingPage?: Page): Promise<string> {
+  let browser: any = null;
+  let page: Page;
+  let shouldCloseBrowser = false;

-  await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
+  if (existingPage) {
+    logger.log('info', `[Scrape] Reusing existing Playwright page instance for markdown conversion of ${url}`);
+    page = existingPage;
+  } else {
+    logger.log('info', `[Scrape] Creating new Chromium browser instance for markdown conversion of ${url}`);
+    browser = await chromium.launch();
+    page = await browser.newPage();
+    shouldCloseBrowser = true;
+  }

-  await page.addInitScript(() => {
+  await gotoWithFallback(page, url);
+
+  const cleanedHtml = await page.evaluate(() => {
    const selectors = [
      "script",
      "style",
@@ -42,14 +71,16 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
        }
      });
    });
-  });

-  // Re-extract HTML after cleanup
-  const cleanedHtml = await page.evaluate(() => {
    return document.documentElement.outerHTML;
  });

-  await browser.close();
+  if (shouldCloseBrowser && browser) {
+    logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`);
+    await browser.close();
+  } else {
+    logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
+  }

  // Convert cleaned HTML → Markdown
  const markdown = await parseMarkdown(cleanedHtml, url);
@@ -59,14 +90,27 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
 /**
 * Fetches a webpage, strips scripts/styles/images/etc,
 * returns clean HTML.
+ * @param url - The URL to convert
+ * @param existingPage - Optional existing Playwright page instance to reuse
 */
-export async function convertPageToHTML(url: string): Promise<string> {
-  const browser = await chromium.launch();
-  const page = await browser.newPage();
+export async function convertPageToHTML(url: string, existingPage?: Page): Promise<string> {
+  let browser: any = null;
+  let page: Page;
+  let shouldCloseBrowser = false;

-  await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
+  if (existingPage) {
+    logger.log('info', `[Scrape] Reusing existing Playwright page instance for HTML conversion of ${url}`);
+    page = existingPage;
+  } else {
+    logger.log('info', `[Scrape] Creating new Chromium browser instance for HTML conversion of ${url}`);
+    browser = await chromium.launch();
+    page = await browser.newPage();
+    shouldCloseBrowser = true;
+  }

-  await page.addInitScript(() => {
+  await gotoWithFallback(page, url);
+
+  const cleanedHtml = await page.evaluate(() => {
    const selectors = [
      "script",
      "style",
@@ -97,14 +141,16 @@ export async function convertPageToHTML(url: string): Promise<string> {
        }
      });
    });
-  });

-  // Re-extract HTML after cleanup
-  const cleanedHtml = await page.evaluate(() => {
    return document.documentElement.outerHTML;
  });

-  await browser.close();
+  if (shouldCloseBrowser && browser) {
+    logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
+    await browser.close();
+  } else {
+    logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
+  }

  // Return cleaned HTML directly
  return cleanedHtml;
--- a/server/src/pgboss-worker.ts
+++ b/server/src/pgboss-worker.ts
@@ -181,7 +181,7 @@ async function processRunExecution(job: Job<ExecuteRunData>) {

    logger.log('info', `Browser ${browserId} found and ready for execution`);

-    try {  
+    try {
      // Find the recording
      const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });

@@ -189,6 +189,30 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
        throw new Error(`Recording for run ${data.runId} not found`);
      }

+      let currentPage = browser.getCurrentPage();
+
+      const pageWaitStart = Date.now();
+      let lastPageLogTime = 0;
+      let pageAttempts = 0;
+      const MAX_PAGE_ATTEMPTS = 15;
+
+      while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
+        const currentTime = Date.now();
+        pageAttempts++;
+
+        if (currentTime - lastPageLogTime > 5000) {
+          logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
+          lastPageLogTime = currentTime;
+        }
+
+        await new Promise(resolve => setTimeout(resolve, 1000));
+        currentPage = browser.getCurrentPage();
+      }
+
+      if (!currentPage) {
+        throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
+      }
+
      if (recording.recording_meta.type === 'scrape') {
        logger.log('info', `Executing scrape robot for run ${data.runId}`);

@@ -212,13 +236,13 @@ async function processRunExecution(job: Job<ExecuteRunData>) {

          // Markdown conversion
          if (formats.includes('markdown')) {
-            markdown = await convertPageToMarkdown(url);
+            markdown = await convertPageToMarkdown(url, currentPage);
            serializableOutput.markdown = [{ content: markdown }];
          }

          // HTML conversion
          if (formats.includes('html')) {
-            html = await convertPageToHTML(url);
+            html = await convertPageToHTML(url, currentPage);
            serializableOutput.html = [{ content: html }];
          }

@@ -328,30 +352,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
        }
      };

-      let currentPage = browser.getCurrentPage();
-      
-      const pageWaitStart = Date.now();
-      let lastPageLogTime = 0;
-      let pageAttempts = 0;
-      const MAX_PAGE_ATTEMPTS = 15;
-      
-      while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
-        const currentTime = Date.now();
-        pageAttempts++;
-        
-        if (currentTime - lastPageLogTime > 5000) {
-          logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
-          lastPageLogTime = currentTime;
-        }
-        
-        await new Promise(resolve => setTimeout(resolve, 1000));
-        currentPage = browser.getCurrentPage();
-      }
-
-      if (!currentPage) {
-        throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
-      }
-
      logger.log('info', `Starting workflow execution for run ${data.runId}`);

      await run.update({ 
--- a/server/src/workflow-management/scheduler/index.ts
+++ b/server/src/workflow-management/scheduler/index.ts
@@ -208,6 +208,16 @@ async function executeRun(id: string, userId: string) {
      }
    }

+    browser = browserPool.getRemoteBrowser(plainRun.browserId);
+    if (!browser) {
+      throw new Error('Could not access browser');
+    }
+
+    let currentPage = await browser.getCurrentPage();
+    if (!currentPage) {
+      throw new Error('Could not create a new page');
+    }
+
    if (recording.recording_meta.type === 'scrape') {
      logger.log('info', `Executing scrape robot for scheduled run ${id}`);

@@ -252,13 +262,13 @@ async function executeRun(id: string, userId: string) {

        // Markdown conversion
        if (formats.includes('markdown')) {
-          markdown = await convertPageToMarkdown(url);
+          markdown = await convertPageToMarkdown(url, currentPage);
          serializableOutput.markdown = [{ content: markdown }];
        }

        // HTML conversion
        if (formats.includes('html')) {
-          html = await convertPageToHTML(url);
+          html = await convertPageToHTML(url, currentPage);
          serializableOutput.html = [{ content: html }];
        }

@@ -391,16 +401,6 @@ async function executeRun(id: string, userId: string) {
      logger.log('warn', `Failed to send run-started notification for run ${plainRun.runId}: ${socketError.message}`);
    }

-    browser = browserPool.getRemoteBrowser(plainRun.browserId);
-    if (!browser) {
-      throw new Error('Could not access browser');
-    }
-
-    let currentPage = await browser.getCurrentPage();
-    if (!currentPage) {
-      throw new Error('Could not create a new page');
-    }
-
    const workflow = AddGeneratedFlags(recording.recording);
    
    // Set run ID for real-time data persistence