feat: reuse existing page instance

2025-11-21 13:21:18 +05:30
parent e6451d0972
commit fa961c5f03
4 changed files with 116 additions and 70 deletions
--- a/server/src/api/record.ts
+++ b/server/src/api/record.ts
@@ -662,6 +662,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
            };
        }
        browser = browserPool.getRemoteBrowser(plainRun.browserId);
        if (!browser) {
            throw new Error('Could not access browser');
        }
        let currentPage = await browser.getCurrentPage();
        if (!currentPage) {
            throw new Error('Could not create a new page');
        }
        if (recording.recording_meta.type === 'scrape') {
            logger.log('info', `Executing scrape robot for API run ${id}`);
@@ -690,13 +700,13 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
                // Markdown conversion
                if (formats.includes('markdown')) {
-                    markdown = await convertPageToMarkdown(url);
+                    markdown = await convertPageToMarkdown(url, currentPage);
                    serializableOutput.markdown = [{ content: markdown }];
                }
                // HTML conversion
                if (formats.includes('html')) {
-                    html = await convertPageToHTML(url);
+                    html = await convertPageToHTML(url, currentPage);
                    serializableOutput.html = [{ content: html }];
                }
@@ -824,16 +834,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
        plainRun.status = 'running';
        browser = browserPool.getRemoteBrowser(plainRun.browserId);
        if (!browser) {
            throw new Error('Could not access browser');
        }
        let currentPage = await browser.getCurrentPage();
        if (!currentPage) {
            throw new Error('Could not create a new page');
        }
        const workflow = AddGeneratedFlags(recording.recording);
        browser.interpreter.setRunId(plainRun.runId);
--- a/server/src/markdownify/scrape.ts
+++ b/server/src/markdownify/scrape.ts
@@ -1,17 +1,46 @@
-import { chromium } from "playwright";
+import { chromium, Page } from "playwright";
 import { parseMarkdown } from "./markdown";
 import logger from "../logger";
 async function gotoWithFallback(page: any, url: string) {
  try {
    return await page.goto(url, {
      waitUntil: "networkidle",
      timeout: 100000,
    });
  } catch (err) {
    // fallback: JS-heavy or unstable sites
    return await page.goto(url, {
      waitUntil: "domcontentloaded",
      timeout: 100000,
    });
  }
 }
 /**
 * Fetches a webpage, strips scripts/styles/images/etc,
 * returns clean Markdown using parser.
 * @param url - The URL to convert
 * @param existingPage - Optional existing Playwright page instance to reuse
 */
-export async function convertPageToMarkdown(url: string): Promise<string> {
+export async function convertPageToMarkdown(url: string, existingPage?: Page): Promise<string> {
-  const browser = await chromium.launch();
+  let browser: any = null;
-  const page = await browser.newPage();
+  let page: Page;
  let shouldCloseBrowser = false;
-  await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
+  if (existingPage) {
    logger.log('info', `[Scrape] Reusing existing Playwright page instance for markdown conversion of ${url}`);
    page = existingPage;
  } else {
    logger.log('info', `[Scrape] Creating new Chromium browser instance for markdown conversion of ${url}`);
    browser = await chromium.launch();
    page = await browser.newPage();
    shouldCloseBrowser = true;
  }
-  await page.addInitScript(() => {
+  await gotoWithFallback(page, url);
  const cleanedHtml = await page.evaluate(() => {
    const selectors = [
      "script",
      "style",
@@ -42,14 +71,16 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
        }
      });
    });
  });
  // Re-extract HTML after cleanup
  const cleanedHtml = await page.evaluate(() => {
    return document.documentElement.outerHTML;
  });
-  await browser.close();
+  if (shouldCloseBrowser && browser) {
    logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`);
    await browser.close();
  } else {
    logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
  }
  // Convert cleaned HTML → Markdown
  const markdown = await parseMarkdown(cleanedHtml, url);
@@ -59,14 +90,27 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
 /**
 * Fetches a webpage, strips scripts/styles/images/etc,
 * returns clean HTML.
 * @param url - The URL to convert
 * @param existingPage - Optional existing Playwright page instance to reuse
 */
-export async function convertPageToHTML(url: string): Promise<string> {
+export async function convertPageToHTML(url: string, existingPage?: Page): Promise<string> {
-  const browser = await chromium.launch();
+  let browser: any = null;
-  const page = await browser.newPage();
+  let page: Page;
  let shouldCloseBrowser = false;
-  await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
+  if (existingPage) {
    logger.log('info', `[Scrape] Reusing existing Playwright page instance for HTML conversion of ${url}`);
    page = existingPage;
  } else {
    logger.log('info', `[Scrape] Creating new Chromium browser instance for HTML conversion of ${url}`);
    browser = await chromium.launch();
    page = await browser.newPage();
    shouldCloseBrowser = true;
  }
-  await page.addInitScript(() => {
+  await gotoWithFallback(page, url);
  const cleanedHtml = await page.evaluate(() => {
    const selectors = [
      "script",
      "style",
@@ -97,14 +141,16 @@ export async function convertPageToHTML(url: string): Promise<string> {
        }
      });
    });
  });
  // Re-extract HTML after cleanup
  const cleanedHtml = await page.evaluate(() => {
    return document.documentElement.outerHTML;
  });
-  await browser.close();
+  if (shouldCloseBrowser && browser) {
    logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
    await browser.close();
  } else {
    logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
  }
  // Return cleaned HTML directly
  return cleanedHtml;
--- a/server/src/pgboss-worker.ts
+++ b/server/src/pgboss-worker.ts
@@ -181,7 +181,7 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
    logger.log('info', `Browser ${browserId} found and ready for execution`);
-    try {  
+    try {
      // Find the recording
      const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
@@ -189,6 +189,30 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
        throw new Error(`Recording for run ${data.runId} not found`);
      }
      let currentPage = browser.getCurrentPage();
      const pageWaitStart = Date.now();
      let lastPageLogTime = 0;
      let pageAttempts = 0;
      const MAX_PAGE_ATTEMPTS = 15;
      while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
        const currentTime = Date.now();
        pageAttempts++;
        if (currentTime - lastPageLogTime > 5000) {
          logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
          lastPageLogTime = currentTime;
        }
        await new Promise(resolve => setTimeout(resolve, 1000));
        currentPage = browser.getCurrentPage();
      }
      if (!currentPage) {
        throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
      }
      if (recording.recording_meta.type === 'scrape') {
        logger.log('info', `Executing scrape robot for run ${data.runId}`);
@@ -212,13 +236,13 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
          // Markdown conversion
          if (formats.includes('markdown')) {
-            markdown = await convertPageToMarkdown(url);
+            markdown = await convertPageToMarkdown(url, currentPage);
            serializableOutput.markdown = [{ content: markdown }];
          }
          // HTML conversion
          if (formats.includes('html')) {
-            html = await convertPageToHTML(url);
+            html = await convertPageToHTML(url, currentPage);
            serializableOutput.html = [{ content: html }];
          }
@@ -328,30 +352,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
        }
      };
      let currentPage = browser.getCurrentPage();
      const pageWaitStart = Date.now();
      let lastPageLogTime = 0;
      let pageAttempts = 0;
      const MAX_PAGE_ATTEMPTS = 15;
      while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
        const currentTime = Date.now();
        pageAttempts++;
        if (currentTime - lastPageLogTime > 5000) {
          logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
          lastPageLogTime = currentTime;
        }
        await new Promise(resolve => setTimeout(resolve, 1000));
        currentPage = browser.getCurrentPage();
      }
      if (!currentPage) {
        throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
      }
      logger.log('info', `Starting workflow execution for run ${data.runId}`);
      await run.update({ 
--- a/server/src/workflow-management/scheduler/index.ts
+++ b/server/src/workflow-management/scheduler/index.ts
@@ -208,6 +208,16 @@ async function executeRun(id: string, userId: string) {
      }
    }
    browser = browserPool.getRemoteBrowser(plainRun.browserId);
    if (!browser) {
      throw new Error('Could not access browser');
    }
    let currentPage = await browser.getCurrentPage();
    if (!currentPage) {
      throw new Error('Could not create a new page');
    }
    if (recording.recording_meta.type === 'scrape') {
      logger.log('info', `Executing scrape robot for scheduled run ${id}`);
@@ -252,13 +262,13 @@ async function executeRun(id: string, userId: string) {
        // Markdown conversion
        if (formats.includes('markdown')) {
-          markdown = await convertPageToMarkdown(url);
+          markdown = await convertPageToMarkdown(url, currentPage);
          serializableOutput.markdown = [{ content: markdown }];
        }
        // HTML conversion
        if (formats.includes('html')) {
-          html = await convertPageToHTML(url);
+          html = await convertPageToHTML(url, currentPage);
          serializableOutput.html = [{ content: html }];
        }
@@ -391,16 +401,6 @@ async function executeRun(id: string, userId: string) {
      logger.log('warn', `Failed to send run-started notification for run ${plainRun.runId}: ${socketError.message}`);
    }
    browser = browserPool.getRemoteBrowser(plainRun.browserId);
    if (!browser) {
      throw new Error('Could not access browser');
    }
    let currentPage = await browser.getCurrentPage();
    if (!currentPage) {
      throw new Error('Could not create a new page');
    }
    const workflow = AddGeneratedFlags(recording.recording);
    // Set run ID for real-time data persistence