feat: reuse existing page instance
This commit is contained in:
@@ -662,6 +662,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
browser = browserPool.getRemoteBrowser(plainRun.browserId);
|
||||||
|
if (!browser) {
|
||||||
|
throw new Error('Could not access browser');
|
||||||
|
}
|
||||||
|
|
||||||
|
let currentPage = await browser.getCurrentPage();
|
||||||
|
if (!currentPage) {
|
||||||
|
throw new Error('Could not create a new page');
|
||||||
|
}
|
||||||
|
|
||||||
if (recording.recording_meta.type === 'scrape') {
|
if (recording.recording_meta.type === 'scrape') {
|
||||||
logger.log('info', `Executing scrape robot for API run ${id}`);
|
logger.log('info', `Executing scrape robot for API run ${id}`);
|
||||||
|
|
||||||
@@ -690,13 +700,13 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
|||||||
|
|
||||||
// Markdown conversion
|
// Markdown conversion
|
||||||
if (formats.includes('markdown')) {
|
if (formats.includes('markdown')) {
|
||||||
markdown = await convertPageToMarkdown(url);
|
markdown = await convertPageToMarkdown(url, currentPage);
|
||||||
serializableOutput.markdown = [{ content: markdown }];
|
serializableOutput.markdown = [{ content: markdown }];
|
||||||
}
|
}
|
||||||
|
|
||||||
// HTML conversion
|
// HTML conversion
|
||||||
if (formats.includes('html')) {
|
if (formats.includes('html')) {
|
||||||
html = await convertPageToHTML(url);
|
html = await convertPageToHTML(url, currentPage);
|
||||||
serializableOutput.html = [{ content: html }];
|
serializableOutput.html = [{ content: html }];
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -824,16 +834,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
|
|||||||
|
|
||||||
plainRun.status = 'running';
|
plainRun.status = 'running';
|
||||||
|
|
||||||
browser = browserPool.getRemoteBrowser(plainRun.browserId);
|
|
||||||
if (!browser) {
|
|
||||||
throw new Error('Could not access browser');
|
|
||||||
}
|
|
||||||
|
|
||||||
let currentPage = await browser.getCurrentPage();
|
|
||||||
if (!currentPage) {
|
|
||||||
throw new Error('Could not create a new page');
|
|
||||||
}
|
|
||||||
|
|
||||||
const workflow = AddGeneratedFlags(recording.recording);
|
const workflow = AddGeneratedFlags(recording.recording);
|
||||||
|
|
||||||
browser.interpreter.setRunId(plainRun.runId);
|
browser.interpreter.setRunId(plainRun.runId);
|
||||||
|
|||||||
@@ -1,17 +1,46 @@
|
|||||||
import { chromium } from "playwright";
|
import { chromium, Page } from "playwright";
|
||||||
import { parseMarkdown } from "./markdown";
|
import { parseMarkdown } from "./markdown";
|
||||||
|
import logger from "../logger";
|
||||||
|
|
||||||
|
async function gotoWithFallback(page: any, url: string) {
|
||||||
|
try {
|
||||||
|
return await page.goto(url, {
|
||||||
|
waitUntil: "networkidle",
|
||||||
|
timeout: 100000,
|
||||||
|
});
|
||||||
|
} catch (err) {
|
||||||
|
// fallback: JS-heavy or unstable sites
|
||||||
|
return await page.goto(url, {
|
||||||
|
waitUntil: "domcontentloaded",
|
||||||
|
timeout: 100000,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Fetches a webpage, strips scripts/styles/images/etc,
|
* Fetches a webpage, strips scripts/styles/images/etc,
|
||||||
* returns clean Markdown using parser.
|
* returns clean Markdown using parser.
|
||||||
|
* @param url - The URL to convert
|
||||||
|
* @param existingPage - Optional existing Playwright page instance to reuse
|
||||||
*/
|
*/
|
||||||
export async function convertPageToMarkdown(url: string): Promise<string> {
|
export async function convertPageToMarkdown(url: string, existingPage?: Page): Promise<string> {
|
||||||
const browser = await chromium.launch();
|
let browser: any = null;
|
||||||
const page = await browser.newPage();
|
let page: Page;
|
||||||
|
let shouldCloseBrowser = false;
|
||||||
|
|
||||||
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
|
if (existingPage) {
|
||||||
|
logger.log('info', `[Scrape] Reusing existing Playwright page instance for markdown conversion of ${url}`);
|
||||||
|
page = existingPage;
|
||||||
|
} else {
|
||||||
|
logger.log('info', `[Scrape] Creating new Chromium browser instance for markdown conversion of ${url}`);
|
||||||
|
browser = await chromium.launch();
|
||||||
|
page = await browser.newPage();
|
||||||
|
shouldCloseBrowser = true;
|
||||||
|
}
|
||||||
|
|
||||||
await page.addInitScript(() => {
|
await gotoWithFallback(page, url);
|
||||||
|
|
||||||
|
const cleanedHtml = await page.evaluate(() => {
|
||||||
const selectors = [
|
const selectors = [
|
||||||
"script",
|
"script",
|
||||||
"style",
|
"style",
|
||||||
@@ -42,14 +71,16 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
|
||||||
|
|
||||||
// Re-extract HTML after cleanup
|
|
||||||
const cleanedHtml = await page.evaluate(() => {
|
|
||||||
return document.documentElement.outerHTML;
|
return document.documentElement.outerHTML;
|
||||||
});
|
});
|
||||||
|
|
||||||
await browser.close();
|
if (shouldCloseBrowser && browser) {
|
||||||
|
logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`);
|
||||||
|
await browser.close();
|
||||||
|
} else {
|
||||||
|
logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
|
||||||
|
}
|
||||||
|
|
||||||
// Convert cleaned HTML → Markdown
|
// Convert cleaned HTML → Markdown
|
||||||
const markdown = await parseMarkdown(cleanedHtml, url);
|
const markdown = await parseMarkdown(cleanedHtml, url);
|
||||||
@@ -59,14 +90,27 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
|
|||||||
/**
|
/**
|
||||||
* Fetches a webpage, strips scripts/styles/images/etc,
|
* Fetches a webpage, strips scripts/styles/images/etc,
|
||||||
* returns clean HTML.
|
* returns clean HTML.
|
||||||
|
* @param url - The URL to convert
|
||||||
|
* @param existingPage - Optional existing Playwright page instance to reuse
|
||||||
*/
|
*/
|
||||||
export async function convertPageToHTML(url: string): Promise<string> {
|
export async function convertPageToHTML(url: string, existingPage?: Page): Promise<string> {
|
||||||
const browser = await chromium.launch();
|
let browser: any = null;
|
||||||
const page = await browser.newPage();
|
let page: Page;
|
||||||
|
let shouldCloseBrowser = false;
|
||||||
|
|
||||||
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
|
if (existingPage) {
|
||||||
|
logger.log('info', `[Scrape] Reusing existing Playwright page instance for HTML conversion of ${url}`);
|
||||||
|
page = existingPage;
|
||||||
|
} else {
|
||||||
|
logger.log('info', `[Scrape] Creating new Chromium browser instance for HTML conversion of ${url}`);
|
||||||
|
browser = await chromium.launch();
|
||||||
|
page = await browser.newPage();
|
||||||
|
shouldCloseBrowser = true;
|
||||||
|
}
|
||||||
|
|
||||||
await page.addInitScript(() => {
|
await gotoWithFallback(page, url);
|
||||||
|
|
||||||
|
const cleanedHtml = await page.evaluate(() => {
|
||||||
const selectors = [
|
const selectors = [
|
||||||
"script",
|
"script",
|
||||||
"style",
|
"style",
|
||||||
@@ -97,14 +141,16 @@ export async function convertPageToHTML(url: string): Promise<string> {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
});
|
});
|
||||||
});
|
|
||||||
|
|
||||||
// Re-extract HTML after cleanup
|
|
||||||
const cleanedHtml = await page.evaluate(() => {
|
|
||||||
return document.documentElement.outerHTML;
|
return document.documentElement.outerHTML;
|
||||||
});
|
});
|
||||||
|
|
||||||
await browser.close();
|
if (shouldCloseBrowser && browser) {
|
||||||
|
logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
|
||||||
|
await browser.close();
|
||||||
|
} else {
|
||||||
|
logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
|
||||||
|
}
|
||||||
|
|
||||||
// Return cleaned HTML directly
|
// Return cleaned HTML directly
|
||||||
return cleanedHtml;
|
return cleanedHtml;
|
||||||
|
|||||||
@@ -181,7 +181,7 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
|||||||
|
|
||||||
logger.log('info', `Browser ${browserId} found and ready for execution`);
|
logger.log('info', `Browser ${browserId} found and ready for execution`);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
// Find the recording
|
// Find the recording
|
||||||
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
|
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
|
||||||
|
|
||||||
@@ -189,6 +189,30 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
|||||||
throw new Error(`Recording for run ${data.runId} not found`);
|
throw new Error(`Recording for run ${data.runId} not found`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let currentPage = browser.getCurrentPage();
|
||||||
|
|
||||||
|
const pageWaitStart = Date.now();
|
||||||
|
let lastPageLogTime = 0;
|
||||||
|
let pageAttempts = 0;
|
||||||
|
const MAX_PAGE_ATTEMPTS = 15;
|
||||||
|
|
||||||
|
while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
|
||||||
|
const currentTime = Date.now();
|
||||||
|
pageAttempts++;
|
||||||
|
|
||||||
|
if (currentTime - lastPageLogTime > 5000) {
|
||||||
|
logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
|
||||||
|
lastPageLogTime = currentTime;
|
||||||
|
}
|
||||||
|
|
||||||
|
await new Promise(resolve => setTimeout(resolve, 1000));
|
||||||
|
currentPage = browser.getCurrentPage();
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!currentPage) {
|
||||||
|
throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
|
||||||
|
}
|
||||||
|
|
||||||
if (recording.recording_meta.type === 'scrape') {
|
if (recording.recording_meta.type === 'scrape') {
|
||||||
logger.log('info', `Executing scrape robot for run ${data.runId}`);
|
logger.log('info', `Executing scrape robot for run ${data.runId}`);
|
||||||
|
|
||||||
@@ -212,13 +236,13 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
|||||||
|
|
||||||
// Markdown conversion
|
// Markdown conversion
|
||||||
if (formats.includes('markdown')) {
|
if (formats.includes('markdown')) {
|
||||||
markdown = await convertPageToMarkdown(url);
|
markdown = await convertPageToMarkdown(url, currentPage);
|
||||||
serializableOutput.markdown = [{ content: markdown }];
|
serializableOutput.markdown = [{ content: markdown }];
|
||||||
}
|
}
|
||||||
|
|
||||||
// HTML conversion
|
// HTML conversion
|
||||||
if (formats.includes('html')) {
|
if (formats.includes('html')) {
|
||||||
html = await convertPageToHTML(url);
|
html = await convertPageToHTML(url, currentPage);
|
||||||
serializableOutput.html = [{ content: html }];
|
serializableOutput.html = [{ content: html }];
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -328,30 +352,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
let currentPage = browser.getCurrentPage();
|
|
||||||
|
|
||||||
const pageWaitStart = Date.now();
|
|
||||||
let lastPageLogTime = 0;
|
|
||||||
let pageAttempts = 0;
|
|
||||||
const MAX_PAGE_ATTEMPTS = 15;
|
|
||||||
|
|
||||||
while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
|
|
||||||
const currentTime = Date.now();
|
|
||||||
pageAttempts++;
|
|
||||||
|
|
||||||
if (currentTime - lastPageLogTime > 5000) {
|
|
||||||
logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
|
|
||||||
lastPageLogTime = currentTime;
|
|
||||||
}
|
|
||||||
|
|
||||||
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
||||||
currentPage = browser.getCurrentPage();
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!currentPage) {
|
|
||||||
throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
|
|
||||||
}
|
|
||||||
|
|
||||||
logger.log('info', `Starting workflow execution for run ${data.runId}`);
|
logger.log('info', `Starting workflow execution for run ${data.runId}`);
|
||||||
|
|
||||||
await run.update({
|
await run.update({
|
||||||
|
|||||||
@@ -208,6 +208,16 @@ async function executeRun(id: string, userId: string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
browser = browserPool.getRemoteBrowser(plainRun.browserId);
|
||||||
|
if (!browser) {
|
||||||
|
throw new Error('Could not access browser');
|
||||||
|
}
|
||||||
|
|
||||||
|
let currentPage = await browser.getCurrentPage();
|
||||||
|
if (!currentPage) {
|
||||||
|
throw new Error('Could not create a new page');
|
||||||
|
}
|
||||||
|
|
||||||
if (recording.recording_meta.type === 'scrape') {
|
if (recording.recording_meta.type === 'scrape') {
|
||||||
logger.log('info', `Executing scrape robot for scheduled run ${id}`);
|
logger.log('info', `Executing scrape robot for scheduled run ${id}`);
|
||||||
|
|
||||||
@@ -252,13 +262,13 @@ async function executeRun(id: string, userId: string) {
|
|||||||
|
|
||||||
// Markdown conversion
|
// Markdown conversion
|
||||||
if (formats.includes('markdown')) {
|
if (formats.includes('markdown')) {
|
||||||
markdown = await convertPageToMarkdown(url);
|
markdown = await convertPageToMarkdown(url, currentPage);
|
||||||
serializableOutput.markdown = [{ content: markdown }];
|
serializableOutput.markdown = [{ content: markdown }];
|
||||||
}
|
}
|
||||||
|
|
||||||
// HTML conversion
|
// HTML conversion
|
||||||
if (formats.includes('html')) {
|
if (formats.includes('html')) {
|
||||||
html = await convertPageToHTML(url);
|
html = await convertPageToHTML(url, currentPage);
|
||||||
serializableOutput.html = [{ content: html }];
|
serializableOutput.html = [{ content: html }];
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -391,16 +401,6 @@ async function executeRun(id: string, userId: string) {
|
|||||||
logger.log('warn', `Failed to send run-started notification for run ${plainRun.runId}: ${socketError.message}`);
|
logger.log('warn', `Failed to send run-started notification for run ${plainRun.runId}: ${socketError.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
browser = browserPool.getRemoteBrowser(plainRun.browserId);
|
|
||||||
if (!browser) {
|
|
||||||
throw new Error('Could not access browser');
|
|
||||||
}
|
|
||||||
|
|
||||||
let currentPage = await browser.getCurrentPage();
|
|
||||||
if (!currentPage) {
|
|
||||||
throw new Error('Could not create a new page');
|
|
||||||
}
|
|
||||||
|
|
||||||
const workflow = AddGeneratedFlags(recording.recording);
|
const workflow = AddGeneratedFlags(recording.recording);
|
||||||
|
|
||||||
// Set run ID for real-time data persistence
|
// Set run ID for real-time data persistence
|
||||||
|
|||||||
Reference in New Issue
Block a user