Merge pull request #906 from getmaxun/browser-service

chore(infra): add separate browser service
This commit is contained in:
Karishma Shukla
2025-11-30 19:37:10 +05:30
committed by GitHub
27 changed files with 409 additions and 100 deletions

View File

@@ -1,6 +1,4 @@
import { Router, Request, Response } from 'express';
import { chromium } from "playwright-extra";
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import { requireAPIKey } from "../middlewares/api";
import Robot from "../models/Robot";
import Run from "../models/Run";
@@ -20,8 +18,6 @@ import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-managem
import { sendWebhook } from "../routes/webhook";
import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';
chromium.use(stealthPlugin());
const router = Router();
const formatRecording = (recordingData: any) => {

View File

@@ -0,0 +1,156 @@
import { chromium } from 'playwright-core';
import type { Browser } from 'playwright-core';
import logger from '../logger';
/**
* Configuration for connection retry logic
*/
const CONNECTION_CONFIG = {
maxRetries: 3,
retryDelay: 2000,
connectionTimeout: 30000,
};
/**
* Get the WebSocket endpoint from the browser service health check
* @returns Promise<string> - The WebSocket endpoint URL with browser ID
*/
async function getBrowserServiceEndpoint(): Promise<string> {
const healthPort = process.env.BROWSER_HEALTH_PORT || '3002';
const healthHost = process.env.BROWSER_WS_HOST || 'localhost';
const healthEndpoint = `http://${healthHost}:${healthPort}/health`;
try {
logger.debug(`Fetching WebSocket endpoint from: ${healthEndpoint}`);
const response = await fetch(healthEndpoint);
const data = await response.json();
if (data.status === 'healthy' && data.wsEndpoint) {
logger.debug(`Got WebSocket endpoint: ${data.wsEndpoint}`);
return data.wsEndpoint;
}
throw new Error('Health check did not return a valid wsEndpoint');
} catch (error: any) {
logger.error(`Failed to fetch endpoint from health check: ${error.message}`);
throw new Error(
`Browser service is not accessible at ${healthEndpoint}. ` +
`Make sure the browser service is running (docker-compose up browser)`
);
}
}
/**
* Launch a local browser as fallback when browser service is unavailable
* @returns Promise<Browser> - Locally launched browser instance
*/
async function launchLocalBrowser(): Promise<Browser> {
logger.warn('Attempting to launch local browser');
logger.warn('Note: This requires Chromium binaries to be installed (npx playwright install chromium)');
try {
const browser = await chromium.launch({
headless: true,
args: [
'--disable-blink-features=AutomationControlled',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
'--disable-site-isolation-trials',
'--disable-extensions',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--force-color-profile=srgb',
'--force-device-scale-factor=2',
'--ignore-certificate-errors',
'--mute-audio'
],
});
logger.info('Successfully launched local browser');
return browser;
} catch (error: any) {
logger.error(`Failed to launch local browser: ${error.message}`);
throw new Error(
`Could not launch local browser. ` +
`Please either:\n` +
` 1. Start the browser service: docker-compose up browser\n` +
` 2. Install Chromium binaries: npx playwright@1.57.0 install chromium`
);
}
}
/**
* Connect to the remote browser service with retry logic, with fallback to local browser
* @param retries - Number of connection attempts (default: 3)
* @returns Promise<Browser> - Connected browser instance (remote or local)
* @throws Error if both remote connection and local launch fail
*/
export async function connectToRemoteBrowser(retries?: number): Promise<Browser> {
const maxRetries = retries ?? CONNECTION_CONFIG.maxRetries;
try {
const wsEndpoint = await getBrowserServiceEndpoint();
logger.info(`Connecting to browser service at ${wsEndpoint}...`);
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
logger.debug(`Connection attempt ${attempt}/${maxRetries}`);
const browser = await chromium.connect(wsEndpoint, {
timeout: CONNECTION_CONFIG.connectionTimeout,
});
logger.info('Successfully connected to browser service');
return browser;
} catch (error: any) {
logger.warn(
`Connection attempt ${attempt}/${maxRetries} failed: ${error.message}`
);
if (attempt === maxRetries) {
logger.error(
`Failed to connect to browser service after ${maxRetries} attempts`
);
throw new Error(`Remote connection failed: ${error.message}`);
}
logger.debug(`Waiting ${CONNECTION_CONFIG.retryDelay}ms before retry...`);
await new Promise(resolve => setTimeout(resolve, CONNECTION_CONFIG.retryDelay));
}
}
throw new Error('Failed to connect to browser service');
} catch (error: any) {
logger.warn(`Browser service connection failed: ${error.message}`);
logger.warn('Falling back to local browser launch...');
return await launchLocalBrowser();
}
}
/**
* Check if browser service is healthy
* @returns Promise<boolean> - true if service is healthy
*/
export async function checkBrowserServiceHealth(): Promise<boolean> {
try {
const healthPort = process.env.BROWSER_HEALTH_PORT || '3002';
const healthHost = process.env.BROWSER_WS_HOST || 'localhost';
const healthEndpoint = `http://${healthHost}:${healthPort}/health`;
const response = await fetch(healthEndpoint);
const data = await response.json();
if (data.status === 'healthy') {
logger.info('Browser service health check passed');
return true;
}
logger.warn('Browser service health check failed:', data);
return false;
} catch (error: any) {
logger.error('Browser service health check error:', error.message);
return false;
}
}

View File

@@ -2,11 +2,9 @@ import {
Page,
Browser,
CDPSession,
BrowserContext,
} from 'playwright';
BrowserContext
} from 'playwright-core';
import { Socket } from "socket.io";
import { chromium } from 'playwright-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import { PlaywrightBlocker } from '@cliqz/adblocker-playwright';
import fetch from 'cross-fetch';
import sharp from 'sharp';
@@ -16,6 +14,7 @@ import { WorkflowGenerator } from "../../workflow-management/classes/Generator";
import { WorkflowInterpreter } from "../../workflow-management/classes/Interpreter";
import { getDecryptedProxyConfig } from '../../routes/proxy';
import { getInjectableScript } from 'idcac-playwright';
import { connectToRemoteBrowser } from '../browserConnection';
declare global {
interface Window {
@@ -83,8 +82,6 @@ interface ProcessedSnapshot {
};
}
chromium.use(stealthPlugin());
const MEMORY_CONFIG = {
gcInterval: 20000, // Check memory more frequently (20s instead of 60s)
maxHeapSize: 1536 * 1024 * 1024, // 1.5GB
@@ -567,23 +564,7 @@ export class RemoteBrowser {
while (!success && retryCount < MAX_RETRIES) {
try {
this.browser = <Browser>(await chromium.launch({
headless: true,
args: [
"--disable-blink-features=AutomationControlled",
"--disable-web-security",
"--disable-features=IsolateOrigins,site-per-process",
"--disable-site-isolation-trials",
"--disable-extensions",
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--force-color-profile=srgb",
"--force-device-scale-factor=2",
"--ignore-certificate-errors",
"--mute-audio"
],
}));
this.browser = await connectToRemoteBrowser();
if (!this.browser || this.browser.isConnected() === false) {
throw new Error('Browser failed to launch or is not connected');
@@ -683,9 +664,9 @@ export class RemoteBrowser {
try {
const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']);
await blocker.enableBlockingInPage(this.currentPage);
await blocker.enableBlockingInPage(this.currentPage as any);
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
await blocker.disableBlockingInPage(this.currentPage);
await blocker.disableBlockingInPage(this.currentPage as any);
console.log('Adblocker initialized');
} catch (error: any) {
console.warn('Failed to initialize adblocker, continuing without it:', error.message);

View File

@@ -8,7 +8,7 @@ import logger from "../logger";
import { Coordinates, ScrollDeltas, KeyboardInput, DatePickerEventData } from '../types';
import { browserPool } from "../server";
import { WorkflowGenerator } from "../workflow-management/classes/Generator";
import { Page } from "playwright";
import { Page } from "playwright-core";
import { throttle } from "../../../src/helpers/inputHelpers";
import { CustomActions } from "../../../src/shared/types";
import { WhereWhatPair } from "maxun-core";

View File

@@ -1,4 +1,4 @@
import { chromium, Page } from "playwright";
import { connectToRemoteBrowser } from "../browser-management/browserConnection";
import { parseMarkdown } from "./markdown";
import logger from "../logger";
@@ -23,22 +23,11 @@ async function gotoWithFallback(page: any, url: string) {
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
*/
export async function convertPageToMarkdown(url: string, existingPage?: Page): Promise<string> {
let browser: any = null;
let page: Page;
let shouldCloseBrowser = false;
export async function convertPageToMarkdown(url: string): Promise<string> {
const browser = await connectToRemoteBrowser();
const page = await browser.newPage();
if (existingPage) {
logger.log('info', `[Scrape] Reusing existing Playwright page instance for markdown conversion of ${url}`);
page = existingPage;
} else {
logger.log('info', `[Scrape] Creating new Chromium browser instance for markdown conversion of ${url}`);
browser = await chromium.launch();
page = await browser.newPage();
shouldCloseBrowser = true;
}
await gotoWithFallback(page, url);
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
const cleanedHtml = await page.evaluate(() => {
const selectors = [
@@ -93,22 +82,11 @@ export async function convertPageToMarkdown(url: string, existingPage?: Page): P
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
*/
export async function convertPageToHTML(url: string, existingPage?: Page): Promise<string> {
let browser: any = null;
let page: Page;
let shouldCloseBrowser = false;
export async function convertPageToHTML(url: string): Promise<string> {
const browser = await connectToRemoteBrowser();
const page = await browser.newPage();
if (existingPage) {
logger.log('info', `[Scrape] Reusing existing Playwright page instance for HTML conversion of ${url}`);
page = existingPage;
} else {
logger.log('info', `[Scrape] Creating new Chromium browser instance for HTML conversion of ${url}`);
browser = await chromium.launch();
page = await browser.newPage();
shouldCloseBrowser = true;
}
await gotoWithFallback(page, url);
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
const cleanedHtml = await page.evaluate(() => {
const selectors = [

View File

@@ -13,7 +13,7 @@ import { WorkflowFile } from 'maxun-core';
import Run from './models/Run';
import Robot from './models/Robot';
import { browserPool } from './server';
import { Page } from 'playwright';
import { Page } from 'playwright-core';
import { capture } from './utils/analytics';
import { googleSheetUpdateTasks, processGoogleSheetUpdates } from './workflow-management/integrations/gsheet';
import { airtableUpdateTasks, processAirtableUpdates } from './workflow-management/integrations/airtable';

View File

@@ -1,10 +1,8 @@
import { Router, Request, Response } from 'express';
import { chromium } from 'playwright-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import { connectToRemoteBrowser } from '../browser-management/browserConnection';
import User from '../models/User';
import { encrypt, decrypt } from '../utils/auth';
import { requireSignIn } from '../middlewares/auth';
chromium.use(stealthPlugin());
export const router = Router();
@@ -86,11 +84,7 @@ router.get('/test', requireSignIn, async (req: Request, res: Response) => {
}),
};
const browser = await chromium.launch({
headless: true,
proxy: proxyOptions,
args:["--ignore-certificate-errors"]
});
const browser = await connectToRemoteBrowser();
const page = await browser.newPage();
await page.goto('https://example.com');
await browser.close();

View File

@@ -13,14 +13,11 @@ import {
destroyRemoteBrowser,
canCreateBrowserInState,
} from '../browser-management/controller';
import { chromium } from 'playwright-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import logger from "../logger";
import { requireSignIn } from '../middlewares/auth';
import { pgBoss } from '../pgboss-worker';
export const router = Router();
chromium.use(stealthPlugin());
export interface AuthenticatedRequest extends Request {
user?: any;

View File

@@ -1,8 +1,6 @@
import { Router } from 'express';
import logger from "../logger";
import { createRemoteBrowserForRun, destroyRemoteBrowser, getActiveBrowserIdByState } from "../browser-management/controller";
import { chromium } from 'playwright-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import { browserPool } from "../server";
import { v4 as uuid } from "uuid";
import moment from 'moment-timezone';
@@ -18,7 +16,6 @@ import { encrypt, decrypt } from '../utils/auth';
import { WorkflowFile } from 'maxun-core';
import { cancelScheduledWorkflow, scheduleWorkflow } from '../schedule-worker';
import { pgBoss, registerWorkerForQueue, registerAbortWorkerForQueue } from '../pgboss-worker';
chromium.use(stealthPlugin());
export const router = Router();

View File

@@ -1,4 +1,4 @@
import {BrowserType, LaunchOptions} from "playwright";
import {BrowserType, LaunchOptions} from "playwright-core";
/**
* Interpreter settings properties including recording parameters.

View File

@@ -2,7 +2,7 @@ import { Action, ActionType, Coordinates, TagName, DatePickerEventData } from ".
import { WhereWhatPair, WorkflowFile } from 'maxun-core';
import logger from "../../logger";
import { Socket } from "socket.io";
import { Page } from "playwright";
import { Page } from "playwright-core";
import {
getElementInformation,
getRect,

View File

@@ -1,7 +1,7 @@
import Interpreter, { WorkflowFile } from "maxun-core";
import logger from "../../logger";
import { Socket } from "socket.io";
import { Page } from "playwright";
import { Page } from "playwright-core";
import { InterpreterSettings } from "../../types";
import { decrypt } from "../../utils/auth";
import Run from "../../models/Run";

View File

@@ -1,6 +1,4 @@
import { v4 as uuid } from "uuid";
import { chromium } from 'playwright-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import { io, Socket } from "socket.io-client";
import { createRemoteBrowserForRun, destroyRemoteBrowser } from '../../browser-management/controller';
import logger from '../../logger';
@@ -12,11 +10,10 @@ import { getDecryptedProxyConfig } from "../../routes/proxy";
import { BinaryOutputService } from "../../storage/mino";
import { capture } from "../../utils/analytics";
import { WorkflowFile } from "maxun-core";
import { Page } from "playwright";
import { Page } from "playwright-core";
import { sendWebhook } from "../../routes/webhook";
import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape";
chromium.use(stealthPlugin());
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
try {

View File

@@ -1,4 +1,4 @@
import { Page } from "playwright";
import { Page } from "playwright-core";
import { Coordinates } from "../types";
import { WhereWhatPair, WorkflowFile } from "maxun-core";
import logger from "../logger";