Merge branch 'develop' into db-fixes

This commit is contained in:
Rohit
2025-11-30 19:45:54 +05:30
committed by GitHub
49 changed files with 1880 additions and 759 deletions

View File

@@ -1,6 +1,4 @@
import { Router, Request, Response } from 'express';
import { chromium } from "playwright-extra";
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import { requireAPIKey } from "../middlewares/api";
import Robot from "../models/Robot";
import Run from "../models/Run";
@@ -20,8 +18,6 @@ import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } fr
import { sendWebhook } from "../routes/webhook";
import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';
chromium.use(stealthPlugin());
const router = Router();
const formatRecording = (recordingData: any) => {
@@ -676,6 +672,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
};
}
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for API run ${id}`);
@@ -705,7 +711,7 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
const SCRAPE_TIMEOUT = 120000;
if (formats.includes('markdown')) {
const markdownPromise = convertPageToMarkdown(url);
const markdownPromise = convertPageToMarkdown(url, currentPage);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
@@ -714,7 +720,7 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
}
if (formats.includes('html')) {
const htmlPromise = convertPageToHTML(url);
const htmlPromise = convertPageToHTML(url, currentPage);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
@@ -862,16 +868,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[
plainRun.status = 'running';
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
const workflow = AddGeneratedFlags(recording.recording);
browser.interpreter.setRunId(plainRun.runId);

View File

@@ -0,0 +1,156 @@
import { chromium } from 'playwright-core';
import type { Browser } from 'playwright-core';
import logger from '../logger';
/**
* Configuration for connection retry logic
*/
const CONNECTION_CONFIG = {
maxRetries: 3,
retryDelay: 2000,
connectionTimeout: 30000,
};
/**
* Get the WebSocket endpoint from the browser service health check
* @returns Promise<string> - The WebSocket endpoint URL with browser ID
*/
async function getBrowserServiceEndpoint(): Promise<string> {
const healthPort = process.env.BROWSER_HEALTH_PORT || '3002';
const healthHost = process.env.BROWSER_WS_HOST || 'localhost';
const healthEndpoint = `http://${healthHost}:${healthPort}/health`;
try {
logger.debug(`Fetching WebSocket endpoint from: ${healthEndpoint}`);
const response = await fetch(healthEndpoint);
const data = await response.json();
if (data.status === 'healthy' && data.wsEndpoint) {
logger.debug(`Got WebSocket endpoint: ${data.wsEndpoint}`);
return data.wsEndpoint;
}
throw new Error('Health check did not return a valid wsEndpoint');
} catch (error: any) {
logger.error(`Failed to fetch endpoint from health check: ${error.message}`);
throw new Error(
`Browser service is not accessible at ${healthEndpoint}. ` +
`Make sure the browser service is running (docker-compose up browser)`
);
}
}
/**
* Launch a local browser as fallback when browser service is unavailable
* @returns Promise<Browser> - Locally launched browser instance
*/
async function launchLocalBrowser(): Promise<Browser> {
logger.warn('Attempting to launch local browser');
logger.warn('Note: This requires Chromium binaries to be installed (npx playwright install chromium)');
try {
const browser = await chromium.launch({
headless: true,
args: [
'--disable-blink-features=AutomationControlled',
'--disable-web-security',
'--disable-features=IsolateOrigins,site-per-process',
'--disable-site-isolation-trials',
'--disable-extensions',
'--no-sandbox',
'--disable-dev-shm-usage',
'--disable-gpu',
'--force-color-profile=srgb',
'--force-device-scale-factor=2',
'--ignore-certificate-errors',
'--mute-audio'
],
});
logger.info('Successfully launched local browser');
return browser;
} catch (error: any) {
logger.error(`Failed to launch local browser: ${error.message}`);
throw new Error(
`Could not launch local browser. ` +
`Please either:\n` +
` 1. Start the browser service: docker-compose up browser\n` +
` 2. Install Chromium binaries: npx playwright@1.57.0 install chromium`
);
}
}
/**
* Connect to the remote browser service with retry logic, with fallback to local browser
* @param retries - Number of connection attempts (default: 3)
* @returns Promise<Browser> - Connected browser instance (remote or local)
* @throws Error if both remote connection and local launch fail
*/
export async function connectToRemoteBrowser(retries?: number): Promise<Browser> {
const maxRetries = retries ?? CONNECTION_CONFIG.maxRetries;
try {
const wsEndpoint = await getBrowserServiceEndpoint();
logger.info(`Connecting to browser service at ${wsEndpoint}...`);
for (let attempt = 1; attempt <= maxRetries; attempt++) {
try {
logger.debug(`Connection attempt ${attempt}/${maxRetries}`);
const browser = await chromium.connect(wsEndpoint, {
timeout: CONNECTION_CONFIG.connectionTimeout,
});
logger.info('Successfully connected to browser service');
return browser;
} catch (error: any) {
logger.warn(
`Connection attempt ${attempt}/${maxRetries} failed: ${error.message}`
);
if (attempt === maxRetries) {
logger.error(
`Failed to connect to browser service after ${maxRetries} attempts`
);
throw new Error(`Remote connection failed: ${error.message}`);
}
logger.debug(`Waiting ${CONNECTION_CONFIG.retryDelay}ms before retry...`);
await new Promise(resolve => setTimeout(resolve, CONNECTION_CONFIG.retryDelay));
}
}
throw new Error('Failed to connect to browser service');
} catch (error: any) {
logger.warn(`Browser service connection failed: ${error.message}`);
logger.warn('Falling back to local browser launch...');
return await launchLocalBrowser();
}
}
/**
* Check if browser service is healthy
* @returns Promise<boolean> - true if service is healthy
*/
export async function checkBrowserServiceHealth(): Promise<boolean> {
try {
const healthPort = process.env.BROWSER_HEALTH_PORT || '3002';
const healthHost = process.env.BROWSER_WS_HOST || 'localhost';
const healthEndpoint = `http://${healthHost}:${healthPort}/health`;
const response = await fetch(healthEndpoint);
const data = await response.json();
if (data.status === 'healthy') {
logger.info('Browser service health check passed');
return true;
}
logger.warn('Browser service health check failed:', data);
return false;
} catch (error: any) {
logger.error('Browser service health check error:', error.message);
return false;
}
}

View File

@@ -2,11 +2,9 @@ import {
Page,
Browser,
CDPSession,
BrowserContext,
} from 'playwright';
BrowserContext
} from 'playwright-core';
import { Socket } from "socket.io";
import { chromium } from 'playwright-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import { PlaywrightBlocker } from '@cliqz/adblocker-playwright';
import fetch from 'cross-fetch';
import logger from '../../logger';
@@ -17,6 +15,7 @@ import { getDecryptedProxyConfig } from '../../routes/proxy';
import { getInjectableScript } from 'idcac-playwright';
import { FingerprintInjector } from "fingerprint-injector";
import { FingerprintGenerator } from "fingerprint-generator";
import { connectToRemoteBrowser } from '../browserConnection';
declare global {
interface Window {
@@ -39,8 +38,6 @@ interface ProcessedSnapshot {
baseUrl: string;
}
chromium.use(stealthPlugin());
const MEMORY_CONFIG = {
gcInterval: 20000, // Check memory more frequently (20s instead of 60s)
maxHeapSize: 1536 * 1024 * 1024, // 1.5GB
@@ -460,26 +457,10 @@ export class RemoteBrowser {
const initializationPromise = (async () => {
while (!success && retryCount < MAX_RETRIES) {
try {
this.browser = <Browser>(await chromium.launch({
headless: true,
args: [
"--disable-blink-features=AutomationControlled",
"--disable-web-security",
"--disable-features=IsolateOrigins,site-per-process",
"--disable-site-isolation-trials",
"--disable-extensions",
"--no-sandbox",
"--disable-dev-shm-usage",
"--disable-gpu",
"--force-color-profile=srgb",
"--force-device-scale-factor=2",
"--ignore-certificate-errors",
"--mute-audio"
],
}));
this.browser = await connectToRemoteBrowser();
if (!this.browser || this.browser.isConnected() === false) {
throw new Error('Browser failed to launch or is not connected');
throw new Error('Browser failed to launch or is not connected');
}
this.emitLoadingProgress(20, 0);

View File

@@ -7,7 +7,7 @@ import { Socket } from 'socket.io';
import logger from "../logger";
import { Coordinates, ScrollDeltas, KeyboardInput, DatePickerEventData } from '../types';
import { browserPool } from "../server";
import { Page } from "playwright";
import { Page } from "playwright-core";
import { CustomActions } from "../../../src/shared/types";
import { WhereWhatPair } from "maxun-core";
import { RemoteBrowser } from './classes/RemoteBrowser';

View File

@@ -1,17 +1,35 @@
import { chromium } from "playwright";
import { connectToRemoteBrowser } from "../browser-management/browserConnection";
import { parseMarkdown } from "./markdown";
import logger from "../logger";
async function gotoWithFallback(page: any, url: string) {
try {
return await page.goto(url, {
waitUntil: "networkidle",
timeout: 100000,
});
} catch (err) {
// fallback: JS-heavy or unstable sites
return await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 100000,
});
}
}
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean Markdown using parser.
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
*/
export async function convertPageToMarkdown(url: string): Promise<string> {
const browser = await chromium.launch();
const browser = await connectToRemoteBrowser();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
await page.addInitScript(() => {
const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
@@ -42,14 +60,16 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
}
});
});
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML;
});
await browser.close();
if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
}
// Convert cleaned HTML → Markdown
const markdown = await parseMarkdown(cleanedHtml, url);
@@ -59,14 +79,16 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean HTML.
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
*/
export async function convertPageToHTML(url: string): Promise<string> {
const browser = await chromium.launch();
const browser = await connectToRemoteBrowser();
const page = await browser.newPage();
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
await page.addInitScript(() => {
const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
@@ -97,14 +119,16 @@ export async function convertPageToHTML(url: string): Promise<string> {
}
});
});
});
// Re-extract HTML after cleanup
const cleanedHtml = await page.evaluate(() => {
return document.documentElement.outerHTML;
});
await browser.close();
if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
}
// Return cleaned HTML directly
return cleanedHtml;

View File

@@ -13,7 +13,7 @@ import { WorkflowFile } from 'maxun-core';
import Run from './models/Run';
import Robot from './models/Robot';
import { browserPool } from './server';
import { Page } from 'playwright';
import { Page } from 'playwright-core';
import { capture } from './utils/analytics';
import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from './workflow-management/integrations/gsheet';
import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from './workflow-management/integrations/airtable';
@@ -192,7 +192,7 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
logger.log('info', `Browser ${browserId} found and ready for execution`);
try {
try {
// Find the recording
const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true });
@@ -200,6 +200,30 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
throw new Error(`Recording for run ${data.runId} not found`);
}
let currentPage = browser.getCurrentPage();
const pageWaitStart = Date.now();
let lastPageLogTime = 0;
let pageAttempts = 0;
const MAX_PAGE_ATTEMPTS = 15;
while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
const currentTime = Date.now();
pageAttempts++;
if (currentTime - lastPageLogTime > 5000) {
logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
lastPageLogTime = currentTime;
}
await new Promise(resolve => setTimeout(resolve, 1000));
currentPage = browser.getCurrentPage();
}
if (!currentPage) {
throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
}
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for run ${data.runId}`);
@@ -224,7 +248,7 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
const SCRAPE_TIMEOUT = 120000;
if (formats.includes('markdown')) {
const markdownPromise = convertPageToMarkdown(url);
const markdownPromise = convertPageToMarkdown(url, currentPage);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
@@ -233,7 +257,7 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
}
if (formats.includes('html')) {
const htmlPromise = convertPageToHTML(url);
const htmlPromise = convertPageToHTML(url, currentPage);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
@@ -347,30 +371,6 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
}
};
let currentPage = browser.getCurrentPage();
const pageWaitStart = Date.now();
let lastPageLogTime = 0;
let pageAttempts = 0;
const MAX_PAGE_ATTEMPTS = 15;
while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) {
const currentTime = Date.now();
pageAttempts++;
if (currentTime - lastPageLogTime > 5000) {
logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`);
lastPageLogTime = currentTime;
}
await new Promise(resolve => setTimeout(resolve, 1000));
currentPage = browser.getCurrentPage();
}
if (!currentPage) {
throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`);
}
logger.log('info', `Starting workflow execution for run ${data.runId}`);
await run.update({

View File

@@ -1,19 +0,0 @@
import { Router } from 'express';
import logger from "../logger";
// import { loadIntegrations, saveIntegrations } from '../workflow-management/integrations/gsheet';
import { requireSignIn } from '../middlewares/auth';
export const router = Router();
router.post('/upload-credentials', requireSignIn, async (req, res) => {
try {
const { fileName, credentials, spreadsheetId, range } = req.body;
if (!fileName || !credentials || !spreadsheetId || !range) {
return res.status(400).json({ message: 'Credentials, Spreadsheet ID, and Range are required.' });
}
// *** TEMPORARILY WE STORE CREDENTIALS HERE ***
} catch (error: any) {
logger.log('error', `Error saving credentials: ${error.message}`);
return res.status(500).json({ message: 'Failed to save credentials.', error: error.message });
}
});

View File

@@ -1,10 +1,8 @@
import { Router, Request, Response } from 'express';
import { chromium } from 'playwright-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import { connectToRemoteBrowser } from '../browser-management/browserConnection';
import User from '../models/User';
import { encrypt, decrypt } from '../utils/auth';
import { requireSignIn } from '../middlewares/auth';
chromium.use(stealthPlugin());
export const router = Router();
@@ -86,11 +84,7 @@ router.get('/test', requireSignIn, async (req: Request, res: Response) => {
}),
};
const browser = await chromium.launch({
headless: true,
proxy: proxyOptions,
args:["--ignore-certificate-errors"]
});
const browser = await connectToRemoteBrowser();
const page = await browser.newPage();
await page.goto('https://example.com');
await browser.close();

View File

@@ -13,14 +13,11 @@ import {
destroyRemoteBrowser,
canCreateBrowserInState,
} from '../browser-management/controller';
import { chromium } from 'playwright-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import logger from "../logger";
import { requireSignIn } from '../middlewares/auth';
import { pgBossClient } from '../storage/pgboss';
export const router = Router();
chromium.use(stealthPlugin());
export interface AuthenticatedRequest extends Request {
user?: any;

View File

@@ -1,8 +1,6 @@
import { Router } from 'express';
import logger from "../logger";
import { createRemoteBrowserForRun, destroyRemoteBrowser, getActiveBrowserIdByState } from "../browser-management/controller";
import { chromium } from 'playwright-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import { browserPool } from "../server";
import { v4 as uuid } from "uuid";
import moment from 'moment-timezone';

View File

@@ -1,4 +1,4 @@
import {BrowserType, LaunchOptions} from "playwright";
import {BrowserType, LaunchOptions} from "playwright-core";
/**
* Interpreter settings properties including recording parameters.

View File

@@ -2,7 +2,7 @@ import { Action, ActionType, Coordinates, TagName, DatePickerEventData } from ".
import { WhereWhatPair, WorkflowFile } from 'maxun-core';
import logger from "../../logger";
import { Socket } from "socket.io";
import { Page } from "playwright";
import { Page } from "playwright-core";
import {
getElementInformation,
getRect,

View File

@@ -1,7 +1,7 @@
import Interpreter, { WorkflowFile } from "maxun-core";
import logger from "../../logger";
import { Socket } from "socket.io";
import { Page } from "playwright";
import { Page } from "playwright-core";
import { InterpreterSettings } from "../../types";
import { decrypt } from "../../utils/auth";
import Run from "../../models/Run";

View File

@@ -1,6 +1,4 @@
import { v4 as uuid } from "uuid";
import { chromium } from 'playwright-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
import { io, Socket } from "socket.io-client";
import { createRemoteBrowserForRun, destroyRemoteBrowser } from '../../browser-management/controller';
import logger from '../../logger';
@@ -12,11 +10,10 @@ import { getDecryptedProxyConfig } from "../../routes/proxy";
import { BinaryOutputService } from "../../storage/mino";
import { capture } from "../../utils/analytics";
import { WorkflowFile } from "maxun-core";
import { Page } from "playwright";
import { Page } from "playwright-core";
import { sendWebhook } from "../../routes/webhook";
import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape";
chromium.use(stealthPlugin());
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
try {
@@ -220,6 +217,16 @@ async function executeRun(id: string, userId: string) {
}
}
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
if (recording.recording_meta.type === 'scrape') {
logger.log('info', `Executing scrape robot for scheduled run ${id}`);
@@ -266,7 +273,7 @@ async function executeRun(id: string, userId: string) {
// Markdown conversion
if (formats.includes("markdown")) {
const markdownPromise = convertPageToMarkdown(url);
const markdownPromise = convertPageToMarkdown(url, currentPage);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
@@ -275,7 +282,7 @@ async function executeRun(id: string, userId: string) {
}
if (formats.includes("html")) {
const htmlPromise = convertPageToHTML(url);
const htmlPromise = convertPageToHTML(url, currentPage);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT);
});
@@ -412,16 +419,6 @@ async function executeRun(id: string, userId: string) {
logger.log('warn', `Failed to send run-started notification for run ${plainRun.runId}: ${socketError.message}`);
}
browser = browserPool.getRemoteBrowser(plainRun.browserId);
if (!browser) {
throw new Error('Could not access browser');
}
let currentPage = await browser.getCurrentPage();
if (!currentPage) {
throw new Error('Could not create a new page');
}
const workflow = AddGeneratedFlags(recording.recording);
// Set run ID for real-time data persistence

View File

@@ -1,4 +1,4 @@
import { Page } from "playwright";
import { Page } from "playwright-core";
import { Coordinates } from "../types";
import { WhereWhatPair, WorkflowFile } from "maxun-core";
import logger from "../logger";