diff --git a/Dockerfile.backend b/Dockerfile.backend index 8a5fc23e..85ee4b83 100644 --- a/Dockerfile.backend +++ b/Dockerfile.backend @@ -18,9 +18,6 @@ COPY server/tsconfig.json ./server/ # Install dependencies RUN npm install --legacy-peer-deps -# Install Playwright browsers and dependencies -RUN npx playwright install --with-deps chromium - # Create the Chromium data directory with necessary permissions RUN mkdir -p /tmp/chromium-data-dir && \ chmod -R 777 /tmp/chromium-data-dir diff --git a/ENVEXAMPLE b/ENVEXAMPLE index db461f55..b8881b36 100644 --- a/ENVEXAMPLE +++ b/ENVEXAMPLE @@ -38,3 +38,8 @@ AIRTABLE_REDIRECT_URI=http://localhost:8080/auth/airtable/callback # Telemetry Settings - Please keep it enabled. Keeping it enabled helps us understand how the product is used and assess the impact of any new changes. MAXUN_TELEMETRY=true + +# WebSocket port for browser CDP connections +BROWSER_WS_PORT=3001 +BROWSER_HEALTH_PORT=3002 +BROWSER_WS_HOST=browser \ No newline at end of file diff --git a/browser/.dockerignore b/browser/.dockerignore new file mode 100644 index 00000000..44f5e86e --- /dev/null +++ b/browser/.dockerignore @@ -0,0 +1,9 @@ +node_modules +npm-debug.log +.env +.git +.gitignore +dist +*.ts +!*.d.ts +tsconfig.json diff --git a/browser/Dockerfile b/browser/Dockerfile new file mode 100644 index 00000000..9f2ea838 --- /dev/null +++ b/browser/Dockerfile @@ -0,0 +1,30 @@ +FROM mcr.microsoft.com/playwright:v1.57.0-jammy + +WORKDIR /app + +# Copy package files +COPY browser/package*.json ./ + +# Install dependencies +RUN npm ci + +# Copy TypeScript source and config +COPY browser/server.ts ./ +COPY browser/tsconfig.json ./ + +# Build TypeScript +RUN npm run build + +# Accept build arguments for ports (with defaults) +ARG BROWSER_WS_PORT=3001 +ARG BROWSER_HEALTH_PORT=3002 + +# Set as environment variables +ENV BROWSER_WS_PORT=${BROWSER_WS_PORT} +ENV BROWSER_HEALTH_PORT=${BROWSER_HEALTH_PORT} + +# Expose ports dynamically based on build args +EXPOSE ${BROWSER_WS_PORT} ${BROWSER_HEALTH_PORT} + +# Start the browser service (run compiled JS) +CMD ["node", "dist/server.js"] diff --git a/browser/package.json b/browser/package.json new file mode 100644 index 00000000..8aaf0d25 --- /dev/null +++ b/browser/package.json @@ -0,0 +1,21 @@ +{ + "name": "maxun-browser-service", + "version": "1.0.0", + "description": "Browser service that exposes Playwright browsers via WebSocket with stealth plugins", + "main": "dist/server.js", + "scripts": { + "build": "tsc", + "start": "node dist/server.js", + "dev": "ts-node server.ts" + }, + "dependencies": { + "playwright": "1.57.0", + "playwright-extra": "^4.3.6", + "puppeteer-extra-plugin-stealth": "^2.11.2" + }, + "devDependencies": { + "@types/node": "^22.7.9", + "typescript": "^5.0.0", + "ts-node": "^10.9.2" + } +} \ No newline at end of file diff --git a/browser/server.ts b/browser/server.ts new file mode 100644 index 00000000..2a70beef --- /dev/null +++ b/browser/server.ts @@ -0,0 +1,92 @@ +import { chromium } from 'playwright-extra'; +import stealthPlugin from 'puppeteer-extra-plugin-stealth'; +import http from 'http'; +import type { BrowserServer } from 'playwright'; + +// Apply stealth plugin to chromium +chromium.use(stealthPlugin()); + +let browserServer: BrowserServer | null = null; + +// Configurable ports with defaults +const BROWSER_WS_PORT = parseInt(process.env.BROWSER_WS_PORT || '3001', 10); +const BROWSER_HEALTH_PORT = parseInt(process.env.BROWSER_HEALTH_PORT || '3002', 10); + +async function start(): Promise { + console.log('Starting Maxun Browser Service...'); + console.log(`WebSocket port: ${BROWSER_WS_PORT}`); + console.log(`Health check port: ${BROWSER_HEALTH_PORT}`); + + try { + // Launch browser server that exposes WebSocket endpoint + browserServer = await chromium.launchServer({ + headless: true, + args: [ + '--disable-blink-features=AutomationControlled', + '--disable-web-security', + '--disable-features=IsolateOrigins,site-per-process', + '--disable-site-isolation-trials', + '--disable-extensions', + '--no-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + '--force-color-profile=srgb', + '--force-device-scale-factor=2', + '--ignore-certificate-errors', + '--mute-audio' + ], + port: BROWSER_WS_PORT, + }); + + console.log(`✅ Browser WebSocket endpoint ready: ${browserServer.wsEndpoint()}`); + console.log(`✅ Stealth plugin enabled`); + + // Health check HTTP server + const healthServer = http.createServer((req, res) => { + if (req.url === '/health') { + res.writeHead(200, { 'Content-Type': 'application/json' }); + res.end(JSON.stringify({ + status: 'healthy', + wsEndpoint: browserServer?.wsEndpoint(), + wsPort: BROWSER_WS_PORT, + healthPort: BROWSER_HEALTH_PORT, + timestamp: new Date().toISOString() + })); + } else if (req.url === '/') { + res.writeHead(200, { 'Content-Type': 'text/plain' }); + res.end(`Maxun Browser Service\nWebSocket: ${browserServer?.wsEndpoint()}\nHealth: http://localhost:${BROWSER_HEALTH_PORT}/health`); + } else { + res.writeHead(404); + res.end('Not Found'); + } + }); + + healthServer.listen(BROWSER_HEALTH_PORT, () => { + console.log(`✅ Health check server running on port ${BROWSER_HEALTH_PORT}`); + console.log('Browser service is ready to accept connections!'); + }); + } catch (error) { + console.error('❌ Failed to start browser service:', error); + process.exit(1); + } +} + +// Graceful shutdown +async function shutdown(): Promise { + console.log('Shutting down browser service...'); + if (browserServer) { + try { + await browserServer.close(); + console.log('Browser server closed'); + } catch (error) { + console.error('Error closing browser server:', error); + } + } + process.exit(0); +} + +process.on('SIGTERM', shutdown); +process.on('SIGINT', shutdown); + +// Start the service +start().catch(console.error); diff --git a/browser/tsconfig.json b/browser/tsconfig.json new file mode 100644 index 00000000..a1a171bf --- /dev/null +++ b/browser/tsconfig.json @@ -0,0 +1,24 @@ +{ + "compilerOptions": { + "target": "ES2020", + "module": "commonjs", + "lib": [ + "ES2020" + ], + "outDir": "./dist", + "rootDir": "./", + "strict": true, + "esModuleInterop": true, + "skipLibCheck": true, + "forceConsistentCasingInFileNames": true, + "resolveJsonModule": true, + "moduleResolution": "node" + }, + "include": [ + "server.ts" + ], + "exclude": [ + "node_modules", + "dist" + ] +} \ No newline at end of file diff --git a/docker-compose.yml b/docker-compose.yml index c1b4302e..dbb147b7 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -74,6 +74,42 @@ services: depends_on: - backend + browser: + build: + context: . + dockerfile: browser/Dockerfile + args: + BROWSER_WS_PORT: ${BROWSER_WS_PORT:-3001} + BROWSER_HEALTH_PORT: ${BROWSER_HEALTH_PORT:-3002} + ports: + - "${BROWSER_WS_PORT:-3001}:${BROWSER_WS_PORT:-3001}" + - "${BROWSER_HEALTH_PORT:-3002}:${BROWSER_HEALTH_PORT:-3002}" + environment: + - NODE_ENV=production + - DEBUG=pw:browser* + - BROWSER_WS_PORT=${BROWSER_WS_PORT:-3001} + - BROWSER_HEALTH_PORT=${BROWSER_HEALTH_PORT:-3002} + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:${BROWSER_HEALTH_PORT:-3002}/health"] + interval: 10s + timeout: 5s + retries: 3 + start_period: 10s + deploy: + resources: + limits: + memory: 2G + cpus: '1.5' + reservations: + memory: 1G + cpus: '1.0' + security_opt: + - seccomp:unconfined + shm_size: 2gb + cap_add: + - SYS_ADMIN + volumes: postgres_data: minio_data: \ No newline at end of file diff --git a/src/components/recorder/AddWhatCondModal.tsx b/legacy/src/AddWhatCondModal.tsx similarity index 94% rename from src/components/recorder/AddWhatCondModal.tsx rename to legacy/src/AddWhatCondModal.tsx index 53fd38b8..e653405a 100644 --- a/src/components/recorder/AddWhatCondModal.tsx +++ b/legacy/src/AddWhatCondModal.tsx @@ -1,11 +1,11 @@ import { WhereWhatPair } from "maxun-core"; -import { GenericModal } from "../ui/GenericModal"; +import { GenericModal } from "../../src/components/ui/GenericModal"; import { modalStyle } from "./AddWhereCondModal"; import { Button, TextField, Typography } from "@mui/material"; import React, { useRef } from "react"; -import { KeyValueForm } from "./KeyValueForm"; -import { ClearButton } from "../ui/buttons/ClearButton"; -import { useSocketStore } from "../../context/socket"; +import { KeyValueForm } from "../../src/components/recorder/KeyValueForm"; +import { ClearButton } from "../../src/components/ui/buttons/ClearButton"; +import { useSocketStore } from "../../src/context/socket"; interface AddWhatCondModalProps { isOpen: boolean; diff --git a/src/components/recorder/AddWhereCondModal.tsx b/legacy/src/AddWhereCondModal.tsx similarity index 95% rename from src/components/recorder/AddWhereCondModal.tsx rename to legacy/src/AddWhereCondModal.tsx index 758c9e75..f53d05a7 100644 --- a/src/components/recorder/AddWhereCondModal.tsx +++ b/legacy/src/AddWhereCondModal.tsx @@ -1,15 +1,15 @@ -import { Dropdown as MuiDropdown } from "../ui/DropdownMui"; +import { Dropdown as MuiDropdown } from "../../src/components/ui/DropdownMui"; import { Button, MenuItem, Typography } from "@mui/material"; import React, { useRef } from "react"; -import { GenericModal } from "../ui/GenericModal"; +import { GenericModal } from "../../src/components/ui/GenericModal"; import { WhereWhatPair } from "maxun-core"; import { SelectChangeEvent } from "@mui/material/Select/Select"; import { DisplayConditionSettings } from "./DisplayWhereConditionSettings"; -import { useSocketStore } from "../../context/socket"; +import { useSocketStore } from "../../src/context/socket"; interface AddWhereCondModalProps { isOpen: boolean; diff --git a/src/components/recorder/DisplayWhereConditionSettings.tsx b/legacy/src/DisplayWhereConditionSettings.tsx similarity index 91% rename from src/components/recorder/DisplayWhereConditionSettings.tsx rename to legacy/src/DisplayWhereConditionSettings.tsx index 784d26c4..b159375f 100644 --- a/src/components/recorder/DisplayWhereConditionSettings.tsx +++ b/legacy/src/DisplayWhereConditionSettings.tsx @@ -1,10 +1,10 @@ import React from "react"; -import { Dropdown as MuiDropdown } from "../ui/DropdownMui"; +import { Dropdown as MuiDropdown } from "../../src/components/ui/DropdownMui"; import { Checkbox, FormControlLabel, FormGroup, MenuItem, Stack, TextField } from "@mui/material"; -import { AddButton } from "../ui/buttons/AddButton"; -import { RemoveButton } from "../ui/buttons/RemoveButton"; -import { KeyValueForm } from "./KeyValueForm"; -import { WarningText } from "../ui/texts"; +import { AddButton } from "../../src/components/ui/buttons/AddButton"; +import { RemoveButton } from "../../src/components/ui/buttons/RemoveButton"; +import { KeyValueForm } from "../../src/components/recorder/KeyValueForm"; +import { WarningText } from "../../src/components/ui/texts"; interface DisplayConditionSettingsProps { whereProp: string; diff --git a/src/components/recorder/LeftSidePanel.tsx b/legacy/src/LeftSidePanel.tsx similarity index 93% rename from src/components/recorder/LeftSidePanel.tsx rename to legacy/src/LeftSidePanel.tsx index 8fddbbb4..c9a6f41d 100644 --- a/src/components/recorder/LeftSidePanel.tsx +++ b/legacy/src/LeftSidePanel.tsx @@ -1,14 +1,14 @@ import { Box, Paper, Tab, Tabs } from "@mui/material"; import React, { useCallback, useEffect, useState } from "react"; -import { getActiveWorkflow, getParamsOfActiveWorkflow } from "../../api/workflow"; -import { useSocketStore } from '../../context/socket'; +import { getActiveWorkflow, getParamsOfActiveWorkflow } from "../../src/api/workflow"; +import { useSocketStore } from '../../src/context/socket'; import { WhereWhatPair, WorkflowFile } from "maxun-core"; -import { emptyWorkflow } from "../../shared/constants"; +import { emptyWorkflow } from "../../src/shared/constants"; import { LeftSidePanelContent } from "./LeftSidePanelContent"; -import { useGlobalInfoStore } from "../../context/globalInfo"; +import { useGlobalInfoStore } from "../../src/context/globalInfo"; import { TabContext, TabPanel } from "@mui/lab"; import { LeftSidePanelSettings } from "./LeftSidePanelSettings"; -import { RunSettings } from "../run/RunSettings"; +import { RunSettings } from "../../src/components/run/RunSettings"; const fetchWorkflow = (id: string, callback: (response: WorkflowFile) => void) => { getActiveWorkflow(id).then( diff --git a/src/components/recorder/LeftSidePanelContent.tsx b/legacy/src/LeftSidePanelContent.tsx similarity index 92% rename from src/components/recorder/LeftSidePanelContent.tsx rename to legacy/src/LeftSidePanelContent.tsx index 745e0e31..5ab5b2bd 100644 --- a/src/components/recorder/LeftSidePanelContent.tsx +++ b/legacy/src/LeftSidePanelContent.tsx @@ -1,11 +1,11 @@ import React, { useCallback, useEffect, useState } from 'react'; import { Pair } from "./Pair"; import { WhereWhatPair, WorkflowFile } from "maxun-core"; -import { useSocketStore } from "../../context/socket"; +import { useSocketStore } from "../../src/context/socket"; import { Socket } from "socket.io-client"; -import { AddButton } from "../ui/buttons/AddButton"; -import { AddPair } from "../../api/workflow"; -import { GenericModal } from "../ui/GenericModal"; +import { AddButton } from "../../src/components/ui/buttons/AddButton"; +import { AddPair } from "../../src/api/workflow"; +import { GenericModal } from "../../src/components/ui/GenericModal"; import { PairEditForm } from "./PairEditForm"; import { Tooltip } from "@mui/material"; diff --git a/src/components/recorder/LeftSidePanelSettings.tsx b/legacy/src/LeftSidePanelSettings.tsx similarity index 92% rename from src/components/recorder/LeftSidePanelSettings.tsx rename to legacy/src/LeftSidePanelSettings.tsx index 87c73ce1..b7c9b095 100644 --- a/src/components/recorder/LeftSidePanelSettings.tsx +++ b/legacy/src/LeftSidePanelSettings.tsx @@ -1,8 +1,8 @@ import React from "react"; import { Button, MenuItem, TextField, Typography } from "@mui/material"; -import { Dropdown } from "../ui/DropdownMui"; -import { RunSettings } from "../run/RunSettings"; -import { useSocketStore } from "../../context/socket"; +import { Dropdown } from "../../src/components/ui/DropdownMui"; +import { RunSettings } from "../../src/components/run/RunSettings"; +import { useSocketStore } from "../../src/context/socket"; interface LeftSidePanelSettingsProps { params: any[] diff --git a/src/components/recorder/Pair.tsx b/legacy/src/Pair.tsx similarity index 93% rename from src/components/recorder/Pair.tsx rename to legacy/src/Pair.tsx index 12d2eca7..c718c775 100644 --- a/src/components/recorder/Pair.tsx +++ b/legacy/src/Pair.tsx @@ -1,13 +1,13 @@ import React, { FC, useState } from 'react'; import { Stack, Button, IconButton, Tooltip, Badge } from "@mui/material"; -import { AddPair, deletePair, UpdatePair } from "../../api/workflow"; +import { AddPair, deletePair, UpdatePair } from "../../src/api/workflow"; import { WorkflowFile } from "maxun-core"; -import { ClearButton } from "../ui/buttons/ClearButton"; -import { GenericModal } from "../ui/GenericModal"; +import { ClearButton } from "../../src/components/ui/buttons/ClearButton"; +import { GenericModal } from "../../src/components/ui/GenericModal"; import { PairEditForm } from "./PairEditForm"; import { PairDisplayDiv } from "./PairDisplayDiv"; -import { EditButton } from "../ui/buttons/EditButton"; -import { BreakpointButton } from "../ui/buttons/BreakpointButton"; +import { EditButton } from "../../src/components/ui/buttons/EditButton"; +import { BreakpointButton } from "../../src/components/ui/buttons/BreakpointButton"; import VisibilityIcon from '@mui/icons-material/Visibility'; import styled from "styled-components"; import { LoadingButton } from "@mui/lab"; diff --git a/src/components/recorder/PairDetail.tsx b/legacy/src/PairDetail.tsx similarity index 97% rename from src/components/recorder/PairDetail.tsx rename to legacy/src/PairDetail.tsx index da9be05a..66098b17 100644 --- a/src/components/recorder/PairDetail.tsx +++ b/legacy/src/PairDetail.tsx @@ -6,12 +6,12 @@ import TreeView from '@mui/lab/TreeView'; import ExpandMoreIcon from '@mui/icons-material/ExpandMore'; import ChevronRightIcon from '@mui/icons-material/ChevronRight'; import TreeItem from '@mui/lab/TreeItem'; -import { AddButton } from "../ui/buttons/AddButton"; -import { WarningText } from "../ui/texts"; +import { AddButton } from "../../src/components/ui/buttons/AddButton"; +import { WarningText } from "../../src/components/ui/texts"; import NotificationImportantIcon from '@mui/icons-material/NotificationImportant'; -import { RemoveButton } from "../ui/buttons/RemoveButton"; +import { RemoveButton } from "../../src/components/ui/buttons/RemoveButton"; import { AddWhereCondModal } from "./AddWhereCondModal"; -import { useSocketStore } from "../../context/socket"; +import { useSocketStore } from "../../src/context/socket"; import { AddWhatCondModal } from "./AddWhatCondModal"; interface PairDetailProps { diff --git a/src/components/recorder/PairDisplayDiv.tsx b/legacy/src/PairDisplayDiv.tsx similarity index 100% rename from src/components/recorder/PairDisplayDiv.tsx rename to legacy/src/PairDisplayDiv.tsx diff --git a/src/components/recorder/PairEditForm.tsx b/legacy/src/PairEditForm.tsx similarity index 100% rename from src/components/recorder/PairEditForm.tsx rename to legacy/src/PairEditForm.tsx diff --git a/src/components/recorder/Renderer.tsx b/legacy/src/Renderer.tsx similarity index 100% rename from src/components/recorder/Renderer.tsx rename to legacy/src/Renderer.tsx diff --git a/maxun-core/package.json b/maxun-core/package.json index 5506f360..21b51e37 100644 --- a/maxun-core/package.json +++ b/maxun-core/package.json @@ -31,10 +31,10 @@ "license": "AGPL-3.0-or-later", "dependencies": { "@cliqz/adblocker-playwright": "^1.31.3", + "@types/node": "22.7.9", "cross-fetch": "^4.0.0", "joi": "^17.6.0", - "playwright": "^1.20.1", - "playwright-extra": "^4.3.6", - "puppeteer-extra-plugin-stealth": "^2.11.2" + "playwright-core": "1.57.0", + "turndown": "^7.2.2" } -} +} \ No newline at end of file diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index a34777d8..b909376a 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -1,5 +1,5 @@ /* eslint-disable no-await-in-loop, no-restricted-syntax */ -import { ElementHandle, Page, PageScreenshotOptions } from 'playwright'; +import { ElementHandle, Page, PageScreenshotOptions } from 'playwright-core'; import { PlaywrightBlocker } from '@cliqz/adblocker-playwright'; import fetch from 'cross-fetch'; import path from 'path'; @@ -144,7 +144,7 @@ export default class Interpreter extends EventEmitter { private async applyAdBlocker(page: Page): Promise { if (this.blocker) { try { - await this.blocker.enableBlockingInPage(page); + await this.blocker.enableBlockingInPage(page as any); } catch (err) { this.log(`Ad-blocker operation failed:`, Level.ERROR); } @@ -154,7 +154,7 @@ export default class Interpreter extends EventEmitter { private async disableAdBlocker(page: Page): Promise { if (this.blocker) { try { - await this.blocker.disableBlockingInPage(page); + await this.blocker.disableBlockingInPage(page as any); } catch (err) { this.log(`Ad-blocker operation failed:`, Level.ERROR); } @@ -460,8 +460,9 @@ export default class Interpreter extends EventEmitter { for (const link of links) { // eslint-disable-next-line this.concurrency.addJob(async () => { + let newPage = null; try { - const newPage = await context.newPage(); + newPage = await context.newPage(); await newPage.goto(link); await newPage.waitForLoadState('networkidle'); await this.runLoop(newPage, this.initializedWorkflow!); @@ -470,6 +471,14 @@ export default class Interpreter extends EventEmitter { // but newPage(), goto() and waitForLoadState() don't (and will kill // the interpreter by throwing). this.log(e, Level.ERROR); + } finally { + if (newPage && !newPage.isClosed()) { + try { + await newPage.close(); + } catch (closeError) { + this.log('Failed to close enqueued page', Level.WARN); + } + } } }); } @@ -1463,41 +1472,57 @@ export default class Interpreter extends EventEmitter { * User-requested concurrency should be entirely managed by the concurrency manager, * e.g. via `enqueueLinks`. */ - p.on('popup', (popup) => { + const popupHandler = (popup) => { this.concurrency.addJob(() => this.runLoop(popup, workflowCopy)); - }); + }; + p.on('popup', popupHandler); /* eslint no-constant-condition: ["warn", { "checkLoops": false }] */ let loopIterations = 0; const MAX_LOOP_ITERATIONS = 1000; // Circuit breaker + + // Cleanup function to remove popup listener + const cleanup = () => { + try { + if (!p.isClosed()) { + p.removeListener('popup', popupHandler); + } + } catch (cleanupError) { + } + }; while (true) { if (this.isAborted) { this.log('Workflow aborted during step execution', Level.WARN); + cleanup(); return; } // Circuit breaker to prevent infinite loops if (++loopIterations > MAX_LOOP_ITERATIONS) { this.log('Maximum loop iterations reached, terminating to prevent infinite loop', Level.ERROR); + cleanup(); return; } // Checks whether the page was closed from outside, // or the workflow execution has been stopped via `interpreter.stop()` if (p.isClosed() || !this.stopper) { + cleanup(); return; } try { await p.waitForLoadState(); } catch (e) { + cleanup(); await p.close(); return; } if (workflowCopy.length === 0) { this.log('All actions completed. Workflow finished.', Level.LOG); + cleanup(); return; } @@ -1589,6 +1614,7 @@ export default class Interpreter extends EventEmitter { } } else { //await this.disableAdBlocker(p); + cleanup(); return; } } @@ -1681,4 +1707,44 @@ export default class Interpreter extends EventEmitter { throw new Error('Cannot stop, there is no running workflow!'); } } + /** + * Cleanup method to release resources and prevent memory leaks + * Call this when the interpreter is no longer needed + */ + public async cleanup(): Promise { + try { + // Stop any running workflows first + if (this.stopper) { + try { + await this.stop(); + } catch (error: any) { + this.log(`Error stopping workflow during cleanup: ${error.message}`, Level.WARN); + } + } + + // Clear ad-blocker resources + if (this.blocker) { + try { + this.blocker = null; + this.log('Ad-blocker resources cleared', Level.DEBUG); + } catch (error: any) { + this.log(`Error cleaning up ad-blocker: ${error.message}`, Level.WARN); + } + } + + // Clear accumulated data to free memory + this.cumulativeResults = []; + this.namedResults = {}; + this.serializableDataByType = { scrapeList: {}, scrapeSchema: {} }; + + // Reset state + this.isAborted = false; + this.initializedWorkflow = null; + + this.log('Interpreter cleanup completed', Level.DEBUG); + } catch (error: any) { + this.log(`Error during interpreter cleanup: ${error.message}`, Level.ERROR); + throw error; + } + } } \ No newline at end of file diff --git a/maxun-core/src/types/workflow.ts b/maxun-core/src/types/workflow.ts index 08b76ef9..91278009 100644 --- a/maxun-core/src/types/workflow.ts +++ b/maxun-core/src/types/workflow.ts @@ -1,4 +1,4 @@ -import { Page } from 'playwright'; +import { Page } from 'playwright-core'; import { naryOperators, unaryOperators, operators, meta, } from './logic'; diff --git a/package.json b/package.json index a7982fcd..79f6f966 100644 --- a/package.json +++ b/package.json @@ -12,8 +12,6 @@ "@mui/material": "^5.6.2", "@react-oauth/google": "^0.12.1", "@tanstack/react-query": "^5.90.2", - "@testing-library/react": "^13.1.1", - "@testing-library/user-event": "^13.5.0", "@types/bcrypt": "^5.0.2", "@types/body-parser": "^1.19.5", "@types/csurf": "^1.11.5", @@ -47,7 +45,6 @@ "i18next-browser-languagedetector": "^8.0.0", "i18next-http-backend": "^3.0.1", "idcac-playwright": "^0.1.3", - "ioredis": "^5.4.1", "joi": "^17.6.0", "joplin-turndown-plugin-gfm": "^1.0.12", "jsonwebtoken": "^9.0.2", @@ -62,19 +59,12 @@ "pg": "^8.13.0", "pg-boss": "^10.1.6", "pkce-challenge": "^4.1.0", - "playwright": "^1.48.2", - "playwright-extra": "^4.3.6", + "playwright-core": "1.57.0", "posthog-node": "^4.2.1", - "prismjs": "^1.28.0", - "puppeteer-extra-plugin-recaptcha": "^3.6.8", - "puppeteer-extra-plugin-stealth": "^2.11.2", "react": "^18.0.0", "react-dom": "^18.0.0", - "react-highlight": "0.15.0", "react-i18next": "^15.1.3", "react-router-dom": "^6.26.1", - "react-simple-code-editor": "^0.11.2", - "react-transition-group": "^4.4.2", "rrweb-snapshot": "^2.0.0-alpha.4", "sequelize": "^6.37.3", "sequelize-typescript": "^2.1.6", @@ -125,9 +115,6 @@ "@types/node": "22.7.9", "@types/node-cron": "^3.0.11", "@types/node-fetch": "^2.6.12", - "@types/prismjs": "^1.26.0", - "@types/react-highlight": "^0.12.5", - "@types/react-transition-group": "^4.4.4", "@types/styled-components": "^5.1.23", "@types/swagger-jsdoc": "^6.0.4", "@types/swagger-ui-express": "^4.1.6", @@ -144,4 +131,4 @@ "vite": "^5.4.10", "zod": "^3.25.62" } -} +} \ No newline at end of file diff --git a/server/src/api/record.ts b/server/src/api/record.ts index 1f567c3e..04f4ab15 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -1,6 +1,4 @@ import { Router, Request, Response } from 'express'; -import { chromium } from "playwright-extra"; -import stealthPlugin from 'puppeteer-extra-plugin-stealth'; import { requireAPIKey } from "../middlewares/api"; import Robot from "../models/Robot"; import Run from "../models/Run"; @@ -20,8 +18,6 @@ import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } fr import { sendWebhook } from "../routes/webhook"; import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape'; -chromium.use(stealthPlugin()); - const router = Router(); const formatRecording = (recordingData: any) => { @@ -676,6 +672,16 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ }; } + browser = browserPool.getRemoteBrowser(plainRun.browserId); + if (!browser) { + throw new Error('Could not access browser'); + } + + let currentPage = await browser.getCurrentPage(); + if (!currentPage) { + throw new Error('Could not create a new page'); + } + if (recording.recording_meta.type === 'scrape') { logger.log('info', `Executing scrape robot for API run ${id}`); @@ -705,7 +711,7 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ const SCRAPE_TIMEOUT = 120000; if (formats.includes('markdown')) { - const markdownPromise = convertPageToMarkdown(url); + const markdownPromise = convertPageToMarkdown(url, currentPage); const timeoutPromise = new Promise((_, reject) => { setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT); }); @@ -714,7 +720,7 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ } if (formats.includes('html')) { - const htmlPromise = convertPageToHTML(url); + const htmlPromise = convertPageToHTML(url, currentPage); const timeoutPromise = new Promise((_, reject) => { setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT); }); @@ -862,16 +868,6 @@ async function executeRun(id: string, userId: string, requestedFormats?: string[ plainRun.status = 'running'; - browser = browserPool.getRemoteBrowser(plainRun.browserId); - if (!browser) { - throw new Error('Could not access browser'); - } - - let currentPage = await browser.getCurrentPage(); - if (!currentPage) { - throw new Error('Could not create a new page'); - } - const workflow = AddGeneratedFlags(recording.recording); browser.interpreter.setRunId(plainRun.runId); diff --git a/server/src/browser-management/browserConnection.ts b/server/src/browser-management/browserConnection.ts new file mode 100644 index 00000000..5e43b3f5 --- /dev/null +++ b/server/src/browser-management/browserConnection.ts @@ -0,0 +1,156 @@ +import { chromium } from 'playwright-core'; +import type { Browser } from 'playwright-core'; +import logger from '../logger'; + +/** + * Configuration for connection retry logic + */ +const CONNECTION_CONFIG = { + maxRetries: 3, + retryDelay: 2000, + connectionTimeout: 30000, +}; + +/** + * Get the WebSocket endpoint from the browser service health check + * @returns Promise - The WebSocket endpoint URL with browser ID + */ +async function getBrowserServiceEndpoint(): Promise { + const healthPort = process.env.BROWSER_HEALTH_PORT || '3002'; + const healthHost = process.env.BROWSER_WS_HOST || 'localhost'; + const healthEndpoint = `http://${healthHost}:${healthPort}/health`; + + try { + logger.debug(`Fetching WebSocket endpoint from: ${healthEndpoint}`); + const response = await fetch(healthEndpoint); + const data = await response.json(); + + if (data.status === 'healthy' && data.wsEndpoint) { + logger.debug(`Got WebSocket endpoint: ${data.wsEndpoint}`); + return data.wsEndpoint; + } + + throw new Error('Health check did not return a valid wsEndpoint'); + } catch (error: any) { + logger.error(`Failed to fetch endpoint from health check: ${error.message}`); + throw new Error( + `Browser service is not accessible at ${healthEndpoint}. ` + + `Make sure the browser service is running (docker-compose up browser)` + ); + } +} + +/** + * Launch a local browser as fallback when browser service is unavailable + * @returns Promise - Locally launched browser instance + */ +async function launchLocalBrowser(): Promise { + logger.warn('Attempting to launch local browser'); + logger.warn('Note: This requires Chromium binaries to be installed (npx playwright install chromium)'); + + try { + const browser = await chromium.launch({ + headless: true, + args: [ + '--disable-blink-features=AutomationControlled', + '--disable-web-security', + '--disable-features=IsolateOrigins,site-per-process', + '--disable-site-isolation-trials', + '--disable-extensions', + '--no-sandbox', + '--disable-dev-shm-usage', + '--disable-gpu', + '--force-color-profile=srgb', + '--force-device-scale-factor=2', + '--ignore-certificate-errors', + '--mute-audio' + ], + }); + + logger.info('Successfully launched local browser'); + return browser; + } catch (error: any) { + logger.error(`Failed to launch local browser: ${error.message}`); + throw new Error( + `Could not launch local browser. ` + + `Please either:\n` + + ` 1. Start the browser service: docker-compose up browser\n` + + ` 2. Install Chromium binaries: npx playwright@1.57.0 install chromium` + ); + } +} + +/** + * Connect to the remote browser service with retry logic, with fallback to local browser + * @param retries - Number of connection attempts (default: 3) + * @returns Promise - Connected browser instance (remote or local) + * @throws Error if both remote connection and local launch fail + */ +export async function connectToRemoteBrowser(retries?: number): Promise { + const maxRetries = retries ?? CONNECTION_CONFIG.maxRetries; + + try { + const wsEndpoint = await getBrowserServiceEndpoint(); + logger.info(`Connecting to browser service at ${wsEndpoint}...`); + + for (let attempt = 1; attempt <= maxRetries; attempt++) { + try { + logger.debug(`Connection attempt ${attempt}/${maxRetries}`); + + const browser = await chromium.connect(wsEndpoint, { + timeout: CONNECTION_CONFIG.connectionTimeout, + }); + + logger.info('Successfully connected to browser service'); + return browser; + } catch (error: any) { + logger.warn( + `Connection attempt ${attempt}/${maxRetries} failed: ${error.message}` + ); + + if (attempt === maxRetries) { + logger.error( + `Failed to connect to browser service after ${maxRetries} attempts` + ); + throw new Error(`Remote connection failed: ${error.message}`); + } + + logger.debug(`Waiting ${CONNECTION_CONFIG.retryDelay}ms before retry...`); + await new Promise(resolve => setTimeout(resolve, CONNECTION_CONFIG.retryDelay)); + } + } + + throw new Error('Failed to connect to browser service'); + } catch (error: any) { + logger.warn(`Browser service connection failed: ${error.message}`); + logger.warn('Falling back to local browser launch...'); + + return await launchLocalBrowser(); + } +} + +/** + * Check if browser service is healthy + * @returns Promise - true if service is healthy + */ +export async function checkBrowserServiceHealth(): Promise { + try { + const healthPort = process.env.BROWSER_HEALTH_PORT || '3002'; + const healthHost = process.env.BROWSER_WS_HOST || 'localhost'; + const healthEndpoint = `http://${healthHost}:${healthPort}/health`; + + const response = await fetch(healthEndpoint); + const data = await response.json(); + + if (data.status === 'healthy') { + logger.info('Browser service health check passed'); + return true; + } + + logger.warn('Browser service health check failed:', data); + return false; + } catch (error: any) { + logger.error('Browser service health check error:', error.message); + return false; + } +} diff --git a/server/src/browser-management/classes/RemoteBrowser.ts b/server/src/browser-management/classes/RemoteBrowser.ts index 731b583d..266a0978 100644 --- a/server/src/browser-management/classes/RemoteBrowser.ts +++ b/server/src/browser-management/classes/RemoteBrowser.ts @@ -2,11 +2,9 @@ import { Page, Browser, CDPSession, - BrowserContext, -} from 'playwright'; + BrowserContext +} from 'playwright-core'; import { Socket } from "socket.io"; -import { chromium } from 'playwright-extra'; -import stealthPlugin from 'puppeteer-extra-plugin-stealth'; import { PlaywrightBlocker } from '@cliqz/adblocker-playwright'; import fetch from 'cross-fetch'; import logger from '../../logger'; @@ -17,6 +15,7 @@ import { getDecryptedProxyConfig } from '../../routes/proxy'; import { getInjectableScript } from 'idcac-playwright'; import { FingerprintInjector } from "fingerprint-injector"; import { FingerprintGenerator } from "fingerprint-generator"; +import { connectToRemoteBrowser } from '../browserConnection'; declare global { interface Window { @@ -39,8 +38,6 @@ interface ProcessedSnapshot { baseUrl: string; } -chromium.use(stealthPlugin()); - const MEMORY_CONFIG = { gcInterval: 20000, // Check memory more frequently (20s instead of 60s) maxHeapSize: 1536 * 1024 * 1024, // 1.5GB @@ -460,26 +457,10 @@ export class RemoteBrowser { const initializationPromise = (async () => { while (!success && retryCount < MAX_RETRIES) { try { - this.browser = (await chromium.launch({ - headless: true, - args: [ - "--disable-blink-features=AutomationControlled", - "--disable-web-security", - "--disable-features=IsolateOrigins,site-per-process", - "--disable-site-isolation-trials", - "--disable-extensions", - "--no-sandbox", - "--disable-dev-shm-usage", - "--disable-gpu", - "--force-color-profile=srgb", - "--force-device-scale-factor=2", - "--ignore-certificate-errors", - "--mute-audio" - ], - })); + this.browser = await connectToRemoteBrowser(); if (!this.browser || this.browser.isConnected() === false) { - throw new Error('Browser failed to launch or is not connected'); + throw new Error('Browser failed to launch or is not connected'); } this.emitLoadingProgress(20, 0); diff --git a/server/src/browser-management/inputHandlers.ts b/server/src/browser-management/inputHandlers.ts index 69b2697d..35505f55 100644 --- a/server/src/browser-management/inputHandlers.ts +++ b/server/src/browser-management/inputHandlers.ts @@ -7,7 +7,7 @@ import { Socket } from 'socket.io'; import logger from "../logger"; import { Coordinates, ScrollDeltas, KeyboardInput, DatePickerEventData } from '../types'; import { browserPool } from "../server"; -import { Page } from "playwright"; +import { Page } from "playwright-core"; import { CustomActions } from "../../../src/shared/types"; import { WhereWhatPair } from "maxun-core"; import { RemoteBrowser } from './classes/RemoteBrowser'; diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts index 6821bfdb..52ae19bf 100644 --- a/server/src/markdownify/scrape.ts +++ b/server/src/markdownify/scrape.ts @@ -1,17 +1,35 @@ -import { chromium } from "playwright"; +import { connectToRemoteBrowser } from "../browser-management/browserConnection"; import { parseMarkdown } from "./markdown"; +import logger from "../logger"; + +async function gotoWithFallback(page: any, url: string) { + try { + return await page.goto(url, { + waitUntil: "networkidle", + timeout: 100000, + }); + } catch (err) { + // fallback: JS-heavy or unstable sites + return await page.goto(url, { + waitUntil: "domcontentloaded", + timeout: 100000, + }); + } +} /** * Fetches a webpage, strips scripts/styles/images/etc, * returns clean Markdown using parser. + * @param url - The URL to convert + * @param existingPage - Optional existing Playwright page instance to reuse */ export async function convertPageToMarkdown(url: string): Promise { - const browser = await chromium.launch(); + const browser = await connectToRemoteBrowser(); const page = await browser.newPage(); await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); - await page.addInitScript(() => { + const cleanedHtml = await page.evaluate(() => { const selectors = [ "script", "style", @@ -42,14 +60,16 @@ export async function convertPageToMarkdown(url: string): Promise { } }); }); - }); - // Re-extract HTML after cleanup - const cleanedHtml = await page.evaluate(() => { return document.documentElement.outerHTML; }); - await browser.close(); + if (shouldCloseBrowser && browser) { + logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`); + await browser.close(); + } else { + logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`); + } // Convert cleaned HTML → Markdown const markdown = await parseMarkdown(cleanedHtml, url); @@ -59,14 +79,16 @@ export async function convertPageToMarkdown(url: string): Promise { /** * Fetches a webpage, strips scripts/styles/images/etc, * returns clean HTML. + * @param url - The URL to convert + * @param existingPage - Optional existing Playwright page instance to reuse */ export async function convertPageToHTML(url: string): Promise { - const browser = await chromium.launch(); + const browser = await connectToRemoteBrowser(); const page = await browser.newPage(); await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); - await page.addInitScript(() => { + const cleanedHtml = await page.evaluate(() => { const selectors = [ "script", "style", @@ -97,14 +119,16 @@ export async function convertPageToHTML(url: string): Promise { } }); }); - }); - // Re-extract HTML after cleanup - const cleanedHtml = await page.evaluate(() => { return document.documentElement.outerHTML; }); - await browser.close(); + if (shouldCloseBrowser && browser) { + logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`); + await browser.close(); + } else { + logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`); + } // Return cleaned HTML directly return cleanedHtml; diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts index 3545cb3a..e7d4e115 100644 --- a/server/src/pgboss-worker.ts +++ b/server/src/pgboss-worker.ts @@ -13,7 +13,7 @@ import { WorkflowFile } from 'maxun-core'; import Run from './models/Run'; import Robot from './models/Robot'; import { browserPool } from './server'; -import { Page } from 'playwright'; +import { Page } from 'playwright-core'; import { capture } from './utils/analytics'; import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from './workflow-management/integrations/gsheet'; import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from './workflow-management/integrations/airtable'; @@ -192,7 +192,7 @@ async function processRunExecution(job: Job) { logger.log('info', `Browser ${browserId} found and ready for execution`); - try { + try { // Find the recording const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true }); @@ -200,6 +200,30 @@ async function processRunExecution(job: Job) { throw new Error(`Recording for run ${data.runId} not found`); } + let currentPage = browser.getCurrentPage(); + + const pageWaitStart = Date.now(); + let lastPageLogTime = 0; + let pageAttempts = 0; + const MAX_PAGE_ATTEMPTS = 15; + + while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) { + const currentTime = Date.now(); + pageAttempts++; + + if (currentTime - lastPageLogTime > 5000) { + logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`); + lastPageLogTime = currentTime; + } + + await new Promise(resolve => setTimeout(resolve, 1000)); + currentPage = browser.getCurrentPage(); + } + + if (!currentPage) { + throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`); + } + if (recording.recording_meta.type === 'scrape') { logger.log('info', `Executing scrape robot for run ${data.runId}`); @@ -224,7 +248,7 @@ async function processRunExecution(job: Job) { const SCRAPE_TIMEOUT = 120000; if (formats.includes('markdown')) { - const markdownPromise = convertPageToMarkdown(url); + const markdownPromise = convertPageToMarkdown(url, currentPage); const timeoutPromise = new Promise((_, reject) => { setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT); }); @@ -233,7 +257,7 @@ async function processRunExecution(job: Job) { } if (formats.includes('html')) { - const htmlPromise = convertPageToHTML(url); + const htmlPromise = convertPageToHTML(url, currentPage); const timeoutPromise = new Promise((_, reject) => { setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT); }); @@ -347,30 +371,6 @@ async function processRunExecution(job: Job) { } }; - let currentPage = browser.getCurrentPage(); - - const pageWaitStart = Date.now(); - let lastPageLogTime = 0; - let pageAttempts = 0; - const MAX_PAGE_ATTEMPTS = 15; - - while (!currentPage && (Date.now() - pageWaitStart) < BROWSER_PAGE_TIMEOUT && pageAttempts < MAX_PAGE_ATTEMPTS) { - const currentTime = Date.now(); - pageAttempts++; - - if (currentTime - lastPageLogTime > 5000) { - logger.log('info', `Page not ready for browser ${browserId}, waiting... (${Math.round((currentTime - pageWaitStart) / 1000)}s elapsed)`); - lastPageLogTime = currentTime; - } - - await new Promise(resolve => setTimeout(resolve, 1000)); - currentPage = browser.getCurrentPage(); - } - - if (!currentPage) { - throw new Error(`No current page available for browser ${browserId} after ${BROWSER_PAGE_TIMEOUT/1000}s timeout`); - } - logger.log('info', `Starting workflow execution for run ${data.runId}`); await run.update({ diff --git a/server/src/routes/integration.ts b/server/src/routes/integration.ts deleted file mode 100644 index 3c4672c5..00000000 --- a/server/src/routes/integration.ts +++ /dev/null @@ -1,19 +0,0 @@ -import { Router } from 'express'; -import logger from "../logger"; -// import { loadIntegrations, saveIntegrations } from '../workflow-management/integrations/gsheet'; -import { requireSignIn } from '../middlewares/auth'; - -export const router = Router(); - -router.post('/upload-credentials', requireSignIn, async (req, res) => { - try { - const { fileName, credentials, spreadsheetId, range } = req.body; - if (!fileName || !credentials || !spreadsheetId || !range) { - return res.status(400).json({ message: 'Credentials, Spreadsheet ID, and Range are required.' }); - } - // *** TEMPORARILY WE STORE CREDENTIALS HERE *** - } catch (error: any) { - logger.log('error', `Error saving credentials: ${error.message}`); - return res.status(500).json({ message: 'Failed to save credentials.', error: error.message }); - } -}); \ No newline at end of file diff --git a/server/src/routes/proxy.ts b/server/src/routes/proxy.ts index 5cdca0fc..52cced53 100644 --- a/server/src/routes/proxy.ts +++ b/server/src/routes/proxy.ts @@ -1,10 +1,8 @@ import { Router, Request, Response } from 'express'; -import { chromium } from 'playwright-extra'; -import stealthPlugin from 'puppeteer-extra-plugin-stealth'; +import { connectToRemoteBrowser } from '../browser-management/browserConnection'; import User from '../models/User'; import { encrypt, decrypt } from '../utils/auth'; import { requireSignIn } from '../middlewares/auth'; -chromium.use(stealthPlugin()); export const router = Router(); @@ -86,11 +84,7 @@ router.get('/test', requireSignIn, async (req: Request, res: Response) => { }), }; - const browser = await chromium.launch({ - headless: true, - proxy: proxyOptions, - args:["--ignore-certificate-errors"] - }); + const browser = await connectToRemoteBrowser(); const page = await browser.newPage(); await page.goto('https://example.com'); await browser.close(); diff --git a/server/src/routes/record.ts b/server/src/routes/record.ts index 6676fdb3..8992b1e3 100644 --- a/server/src/routes/record.ts +++ b/server/src/routes/record.ts @@ -13,14 +13,11 @@ import { destroyRemoteBrowser, canCreateBrowserInState, } from '../browser-management/controller'; -import { chromium } from 'playwright-extra'; -import stealthPlugin from 'puppeteer-extra-plugin-stealth'; import logger from "../logger"; import { requireSignIn } from '../middlewares/auth'; import { pgBossClient } from '../storage/pgboss'; export const router = Router(); -chromium.use(stealthPlugin()); export interface AuthenticatedRequest extends Request { user?: any; diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index 79e94eab..72518c7b 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -1,8 +1,6 @@ import { Router } from 'express'; import logger from "../logger"; import { createRemoteBrowserForRun, destroyRemoteBrowser, getActiveBrowserIdByState } from "../browser-management/controller"; -import { chromium } from 'playwright-extra'; -import stealthPlugin from 'puppeteer-extra-plugin-stealth'; import { browserPool } from "../server"; import { v4 as uuid } from "uuid"; import moment from 'moment-timezone'; diff --git a/server/src/types/index.ts b/server/src/types/index.ts index 75aac802..45b21ca4 100644 --- a/server/src/types/index.ts +++ b/server/src/types/index.ts @@ -1,4 +1,4 @@ -import {BrowserType, LaunchOptions} from "playwright"; +import {BrowserType, LaunchOptions} from "playwright-core"; /** * Interpreter settings properties including recording parameters. diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index bb19b465..63a466d0 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -2,7 +2,7 @@ import { Action, ActionType, Coordinates, TagName, DatePickerEventData } from ". import { WhereWhatPair, WorkflowFile } from 'maxun-core'; import logger from "../../logger"; import { Socket } from "socket.io"; -import { Page } from "playwright"; +import { Page } from "playwright-core"; import { getElementInformation, getRect, diff --git a/server/src/workflow-management/classes/Interpreter.ts b/server/src/workflow-management/classes/Interpreter.ts index 07f99dfe..03e9ef87 100644 --- a/server/src/workflow-management/classes/Interpreter.ts +++ b/server/src/workflow-management/classes/Interpreter.ts @@ -1,7 +1,7 @@ import Interpreter, { WorkflowFile } from "maxun-core"; import logger from "../../logger"; import { Socket } from "socket.io"; -import { Page } from "playwright"; +import { Page } from "playwright-core"; import { InterpreterSettings } from "../../types"; import { decrypt } from "../../utils/auth"; import Run from "../../models/Run"; diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts index 30ed892b..49237522 100644 --- a/server/src/workflow-management/scheduler/index.ts +++ b/server/src/workflow-management/scheduler/index.ts @@ -1,6 +1,4 @@ import { v4 as uuid } from "uuid"; -import { chromium } from 'playwright-extra'; -import stealthPlugin from 'puppeteer-extra-plugin-stealth'; import { io, Socket } from "socket.io-client"; import { createRemoteBrowserForRun, destroyRemoteBrowser } from '../../browser-management/controller'; import logger from '../../logger'; @@ -12,11 +10,10 @@ import { getDecryptedProxyConfig } from "../../routes/proxy"; import { BinaryOutputService } from "../../storage/mino"; import { capture } from "../../utils/analytics"; import { WorkflowFile } from "maxun-core"; -import { Page } from "playwright"; +import { Page } from "playwright-core"; import { sendWebhook } from "../../routes/webhook"; import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable"; import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape"; -chromium.use(stealthPlugin()); async function createWorkflowAndStoreMetadata(id: string, userId: string) { try { @@ -220,6 +217,16 @@ async function executeRun(id: string, userId: string) { } } + browser = browserPool.getRemoteBrowser(plainRun.browserId); + if (!browser) { + throw new Error('Could not access browser'); + } + + let currentPage = await browser.getCurrentPage(); + if (!currentPage) { + throw new Error('Could not create a new page'); + } + if (recording.recording_meta.type === 'scrape') { logger.log('info', `Executing scrape robot for scheduled run ${id}`); @@ -266,7 +273,7 @@ async function executeRun(id: string, userId: string) { // Markdown conversion if (formats.includes("markdown")) { - const markdownPromise = convertPageToMarkdown(url); + const markdownPromise = convertPageToMarkdown(url, currentPage); const timeoutPromise = new Promise((_, reject) => { setTimeout(() => reject(new Error(`Markdown conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT); }); @@ -275,7 +282,7 @@ async function executeRun(id: string, userId: string) { } if (formats.includes("html")) { - const htmlPromise = convertPageToHTML(url); + const htmlPromise = convertPageToHTML(url, currentPage); const timeoutPromise = new Promise((_, reject) => { setTimeout(() => reject(new Error(`HTML conversion timed out after ${SCRAPE_TIMEOUT/1000}s`)), SCRAPE_TIMEOUT); }); @@ -412,16 +419,6 @@ async function executeRun(id: string, userId: string) { logger.log('warn', `Failed to send run-started notification for run ${plainRun.runId}: ${socketError.message}`); } - browser = browserPool.getRemoteBrowser(plainRun.browserId); - if (!browser) { - throw new Error('Could not access browser'); - } - - let currentPage = await browser.getCurrentPage(); - if (!currentPage) { - throw new Error('Could not create a new page'); - } - const workflow = AddGeneratedFlags(recording.recording); // Set run ID for real-time data persistence diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 43c1d2f8..b5f8051f 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -1,4 +1,4 @@ -import { Page } from "playwright"; +import { Page } from "playwright-core"; import { Coordinates } from "../types"; import { WhereWhatPair, WorkflowFile } from "maxun-core"; import logger from "../logger"; diff --git a/src/components/browser/BrowserWindow.tsx b/src/components/browser/BrowserWindow.tsx index 8bdabaae..7dd8b2cf 100644 --- a/src/components/browser/BrowserWindow.tsx +++ b/src/components/browser/BrowserWindow.tsx @@ -304,8 +304,6 @@ export const BrowserWindow = () => { const createFieldsFromChildSelectors = useCallback( (childSelectors: string[], listSelector: string) => { - if (!childSelectors.length || !currentSnapshot) return {}; - const iframeElement = document.querySelector( "#dom-browser-iframe" ) as HTMLIFrameElement; @@ -323,7 +321,6 @@ export const BrowserWindow = () => { const uniqueChildSelectors = [...new Set(childSelectors)]; - // Filter child selectors that occur in at least 2 out of first 10 list elements const validateChildSelectors = (selectors: string[]): string[] => { try { // Get first 10 list elements @@ -352,13 +349,10 @@ export const BrowserWindow = () => { // If we can't access the element, it's likely in shadow DOM - include it if (!testElement) { - console.log(`Including potentially shadow DOM selector: ${selector}`); validSelectors.push(selector); continue; } } catch (accessError) { - // If there's an error accessing, assume shadow DOM and include it - console.log(`Including selector due to access error: ${selector}`); validSelectors.push(selector); continue; } @@ -395,7 +389,6 @@ export const BrowserWindow = () => { } }; - // Enhanced XPath evaluation for multiple elements const evaluateXPathAllWithShadowSupport = ( document: Document, xpath: string, @@ -423,8 +416,6 @@ export const BrowserWindow = () => { return elements; } - // If shadow DOM is indicated and regular XPath fails, use shadow DOM traversal - // This is a simplified version - for multiple elements, we'll primarily rely on regular XPath return elements; } catch (err) { console.error("XPath evaluation failed:", xpath, err); @@ -432,7 +423,9 @@ export const BrowserWindow = () => { } }; - const validatedChildSelectors = validateChildSelectors(uniqueChildSelectors); + const isValidData = (text: string | null | undefined): boolean => { + return !!text && text.trim().length > 0; + }; const isElementVisible = (element: HTMLElement): boolean => { try { @@ -443,443 +436,119 @@ export const BrowserWindow = () => { } }; - const isValidData = (data: string): boolean => { - if (!data || data.trim().length === 0) return false; + const createFieldData = (element: HTMLElement, selector: string, forceAttribute?: string) => { + const tagName = element.tagName.toLowerCase(); + let data = ''; + let attribute = forceAttribute || 'innerText'; - const trimmed = data.trim(); - - // Filter out single letters - if (trimmed.length === 1) { - return false; - } - - // Filter out pure symbols/punctuation - if (trimmed.length < 3 && /^[^\w\s]+$/.test(trimmed)) { - return false; - } - - // Filter out whitespace and punctuation only - if (/^[\s\p{P}\p{S}]*$/u.test(trimmed)) return false; - - return trimmed.length > 0; - }; - - // Enhanced shadow DOM-aware element evaluation - const evaluateXPathWithShadowSupport = ( - document: Document, - xpath: string, - isShadow: boolean = false - ): Element | null => { - try { - // First try regular XPath evaluation - const result = document.evaluate( - xpath, - document, - null, - XPathResult.FIRST_ORDERED_NODE_TYPE, - null - ).singleNodeValue as Element | null; - - if (!isShadow || result) { - return result; + if (forceAttribute) { + if (forceAttribute === 'href') { + data = element.getAttribute('href') || ''; + } else if (forceAttribute === 'innerText') { + data = (element.textContent || '').trim(); } - - // If shadow DOM is indicated and regular XPath fails, use shadow DOM traversal - let cleanPath = xpath; - let isIndexed = false; - - const indexedMatch = xpath.match(/^\((.*?)\)\[(\d+)\](.*)$/); - if (indexedMatch) { - cleanPath = indexedMatch[1] + indexedMatch[3]; - isIndexed = true; + } else if (tagName === 'img') { + data = element.getAttribute('src') || ''; + attribute = 'src'; + } else if (tagName === 'a') { + const href = element.getAttribute('href') || ''; + const text = (element.textContent || '').trim(); + if (href && href !== '#' && !href.startsWith('javascript:')) { + data = href; + attribute = 'href'; + } else if (text) { + data = text; + attribute = 'innerText'; } - - const pathParts = cleanPath - .replace(/^\/\//, "") - .split("/") - .map((p) => p.trim()) - .filter((p) => p.length > 0); - - let currentContexts: (Document | Element | ShadowRoot)[] = [document]; - - for (let i = 0; i < pathParts.length; i++) { - const part = pathParts[i]; - const nextContexts: (Element | ShadowRoot)[] = []; - - for (const ctx of currentContexts) { - const positionalMatch = part.match(/^([^[]+)\[(\d+)\]$/); - let partWithoutPosition = part; - let requestedPosition: number | null = null; - - if (positionalMatch) { - partWithoutPosition = positionalMatch[1]; - requestedPosition = parseInt(positionalMatch[2]); - } - - const matched = queryInsideContext(ctx, partWithoutPosition); - - let elementsToAdd = matched; - if (requestedPosition !== null) { - const index = requestedPosition - 1; - if (index >= 0 && index < matched.length) { - elementsToAdd = [matched[index]]; - } else { - elementsToAdd = []; - } - } - - elementsToAdd.forEach((el) => { - nextContexts.push(el); - if (el.shadowRoot) { - nextContexts.push(el.shadowRoot); - } - }); - } - - if (nextContexts.length === 0) { - return null; - } - - currentContexts = nextContexts; - } - - if (currentContexts.length > 0) { - if (isIndexed && indexedMatch) { - const requestedIndex = parseInt(indexedMatch[2]) - 1; - if (requestedIndex >= 0 && requestedIndex < currentContexts.length) { - return currentContexts[requestedIndex] as Element; - } else { - return null; - } - } - - return currentContexts[0] as Element; - } - - return null; - } catch (err) { - console.error("XPath evaluation failed:", xpath, err); - return null; - } - }; - - const queryInsideContext = ( - context: Document | Element | ShadowRoot, - part: string - ): Element[] => { - try { - const { tagName, conditions } = parseXPathPart(part); - - const candidateElements = Array.from(context.querySelectorAll(tagName)); - if (candidateElements.length === 0) { - return []; - } - - const matchingElements = candidateElements.filter((el) => { - return elementMatchesConditions(el, conditions); - }); - - return matchingElements; - } catch (err) { - console.error("Error in queryInsideContext:", err); - return []; - } - }; - - const parseXPathPart = ( - part: string - ): { tagName: string; conditions: string[] } => { - const tagMatch = part.match(/^([a-zA-Z0-9-]+)/); - const tagName = tagMatch ? tagMatch[1] : "*"; - - const conditionMatches = part.match(/\[([^\]]+)\]/g); - const conditions = conditionMatches - ? conditionMatches.map((c) => c.slice(1, -1)) - : []; - - return { tagName, conditions }; - }; - - const elementMatchesConditions = ( - element: Element, - conditions: string[] - ): boolean => { - for (const condition of conditions) { - if (!elementMatchesCondition(element, condition)) { - return false; - } - } - return true; - }; - - const elementMatchesCondition = ( - element: Element, - condition: string - ): boolean => { - condition = condition.trim(); - - if (/^\d+$/.test(condition)) { - return true; + } else { + data = (element.textContent || '').trim(); + attribute = 'innerText'; } - // Handle @attribute="value" - const attrMatch = condition.match(/^@([^=]+)=["']([^"']+)["']$/); - if (attrMatch) { - const [, attr, value] = attrMatch; - const elementValue = element.getAttribute(attr); - return elementValue === value; - } + if (!data) return null; - // Handle contains(@class, 'value') - const classContainsMatch = condition.match( - /^contains\(@class,\s*["']([^"']+)["']\)$/ - ); - if (classContainsMatch) { - const className = classContainsMatch[1]; - return element.classList.contains(className); - } - - // Handle contains(@attribute, 'value') - const attrContainsMatch = condition.match( - /^contains\(@([^,]+),\s*["']([^"']+)["']\)$/ - ); - if (attrContainsMatch) { - const [, attr, value] = attrContainsMatch; - const elementValue = element.getAttribute(attr) || ""; - return elementValue.includes(value); - } - - // Handle text()="value" - const textMatch = condition.match(/^text\(\)=["']([^"']+)["']$/); - if (textMatch) { - const expectedText = textMatch[1]; - const elementText = element.textContent?.trim() || ""; - return elementText === expectedText; - } - - // Handle contains(text(), 'value') - const textContainsMatch = condition.match( - /^contains\(text\(\),\s*["']([^"']+)["']\)$/ - ); - if (textContainsMatch) { - const expectedText = textContainsMatch[1]; - const elementText = element.textContent?.trim() || ""; - return elementText.includes(expectedText); - } - - // Handle count(*)=0 (element has no children) - if (condition === "count(*)=0") { - return element.children.length === 0; - } - - // Handle other count conditions - const countMatch = condition.match(/^count\(\*\)=(\d+)$/); - if (countMatch) { - const expectedCount = parseInt(countMatch[1]); - return element.children.length === expectedCount; - } - - return true; - }; - - // Enhanced value extraction with shadow DOM support - const extractValueWithShadowSupport = ( - element: Element, - attribute: string - ): string | null => { - if (!element) return null; - - const baseURL = - element.ownerDocument?.location?.href || window.location.origin; - - // Check shadow DOM content first - if (element.shadowRoot) { - const shadowContent = element.shadowRoot.textContent; - if (shadowContent?.trim()) { - return shadowContent.trim(); + return { + data, + selectorObj: { + selector, + attribute, + tag: tagName.toUpperCase(), + isShadow: element.getRootNode() instanceof ShadowRoot } - } - - if (attribute === "innerText") { - let textContent = - (element as HTMLElement).innerText?.trim() || - (element as HTMLElement).textContent?.trim(); - - if (!textContent) { - const dataAttributes = [ - "data-600", - "data-text", - "data-label", - "data-value", - "data-content", - ]; - for (const attr of dataAttributes) { - const dataValue = element.getAttribute(attr); - if (dataValue && dataValue.trim()) { - textContent = dataValue.trim(); - break; - } - } - } - - return textContent || null; - } else if (attribute === "innerHTML") { - return element.innerHTML?.trim() || null; - } else if (attribute === "href") { - let anchorElement = element; - - if (element.tagName !== "A") { - anchorElement = - element.closest("a") || - element.parentElement?.closest("a") || - element; - } - - const hrefValue = anchorElement.getAttribute("href"); - if (!hrefValue || hrefValue.trim() === "") { - return null; - } - - try { - return new URL(hrefValue, baseURL).href; - } catch (e) { - console.warn("Error creating URL from", hrefValue, e); - return hrefValue; - } - } else if (attribute === "src") { - const attrValue = element.getAttribute(attribute); - const dataAttr = attrValue || element.getAttribute("data-" + attribute); - - if (!dataAttr || dataAttr.trim() === "") { - const style = window.getComputedStyle(element as HTMLElement); - const bgImage = style.backgroundImage; - if (bgImage && bgImage !== "none") { - const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); - return matches ? new URL(matches[1], baseURL).href : null; - } - return null; - } - - try { - return new URL(dataAttr, baseURL).href; - } catch (e) { - console.warn("Error creating URL from", dataAttr, e); - return dataAttr; - } - } - return element.getAttribute(attribute); - }; - - // Simple deepest child finder - limit depth to prevent hanging - const findDeepestChild = (element: HTMLElement): HTMLElement => { - let deepest = element; - let maxDepth = 0; - - const traverse = (el: HTMLElement, depth: number) => { - if (depth > 3) return; - - const text = el.textContent?.trim() || ""; - if (isValidData(text) && depth > maxDepth) { - maxDepth = depth; - deepest = el; - } - - const children = Array.from(el.children).slice(0, 3); - children.forEach((child) => { - if (child instanceof HTMLElement) { - traverse(child, depth + 1); - } - }); }; - - traverse(element, 0); - return deepest; }; - validatedChildSelectors.forEach((childSelector, index) => { + const validatedChildSelectors = validateChildSelectors(uniqueChildSelectors); + + validatedChildSelectors.forEach((selector, index) => { try { - // Detect if this selector should use shadow DOM traversal - const isShadowSelector = childSelector.includes('>>') || - childSelector.startsWith('//') && - (listSelector.includes('>>') || currentSnapshot?.snapshot); - - const element = evaluateXPathWithShadowSupport( + const elements = evaluateXPathAllWithShadowSupport( iframeElement.contentDocument!, - childSelector, - isShadowSelector - ) as HTMLElement; + selector, + selector.includes(">>") || selector.startsWith("//") + ); - if (element && isElementVisible(element)) { + if (elements.length === 0) return; + + const element = elements[0] as HTMLElement; + const tagName = element.tagName.toLowerCase(); + const isShadow = element.getRootNode() instanceof ShadowRoot; + + if (isElementVisible(element)) { const rect = element.getBoundingClientRect(); const position = { x: rect.left, y: rect.top }; - const tagName = element.tagName.toLowerCase(); - const isShadow = element.getRootNode() instanceof ShadowRoot; - - if (tagName === "a") { - const anchor = element as HTMLAnchorElement; - const href = extractValueWithShadowSupport(anchor, "href"); - const text = extractValueWithShadowSupport(anchor, "innerText"); - - if ( - href && - href.trim() !== "" && - href !== window.location.href && - !href.startsWith("javascript:") && - !href.startsWith("#") - ) { - const fieldIdHref = Date.now() + index * 1000; - - candidateFields.push({ - id: fieldIdHref, - element: element, - isLeaf: true, - depth: 0, - position: position, - field: { - id: fieldIdHref, - type: "text", - label: `Label ${index * 2 + 1}`, - data: href, - selectorObj: { - selector: childSelector, - tag: element.tagName, - isShadow: isShadow, - attribute: "href", - }, - }, - }); - } - - const fieldIdText = Date.now() + index * 1000 + 1; + if (tagName === 'a') { + const href = element.getAttribute('href'); + const text = (element.textContent || '').trim(); if (text && isValidData(text)) { - candidateFields.push({ - id: fieldIdText, - element: element, - isLeaf: true, - depth: 0, - position: position, - field: { - id: fieldIdText, - type: "text", - label: `Label ${index * 2 + 2}`, - data: text, - selectorObj: { - selector: childSelector, - tag: element.tagName, - isShadow: isShadow, - attribute: "innerText", - }, - }, - }); + const textField = createFieldData(element, selector, 'innerText'); + if (textField && textField.data) { + const fieldId = Date.now() + index * 1000; + + candidateFields.push({ + id: fieldId, + element: element, + isLeaf: true, + depth: 0, + position: position, + field: { + id: fieldId, + type: "text", + label: `Label ${index * 2 + 1}`, + data: textField.data, + selectorObj: textField.selectorObj + } + }); + } + } + + if (href && href !== '#' && !href.startsWith('javascript:')) { + const hrefField = createFieldData(element, selector, 'href'); + if (hrefField && hrefField.data) { + const fieldId = Date.now() + index * 1000 + 1; + + candidateFields.push({ + id: fieldId, + element: element, + isLeaf: true, + depth: 0, + position: position, + field: { + id: fieldId, + type: "text", + label: `Label ${index * 2 + 2}`, + data: hrefField.data, + selectorObj: hrefField.selectorObj + } + }); + } } } else if (tagName === "img") { - const img = element as HTMLImageElement; - const src = extractValueWithShadowSupport(img, "src"); - const alt = extractValueWithShadowSupport(img, "alt"); + const src = element.getAttribute("src"); - if (src && !src.startsWith("data:") && src.length > 10) { + if (src && isValidData(src)) { const fieldId = Date.now() + index * 1000; candidateFields.push({ @@ -894,7 +563,7 @@ export const BrowserWindow = () => { label: `Label ${index + 1}`, data: src, selectorObj: { - selector: childSelector, + selector: selector, tag: element.tagName, isShadow: isShadow, attribute: "src", @@ -902,9 +571,11 @@ export const BrowserWindow = () => { }, }); } + } else { + const fieldData = createFieldData(element, selector); - if (alt && isValidData(alt)) { - const fieldId = Date.now() + index * 1000 + 1; + if (fieldData && fieldData.data && isValidData(fieldData.data)) { + const fieldId = Date.now() + index * 1000; candidateFields.push({ id: fieldId, @@ -912,127 +583,39 @@ export const BrowserWindow = () => { isLeaf: true, depth: 0, position: position, - field: { - id: fieldId, - type: "text", - label: `Label ${index + 2}`, - data: alt, - selectorObj: { - selector: childSelector, - tag: element.tagName, - isShadow: isShadow, - attribute: "alt", - }, - }, - }); - } - } else { - const deepestElement = findDeepestChild(element); - const data = extractValueWithShadowSupport(deepestElement, "innerText"); - - if (data && isValidData(data)) { - const isLeaf = isLeafElement(deepestElement); - const depth = getElementDepthFromList( - deepestElement, - listSelector, - iframeElement.contentDocument! - ); - - const fieldId = Date.now() + index; - - candidateFields.push({ - id: fieldId, - element: deepestElement, - isLeaf: isLeaf, - depth: depth, - position: position, field: { id: fieldId, type: "text", label: `Label ${index + 1}`, - data: data, - selectorObj: { - selector: childSelector, - tag: deepestElement.tagName, - isShadow: deepestElement.getRootNode() instanceof ShadowRoot, - attribute: "innerText", - }, - }, + data: fieldData.data, + selectorObj: fieldData.selectorObj + } }); } } } } catch (error) { - console.warn( - `Failed to process child selector ${childSelector}:`, - error - ); + console.warn(`Failed to process child selector ${selector}:`, error); } }); candidateFields.sort((a, b) => { const yDiff = a.position.y - b.position.y; - + if (Math.abs(yDiff) <= 5) { return a.position.x - b.position.x; } - + return yDiff; }); const filteredCandidates = removeParentChildDuplicates(candidateFields); - const finalFields = removeDuplicateContent(filteredCandidates); return finalFields; }, [currentSnapshot] ); - const isLeafElement = (element: HTMLElement): boolean => { - const children = Array.from(element.children) as HTMLElement[]; - - if (children.length === 0) return true; - - const hasContentfulChildren = children.some((child) => { - const text = child.textContent?.trim() || ""; - return text.length > 0 && text !== element.textContent?.trim(); - }); - - return !hasContentfulChildren; - }; - - const getElementDepthFromList = ( - element: HTMLElement, - listSelector: string, - document: Document - ): number => { - try { - const listResult = document.evaluate( - listSelector, - document, - null, - XPathResult.FIRST_ORDERED_NODE_TYPE, - null - ); - - const listElement = listResult.singleNodeValue as HTMLElement; - if (!listElement) return 0; - - let depth = 0; - let current = element; - - while (current && current !== listElement && current.parentElement) { - depth++; - current = current.parentElement; - if (depth > 20) break; - } - - return current === listElement ? depth : 0; - } catch (error) { - return 0; - } - }; - const removeParentChildDuplicates = ( candidates: Array<{ id: number; @@ -1242,6 +825,29 @@ export const BrowserWindow = () => { } }, [browserSteps, getList, listSelector, initialAutoFieldIds, currentListActionId, manuallyAddedFieldIds]); + useEffect(() => { + if (currentListActionId && browserSteps.length > 0) { + const activeStep = browserSteps.find( + s => s.type === 'list' && s.actionId === currentListActionId + ) as ListStep | undefined; + + if (activeStep) { + if (currentListId !== activeStep.id) { + setCurrentListId(activeStep.id); + } + if (listSelector !== activeStep.listSelector) { + setListSelector(activeStep.listSelector); + } + if (JSON.stringify(fields) !== JSON.stringify(activeStep.fields)) { + setFields(activeStep.fields); + } + if (activeStep.pagination?.selector && paginationSelector !== activeStep.pagination.selector) { + setPaginationSelector(activeStep.pagination.selector); + } + } + } + }, [currentListActionId, browserSteps, currentListId, listSelector, fields, paginationSelector]); + useEffect(() => { if (!isDOMMode) { capturedElementHighlighter.clearHighlights(); @@ -1637,6 +1243,22 @@ export const BrowserWindow = () => { paginationType !== "scrollUp" && paginationType !== "none" ) { + let targetListId = currentListId; + let targetFields = fields; + + if ((!targetListId || targetListId === 0) && currentListActionId) { + const activeStep = browserSteps.find( + s => s.type === 'list' && s.actionId === currentListActionId + ) as ListStep | undefined; + + if (activeStep) { + targetListId = activeStep.id; + if (Object.keys(targetFields).length === 0 && Object.keys(activeStep.fields).length > 0) { + targetFields = activeStep.fields; + } + } + } + setPaginationSelector(highlighterData.selector); notify( `info`, @@ -1646,8 +1268,8 @@ export const BrowserWindow = () => { ); addListStep( listSelector!, - fields, - currentListId || 0, + targetFields, + targetListId || 0, currentListActionId || `list-${crypto.randomUUID()}`, { type: paginationType, @@ -1812,6 +1434,8 @@ export const BrowserWindow = () => { socket, t, paginationSelector, + highlighterData, + browserSteps ] ); @@ -1864,6 +1488,22 @@ export const BrowserWindow = () => { paginationType !== "scrollUp" && paginationType !== "none" ) { + let targetListId = currentListId; + let targetFields = fields; + + if ((!targetListId || targetListId === 0) && currentListActionId) { + const activeStep = browserSteps.find( + s => s.type === 'list' && s.actionId === currentListActionId + ) as ListStep | undefined; + + if (activeStep) { + targetListId = activeStep.id; + if (Object.keys(targetFields).length === 0 && Object.keys(activeStep.fields).length > 0) { + targetFields = activeStep.fields; + } + } + } + setPaginationSelector(highlighterData.selector); notify( `info`, @@ -1873,8 +1513,8 @@ export const BrowserWindow = () => { ); addListStep( listSelector!, - fields, - currentListId || 0, + targetFields, + targetListId || 0, currentListActionId || `list-${crypto.randomUUID()}`, { type: paginationType, selector: highlighterData.selector, isShadow: highlighterData.isShadow }, undefined, @@ -2046,6 +1686,31 @@ export const BrowserWindow = () => { } }, [paginationMode, resetPaginationSelector]); + useEffect(() => { + if (paginationMode && currentListActionId) { + const currentListStep = browserSteps.find( + step => step.type === 'list' && step.actionId === currentListActionId + ) as (ListStep & { type: 'list' }) | undefined; + + const currentSelector = currentListStep?.pagination?.selector; + const currentType = currentListStep?.pagination?.type; + + if (['clickNext', 'clickLoadMore'].includes(paginationType)) { + if (!currentSelector || (currentType && currentType !== paginationType)) { + setPaginationSelector(''); + } + } + + const stepSelector = currentListStep?.pagination?.selector; + + if (stepSelector && !paginationSelector) { + setPaginationSelector(stepSelector); + } else if (!stepSelector && paginationSelector) { + setPaginationSelector(''); + } + } + }, [browserSteps, paginationMode, currentListActionId, paginationSelector]); + return (
{ listSelector={listSelector} cachedChildSelectors={cachedChildSelectors} paginationMode={paginationMode} + paginationSelector={paginationSelector} paginationType={paginationType} limitMode={limitMode} isCachingChildSelectors={isCachingChildSelectors} diff --git a/src/components/recorder/DOMBrowserRenderer.tsx b/src/components/recorder/DOMBrowserRenderer.tsx index 9e818e31..10fa4742 100644 --- a/src/components/recorder/DOMBrowserRenderer.tsx +++ b/src/components/recorder/DOMBrowserRenderer.tsx @@ -100,6 +100,7 @@ interface RRWebDOMBrowserRendererProps { listSelector?: string | null; cachedChildSelectors?: string[]; paginationMode?: boolean; + paginationSelector?: string; paginationType?: string; limitMode?: boolean; isCachingChildSelectors?: boolean; @@ -153,6 +154,7 @@ export const DOMBrowserRenderer: React.FC = ({ listSelector = null, cachedChildSelectors = [], paginationMode = false, + paginationSelector = "", paginationType = "", limitMode = false, isCachingChildSelectors = false, @@ -257,6 +259,13 @@ export const DOMBrowserRenderer: React.FC = ({ else if (listSelector) { if (limitMode) { shouldHighlight = false; + } else if ( + paginationMode && + paginationSelector && + paginationType !== "" && + !["none", "scrollDown", "scrollUp"].includes(paginationType) + ) { + shouldHighlight = false; } else if ( paginationMode && paginationType !== "" && diff --git a/src/components/recorder/RightSidePanel.tsx b/src/components/recorder/RightSidePanel.tsx index d5a7c29c..8159e149 100644 --- a/src/components/recorder/RightSidePanel.tsx +++ b/src/components/recorder/RightSidePanel.tsx @@ -1,4 +1,4 @@ -import React, { useState, useCallback, useEffect, useMemo } from 'react'; +import React, { useState, useCallback, useEffect, useRef, useMemo } from 'react'; import { Button, Paper, Box, TextField, IconButton, Tooltip } from "@mui/material"; import { WorkflowFile } from "maxun-core"; import Typography from "@mui/material/Typography"; @@ -15,9 +15,9 @@ import ActionDescriptionBox from '../action/ActionDescriptionBox'; import { useThemeMode } from '../../context/theme-provider'; import { useTranslation } from 'react-i18next'; import { useBrowserDimensionsStore } from '../../context/browserDimensions'; -import { emptyWorkflow } from '../../shared/constants'; import { clientListExtractor } from '../../helpers/clientListExtractor'; import { clientSelectorGenerator } from '../../helpers/clientSelectorGenerator'; +import { clientPaginationDetector } from '../../helpers/clientPaginationDetector'; const fetchWorkflow = (id: string, callback: (response: WorkflowFile) => void) => { getActiveWorkflow(id).then( @@ -45,6 +45,13 @@ export const RightSidePanel: React.FC = ({ onFinishCapture const [showCaptureText, setShowCaptureText] = useState(true); const { panelHeight } = useBrowserDimensionsStore(); + const [autoDetectedPagination, setAutoDetectedPagination] = useState<{ + type: PaginationType; + selector: string | null; + confidence: 'high' | 'medium' | 'low'; + } | null>(null); + const autoDetectionRunRef = useRef(null); + const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog, currentListActionId, setCurrentListActionId, currentTextActionId, setCurrentTextActionId, currentScreenshotActionId, setCurrentScreenshotActionId, isDOMMode, setIsDOMMode, currentSnapshot, setCurrentSnapshot, updateDOMMode, initialUrl, setRecordingUrl, currentTextGroupName } = useGlobalInfoStore(); const { getText, startGetText, stopGetText, @@ -62,7 +69,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture startAction, finishAction } = useActionContext(); - const { browserSteps, updateBrowserTextStepLabel, deleteBrowserStep, addScreenshotStep, updateListTextFieldLabel, removeListTextField, updateListStepLimit, deleteStepsByActionId, updateListStepData, updateScreenshotStepData, emitActionForStep } = useBrowserSteps(); + const { browserSteps, addScreenshotStep, updateListStepLimit, updateListStepPagination, deleteStepsByActionId, updateListStepData, updateScreenshotStepData, emitActionForStep } = useBrowserSteps(); const { id, socket } = useSocketStore(); const { t } = useTranslation(); @@ -72,6 +79,73 @@ export const RightSidePanel: React.FC = ({ onFinishCapture setWorkflow(data); }, [setWorkflow]); + useEffect(() => { + if (!paginationType || !currentListActionId) return; + + const currentListStep = browserSteps.find( + step => step.type === 'list' && step.actionId === currentListActionId + ) as (BrowserStep & { type: 'list' }) | undefined; + + const currentSelector = currentListStep?.pagination?.selector; + const currentType = currentListStep?.pagination?.type; + + if (['clickNext', 'clickLoadMore'].includes(paginationType)) { + const needsSelector = !currentSelector && !currentType; + const typeChanged = currentType && currentType !== paginationType; + + if (typeChanged) { + const iframeElement = document.querySelector('#browser-window iframe') as HTMLIFrameElement; + if (iframeElement?.contentDocument && currentSelector) { + try { + function evaluateSelector(selector: string, doc: Document): Element[] { + if (selector.startsWith('//') || selector.startsWith('(//')) { + try { + const result = doc.evaluate(selector, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + const elements: Element[] = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node as Element); + } + } + return elements; + } catch (err) { + return []; + } + } else { + try { + return Array.from(doc.querySelectorAll(selector)); + } catch (err) { + return []; + } + } + } + + const elements = evaluateSelector(currentSelector, iframeElement.contentDocument); + elements.forEach((el: Element) => { + (el as HTMLElement).style.outline = ''; + (el as HTMLElement).style.outlineOffset = ''; + (el as HTMLElement).style.zIndex = ''; + }); + } catch (error) { + console.error('Error removing pagination highlight:', error); + } + } + + if (currentListStep) { + updateListStepPagination(currentListStep.id, { + type: paginationType, + selector: null, + }); + } + + startPaginationMode(); + } else if (needsSelector) { + startPaginationMode(); + } + } + }, [paginationType, currentListActionId, browserSteps, updateListStepPagination, startPaginationMode]); + useEffect(() => { if (socket) { const domModeHandler = (data: any) => { @@ -391,7 +465,182 @@ export const RightSidePanel: React.FC = ({ onFinishCapture return; } - startPaginationMode(); + const currentListStepForAutoDetect = browserSteps.find( + step => step.type === 'list' && step.actionId === currentListActionId + ) as (BrowserStep & { type: 'list'; listSelector?: string }) | undefined; + + if (currentListStepForAutoDetect?.listSelector) { + if (autoDetectionRunRef.current !== currentListActionId) { + autoDetectionRunRef.current = currentListActionId; + + notify('info', 'Detecting pagination...'); + + try { + socket?.emit('testPaginationScroll', { + listSelector: currentListStepForAutoDetect.listSelector + }); + + const handleScrollTestResult = (result: any) => { + if (result.success && result.contentLoaded) { + setAutoDetectedPagination({ + type: 'scrollDown', + selector: null, + confidence: 'high' + }); + updatePaginationType('scrollDown'); + + const latestListStep = browserSteps.find( + step => step.type === 'list' && step.actionId === currentListActionId + ); + if (latestListStep) { + updateListStepPagination(latestListStep.id, { + type: 'scrollDown', + selector: null, + isShadow: false + }); + } + } else if (result.success && !result.contentLoaded) { + const iframeElement = document.querySelector('#browser-window iframe') as HTMLIFrameElement; + const iframeDoc = iframeElement?.contentDocument; + + if (iframeDoc) { + const detectionResult = clientPaginationDetector.autoDetectPagination( + iframeDoc, + currentListStepForAutoDetect.listSelector!, + clientSelectorGenerator, + { disableScrollDetection: true } + ); + + if (detectionResult.type) { + setAutoDetectedPagination({ + type: detectionResult.type, + selector: detectionResult.selector, + confidence: detectionResult.confidence + }); + + const latestListStep = browserSteps.find( + step => step.type === 'list' && step.actionId === currentListActionId + ); + if (latestListStep) { + updateListStepPagination(latestListStep.id, { + type: detectionResult.type, + selector: detectionResult.selector, + isShadow: false + }); + } + + updatePaginationType(detectionResult.type); + + if (detectionResult.selector && (detectionResult.type === 'clickNext' || detectionResult.type === 'clickLoadMore')) { + try { + function evaluateSelector(selector: string, doc: Document): Element[] { + try { + const isXPath = selector.startsWith('//') || selector.startsWith('(//'); + if (isXPath) { + const result = doc.evaluate( + selector, + doc, + null, + XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, + null + ); + const elements: Element[] = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node as Element); + } + } + return elements; + } else { + try { + const allElements = Array.from(doc.querySelectorAll(selector)); + if (allElements.length > 0) { + return allElements; + } + } catch (err) { + console.warn('[RightSidePanel] Full chained selector failed, trying individual selectors:', err); + } + + const selectorParts = selector.split(','); + for (const part of selectorParts) { + try { + const elements = Array.from(doc.querySelectorAll(part.trim())); + if (elements.length > 0) { + return elements; + } + } catch (err) { + console.warn('[RightSidePanel] Selector part failed:', part.trim(), err); + continue; + } + } + return []; + } + } catch (err) { + console.error('[RightSidePanel] Selector evaluation failed:', selector, err); + return []; + } + } + + const elements = evaluateSelector(detectionResult.selector, iframeDoc); + if (elements.length > 0) { + elements.forEach((el: Element) => { + (el as HTMLElement).style.outline = '3px dashed #ff00c3'; + (el as HTMLElement).style.outlineOffset = '2px'; + (el as HTMLElement).style.zIndex = '9999'; + }); + + const firstElement = elements[0] as HTMLElement; + const elementRect = firstElement.getBoundingClientRect(); + const iframeWindow = iframeElement.contentWindow; + if (iframeWindow) { + const targetY = elementRect.top + iframeWindow.scrollY - (iframeWindow.innerHeight / 2) + (elementRect.height / 2); + iframeWindow.scrollTo({ top: targetY, behavior: 'smooth' }); + } + + const paginationTypeLabel = detectionResult.type === 'clickNext' ? 'Next Button' : 'Load More Button'; + notify('info', `${paginationTypeLabel} has been auto-detected and highlighted on the page`); + } else { + console.warn(' No elements found for selector:', detectionResult.selector); + } + } catch (error) { + console.error('Error highlighting pagination button:', error); + } + } + } else { + setAutoDetectedPagination(null); + } + } + } else { + console.error('Scroll test failed:', result.error); + setAutoDetectedPagination(null); + } + + socket?.off('paginationScrollTestResult', handleScrollTestResult); + }; + + socket?.on('paginationScrollTestResult', handleScrollTestResult); + + setTimeout(() => { + socket?.off('paginationScrollTestResult', handleScrollTestResult); + }, 5000); + + } catch (error) { + console.error('Scroll test failed:', error); + setAutoDetectedPagination(null); + } + } + } + + const shouldSkipPaginationMode = autoDetectedPagination && ( + ['scrollDown', 'scrollUp'].includes(autoDetectedPagination.type) || + (['clickNext', 'clickLoadMore'].includes(autoDetectedPagination.type) && autoDetectedPagination.selector) + ); + + if (!shouldSkipPaginationMode) { + startPaginationMode(); + } + setShowPaginationOptions(true); setCaptureStage('pagination'); break; @@ -460,6 +709,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture case 'pagination': stopPaginationMode(); setShowPaginationOptions(false); + setAutoDetectedPagination(null); setCaptureStage('initial'); break; } @@ -495,17 +745,58 @@ export const RightSidePanel: React.FC = ({ onFinishCapture socket.emit('removeAction', { actionId: currentListActionId }); } } + + if (autoDetectedPagination?.selector) { + const iframeElement = document.querySelector('#browser-window iframe') as HTMLIFrameElement; + if (iframeElement?.contentDocument) { + try { + function evaluateSelector(selector: string, doc: Document): Element[] { + if (selector.startsWith('//') || selector.startsWith('(//')) { + try { + const result = doc.evaluate(selector, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + const elements: Element[] = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node as Element); + } + } + return elements; + } catch (err) { + return []; + } + } else { + try { + return Array.from(doc.querySelectorAll(selector)); + } catch (err) { + return []; + } + } + } + + const elements = evaluateSelector(autoDetectedPagination.selector, iframeElement.contentDocument); + elements.forEach((el: Element) => { + (el as HTMLElement).style.outline = ''; + (el as HTMLElement).style.outlineOffset = ''; + (el as HTMLElement).style.zIndex = ''; + }); + } catch (error) { + console.error('Error removing pagination highlight on discard:', error); + } + } + } resetListState(); stopPaginationMode(); stopLimitMode(); setShowPaginationOptions(false); setShowLimitOptions(false); + setAutoDetectedPagination(null); setCaptureStage('initial'); setCurrentListActionId(''); clientSelectorGenerator.cleanup(); notify('error', t('right_panel.errors.capture_list_discarded')); - }, [currentListActionId, browserSteps, stopGetList, deleteStepsByActionId, resetListState, setShowPaginationOptions, setShowLimitOptions, setCaptureStage, notify, t, stopPaginationMode, stopLimitMode, socket]); + }, [currentListActionId, browserSteps, stopGetList, deleteStepsByActionId, resetListState, setShowPaginationOptions, setShowLimitOptions, setCaptureStage, notify, t, stopPaginationMode, stopLimitMode, socket, autoDetectedPagination]); const captureScreenshot = (fullPage: boolean) => { const screenshotCount = browserSteps.filter(s => s.type === 'screenshot').length + 1; @@ -615,6 +906,114 @@ export const RightSidePanel: React.FC = ({ onFinishCapture {showPaginationOptions && ( {t('right_panel.pagination.title')} + + {autoDetectedPagination && autoDetectedPagination.type !== '' && ( + + + ✓ Auto-detected: { + autoDetectedPagination.type === 'clickNext' ? 'Click Next' : + autoDetectedPagination.type === 'clickLoadMore' ? 'Click Load More' : + autoDetectedPagination.type === 'scrollDown' ? 'Scroll Down' : + autoDetectedPagination.type === 'scrollUp' ? 'Scroll Up' : + autoDetectedPagination.type + } + + + You can continue with this or manually select a different pagination type below. + + {autoDetectedPagination.selector && ['clickNext', 'clickLoadMore'].includes(autoDetectedPagination.type) && ( + + )} + + )}