Merge pull request #907 from getmaxun/pre-release-28

chore: pre-release v0.0.28
This commit is contained in:
Karishma Shukla
2025-11-30 23:00:38 +05:30
committed by GitHub
15 changed files with 157 additions and 145 deletions

View File

@@ -1,4 +1,4 @@
FROM --platform=$BUILDPLATFORM mcr.microsoft.com/playwright:v1.46.0-noble FROM --platform=$BUILDPLATFORM node:20-slim
# Set working directory # Set working directory
WORKDIR /app WORKDIR /app
@@ -18,31 +18,6 @@ COPY server/tsconfig.json ./server/
# Install dependencies # Install dependencies
RUN npm install --legacy-peer-deps RUN npm install --legacy-peer-deps
# Create the Chromium data directory with necessary permissions
RUN mkdir -p /tmp/chromium-data-dir && \
chmod -R 777 /tmp/chromium-data-dir
# Install dependencies
RUN apt-get update && apt-get install -y \
libgbm1 \
libnss3 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libdrm2 \
libxkbcommon0 \
libglib2.0-0 \
libdbus-1-3 \
libx11-xcb1 \
libxcb1 \
libxcomposite1 \
libxcursor1 \
libxdamage1 \
libxext6 \
libxi6 \
libxtst6 \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir -p /tmp/.X11-unix && chmod 1777 /tmp/.X11-unix
# Expose backend port # Expose backend port
EXPOSE ${BACKEND_PORT:-8080} EXPOSE ${BACKEND_PORT:-8080}

View File

@@ -6,7 +6,7 @@ WORKDIR /app
COPY browser/package*.json ./ COPY browser/package*.json ./
# Install dependencies # Install dependencies
RUN npm ci RUN npm install
# Copy TypeScript source and config # Copy TypeScript source and config
COPY browser/server.ts ./ COPY browser/server.ts ./

View File

@@ -11,6 +11,7 @@ let browserServer: BrowserServer | null = null;
// Configurable ports with defaults // Configurable ports with defaults
const BROWSER_WS_PORT = parseInt(process.env.BROWSER_WS_PORT || '3001', 10); const BROWSER_WS_PORT = parseInt(process.env.BROWSER_WS_PORT || '3001', 10);
const BROWSER_HEALTH_PORT = parseInt(process.env.BROWSER_HEALTH_PORT || '3002', 10); const BROWSER_HEALTH_PORT = parseInt(process.env.BROWSER_HEALTH_PORT || '3002', 10);
const BROWSER_WS_HOST = process.env.BROWSER_WS_HOST || 'localhost';
async function start(): Promise<void> { async function start(): Promise<void> {
console.log('Starting Maxun Browser Service...'); console.log('Starting Maxun Browser Service...');
@@ -44,17 +45,19 @@ async function start(): Promise<void> {
// Health check HTTP server // Health check HTTP server
const healthServer = http.createServer((req, res) => { const healthServer = http.createServer((req, res) => {
if (req.url === '/health') { if (req.url === '/health') {
const wsEndpoint = browserServer?.wsEndpoint().replace('localhost', BROWSER_WS_HOST) || '';
res.writeHead(200, { 'Content-Type': 'application/json' }); res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({ res.end(JSON.stringify({
status: 'healthy', status: 'healthy',
wsEndpoint: browserServer?.wsEndpoint(), wsEndpoint,
wsPort: BROWSER_WS_PORT, wsPort: BROWSER_WS_PORT,
healthPort: BROWSER_HEALTH_PORT, healthPort: BROWSER_HEALTH_PORT,
timestamp: new Date().toISOString() timestamp: new Date().toISOString()
})); }));
} else if (req.url === '/') { } else if (req.url === '/') {
res.writeHead(200, { 'Content-Type': 'text/plain' }); res.writeHead(200, { 'Content-Type': 'text/plain' });
res.end(`Maxun Browser Service\nWebSocket: ${browserServer?.wsEndpoint()}\nHealth: http://localhost:${BROWSER_HEALTH_PORT}/health`); const wsEndpoint = browserServer?.wsEndpoint().replace('localhost', BROWSER_WS_HOST) || '';
res.end(`Maxun Browser Service\nWebSocket: ${wsEndpoint}\nHealth: http://localhost:${BROWSER_HEALTH_PORT}/health`);
} else { } else {
res.writeHead(404); res.writeHead(404);
res.end('Not Found'); res.end('Not Found');

View File

@@ -30,9 +30,9 @@ services:
- minio_data:/data - minio_data:/data
backend: backend:
#build: # build:
#context: . # context: .
#dockerfile: server/Dockerfile # dockerfile: Dockerfile.backend
image: getmaxun/maxun-backend:latest image: getmaxun/maxun-backend:latest
restart: unless-stopped restart: unless-stopped
ports: ports:
@@ -60,9 +60,9 @@ services:
- /var/run/dbus:/var/run/dbus - /var/run/dbus:/var/run/dbus
frontend: frontend:
#build: # build:
#context: . # context: .
#dockerfile: Dockerfile # dockerfile: Dockerfile.frontend
image: getmaxun/maxun-frontend:latest image: getmaxun/maxun-frontend:latest
restart: unless-stopped restart: unless-stopped
ports: ports:
@@ -89,6 +89,8 @@ services:
- DEBUG=pw:browser* - DEBUG=pw:browser*
- BROWSER_WS_PORT=${BROWSER_WS_PORT:-3001} - BROWSER_WS_PORT=${BROWSER_WS_PORT:-3001}
- BROWSER_HEALTH_PORT=${BROWSER_HEALTH_PORT:-3002} - BROWSER_HEALTH_PORT=${BROWSER_HEALTH_PORT:-3002}
- BROWSER_WS_HOST=${BROWSER_WS_HOST:-browser}
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
restart: unless-stopped restart: unless-stopped
healthcheck: healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${BROWSER_HEALTH_PORT:-3002}/health"] test: ["CMD", "curl", "-f", "http://localhost:${BROWSER_HEALTH_PORT:-3002}/health"]

View File

@@ -1,6 +1,6 @@
{ {
"name": "maxun-core", "name": "maxun-core",
"version": "0.0.27", "version": "0.0.28",
"description": "Core package for Maxun, responsible for data extraction", "description": "Core package for Maxun, responsible for data extraction",
"main": "build/index.js", "main": "build/index.js",
"typings": "build/index.d.ts", "typings": "build/index.d.ts",

View File

@@ -1,6 +1,6 @@
{ {
"name": "maxun", "name": "maxun",
"version": "0.0.27", "version": "0.0.28",
"author": "Maxun", "author": "Maxun",
"license": "AGPL-3.0-or-later", "license": "AGPL-3.0-or-later",
"dependencies": { "dependencies": {
@@ -52,7 +52,7 @@
"lodash": "^4.17.21", "lodash": "^4.17.21",
"loglevel": "^1.8.0", "loglevel": "^1.8.0",
"loglevel-plugin-remote": "^0.6.8", "loglevel-plugin-remote": "^0.6.8",
"maxun-core": "^0.0.27", "maxun-core": "^0.0.28",
"minio": "^8.0.1", "minio": "^8.0.1",
"moment-timezone": "^0.5.45", "moment-timezone": "^0.5.45",
"node-cron": "^3.0.3", "node-cron": "^3.0.3",
@@ -131,4 +131,4 @@
"vite": "^5.4.10", "vite": "^5.4.10",
"zod": "^3.25.62" "zod": "^3.25.62"
} }
} }

View File

@@ -11,7 +11,7 @@ import { io, Socket } from "socket.io-client";
import { BinaryOutputService } from "../storage/mino"; import { BinaryOutputService } from "../storage/mino";
import { AuthenticatedRequest } from "../routes/record" import { AuthenticatedRequest } from "../routes/record"
import {capture} from "../utils/analytics"; import {capture} from "../utils/analytics";
import { Page } from "playwright"; import { Page } from "playwright-core";
import { WorkflowFile } from "maxun-core"; import { WorkflowFile } from "maxun-core";
import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet"; import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable"; import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";

View File

@@ -550,9 +550,9 @@ export class RemoteBrowser {
try { try {
const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']); const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']);
await blocker.enableBlockingInPage(this.currentPage); await blocker.enableBlockingInPage(this.currentPage as any);
this.client = await this.currentPage.context().newCDPSession(this.currentPage); this.client = await this.currentPage.context().newCDPSession(this.currentPage);
await blocker.disableBlockingInPage(this.currentPage); await blocker.disableBlockingInPage(this.currentPage as any);
console.log('Adblocker initialized'); console.log('Adblocker initialized');
} catch (error: any) { } catch (error: any) {
console.warn('Failed to initialize adblocker, continuing without it:', error.message); console.warn('Failed to initialize adblocker, continuing without it:', error.message);

View File

@@ -1,4 +1,4 @@
import { connectToRemoteBrowser } from "../browser-management/browserConnection"; import { Page } from "playwright-core";
import { parseMarkdown } from "./markdown"; import { parseMarkdown } from "./markdown";
import logger from "../logger"; import logger from "../logger";
@@ -21,115 +21,105 @@ async function gotoWithFallback(page: any, url: string) {
* Fetches a webpage, strips scripts/styles/images/etc, * Fetches a webpage, strips scripts/styles/images/etc,
* returns clean Markdown using parser. * returns clean Markdown using parser.
* @param url - The URL to convert * @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse * @param page - Existing Playwright page instance to use
*/ */
export async function convertPageToMarkdown(url: string): Promise<string> { export async function convertPageToMarkdown(url: string, page: Page): Promise<string> {
const browser = await connectToRemoteBrowser(); try {
const page = await browser.newPage(); logger.log('info', `[Scrape] Using existing page instance for markdown conversion of ${url}`);
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); await gotoWithFallback(page, url);
const cleanedHtml = await page.evaluate(() => { const cleanedHtml = await page.evaluate(() => {
const selectors = [ const selectors = [
"script", "script",
"style", "style",
"link[rel='stylesheet']", "link[rel='stylesheet']",
"noscript", "noscript",
"meta", "meta",
"svg", "svg",
"img", "img",
"picture", "picture",
"source", "source",
"video", "video",
"audio", "audio",
"iframe", "iframe",
"object", "object",
"embed" "embed"
]; ];
selectors.forEach(sel => { selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove()); document.querySelectorAll(sel).forEach(e => e.remove());
});
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
}); });
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
return document.documentElement.outerHTML;
}); });
return document.documentElement.outerHTML; const markdown = await parseMarkdown(cleanedHtml, url);
}); return markdown;
} catch (error: any) {
if (shouldCloseBrowser && browser) { logger.error(`[Scrape] Error during markdown conversion: ${error.message}`);
logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`); throw error;
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
} }
// Convert cleaned HTML → Markdown
const markdown = await parseMarkdown(cleanedHtml, url);
return markdown;
} }
/** /**
* Fetches a webpage, strips scripts/styles/images/etc, * Fetches a webpage, strips scripts/styles/images/etc,
* returns clean HTML. * returns clean HTML.
* @param url - The URL to convert * @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse * @param page - Existing Playwright page instance to use
*/ */
export async function convertPageToHTML(url: string): Promise<string> { export async function convertPageToHTML(url: string, page: Page): Promise<string> {
const browser = await connectToRemoteBrowser(); try {
const page = await browser.newPage(); logger.log('info', `[Scrape] Using existing page instance for HTML conversion of ${url}`);
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); await gotoWithFallback(page, url);
const cleanedHtml = await page.evaluate(() => { const cleanedHtml = await page.evaluate(() => {
const selectors = [ const selectors = [
"script", "script",
"style", "style",
"link[rel='stylesheet']", "link[rel='stylesheet']",
"noscript", "noscript",
"meta", "meta",
"svg", "svg",
"img", "img",
"picture", "picture",
"source", "source",
"video", "video",
"audio", "audio",
"iframe", "iframe",
"object", "object",
"embed" "embed"
]; ];
selectors.forEach(sel => { selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove()); document.querySelectorAll(sel).forEach(e => e.remove());
});
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
}); });
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
return document.documentElement.outerHTML;
}); });
return document.documentElement.outerHTML; return cleanedHtml;
}); } catch (error: any) {
logger.error(`[Scrape] Error during HTML conversion: ${error.message}`);
if (shouldCloseBrowser && browser) { throw error;
logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
} }
// Return cleaned HTML directly
return cleanedHtml;
} }

View File

@@ -2,7 +2,6 @@ import { router as record } from './record';
import { router as workflow } from './workflow'; import { router as workflow } from './workflow';
import { router as storage } from './storage'; import { router as storage } from './storage';
import { router as auth } from './auth'; import { router as auth } from './auth';
import { router as integration } from './integration';
import { router as proxy } from './proxy'; import { router as proxy } from './proxy';
import { router as webhook } from './webhook'; import { router as webhook } from './webhook';
@@ -11,7 +10,6 @@ export {
workflow, workflow,
storage, storage,
auth, auth,
integration,
proxy, proxy,
webhook webhook
}; };

View File

@@ -15,7 +15,6 @@ import { encrypt, decrypt } from '../utils/auth';
import { WorkflowFile } from 'maxun-core'; import { WorkflowFile } from 'maxun-core';
import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule'; import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule';
import { pgBossClient } from '../storage/pgboss'; import { pgBossClient } from '../storage/pgboss';
chromium.use(stealthPlugin());
export const router = Router(); export const router = Router();

View File

@@ -5,7 +5,7 @@ import { Server } from "socket.io";
import cors from 'cors'; import cors from 'cors';
import dotenv from 'dotenv'; import dotenv from 'dotenv';
dotenv.config(); dotenv.config();
import { record, workflow, storage, auth, integration, proxy, webhook } from './routes'; import { record, workflow, storage, auth, proxy, webhook } from './routes';
import { BrowserPool } from "./browser-management/classes/BrowserPool"; import { BrowserPool } from "./browser-management/classes/BrowserPool";
import logger from './logger'; import logger from './logger';
import sequelize, { connectDB, syncDB } from './storage/db' import sequelize, { connectDB, syncDB } from './storage/db'
@@ -107,7 +107,6 @@ app.use('/record', record);
app.use('/workflow', workflow); app.use('/workflow', workflow);
app.use('/storage', storage); app.use('/storage', storage);
app.use('/auth', auth); app.use('/auth', auth);
app.use('/integration', integration);
app.use('/proxy', proxy); app.use('/proxy', proxy);
app.use('/api-docs', swaggerUi.serve, swaggerUi.setup(swaggerSpec)); app.use('/api-docs', swaggerUi.serve, swaggerUi.setup(swaggerSpec));
@@ -179,8 +178,6 @@ if (require.main === module) {
await startWorkers(); await startWorkers();
io = new Server(server);
io.of('/queued-run').on('connection', (socket) => { io.of('/queued-run').on('connection', (socket) => {
const userId = socket.handshake.query.userId as string; const userId = socket.handshake.query.userId as string;

View File

@@ -1686,6 +1686,12 @@ export const BrowserWindow = () => {
} }
}, [paginationMode, resetPaginationSelector]); }, [paginationMode, resetPaginationSelector]);
useEffect(() => {
if (!paginationMode || !getList) {
setHighlighterData(null);
}
}, [paginationMode, getList]);
useEffect(() => { useEffect(() => {
if (paginationMode && currentListActionId) { if (paginationMode && currentListActionId) {
const currentListStep = browserSteps.find( const currentListStep = browserSteps.find(
@@ -1841,7 +1847,7 @@ export const BrowserWindow = () => {
> >
{/* Individual element highlight (for non-group or hovered element) */} {/* Individual element highlight (for non-group or hovered element) */}
{((getText && !listSelector) || {((getText && !listSelector) ||
(getList && paginationMode && paginationType !== "" && (getList && paginationMode && !paginationSelector && paginationType !== "" &&
!["none", "scrollDown", "scrollUp"].includes(paginationType))) && ( !["none", "scrollDown", "scrollUp"].includes(paginationType))) && (
<div <div
style={{ style={{
@@ -1909,6 +1915,7 @@ export const BrowserWindow = () => {
listSelector && listSelector &&
!paginationMode && !paginationMode &&
!limitMode && !limitMode &&
captureStage === 'initial' &&
highlighterData.similarElements?.rects?.map((rect, index) => ( highlighterData.similarElements?.rects?.map((rect, index) => (
<React.Fragment key={`item-${index}`}> <React.Fragment key={`item-${index}`}>
<div <div

View File

@@ -268,6 +268,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
shouldHighlight = false; shouldHighlight = false;
} else if ( } else if (
paginationMode && paginationMode &&
!paginationSelector &&
paginationType !== "" && paginationType !== "" &&
!["none", "scrollDown", "scrollUp"].includes(paginationType) !["none", "scrollDown", "scrollUp"].includes(paginationType)
) { ) {
@@ -353,7 +354,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
const options: boolean | AddEventListenerOptions = ['wheel', 'touchstart', 'touchmove'].includes(event) const options: boolean | AddEventListenerOptions = ['wheel', 'touchstart', 'touchmove'].includes(event)
? { passive: false } ? { passive: false }
: false; : false;
iframeDoc.removeEventListener(event, handler as EventListener, options); iframeDoc.removeEventListener(event, handler as EventListener, options);
}); });
} }
@@ -588,7 +589,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
const elementRect = element.getBoundingClientRect(); const elementRect = element.getBoundingClientRect();
const relativeX = iframeX - elementRect.left; const relativeX = iframeX - elementRect.left;
const relativeY = iframeY - elementRect.top; const relativeY = iframeY - elementRect.top;
socket.emit("dom:click", { socket.emit("dom:click", {
selector, selector,
url: snapshot.baseUrl, url: snapshot.baseUrl,
@@ -636,7 +637,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
if (iframe) { if (iframe) {
const focusedElement = iframeDoc.activeElement as HTMLElement; const focusedElement = iframeDoc.activeElement as HTMLElement;
let coordinates = { x: 0, y: 0 }; let coordinates = { x: 0, y: 0 };
if (focusedElement && focusedElement !== iframeDoc.body) { if (focusedElement && focusedElement !== iframeDoc.body) {
// Get coordinates from the focused element // Get coordinates from the focused element
const rect = focusedElement.getBoundingClientRect(); const rect = focusedElement.getBoundingClientRect();

View File

@@ -415,6 +415,46 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
}, [stopGetList, resetListState]); }, [stopGetList, resetListState]);
const stopCaptureAndEmitGetListSettings = useCallback(() => { const stopCaptureAndEmitGetListSettings = useCallback(() => {
if (autoDetectedPagination?.selector) {
const iframeElement = document.querySelector('#browser-window iframe') as HTMLIFrameElement;
if (iframeElement?.contentDocument) {
try {
function evaluateSelector(selector: string, doc: Document): Element[] {
if (selector.startsWith('//') || selector.startsWith('(//')) {
try {
const result = doc.evaluate(selector, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
const elements: Element[] = [];
for (let i = 0; i < result.snapshotLength; i++) {
const node = result.snapshotItem(i);
if (node && node.nodeType === Node.ELEMENT_NODE) {
elements.push(node as Element);
}
}
return elements;
} catch (err) {
return [];
}
} else {
try {
return Array.from(doc.querySelectorAll(selector));
} catch (err) {
return [];
}
}
}
const elements = evaluateSelector(autoDetectedPagination.selector, iframeElement.contentDocument);
elements.forEach((el: Element) => {
(el as HTMLElement).style.outline = '';
(el as HTMLElement).style.outlineOffset = '';
(el as HTMLElement).style.zIndex = '';
});
} catch (error) {
console.error('Error removing pagination highlight on completion:', error);
}
}
}
const latestListStep = getLatestListStep(browserSteps); const latestListStep = getLatestListStep(browserSteps);
if (latestListStep) { if (latestListStep) {
extractDataClientSide(latestListStep.listSelector!, latestListStep.fields, latestListStep.id); extractDataClientSide(latestListStep.listSelector!, latestListStep.fields, latestListStep.id);
@@ -423,7 +463,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
...currentWorkflowActionsState, ...currentWorkflowActionsState,
hasScrapeListAction: true hasScrapeListAction: true
}); });
emitActionForStep(latestListStep); emitActionForStep(latestListStep);
handleStopGetList(); handleStopGetList();
@@ -441,7 +481,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
onFinishCapture(); onFinishCapture();
clientSelectorGenerator.cleanup(); clientSelectorGenerator.cleanup();
} }
}, [socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide, setCurrentWorkflowActionsState, currentWorkflowActionsState, emitActionForStep]); }, [socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide, setCurrentWorkflowActionsState, currentWorkflowActionsState, emitActionForStep, autoDetectedPagination]);
const getLatestListStep = (steps: BrowserStep[]) => { const getLatestListStep = (steps: BrowserStep[]) => {
const listSteps = steps.filter(step => step.type === 'list'); const listSteps = steps.filter(step => step.type === 'list');