Merge pull request #907 from getmaxun/pre-release-28

chore: pre-release v0.0.28
This commit is contained in:
Karishma Shukla
2025-11-30 23:00:38 +05:30
committed by GitHub
15 changed files with 157 additions and 145 deletions

View File

@@ -1,4 +1,4 @@
FROM --platform=$BUILDPLATFORM mcr.microsoft.com/playwright:v1.46.0-noble
FROM --platform=$BUILDPLATFORM node:20-slim
# Set working directory
WORKDIR /app
@@ -18,31 +18,6 @@ COPY server/tsconfig.json ./server/
# Install dependencies
RUN npm install --legacy-peer-deps
# Create the Chromium data directory with necessary permissions
RUN mkdir -p /tmp/chromium-data-dir && \
chmod -R 777 /tmp/chromium-data-dir
# Install dependencies
RUN apt-get update && apt-get install -y \
libgbm1 \
libnss3 \
libatk1.0-0 \
libatk-bridge2.0-0 \
libdrm2 \
libxkbcommon0 \
libglib2.0-0 \
libdbus-1-3 \
libx11-xcb1 \
libxcb1 \
libxcomposite1 \
libxcursor1 \
libxdamage1 \
libxext6 \
libxi6 \
libxtst6 \
&& rm -rf /var/lib/apt/lists/* \
&& mkdir -p /tmp/.X11-unix && chmod 1777 /tmp/.X11-unix
# Expose backend port
EXPOSE ${BACKEND_PORT:-8080}

View File

@@ -6,7 +6,7 @@ WORKDIR /app
COPY browser/package*.json ./
# Install dependencies
RUN npm ci
RUN npm install
# Copy TypeScript source and config
COPY browser/server.ts ./

View File

@@ -11,6 +11,7 @@ let browserServer: BrowserServer | null = null;
// Configurable ports with defaults
const BROWSER_WS_PORT = parseInt(process.env.BROWSER_WS_PORT || '3001', 10);
const BROWSER_HEALTH_PORT = parseInt(process.env.BROWSER_HEALTH_PORT || '3002', 10);
const BROWSER_WS_HOST = process.env.BROWSER_WS_HOST || 'localhost';
async function start(): Promise<void> {
console.log('Starting Maxun Browser Service...');
@@ -44,17 +45,19 @@ async function start(): Promise<void> {
// Health check HTTP server
const healthServer = http.createServer((req, res) => {
if (req.url === '/health') {
const wsEndpoint = browserServer?.wsEndpoint().replace('localhost', BROWSER_WS_HOST) || '';
res.writeHead(200, { 'Content-Type': 'application/json' });
res.end(JSON.stringify({
status: 'healthy',
wsEndpoint: browserServer?.wsEndpoint(),
wsEndpoint,
wsPort: BROWSER_WS_PORT,
healthPort: BROWSER_HEALTH_PORT,
timestamp: new Date().toISOString()
}));
} else if (req.url === '/') {
res.writeHead(200, { 'Content-Type': 'text/plain' });
res.end(`Maxun Browser Service\nWebSocket: ${browserServer?.wsEndpoint()}\nHealth: http://localhost:${BROWSER_HEALTH_PORT}/health`);
const wsEndpoint = browserServer?.wsEndpoint().replace('localhost', BROWSER_WS_HOST) || '';
res.end(`Maxun Browser Service\nWebSocket: ${wsEndpoint}\nHealth: http://localhost:${BROWSER_HEALTH_PORT}/health`);
} else {
res.writeHead(404);
res.end('Not Found');

View File

@@ -30,9 +30,9 @@ services:
- minio_data:/data
backend:
#build:
#context: .
#dockerfile: server/Dockerfile
# build:
# context: .
# dockerfile: Dockerfile.backend
image: getmaxun/maxun-backend:latest
restart: unless-stopped
ports:
@@ -60,9 +60,9 @@ services:
- /var/run/dbus:/var/run/dbus
frontend:
#build:
#context: .
#dockerfile: Dockerfile
# build:
# context: .
# dockerfile: Dockerfile.frontend
image: getmaxun/maxun-frontend:latest
restart: unless-stopped
ports:
@@ -89,6 +89,8 @@ services:
- DEBUG=pw:browser*
- BROWSER_WS_PORT=${BROWSER_WS_PORT:-3001}
- BROWSER_HEALTH_PORT=${BROWSER_HEALTH_PORT:-3002}
- BROWSER_WS_HOST=${BROWSER_WS_HOST:-browser}
- PLAYWRIGHT_BROWSERS_PATH=/ms-playwright
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:${BROWSER_HEALTH_PORT:-3002}/health"]

View File

@@ -1,6 +1,6 @@
{
"name": "maxun-core",
"version": "0.0.27",
"version": "0.0.28",
"description": "Core package for Maxun, responsible for data extraction",
"main": "build/index.js",
"typings": "build/index.d.ts",

View File

@@ -1,6 +1,6 @@
{
"name": "maxun",
"version": "0.0.27",
"version": "0.0.28",
"author": "Maxun",
"license": "AGPL-3.0-or-later",
"dependencies": {
@@ -52,7 +52,7 @@
"lodash": "^4.17.21",
"loglevel": "^1.8.0",
"loglevel-plugin-remote": "^0.6.8",
"maxun-core": "^0.0.27",
"maxun-core": "^0.0.28",
"minio": "^8.0.1",
"moment-timezone": "^0.5.45",
"node-cron": "^3.0.3",
@@ -131,4 +131,4 @@
"vite": "^5.4.10",
"zod": "^3.25.62"
}
}
}

View File

@@ -11,7 +11,7 @@ import { io, Socket } from "socket.io-client";
import { BinaryOutputService } from "../storage/mino";
import { AuthenticatedRequest } from "../routes/record"
import {capture} from "../utils/analytics";
import { Page } from "playwright";
import { Page } from "playwright-core";
import { WorkflowFile } from "maxun-core";
import { addGoogleSheetUpdateTask, googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
import { addAirtableUpdateTask, airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";

View File

@@ -550,9 +550,9 @@ export class RemoteBrowser {
try {
const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']);
await blocker.enableBlockingInPage(this.currentPage);
await blocker.enableBlockingInPage(this.currentPage as any);
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
await blocker.disableBlockingInPage(this.currentPage);
await blocker.disableBlockingInPage(this.currentPage as any);
console.log('Adblocker initialized');
} catch (error: any) {
console.warn('Failed to initialize adblocker, continuing without it:', error.message);

View File

@@ -1,4 +1,4 @@
import { connectToRemoteBrowser } from "../browser-management/browserConnection";
import { Page } from "playwright-core";
import { parseMarkdown } from "./markdown";
import logger from "../logger";
@@ -21,115 +21,105 @@ async function gotoWithFallback(page: any, url: string) {
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean Markdown using parser.
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
* @param page - Existing Playwright page instance to use
*/
export async function convertPageToMarkdown(url: string): Promise<string> {
const browser = await connectToRemoteBrowser();
const page = await browser.newPage();
export async function convertPageToMarkdown(url: string, page: Page): Promise<string> {
try {
logger.log('info', `[Scrape] Using existing page instance for markdown conversion of ${url}`);
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
await gotoWithFallback(page, url);
const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];
const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
return document.documentElement.outerHTML;
});
return document.documentElement.outerHTML;
});
if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for markdown conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after markdown conversion`);
const markdown = await parseMarkdown(cleanedHtml, url);
return markdown;
} catch (error: any) {
logger.error(`[Scrape] Error during markdown conversion: ${error.message}`);
throw error;
}
// Convert cleaned HTML → Markdown
const markdown = await parseMarkdown(cleanedHtml, url);
return markdown;
}
/**
* Fetches a webpage, strips scripts/styles/images/etc,
* returns clean HTML.
* @param url - The URL to convert
* @param existingPage - Optional existing Playwright page instance to reuse
* @param page - Existing Playwright page instance to use
*/
export async function convertPageToHTML(url: string): Promise<string> {
const browser = await connectToRemoteBrowser();
const page = await browser.newPage();
export async function convertPageToHTML(url: string, page: Page): Promise<string> {
try {
logger.log('info', `[Scrape] Using existing page instance for HTML conversion of ${url}`);
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 });
await gotoWithFallback(page, url);
const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];
const cleanedHtml = await page.evaluate(() => {
const selectors = [
"script",
"style",
"link[rel='stylesheet']",
"noscript",
"meta",
"svg",
"img",
"picture",
"source",
"video",
"audio",
"iframe",
"object",
"embed"
];
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
// Remove inline event handlers (onclick, onload…)
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
selectors.forEach(sel => {
document.querySelectorAll(sel).forEach(e => e.remove());
});
const all = document.querySelectorAll("*");
all.forEach(el => {
[...el.attributes].forEach(attr => {
if (attr.name.startsWith("on")) {
el.removeAttribute(attr.name);
}
});
});
return document.documentElement.outerHTML;
});
return document.documentElement.outerHTML;
});
if (shouldCloseBrowser && browser) {
logger.log('info', `[Scrape] Closing browser instance created for HTML conversion`);
await browser.close();
} else {
logger.log('info', `[Scrape] Keeping existing browser instance open after HTML conversion`);
return cleanedHtml;
} catch (error: any) {
logger.error(`[Scrape] Error during HTML conversion: ${error.message}`);
throw error;
}
// Return cleaned HTML directly
return cleanedHtml;
}

View File

@@ -2,7 +2,6 @@ import { router as record } from './record';
import { router as workflow } from './workflow';
import { router as storage } from './storage';
import { router as auth } from './auth';
import { router as integration } from './integration';
import { router as proxy } from './proxy';
import { router as webhook } from './webhook';
@@ -11,7 +10,6 @@ export {
workflow,
storage,
auth,
integration,
proxy,
webhook
};

View File

@@ -15,7 +15,6 @@ import { encrypt, decrypt } from '../utils/auth';
import { WorkflowFile } from 'maxun-core';
import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule';
import { pgBossClient } from '../storage/pgboss';
chromium.use(stealthPlugin());
export const router = Router();

View File

@@ -5,7 +5,7 @@ import { Server } from "socket.io";
import cors from 'cors';
import dotenv from 'dotenv';
dotenv.config();
import { record, workflow, storage, auth, integration, proxy, webhook } from './routes';
import { record, workflow, storage, auth, proxy, webhook } from './routes';
import { BrowserPool } from "./browser-management/classes/BrowserPool";
import logger from './logger';
import sequelize, { connectDB, syncDB } from './storage/db'
@@ -107,7 +107,6 @@ app.use('/record', record);
app.use('/workflow', workflow);
app.use('/storage', storage);
app.use('/auth', auth);
app.use('/integration', integration);
app.use('/proxy', proxy);
app.use('/api-docs', swaggerUi.serve, swaggerUi.setup(swaggerSpec));
@@ -179,8 +178,6 @@ if (require.main === module) {
await startWorkers();
io = new Server(server);
io.of('/queued-run').on('connection', (socket) => {
const userId = socket.handshake.query.userId as string;

View File

@@ -1686,6 +1686,12 @@ export const BrowserWindow = () => {
}
}, [paginationMode, resetPaginationSelector]);
useEffect(() => {
if (!paginationMode || !getList) {
setHighlighterData(null);
}
}, [paginationMode, getList]);
useEffect(() => {
if (paginationMode && currentListActionId) {
const currentListStep = browserSteps.find(
@@ -1841,7 +1847,7 @@ export const BrowserWindow = () => {
>
{/* Individual element highlight (for non-group or hovered element) */}
{((getText && !listSelector) ||
(getList && paginationMode && paginationType !== "" &&
(getList && paginationMode && !paginationSelector && paginationType !== "" &&
!["none", "scrollDown", "scrollUp"].includes(paginationType))) && (
<div
style={{
@@ -1909,6 +1915,7 @@ export const BrowserWindow = () => {
listSelector &&
!paginationMode &&
!limitMode &&
captureStage === 'initial' &&
highlighterData.similarElements?.rects?.map((rect, index) => (
<React.Fragment key={`item-${index}`}>
<div

View File

@@ -268,6 +268,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
shouldHighlight = false;
} else if (
paginationMode &&
!paginationSelector &&
paginationType !== "" &&
!["none", "scrollDown", "scrollUp"].includes(paginationType)
) {
@@ -353,7 +354,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
const options: boolean | AddEventListenerOptions = ['wheel', 'touchstart', 'touchmove'].includes(event)
? { passive: false }
: false;
iframeDoc.removeEventListener(event, handler as EventListener, options);
iframeDoc.removeEventListener(event, handler as EventListener, options);
});
}
@@ -588,7 +589,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
const elementRect = element.getBoundingClientRect();
const relativeX = iframeX - elementRect.left;
const relativeY = iframeY - elementRect.top;
socket.emit("dom:click", {
selector,
url: snapshot.baseUrl,
@@ -636,7 +637,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
if (iframe) {
const focusedElement = iframeDoc.activeElement as HTMLElement;
let coordinates = { x: 0, y: 0 };
if (focusedElement && focusedElement !== iframeDoc.body) {
// Get coordinates from the focused element
const rect = focusedElement.getBoundingClientRect();

View File

@@ -415,6 +415,46 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
}, [stopGetList, resetListState]);
const stopCaptureAndEmitGetListSettings = useCallback(() => {
if (autoDetectedPagination?.selector) {
const iframeElement = document.querySelector('#browser-window iframe') as HTMLIFrameElement;
if (iframeElement?.contentDocument) {
try {
function evaluateSelector(selector: string, doc: Document): Element[] {
if (selector.startsWith('//') || selector.startsWith('(//')) {
try {
const result = doc.evaluate(selector, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
const elements: Element[] = [];
for (let i = 0; i < result.snapshotLength; i++) {
const node = result.snapshotItem(i);
if (node && node.nodeType === Node.ELEMENT_NODE) {
elements.push(node as Element);
}
}
return elements;
} catch (err) {
return [];
}
} else {
try {
return Array.from(doc.querySelectorAll(selector));
} catch (err) {
return [];
}
}
}
const elements = evaluateSelector(autoDetectedPagination.selector, iframeElement.contentDocument);
elements.forEach((el: Element) => {
(el as HTMLElement).style.outline = '';
(el as HTMLElement).style.outlineOffset = '';
(el as HTMLElement).style.zIndex = '';
});
} catch (error) {
console.error('Error removing pagination highlight on completion:', error);
}
}
}
const latestListStep = getLatestListStep(browserSteps);
if (latestListStep) {
extractDataClientSide(latestListStep.listSelector!, latestListStep.fields, latestListStep.id);
@@ -423,7 +463,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
...currentWorkflowActionsState,
hasScrapeListAction: true
});
emitActionForStep(latestListStep);
handleStopGetList();
@@ -441,7 +481,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
onFinishCapture();
clientSelectorGenerator.cleanup();
}
}, [socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide, setCurrentWorkflowActionsState, currentWorkflowActionsState, emitActionForStep]);
}, [socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide, setCurrentWorkflowActionsState, currentWorkflowActionsState, emitActionForStep, autoDetectedPagination]);
const getLatestListStep = (steps: BrowserStep[]) => {
const listSteps = steps.filter(step => step.type === 'list');