Merge pull request #920 from getmaxun/sdk

feat: support for sdk
This commit is contained in:
Karishma Shukla
2025-12-11 22:48:45 +05:30
committed by GitHub
9 changed files with 5382 additions and 5 deletions

View File

@@ -4,6 +4,7 @@
"author": "Maxun",
"license": "AGPL-3.0-or-later",
"dependencies": {
"@anthropic-ai/sdk": "^0.71.2",
"@cliqz/adblocker-playwright": "^1.30.0",
"@emotion/react": "^11.9.0",
"@emotion/styled": "^11.8.1",

717
server/src/api/sdk.ts Normal file
View File

@@ -0,0 +1,717 @@
/**
* SDK API Routes
* Separate API endpoints specifically for Maxun SDKs
* All routes require API key authentication
*/
import { Router, Request, Response } from 'express';
import { requireAPIKey } from "../middlewares/api";
import Robot from "../models/Robot";
import Run from "../models/Run";
import { v4 as uuid } from 'uuid';
import { WorkflowFile } from "maxun-core";
import logger from "../logger";
import { capture } from "../utils/analytics";
import { handleRunRecording } from "./record";
import { WorkflowEnricher } from "../sdk/workflowEnricher";
import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule';
import { computeNextRun } from "../utils/schedule";
import moment from 'moment-timezone';
const router = Router();
interface AuthenticatedRequest extends Request {
user?: any;
}
/**
* Create a new robot programmatically
* POST /api/sdk/robots
*/
router.post("/sdk/robots", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const user = req.user;
const workflowFile: WorkflowFile = req.body;
if (!workflowFile.meta || !workflowFile.workflow) {
return res.status(400).json({
error: "Invalid workflow structure. Expected { meta, workflow }"
});
}
if (!workflowFile.meta.name) {
return res.status(400).json({
error: "Robot name is required in meta.name"
});
}
const type = (workflowFile.meta as any).type || 'extract';
let enrichedWorkflow: any[] = [];
let extractedUrl: string | undefined;
if (type === 'scrape') {
enrichedWorkflow = [];
extractedUrl = (workflowFile.meta as any).url;
if (!extractedUrl) {
return res.status(400).json({
error: "URL is required for scrape robots"
});
}
} else {
const enrichResult = await WorkflowEnricher.enrichWorkflow(workflowFile.workflow, user.id);
if (!enrichResult.success) {
logger.error("[SDK] Error in Selector Validation:\n" + JSON.stringify(enrichResult.errors, null, 2))
return res.status(400).json({
error: "Workflow validation failed",
details: enrichResult.errors
});
}
enrichedWorkflow = enrichResult.workflow!;
extractedUrl = enrichResult.url;
}
const robotId = uuid();
const metaId = uuid();
const robotMeta: any = {
name: workflowFile.meta.name,
id: metaId,
createdAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
pairs: enrichedWorkflow.length,
params: [],
type,
url: extractedUrl,
formats: (workflowFile.meta as any).formats || [],
};
const robot = await Robot.create({
id: robotId,
userId: user.id,
recording_meta: robotMeta,
recording: {
workflow: enrichedWorkflow
}
});
capture("maxun-oss-robot-created", {
robot_meta: robot.recording_meta,
recording: robot.recording,
});
return res.status(201).json({
data: robot,
message: "Robot created successfully"
});
} catch (error: any) {
logger.error("[SDK] Error creating robot:", error);
return res.status(500).json({
error: "Failed to create robot",
message: error.message
});
}
});
/**
* List all robots for the authenticated user
* GET /api/sdk/robots
*/
router.get("/sdk/robots", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robots = await Robot.findAll();
return res.status(200).json({
data: robots
});
} catch (error: any) {
logger.error("[SDK] Error listing robots:", error);
return res.status(500).json({
error: "Failed to list robots",
message: error.message
});
}
});
/**
* Get a specific robot by ID
* GET /api/sdk/robots/:id
*/
router.get("/sdk/robots/:id", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
return res.status(200).json({
data: robot
});
} catch (error: any) {
logger.error("[SDK] Error getting robot:", error);
return res.status(500).json({
error: "Failed to get robot",
message: error.message
});
}
});
/**
* Update a robot
* PUT /api/sdk/robots/:id
*/
router.put("/sdk/robots/:id", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const updates = req.body;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
const updateData: any = {};
if (updates.workflow) {
updateData.recording = {
workflow: updates.workflow
};
}
if (updates.meta) {
updateData.recording_meta = {
...robot.recording_meta,
...updates.meta,
updatedAt: new Date().toISOString()
};
}
if (updates.google_sheet_email !== undefined) {
updateData.google_sheet_email = updates.google_sheet_email;
}
if (updates.google_sheet_name !== undefined) {
updateData.google_sheet_name = updates.google_sheet_name;
}
if (updates.airtable_base_id !== undefined) {
updateData.airtable_base_id = updates.airtable_base_id;
}
if (updates.airtable_table_name !== undefined) {
updateData.airtable_table_name = updates.airtable_table_name;
}
if (updates.schedule !== undefined) {
if (updates.schedule === null) {
try {
await cancelScheduledWorkflow(robotId);
} catch (cancelError) {
logger.warn(`[SDK] Failed to cancel existing schedule for robot ${robotId}: ${cancelError}`);
}
updateData.schedule = null;
} else {
const {
runEvery,
runEveryUnit,
timezone,
startFrom = 'SUNDAY',
dayOfMonth = 1,
atTimeStart = '00:00',
atTimeEnd = '23:59'
} = updates.schedule;
if (!runEvery || !runEveryUnit || !timezone) {
return res.status(400).json({
error: "Missing required schedule parameters: runEvery, runEveryUnit, timezone"
});
}
if (!moment.tz.zone(timezone)) {
return res.status(400).json({
error: "Invalid timezone"
});
}
const [startHours, startMinutes] = atTimeStart.split(':').map(Number);
const [endHours, endMinutes] = atTimeEnd.split(':').map(Number);
if (isNaN(startHours) || isNaN(startMinutes) || isNaN(endHours) || isNaN(endMinutes) ||
startHours < 0 || startHours > 23 || startMinutes < 0 || startMinutes > 59 ||
endHours < 0 || endHours > 23 || endMinutes < 0 || endMinutes > 59) {
return res.status(400).json({ error: 'Invalid time format. Expected HH:MM (e.g., 09:30)' });
}
const days = ['SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY'];
if (!days.includes(startFrom)) {
return res.status(400).json({ error: 'Invalid startFrom day. Must be one of: SUNDAY, MONDAY, TUESDAY, WEDNESDAY, THURSDAY, FRIDAY, SATURDAY' });
}
let cronExpression;
const dayIndex = days.indexOf(startFrom);
switch (runEveryUnit) {
case 'MINUTES':
cronExpression = `*/${runEvery} * * * *`;
break;
case 'HOURS':
cronExpression = `${startMinutes} */${runEvery} * * *`;
break;
case 'DAYS':
cronExpression = `${startMinutes} ${startHours} */${runEvery} * *`;
break;
case 'WEEKS':
cronExpression = `${startMinutes} ${startHours} * * ${dayIndex}`;
break;
case 'MONTHS':
cronExpression = `${startMinutes} ${startHours} ${dayOfMonth} */${runEvery} *`;
if (startFrom !== 'SUNDAY') {
cronExpression += ` ${dayIndex}`;
}
break;
default:
return res.status(400).json({
error: "Invalid runEveryUnit. Must be one of: MINUTES, HOURS, DAYS, WEEKS, MONTHS"
});
}
try {
await cancelScheduledWorkflow(robotId);
} catch (cancelError) {
logger.warn(`[SDK] Failed to cancel existing schedule for robot ${robotId}: ${cancelError}`);
}
try {
await scheduleWorkflow(robotId, req.user.id, cronExpression, timezone);
} catch (scheduleError: any) {
logger.error(`[SDK] Failed to schedule workflow for robot ${robotId}: ${scheduleError.message}`);
return res.status(500).json({
error: "Failed to schedule workflow",
message: scheduleError.message
});
}
const nextRunAt = computeNextRun(cronExpression, timezone);
updateData.schedule = {
runEvery,
runEveryUnit,
timezone,
startFrom,
dayOfMonth,
atTimeStart,
atTimeEnd,
cronExpression,
lastRunAt: undefined,
nextRunAt: nextRunAt || undefined,
};
logger.info(`[SDK] Scheduled robot ${robotId} with cron: ${cronExpression} in timezone: ${timezone}`);
}
}
if (updates.webhooks !== undefined) {
updateData.webhooks = updates.webhooks;
}
if (updates.proxy_url !== undefined) {
updateData.proxy_url = updates.proxy_url;
}
if (updates.proxy_username !== undefined) {
updateData.proxy_username = updates.proxy_username;
}
if (updates.proxy_password !== undefined) {
updateData.proxy_password = updates.proxy_password;
}
await robot.update(updateData);
logger.info(`[SDK] Robot updated: ${robotId}`);
return res.status(200).json({
data: robot,
message: "Robot updated successfully"
});
} catch (error: any) {
logger.error("[SDK] Error updating robot:", error);
return res.status(500).json({
error: "Failed to update robot",
message: error.message
});
}
});
/**
* Delete a robot
* DELETE /api/sdk/robots/:id
*/
router.delete("/sdk/robots/:id", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
await Run.destroy({
where: {
robotMetaId: robot.recording_meta.id
}
});
await robot.destroy();
logger.info(`[SDK] Robot deleted: ${robotId}`);
capture(
'maxun-oss-robot-deleted',
{
robotId: robotId,
user_id: req.user?.id,
deleted_at: new Date().toISOString(),
}
)
return res.status(200).json({
message: "Robot deleted successfully"
});
} catch (error: any) {
logger.error("[SDK] Error deleting robot:", error);
return res.status(500).json({
error: "Failed to delete robot",
message: error.message
});
}
});
/**
* Execute a robot
* POST /api/sdk/robots/:id/execute
*/
router.post("/sdk/robots/:id/execute", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const user = req.user;
const robotId = req.params.id;
logger.info(`[SDK] Starting execution for robot ${robotId}`);
const runId = await handleRunRecording(robotId, user.id.toString());
if (!runId) {
throw new Error('Failed to start robot execution');
}
const run = await waitForRunCompletion(runId, user.id.toString());
let listData: any[] = [];
if (run.serializableOutput?.scrapeList) {
const scrapeList: any = run.serializableOutput.scrapeList;
if (scrapeList.scrapeList && Array.isArray(scrapeList.scrapeList)) {
listData = scrapeList.scrapeList;
}
else if (Array.isArray(scrapeList)) {
listData = scrapeList;
}
else if (typeof scrapeList === 'object') {
const listValues = Object.values(scrapeList);
if (listValues.length > 0 && Array.isArray(listValues[0])) {
listData = listValues[0] as any[];
}
}
}
return res.status(200).json({
data: {
runId: run.runId,
status: run.status,
data: {
textData: run.serializableOutput?.scrapeSchema || {},
listData: listData
},
screenshots: Object.values(run.binaryOutput || {})
}
});
} catch (error: any) {
logger.error("[SDK] Error executing robot:", error);
return res.status(500).json({
error: "Failed to execute robot",
message: error.message
});
}
});
/**
* Wait for run completion
*/
async function waitForRunCompletion(runId: string, interval: number = 2000) {
const MAX_WAIT_TIME = 180 * 60 * 1000;
const startTime = Date.now();
while (true) {
if (Date.now() - startTime > MAX_WAIT_TIME) {
throw new Error('Run completion timeout after 3 hours');
}
const run = await Run.findOne({ where: { runId } });
if (!run) throw new Error('Run not found');
if (run.status === 'success') {
return run.toJSON();
} else if (run.status === 'failed') {
throw new Error('Run failed');
} else if (run.status === 'aborted') {
throw new Error('Run was aborted');
}
await new Promise(resolve => setTimeout(resolve, interval));
}
}
/**
* Get all runs for a robot
* GET /api/sdk/robots/:id/runs
*/
router.get("/sdk/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
const runs = await Run.findAll({
where: {
robotMetaId: robot.recording_meta.id
},
order: [['startedAt', 'DESC']]
});
return res.status(200).json({
data: runs
});
} catch (error: any) {
logger.error("[SDK] Error getting runs:", error);
return res.status(500).json({
error: "Failed to get runs",
message: error.message
});
}
});
/**
* Get a specific run
* GET /api/sdk/robots/:id/runs/:runId
*/
router.get("/sdk/robots/:id/runs/:runId", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const runId = req.params.runId;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
const run = await Run.findOne({
where: {
runId: runId,
robotMetaId: robot.recording_meta.id
}
});
if (!run) {
return res.status(404).json({
error: "Run not found"
});
}
return res.status(200).json({
data: run
});
} catch (error: any) {
logger.error("[SDK] Error getting run:", error);
return res.status(500).json({
error: "Failed to get run",
message: error.message
});
}
});
/**
* Abort a running execution
* POST /api/sdk/robots/:id/runs/:runId/abort
*/
router.post("/sdk/robots/:id/runs/:runId/abort", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const runId = req.params.runId;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
const run = await Run.findOne({
where: {
runId: runId,
robotMetaId: robot.recording_meta.id
}
});
if (!run) {
return res.status(404).json({
error: "Run not found"
});
}
if (run.status !== 'running' && run.status !== 'queued') {
return res.status(400).json({
error: "Run is not in a state that can be aborted",
currentStatus: run.status
});
}
await run.update({ status: 'aborted' });
logger.info(`[SDK] Run ${runId} marked for abortion`);
return res.status(200).json({
message: "Run abortion initiated",
data: run
});
} catch (error: any) {
logger.error("[SDK] Error aborting run:", error);
return res.status(500).json({
error: "Failed to abort run",
message: error.message
});
}
});
/**
* LLM-based extraction - generate workflow from natural language prompt
* POST /api/sdk/extract/llm
*/
router.post("/sdk/extract/llm", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const user = req.user
const { url, prompt, llmProvider, llmModel, llmApiKey, llmBaseUrl, robotName } = req.body;
if (!url || !prompt) {
return res.status(400).json({
error: "URL and prompt are required"
});
}
const workflowResult = await WorkflowEnricher.generateWorkflowFromPrompt(url, prompt, user.id, {
provider: llmProvider,
model: llmModel,
apiKey: llmApiKey,
baseUrl: llmBaseUrl
});
if (!workflowResult.success || !workflowResult.workflow) {
return res.status(400).json({
error: "Failed to generate workflow from prompt",
details: workflowResult.errors
});
}
const robotId = uuid();
const metaId = uuid();
const robotMeta: any = {
name: robotName || `LLM Extract: ${prompt.substring(0, 50)}`,
id: metaId,
createdAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
pairs: workflowResult.workflow.length,
params: [],
type: 'extract',
url: workflowResult.url,
};
const robot = await Robot.create({
id: robotId,
userId: user.id,
recording_meta: robotMeta,
recording: {
workflow: workflowResult.workflow
},
});
logger.info(`[SDK] Persistent robot created: ${metaId} for LLM extraction`);
capture("maxun-oss-robot-created", {
robot_meta: robot.recording_meta,
recording: robot.recording,
});
return res.status(200).json({
success: true,
data: {
robotId: metaId,
name: robotMeta.name,
description: prompt,
url: workflowResult.url,
workflow: workflowResult.workflow
}
});
} catch (error: any) {
logger.error("[SDK] Error in LLM extraction:", error);
return res.status(500).json({
error: "Failed to perform LLM extraction",
message: error.message
});
}
});
export default router;

View File

@@ -4,7 +4,7 @@
*/
import { Socket } from "socket.io";
import { v4 as uuid } from "uuid";
import { Page } from "playwright-core";
import { createSocketConnection, createSocketConnectionForRun } from "../socket-connection/connection";
import { io, browserPool } from "../server";
import { RemoteBrowser } from "./classes/RemoteBrowser";
@@ -434,3 +434,63 @@ const initializeBrowserAsync = async (id: string, userId: string) => {
throw error;
}
};
/**
* Creates a RemoteBrowser instance specifically for SDK validation
* Uses dummy socket and returns browser ID and Page for validation tasks
* @param userId User ID for browser ownership
* @returns Promise with browser ID and Page instance
* @category BrowserManagement-Controller
*/
export const createRemoteBrowserForValidation = async (
userId: string
): Promise<{ browserId: string; page: Page }> => {
const id = uuid();
logger.log('info', `Creating validation browser ${id} for user ${userId}`);
try {
const dummySocket = {
emit: (event: string, data?: any) => {
logger.log('debug', `Browser ${id} emitted ${event}`);
},
on: () => {},
off: () => {},
id: `validation-${id}`,
} as any;
const browserSession = new RemoteBrowser(dummySocket, userId, id);
const VALIDATION_INIT_TIMEOUT = 45000;
const initPromise = browserSession.initialize(userId);
const timeoutPromise = new Promise<never>((_, reject) => {
setTimeout(() => reject(new Error('Validation browser initialization timeout')), VALIDATION_INIT_TIMEOUT);
});
await Promise.race([initPromise, timeoutPromise]);
const added = browserPool.addRemoteBrowser(id, browserSession, userId, true, 'run');
if (!added) {
await browserSession.switchOff();
throw new Error('Failed to add validation browser to pool');
}
const page = browserSession.getCurrentPage();
if (!page) {
await destroyRemoteBrowser(id, userId);
throw new Error('Failed to get page from validation browser');
}
logger.log('info', `Browser ${id} initialized successfully`);
return { browserId: id, page };
} catch (error: any) {
logger.log('error', `Failed to create validation browser ${id}: ${error.message}`);
try {
await destroyRemoteBrowser(id, userId);
} catch (cleanupError) {
logger.log('warn', `Failed to cleanup browser ${id}: ${cleanupError}`);
}
throw error;
}
};

View File

@@ -894,7 +894,7 @@ router.put('/schedule/:id/', requireSignIn, async (req: AuthenticatedRequest, re
logger.log('warn', `Failed to cancel existing schedule for robot ${id}: ${cancelError}`);
}
const jobId = await scheduleWorkflow(id, req.user.id, cronExpression, timezone);
await scheduleWorkflow(id, req.user.id, cronExpression, timezone);
const nextRunAt = computeNextRun(cronExpression, timezone);

View File

@@ -13,7 +13,7 @@ if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST ||
const pgBossConnectionString = `postgresql://${process.env.DB_USER}:${encodeURIComponent(process.env.DB_PASSWORD)}@${process.env.DB_HOST}:${process.env.DB_PORT}/${process.env.DB_NAME}`;
const pgBoss = new PgBoss({
export const pgBoss = new PgBoss({
connectionString: pgBossConnectionString,
max: 3,
expireInHours: 23,
@@ -87,8 +87,9 @@ async function registerScheduledWorkflowWorker() {
/**
* Register a worker for a specific queue
* Exported to allow dynamic registration when new schedules are created
*/
async function registerWorkerForQueue(queueName: string) {
export async function registerWorkerForQueue(queueName: string) {
try {
if (registeredQueues.has(queueName)) {
return;

File diff suppressed because it is too large Load Diff

View File

@@ -0,0 +1,585 @@
/**
* Selector Validator
* Validates and enriches selectors with metadata using Playwright page instance
*/
import { Page } from 'playwright-core';
import logger from '../logger';
interface SelectorInput {
selector: string;
attribute?: string;
}
interface EnrichedSelector {
tag: string;
isShadow: boolean;
selector: string;
attribute: string;
}
interface ValidationResult {
valid: boolean;
enriched?: EnrichedSelector;
error?: string;
}
export class SelectorValidator {
private page: Page | null = null;
/**
* Initialize with an existing Page instance and navigate to URL
* @param page Page instance from RemoteBrowser
* @param url URL to navigate to
*/
async initialize(page: Page, url: string): Promise<void> {
this.page = page;
try {
await page.goto(url, {
waitUntil: "networkidle",
timeout: 100000,
});
} catch (err) {
await page.goto(url, {
waitUntil: "domcontentloaded",
timeout: 100000,
});
}
logger.info(`Navigated to ${url} using RemoteBrowser page`);
}
/**
* Validate and enrich a single selector
*/
async validateSelector(input: SelectorInput): Promise<ValidationResult> {
if (!this.page) {
return { valid: false, error: 'Browser not initialized' };
}
const { selector, attribute = 'innerText' } = input;
try {
const isXPath = selector.startsWith('//') || selector.startsWith('(//');
let element;
if (isXPath) {
element = await this.page.locator(`xpath=${selector}`).first();
} else {
element = await this.page.locator(selector).first();
}
const count = await element.count();
if (count === 0) {
return {
valid: false,
error: `Selector "${selector}" did not match any elements`
};
}
const tagName = await element.evaluate((el) => el.tagName);
const isShadow = await element.evaluate((el) => {
let parent = el.parentNode;
while (parent) {
if (parent instanceof ShadowRoot) {
return true;
}
parent = parent.parentNode;
}
return false;
});
return {
valid: true,
enriched: {
tag: tagName,
isShadow,
selector,
attribute
}
};
} catch (error: any) {
logger.error(`Error validating selector "${selector}":`, error.message);
return {
valid: false,
error: `Invalid selector: ${error.message}`
};
}
}
/**
* Validate and enrich multiple selectors
*/
async validateSchemaFields(
fields: Record<string, string | SelectorInput>
): Promise<{ valid: boolean; enriched?: Record<string, EnrichedSelector>; errors?: string[] }> {
const enriched: Record<string, EnrichedSelector> = {};
const errors: string[] = [];
for (const [fieldName, fieldInput] of Object.entries(fields)) {
const input: SelectorInput = typeof fieldInput === 'string'
? { selector: fieldInput }
: fieldInput;
const result = await this.validateSelector(input);
if (result.valid && result.enriched) {
enriched[fieldName] = result.enriched;
} else {
errors.push(`Field "${fieldName}": ${result.error}`);
}
}
if (errors.length > 0) {
return { valid: false, errors };
}
return { valid: true, enriched };
}
/**
* Validate list selector and fields
*/
async validateListFields(config: {
itemSelector: string;
fields: Record<string, string | SelectorInput>;
}): Promise<{
valid: boolean;
enriched?: {
listSelector: string;
listTag: string;
fields: Record<string, EnrichedSelector>;
};
errors?: string[]
}> {
const errors: string[] = [];
const listResult = await this.validateSelector({
selector: config.itemSelector,
attribute: 'innerText'
});
if (!listResult.valid || !listResult.enriched) {
errors.push(`List selector: ${listResult.error}`);
return { valid: false, errors };
}
const fieldsResult = await this.validateSchemaFields(config.fields);
if (!fieldsResult.valid) {
errors.push(...(fieldsResult.errors || []));
return { valid: false, errors };
}
return {
valid: true,
enriched: {
listSelector: config.itemSelector,
listTag: listResult.enriched.tag,
fields: fieldsResult.enriched!
}
};
}
/**
* Detect input type for a given selector
*/
async detectInputType(selector: string): Promise<string> {
if (!this.page) {
throw new Error('Browser not initialized');
}
try {
const isXPath = selector.startsWith('//') || selector.startsWith('(//');
let element;
if (isXPath) {
element = await this.page.locator(`xpath=${selector}`).first();
} else {
element = await this.page.locator(selector).first();
}
const count = await element.count();
if (count === 0) {
throw new Error(`Selector "${selector}" did not match any elements`);
}
const inputType = await element.evaluate((el) => {
if (el instanceof HTMLInputElement) {
return el.type || 'text';
}
if (el instanceof HTMLTextAreaElement) {
return 'textarea';
}
if (el instanceof HTMLSelectElement) {
return 'select';
}
return 'text';
});
return inputType;
} catch (error: any) {
throw new Error(`Failed to detect input type: ${error.message}`);
}
}
/**
* Auto-detect fields from list selector
*/
async autoDetectListFields(listSelector: string): Promise<{
success: boolean;
fields?: Record<string, any>;
listSelector?: string;
error?: string;
}> {
if (!this.page) {
return { success: false, error: 'Browser not initialized' };
}
try {
const fs = require('fs');
const path = require('path');
const scriptPath = path.join(__dirname, 'browserSide/pageAnalyzer.js');
const scriptContent = fs.readFileSync(scriptPath, 'utf8');
await this.page.evaluate((script) => {
eval(script);
}, scriptContent);
const result = await this.page.evaluate((selector) => {
const win = window as any;
if (typeof win.autoDetectListFields === 'function') {
return win.autoDetectListFields(selector);
} else {
return {
fields: {},
error: 'Auto-detection function not loaded'
};
}
}, listSelector);
// Log debug information
if (result.debug) {
logger.info(`Debug info: ${JSON.stringify(result.debug)}`);
}
if (result.error || !result.fields || Object.keys(result.fields).length === 0) {
return {
success: false,
error: result.error || 'No fields detected from list selector'
};
}
const convertedListSelector = result.listSelector || listSelector;
logger.info(`Auto-detected ${Object.keys(result.fields).length} fields from list`);
return {
success: true,
fields: result.fields,
listSelector: convertedListSelector,
};
} catch (error: any) {
logger.error('Field auto-detection error:', error);
return {
success: false,
error: `Field auto-detection failed: ${error.message}`
};
}
}
/**
* Auto-detect pagination type and selector from list selector
*/
async autoDetectPagination(listSelector: string): Promise<{
success: boolean;
type?: string;
selector?: string | null;
error?: string;
}> {
if (!this.page) {
return { success: false, error: 'Browser not initialized' };
}
try {
const fs = require('fs');
const path = require('path');
const scriptPath = path.join(__dirname, 'browserSide/pageAnalyzer.js');
const scriptContent = fs.readFileSync(scriptPath, 'utf8');
await this.page.evaluate((script) => {
eval(script);
}, scriptContent);
const buttonResult = await this.page.evaluate((selector) => {
const win = window as any;
if (typeof win.autoDetectPagination === 'function') {
const result = win.autoDetectPagination(selector);
return result;
} else {
console.error('autoDetectPagination function not found!');
return {
type: '',
selector: null,
error: 'Pagination auto-detection function not loaded'
};
}
}, listSelector);
if (buttonResult.debug) {
logger.info(`Pagination debug info: ${JSON.stringify(buttonResult.debug)}`);
}
if (buttonResult.error) {
logger.error(`Button detection error: ${buttonResult.error}`);
return {
success: false,
error: buttonResult.error
};
}
if (buttonResult.type && buttonResult.type !== '') {
if (buttonResult.type === 'clickLoadMore' && buttonResult.selector) {
logger.info('Testing Load More button by clicking...');
const loadMoreVerified = await this.testLoadMoreButton(buttonResult.selector, listSelector);
if (!loadMoreVerified) {
logger.warn('Load More button did not load content, falling back to scroll detection');
const scrollTestResult = await this.testInfiniteScrollByScrolling(listSelector);
if (scrollTestResult.detected) {
return {
success: true,
type: 'scrollDown',
selector: null
};
}
} else {
logger.info(`Verified Load More button works`);
return {
success: true,
type: buttonResult.type,
selector: buttonResult.selector
};
}
} else {
logger.info(`Detected pagination type: ${buttonResult.type}${buttonResult.selector ? ` with selector: ${buttonResult.selector}` : ''}`);
return {
success: true,
type: buttonResult.type,
selector: buttonResult.selector
};
}
}
return {
success: true,
type: '',
selector: null
};
} catch (error: any) {
logger.error('Pagination auto-detection error:', error);
return {
success: false,
error: `Pagination auto-detection failed: ${error.message}`
};
}
}
/**
* Test Load More button by clicking it and checking if content loads
*/
private async testLoadMoreButton(buttonSelector: string, listSelector: string): Promise<boolean> {
if (!this.page) {
return false;
}
try {
const initialState = await this.page.evaluate((selector) => {
function evaluateSelector(sel: string, doc: Document) {
const isXPath = sel.startsWith('//') || sel.startsWith('(//');
if (isXPath) {
const result = doc.evaluate(sel, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
const elements = [];
for (let i = 0; i < result.snapshotLength; i++) {
elements.push(result.snapshotItem(i));
}
return elements;
} else {
return Array.from(doc.querySelectorAll(sel));
}
}
const listElements = evaluateSelector(selector, document);
return {
itemCount: listElements.length,
scrollHeight: document.documentElement.scrollHeight
};
}, listSelector);
try {
const selectors = buttonSelector.split(',').map(s => s.trim());
let clicked = false;
for (const sel of selectors) {
try {
await this.page.click(sel, { timeout: 1000 });
clicked = true;
break;
} catch (e) {
continue;
}
}
if (!clicked) {
return false;
}
await this.page.waitForTimeout(2000);
} catch (clickError: any) {
logger.warn(`Failed to click button: ${clickError.message}`);
return false;
}
const afterClickState = await this.page.evaluate((selector) => {
function evaluateSelector(sel: string, doc: Document) {
const isXPath = sel.startsWith('//') || sel.startsWith('(//');
if (isXPath) {
const result = doc.evaluate(sel, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
const elements = [];
for (let i = 0; i < result.snapshotLength; i++) {
elements.push(result.snapshotItem(i));
}
return elements;
} else {
return Array.from(doc.querySelectorAll(sel));
}
}
const listElements = evaluateSelector(selector, document);
return {
itemCount: listElements.length,
scrollHeight: document.documentElement.scrollHeight
};
}, listSelector);
logger.info(`After click: ${afterClickState.itemCount} items, scrollHeight: ${afterClickState.scrollHeight}`);
const itemsAdded = afterClickState.itemCount > initialState.itemCount;
const heightIncreased = afterClickState.scrollHeight > initialState.scrollHeight + 100;
if (itemsAdded || heightIncreased) {
const details = `Items: ${initialState.itemCount}${afterClickState.itemCount}, Height: ${initialState.scrollHeight}${afterClickState.scrollHeight}`;
logger.info(`Content loaded after click: ${details}`);
return true;
}
logger.info('No content change detected after clicking');
return false;
} catch (error: any) {
logger.error('Error during Load More test:', error.message);
return false;
}
}
/**
* Test for infinite scroll by actually scrolling and checking if content loads
*/
private async testInfiniteScrollByScrolling(listSelector: string): Promise<{
detected: boolean;
details?: string;
}> {
if (!this.page) {
return { detected: false };
}
try {
const initialState = await this.page.evaluate((selector) => {
function evaluateSelector(sel: string, doc: Document) {
const isXPath = sel.startsWith('//') || sel.startsWith('(//');
if (isXPath) {
const result = doc.evaluate(sel, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
const elements = [];
for (let i = 0; i < result.snapshotLength; i++) {
elements.push(result.snapshotItem(i));
}
return elements;
} else {
return Array.from(doc.querySelectorAll(sel));
}
}
const listElements = evaluateSelector(selector, document);
return {
itemCount: listElements.length,
scrollHeight: document.documentElement.scrollHeight,
scrollY: window.scrollY
};
}, listSelector);
logger.info(`Initial state: ${initialState.itemCount} items, scrollHeight: ${initialState.scrollHeight}`);
await this.page.evaluate(() => {
window.scrollTo(0, document.documentElement.scrollHeight);
});
await this.page.waitForTimeout(2000);
const afterScrollState = await this.page.evaluate((selector) => {
function evaluateSelector(sel: string, doc: Document) {
const isXPath = sel.startsWith('//') || sel.startsWith('(//');
if (isXPath) {
const result = doc.evaluate(sel, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
const elements = [];
for (let i = 0; i < result.snapshotLength; i++) {
elements.push(result.snapshotItem(i));
}
return elements;
} else {
return Array.from(doc.querySelectorAll(sel));
}
}
const listElements = evaluateSelector(selector, document);
return {
itemCount: listElements.length,
scrollHeight: document.documentElement.scrollHeight,
scrollY: window.scrollY
};
}, listSelector);
await this.page.evaluate((originalY) => {
window.scrollTo(0, originalY);
}, initialState.scrollY);
const itemsAdded = afterScrollState.itemCount > initialState.itemCount;
const heightIncreased = afterScrollState.scrollHeight > initialState.scrollHeight + 100;
if (itemsAdded || heightIncreased) {
const details = `Items: ${initialState.itemCount}${afterScrollState.itemCount}, Height: ${initialState.scrollHeight}${afterScrollState.scrollHeight}`;
logger.info(`Content changed: ${details}`);
return { detected: true, details };
}
logger.info('No content change detected');
return { detected: false };
} catch (error: any) {
logger.error('Error during scroll test:', error.message);
return { detected: false };
}
}
/**
* Clear page reference
*/
async close(): Promise<void> {
this.page = null;
logger.info('Page reference cleared');
}
}

File diff suppressed because it is too large Load Diff

View File

@@ -5,6 +5,7 @@
import { v4 as uuid } from 'uuid';
import logger from '../logger';
import { pgBossClient } from './pgboss';
import { registerWorkerForQueue } from '../schedule-worker';
/**
* Utility function to schedule a cron job using PgBoss
@@ -13,7 +14,7 @@ import { pgBossClient } from './pgboss';
* @param cronExpression The cron expression for scheduling
* @param timezone The timezone for the cron expression
*/
export async function scheduleWorkflow(id: string, userId: string, cronExpression: string, timezone: string): Promise<void> {
export async function scheduleWorkflow(id: string, userId: string, cronExpression: string, timezone: string): Promise<void> {
try {
const runId = uuid();
@@ -28,6 +29,8 @@ export async function scheduleWorkflow(id: string, userId: string, cronExpressio
{ tz: timezone }
);
await registerWorkerForQueue(queueName);
logger.log('info', `Scheduled workflow job for robot ${id}`);
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : String(error);