Files
parcer/server/src/workflow-management/classes/Interpreter.ts
2025-10-25 13:09:00 +05:30

792 lines
26 KiB
TypeScript

import Interpreter, { WorkflowFile } from "maxun-core";
import logger from "../../logger";
import { Socket } from "socket.io";
import { Page } from "playwright";
import { InterpreterSettings } from "../../types";
import { decrypt } from "../../utils/auth";
import Run from "../../models/Run";
/**
* Decrypts any encrypted inputs in the workflow. If checkLimit is true, it will also handle the limit validation for scrapeList action.
* @param workflow The workflow to decrypt.
* @param checkLimit If true, it will handle the limit validation for scrapeList action.
*/
function processWorkflow(workflow: WorkflowFile, checkLimit: boolean = false): WorkflowFile {
const processedWorkflow = JSON.parse(JSON.stringify(workflow)) as WorkflowFile;
processedWorkflow.workflow.forEach((pair) => {
pair.what.forEach((action) => {
// Handle limit validation for scrapeList action
if (action.action === 'scrapeList' && checkLimit && Array.isArray(action.args) && action.args.length > 0) {
const scrapeConfig = action.args[0];
if (scrapeConfig && typeof scrapeConfig === 'object' && 'limit' in scrapeConfig) {
if (typeof scrapeConfig.limit === 'number' && scrapeConfig.limit > 5) {
scrapeConfig.limit = 5;
}
}
}
// Handle decryption for type and press actions
if ((action.action === 'type' || action.action === 'press') && Array.isArray(action.args) && action.args.length > 1) {
try {
const encryptedValue = action.args[1];
if (typeof encryptedValue === 'string') {
const decryptedValue = decrypt(encryptedValue);
action.args[1] = decryptedValue;
} else {
logger.log('error', 'Encrypted value is not a string');
action.args[1] = '';
}
} catch (error: unknown) {
const errorMessage = error instanceof Error ? error.message : String(error);
logger.log('error', `Failed to decrypt input value: ${errorMessage}`);
action.args[1] = '';
}
}
});
});
return processedWorkflow;
}
/**
* This class implements the main interpretation functions.
* It holds some information about the current interpretation process and
* registers to some events to allow the client (frontend) to interact with the interpreter.
* It uses the [maxun-core](https://www.npmjs.com/package/maxun-core)
* library to interpret the workflow.
* @category WorkflowManagement
*/
export class WorkflowInterpreter {
/**
* Socket.io socket instance enabling communication with the client (frontend) side.
* @private
*/
private socket: Socket;
/**
* True if the interpretation is paused.
*/
public interpretationIsPaused: boolean = false;
/**
* The instance of the {@link Interpreter} class used to interpret the workflow.
* From maxun-core.
* @private
*/
private interpreter: Interpreter | null = null;
/**
* An id of the currently interpreted pair in the workflow.
* @private
*/
private activeId: number | null = null;
/**
* An array of debug messages emitted by the {@link Interpreter}.
*/
public debugMessages: string[] = [];
/**
* Storage for different types of serializable data
*/
public serializableDataByType: {
scrapeSchema: Record<string, any>;
scrapeList: Record<string, any>;
[key: string]: any;
} = {
scrapeSchema: {},
scrapeList: {},
};
private currentActionName: string | null = null;
/**
* Track the current action type being processed
*/
private currentActionType: string | null = null;
/**
* An array of all the binary data extracted from the run.
*/
public binaryData: { name: string; mimeType: string; data: string }[] = [];
/**
* Track current scrapeList index
*/
private currentScrapeListIndex: number = 0;
/**
* Current run ID for real-time persistence
*/
private currentRunId: string | null = null;
/**
* Batched persistence system for performance optimization
*/
private persistenceBuffer: Array<{
actionType: string;
data: any;
listIndex?: number;
timestamp: number;
creditValidated: boolean;
}> = [];
private persistenceTimer: NodeJS.Timeout | null = null;
private readonly BATCH_SIZE = 5;
private readonly BATCH_TIMEOUT = 3000;
private persistenceInProgress = false;
private persistenceRetryCount = 0;
/**
* An array of id's of the pairs from the workflow that are about to be paused.
* As "breakpoints".
* @private
*/
private breakpoints: boolean[] = [];
/**
* Callback to resume the interpretation after a pause.
* @private
*/
private interpretationResume: (() => void) | null = null;
/**
* A public constructor taking a socket instance for communication with the client.
* @param socket Socket.io socket instance enabling communication with the client (frontend) side.
* @param runId Optional run ID for real-time data persistence
* @constructor
*/
constructor(socket: Socket, runId?: string) {
this.socket = socket;
this.currentRunId = runId || null;
}
/**
* Subscribes to the events that are used to control the interpretation.
* The events are pause, resume, step and breakpoints.
* Step is used to interpret a single pair and pause on the other matched pair.
* @returns void
*/
public subscribeToPausing = () => {
this.socket.on('pause', () => {
this.interpretationIsPaused = true;
});
this.socket.on('resume', () => {
this.interpretationIsPaused = false;
if (this.interpretationResume) {
this.interpretationResume();
this.socket.emit('log', '----- The interpretation has been resumed -----', false);
} else {
logger.log('debug', "Resume called but no resume function is set");
}
});
this.socket.on('step', () => {
if (this.interpretationResume) {
this.interpretationResume();
} else {
logger.log('debug', "Step called but no resume function is set");
}
});
this.socket.on('breakpoints', (data: boolean[]) => {
logger.log('debug', "Setting breakpoints: " + data);
this.breakpoints = data
});
}
/**
* Sets up the instance of {@link Interpreter} and interprets
* the workflow inside the recording editor.
* Cleans up this interpreter instance after the interpretation is finished.
* @param workflow The workflow to interpret.
* @param page The page instance used to interact with the browser.
* @param updatePageOnPause A callback to update the page after a pause.
* @returns {Promise<void>}
*/
public interpretRecordingInEditor = async (
workflow: WorkflowFile,
page: Page,
updatePageOnPause: (page: Page) => void,
settings: InterpreterSettings,
) => {
const params = settings.params ? settings.params : null;
delete settings.params;
const processedWorkflow = processWorkflow(workflow, true);
const options = {
...settings,
mode: 'editor',
debugChannel: {
activeId: (id: any) => {
this.activeId = id;
this.socket.emit('activePairId', id);
},
debugMessage: (msg: any) => {
this.debugMessages.push(`[${new Date().toLocaleString()}] ` + msg);
this.socket.emit('log', msg)
},
setActionType: (type: string) => {
this.currentActionType = type;
}
},
serializableCallback: async (data: any) => {
if (this.currentActionType === 'scrapeSchema') {
const cumulativeScrapeSchemaData = Array.isArray(data) && data.length > 0 ? data : [data];
if (cumulativeScrapeSchemaData.length > 0) {
await this.persistDataToDatabase('scrapeSchema', cumulativeScrapeSchemaData);
}
if (Array.isArray(data) && data.length > 0) {
this.socket.emit('serializableCallback', {
type: 'captureText',
data
});
} else {
this.socket.emit('serializableCallback', {
type: 'captureText',
data : [data]
});
}
} else if (this.currentActionType === 'scrapeList') {
if (data && Array.isArray(data) && data.length > 0) {
// Use the current index for persistence
await this.persistDataToDatabase('scrapeList', data, this.currentScrapeListIndex);
}
this.socket.emit('serializableCallback', {
type: 'captureList',
data
});
}
},
binaryCallback: async (data: string, mimetype: string) => {
// For editor mode, we don't have the name yet, so use a timestamp-based name
const binaryItem = {
name: `Screenshot ${Date.now()}`,
mimeType: mimetype,
data: JSON.stringify(data)
};
this.binaryData.push(binaryItem);
// Persist binary data to database
await this.persistBinaryDataToDatabase(binaryItem);
this.socket.emit('binaryCallback', {
data,
mimetype,
type: 'captureScreenshot'
});
}
}
const interpreter = new Interpreter(processedWorkflow, options);
this.interpreter = interpreter;
interpreter.on('flag', async (page, resume) => {
if (this.activeId !== null && this.breakpoints[this.activeId]) {
logger.log('debug', `breakpoint hit id: ${this.activeId}`);
this.socket.emit('breakpointHit');
this.interpretationIsPaused = true;
}
if (this.interpretationIsPaused) {
this.interpretationResume = resume;
logger.log('debug', `Paused inside of flag: ${page.url()}`);
updatePageOnPause(page);
this.socket.emit('log', '----- The interpretation has been paused -----', false);
} else {
resume();
}
});
this.socket.emit('log', '----- Starting the interpretation -----', false);
const status = await interpreter.run(page, params);
this.socket.emit('log', `----- The interpretation finished with status: ${status} -----`, false);
logger.log('debug', `Interpretation finished`);
// Flush any remaining data in persistence buffer before completing
await this.flushPersistenceBuffer();
this.interpreter = null;
this.socket.emit('activePairId', -1);
this.interpretationIsPaused = false;
this.interpretationResume = null;
this.socket.emit('finished');
};
/**
* Stops the current process of the interpretation of the workflow.
* @returns {Promise<void>}
*/
public stopInterpretation = async () => {
if (this.interpreter) {
logger.log('info', 'Stopping the interpretation.');
this.interpreter.abort();
logger.log('info', 'maxun-core interpreter aborted - data collection stopped immediately');
await this.interpreter.stop();
this.socket.emit('log', '----- The interpretation has been stopped -----', false);
await this.clearState();
} else {
logger.log('error', 'Cannot stop: No active interpretation.');
}
};
public clearState = async (): Promise<void> => {
if (this.persistenceBuffer.length > 0) {
try {
await this.flushPersistenceBuffer();
logger.log('debug', 'Successfully flushed final persistence buffer during cleanup');
} catch (error: any) {
logger.log('error', `Failed to flush final persistence buffer: ${error.message}`);
}
}
if (this.persistenceTimer) {
clearTimeout(this.persistenceTimer);
this.persistenceTimer = null;
}
if (this.interpreter) {
try {
if (!this.interpreter.getIsAborted()) {
this.interpreter.abort();
}
await this.interpreter.stop();
logger.log('debug', 'mx-cloud interpreter properly stopped during cleanup');
} catch (error: any) {
logger.log('warn', `Error stopping mx-cloud interpreter during cleanup: ${error.message}`);
}
}
this.debugMessages = [];
this.interpretationIsPaused = false;
this.activeId = null;
this.interpreter = null;
this.breakpoints = [];
this.interpretationResume = null;
this.currentActionType = null;
this.currentActionName = null;
this.serializableDataByType = {
scrapeSchema: {},
scrapeList: {},
};
this.binaryData = [];
this.currentScrapeListIndex = 0;
this.currentRunId = null;
this.persistenceBuffer = [];
this.persistenceInProgress = false;
this.persistenceRetryCount = 0;
}
/**
* Sets the current run ID for real-time persistence.
* @param runId The run ID to set
*/
public setRunId = (runId: string): void => {
this.currentRunId = runId;
logger.log('debug', `Set run ID for real-time persistence: ${runId}`);
};
/**
* Persists extracted data to database with intelligent batching for performance
* Falls back to immediate persistence for critical operations
* @private
*/
private persistDataToDatabase = async (actionType: string, data: any, listIndex?: number): Promise<void> => {
if (!this.currentRunId) {
logger.log('debug', 'No run ID available for persistence');
return;
}
this.addToPersistenceBatch(actionType, data, listIndex, true);
if (actionType === 'scrapeSchema' || this.persistenceBuffer.length >= this.BATCH_SIZE) {
await this.flushPersistenceBuffer();
} else {
this.scheduleBatchFlush();
}
};
/**
* Persists binary data to database in real-time
* @private
*/
private persistBinaryDataToDatabase = async (binaryItem: { name: string; mimeType: string; data: string }): Promise<void> => {
if (!this.currentRunId) {
logger.log('debug', 'No run ID available for binary data persistence');
return;
}
try {
const run = await Run.findOne({ where: { runId: this.currentRunId } });
if (!run) {
logger.log('warn', `Run not found for binary data persistence: ${this.currentRunId}`);
return;
}
const currentBinaryOutput =
run.binaryOutput && typeof run.binaryOutput === 'object'
? JSON.parse(JSON.stringify(run.binaryOutput))
: {};
const baseName = binaryItem.name?.trim() || `Screenshot ${Object.keys(currentBinaryOutput).length + 1}`;
let uniqueName = baseName;
let counter = 1;
while (currentBinaryOutput[uniqueName]) {
uniqueName = `${baseName} (${counter++})`;
}
const updatedBinaryOutput = {
...currentBinaryOutput,
[uniqueName]: binaryItem,
};
await run.update({
binaryOutput: updatedBinaryOutput
});
logger.log('debug', `Persisted binary data for run ${this.currentRunId}: ${binaryItem.name} (${binaryItem.mimeType})`);
} catch (error: any) {
logger.log('error', `Failed to persist binary data in real-time for run ${this.currentRunId}: ${error.message}`);
}
};
/**
* Interprets the recording as a run.
* @param workflow The workflow to interpret.
* @param page The page instance used to interact with the browser.
* @param settings The settings to use for the interpretation.
*/
public InterpretRecording = async (
workflow: WorkflowFile,
page: Page,
updatePageOnPause: (page: Page) => void,
settings: InterpreterSettings
) => {
const params = settings.params ? settings.params : null;
delete settings.params;
const processedWorkflow = processWorkflow(workflow);
let mergedScrapeSchema = {};
const options = {
...settings,
debugChannel: {
activeId: (id: any) => {
this.activeId = id;
this.socket.emit('activePairId', id);
},
debugMessage: (msg: any) => {
this.debugMessages.push(`[${new Date().toLocaleString()}] ` + msg);
this.socket.emit('debugMessage', msg)
},
setActionType: (type: string) => {
this.currentActionType = type;
},
incrementScrapeListIndex: () => {
this.currentScrapeListIndex++;
},
setActionName: (name: string) => {
this.currentActionName = name;
},
},
serializableCallback: async (data: any) => {
try {
if (!data || typeof data !== "object") return;
if (!this.currentActionType && Array.isArray(data) && data.length > 0) {
const first = data[0];
if (first && Object.keys(first).some(k => k.toLowerCase().includes("label") || k.toLowerCase().includes("text"))) {
this.currentActionType = "scrapeSchema";
}
}
let typeKey = this.currentActionType || "unknown";
if (this.currentActionType === "scrapeList") {
typeKey = "scrapeList";
} else if (this.currentActionType === "scrapeSchema") {
typeKey = "scrapeSchema";
}
if (this.currentActionType === "scrapeList" && data.scrapeList) {
data = data.scrapeList;
} else if (this.currentActionType === "scrapeSchema" && data.scrapeSchema) {
data = data.scrapeSchema;
}
let actionName = this.currentActionName || "";
if (!actionName) {
if (!Array.isArray(data) && Object.keys(data).length === 1) {
const soleKey = Object.keys(data)[0];
const soleValue = data[soleKey];
if (Array.isArray(soleValue) || typeof soleValue === "object") {
actionName = soleKey;
data = soleValue;
}
}
}
if (!actionName) {
actionName = "Unnamed Action";
}
const flattened = Array.isArray(data)
? data
: (data?.List ?? (data && typeof data === 'object' ? Object.values(data).flat?.() ?? data : []));
if (!this.serializableDataByType[typeKey]) {
this.serializableDataByType[typeKey] = {};
}
this.serializableDataByType[typeKey][actionName] = flattened;
await this.persistDataToDatabase(typeKey, { [actionName]: flattened });
this.socket.emit("serializableCallback", {
type: typeKey,
name: actionName,
data: flattened,
});
this.currentActionType = null;
this.currentActionName = null;
} catch (err: any) {
logger.log('error', `serializableCallback handler failed: ${err.message}`);
}
},
binaryCallback: async (payload: { name: string; data: Buffer; mimeType: string }) => {
try {
const { name, data, mimeType } = payload;
const base64Data = data.toString("base64");
const binaryItem = {
name,
mimeType,
data: base64Data
};
this.binaryData.push(binaryItem);
await this.persistBinaryDataToDatabase(binaryItem);
this.socket.emit("binaryCallback", {
name,
data: base64Data,
mimeType
});
} catch (err: any) {
logger.log("error", `binaryCallback handler failed: ${err.message}`);
}
}
}
const interpreter = new Interpreter(processedWorkflow, options);
this.interpreter = interpreter;
interpreter.on('flag', async (page, resume) => {
if (this.activeId !== null && this.breakpoints[this.activeId]) {
logger.log('debug', `breakpoint hit id: ${this.activeId}`);
this.socket.emit('breakpointHit');
this.interpretationIsPaused = true;
}
if (this.interpretationIsPaused) {
this.interpretationResume = resume;
logger.log('debug', `Paused inside of flag: ${page.url()}`);
updatePageOnPause(page);
this.socket.emit('log', '----- The interpretation has been paused -----', false);
} else {
resume();
}
});
const status = await interpreter.run(page, params);
await this.flushPersistenceBuffer();
// Structure the output to maintain separate data for each action type
const result = {
log: this.debugMessages,
result: status,
scrapeSchemaOutput: this.serializableDataByType.scrapeSchema,
scrapeListOutput: this.serializableDataByType.scrapeList,
binaryOutput: this.binaryData.reduce<Record<string, { data: string; mimeType: string }>>((acc, item) => {
const key = item.name || `Screenshot ${Object.keys(acc).length + 1}`;
acc[key] = { data: item.data, mimeType: item.mimeType };
return acc;
}, {})
}
logger.log('debug', `Interpretation finished`);
return result;
}
/**
* Returns true if an interpretation is currently running.
* @returns {boolean}
*/
public interpretationInProgress = () => {
return this.interpreter !== null;
};
/**
* Updates the socket used for communication with the client (frontend).
* @param socket Socket.io socket instance enabling communication with the client (frontend) side.
* @returns void
*/
public updateSocket = (socket: Socket): void => {
this.socket = socket;
this.subscribeToPausing();
};
/**
* Adds data to persistence buffer for batched processing
* @private
*/
private addToPersistenceBatch(actionType: string, data: any, listIndex?: number, creditValidated: boolean = false): void {
this.persistenceBuffer.push({
actionType,
data,
listIndex,
timestamp: Date.now(),
creditValidated
});
logger.log('debug', `Added ${actionType} to persistence buffer (${this.persistenceBuffer.length} items)`);
}
/**
* Schedules a batched flush if not already scheduled
* @private
*/
private scheduleBatchFlush(): void {
if (!this.persistenceTimer && !this.persistenceInProgress) {
this.persistenceTimer = setTimeout(async () => {
await this.flushPersistenceBuffer();
}, this.BATCH_TIMEOUT);
}
}
/**
* Flushes persistence buffer to database in a single transaction
* @public - Made public to allow external flush before socket emission
*/
public async flushPersistenceBuffer(): Promise<void> {
if (this.persistenceBuffer.length === 0 || this.persistenceInProgress || !this.currentRunId) {
return;
}
if (this.persistenceTimer) {
clearTimeout(this.persistenceTimer);
this.persistenceTimer = null;
}
this.persistenceInProgress = true;
const batchToProcess = [...this.persistenceBuffer];
this.persistenceBuffer = [];
try {
const sequelize = require('../../storage/db').default;
await sequelize.transaction(async (transaction: any) => {
const run = await Run.findOne({
where: { runId: this.currentRunId! },
transaction
});
if (!run) {
logger.log('warn', `Run not found for batched persistence: ${this.currentRunId}`);
return;
}
const currentSerializableOutput = run.serializableOutput ?
JSON.parse(JSON.stringify(run.serializableOutput)) :
{ scrapeSchema: [], scrapeList: [] };
if (Array.isArray(currentSerializableOutput.scrapeList)) {
currentSerializableOutput.scrapeList = {};
}
if (Array.isArray(currentSerializableOutput.scrapeSchema)) {
currentSerializableOutput.scrapeSchema = {};
}
let hasUpdates = false;
const mergeLists = (target: Record<string, any>, updates: Record<string, any>) => {
for (const [key, val] of Object.entries(updates)) {
const flattened = Array.isArray(val)
? val
: (val?.List ?? (val && typeof val === 'object' ? Object.values(val).flat?.() ?? val : []));
target[key] = flattened;
}
};
for (const item of batchToProcess) {
if (item.actionType === 'scrapeSchema') {
if (!currentSerializableOutput.scrapeSchema || typeof currentSerializableOutput.scrapeSchema !== 'object') {
currentSerializableOutput.scrapeSchema = {};
}
mergeLists(currentSerializableOutput.scrapeSchema, item.data);
hasUpdates = true;
} else if (item.actionType === 'scrapeList') {
if (!currentSerializableOutput.scrapeList || typeof currentSerializableOutput.scrapeList !== 'object') {
currentSerializableOutput.scrapeList = {};
}
mergeLists(currentSerializableOutput.scrapeList, item.data);
hasUpdates = true;
}
}
if (hasUpdates) {
await run.update({
serializableOutput: currentSerializableOutput
}, { transaction });
logger.log('debug', `Batched persistence: Updated run ${this.currentRunId} with ${batchToProcess.length} items`);
}
});
this.persistenceRetryCount = 0;
} catch (error: any) {
logger.log('error', `Failed to flush persistence buffer for run ${this.currentRunId}: ${error.message}`);
if (!this.persistenceRetryCount) {
this.persistenceRetryCount = 0;
}
if (this.persistenceRetryCount < 3) {
this.persistenceBuffer.unshift(...batchToProcess);
this.persistenceRetryCount++;
const backoffDelay = Math.min(5000 * Math.pow(2, this.persistenceRetryCount), 30000);
setTimeout(async () => {
await this.flushPersistenceBuffer();
}, backoffDelay);
logger.log('warn', `Scheduling persistence retry ${this.persistenceRetryCount}/3 in ${backoffDelay}ms`);
} else {
logger.log('error', `Max persistence retries exceeded for run ${this.currentRunId}, dropping ${batchToProcess.length} items`);
this.persistenceRetryCount = 0;
}
} finally {
this.persistenceInProgress = false;
if (this.persistenceBuffer.length > 0 && !this.persistenceTimer) {
this.scheduleBatchFlush();
}
}
};
}