Merge pull request #890 from RohitR311/persist-fix

fix(maxun-core): pagination data persistence for multiple actions
This commit is contained in:
Karishma Shukla
2025-11-20 23:37:20 +05:30
committed by GitHub
2 changed files with 99 additions and 59 deletions

View File

@@ -82,6 +82,8 @@ export default class Interpreter extends EventEmitter {
scrapeSchema: {} scrapeSchema: {}
}; };
private scrapeListCounter: number = 0;
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) { constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
super(); super();
this.workflow = workflow.workflow; this.workflow = workflow.workflow;
@@ -484,7 +486,7 @@ export default class Interpreter extends EventEmitter {
await this.options.serializableCallback(scrapeResults); await this.options.serializableCallback(scrapeResults);
}, },
scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; shadow: string}>) => { scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; shadow: string}>, actionName: string = "") => {
if (this.isAborted) { if (this.isAborted) {
this.log('Workflow aborted, stopping scrapeSchema', Level.WARN); this.log('Workflow aborted, stopping scrapeSchema', Level.WARN);
return; return;
@@ -540,17 +542,17 @@ export default class Interpreter extends EventEmitter {
} }
const actionType = "scrapeSchema"; const actionType = "scrapeSchema";
const actionName = (schema as any).__name || "Texts"; const name = actionName || "Texts";
if (!this.namedResults[actionType]) this.namedResults[actionType] = {}; if (!this.namedResults[actionType]) this.namedResults[actionType] = {};
this.namedResults[actionType][actionName] = this.cumulativeResults; this.namedResults[actionType][name] = this.cumulativeResults;
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {}; if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
if (!this.serializableDataByType[actionType][actionName]) { if (!this.serializableDataByType[actionType][name]) {
this.serializableDataByType[actionType][actionName] = []; this.serializableDataByType[actionType][name] = [];
} }
this.serializableDataByType[actionType][actionName] = [...this.cumulativeResults]; this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
await this.options.serializableCallback({ await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList, scrapeList: this.serializableDataByType.scrapeList,
@@ -558,7 +560,7 @@ export default class Interpreter extends EventEmitter {
}); });
}, },
scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => { scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }, actionName: string = "") => {
if (this.isAborted) { if (this.isAborted) {
this.log('Workflow aborted, stopping scrapeList', Level.WARN); this.log('Workflow aborted, stopping scrapeList', Level.WARN);
return; return;
@@ -581,6 +583,7 @@ export default class Interpreter extends EventEmitter {
} }
let scrapeResults = []; let scrapeResults = [];
let paginationUsed = false;
if (!config.pagination) { if (!config.pagination) {
scrapeResults = await page.evaluate((cfg) => { scrapeResults = await page.evaluate((cfg) => {
@@ -592,38 +595,53 @@ export default class Interpreter extends EventEmitter {
} }
}, config); }, config);
} else { } else {
scrapeResults = await this.handlePagination(page, config); paginationUsed = true;
scrapeResults = await this.handlePagination(page, config, actionName);
} }
if (!Array.isArray(scrapeResults)) { if (!Array.isArray(scrapeResults)) {
scrapeResults = []; scrapeResults = [];
} }
const actionType = "scrapeList"; console.log(`ScrapeList completed with ${scrapeResults.length} results`);
const actionName = (config as any).__name || "List";
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {}; if (!paginationUsed) {
if (!this.serializableDataByType[actionType][actionName]) { const actionType = "scrapeList";
this.serializableDataByType[actionType][actionName] = []; let name = actionName || "";
if (!name || name.trim() === "") {
this.scrapeListCounter++;
name = `List ${this.scrapeListCounter}`;
}
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
if (!this.serializableDataByType[actionType][name]) {
this.serializableDataByType[actionType][name] = [];
}
this.serializableDataByType[actionType][name].push(...scrapeResults);
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList,
scrapeSchema: this.serializableDataByType.scrapeSchema
});
} }
this.serializableDataByType[actionType][actionName].push(...scrapeResults);
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList,
scrapeSchema: this.serializableDataByType.scrapeSchema
});
} catch (error) { } catch (error) {
console.error('ScrapeList action failed completely:', error.message); console.error('ScrapeList action failed completely:', error.message);
const actionType = "scrapeList"; const actionType = "scrapeList";
const actionName = (config as any).__name || "List"; let name = actionName || "";
if (!name || name.trim() === "") {
this.scrapeListCounter++;
name = `List ${this.scrapeListCounter}`;
}
if (!this.namedResults[actionType]) this.namedResults[actionType] = {}; if (!this.namedResults[actionType]) this.namedResults[actionType] = {};
this.namedResults[actionType][actionName] = []; this.namedResults[actionType][name] = [];
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {}; if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
this.serializableDataByType[actionType][actionName] = []; this.serializableDataByType[actionType][name] = [];
await this.options.serializableCallback({ await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList, scrapeList: this.serializableDataByType.scrapeList,
@@ -718,26 +736,7 @@ export default class Interpreter extends EventEmitter {
debug.setActionType(String(step.action)); debug.setActionType(String(step.action));
} }
if ((step as any)?.name) { stepName = (step as any)?.name || String(step.action);
stepName = (step as any).name;
} else if (
Array.isArray((step as any)?.args) &&
(step as any).args.length > 0 &&
typeof (step as any).args[0] === "object" &&
"__name" in (step as any).args[0]
) {
stepName = (step as any).args[0].__name;
} else if (
typeof (step as any)?.args === "object" &&
step?.args !== null &&
"__name" in (step as any).args
) {
stepName = (step as any).args.__name;
}
if (!stepName) {
stepName = String(step.action);
}
if (debug && typeof (debug as any).setActionName === "function") { if (debug && typeof (debug as any).setActionName === "function") {
(debug as any).setActionName(stepName); (debug as any).setActionName(stepName);
@@ -751,6 +750,9 @@ export default class Interpreter extends EventEmitter {
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args]; const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
if (step.action === 'screenshot') { if (step.action === 'screenshot') {
await (wawActions.screenshot as any)(...(params ?? []), stepName ?? undefined); await (wawActions.screenshot as any)(...(params ?? []), stepName ?? undefined);
} else if (step.action === 'scrapeList' || step.action === 'scrapeSchema') {
const actionName = (step as any).name || "";
await wawActions[step.action as CustomFunctions](...(params ?? []), actionName);
} else { } else {
await wawActions[step.action as CustomFunctions](...(params ?? [])); await wawActions[step.action as CustomFunctions](...(params ?? []));
} }
@@ -812,18 +814,32 @@ export default class Interpreter extends EventEmitter {
fields: any, fields: any,
limit?: number, limit?: number,
pagination: any pagination: any
}) { }, providedActionName: string = "") {
if (this.isAborted) { if (this.isAborted) {
this.log('Workflow aborted, stopping pagination', Level.WARN); this.log('Workflow aborted, stopping pagination', Level.WARN);
return []; return [];
} }
const actionType = "scrapeList";
let actionName = providedActionName || "";
if (!actionName || actionName.trim() === "") {
this.scrapeListCounter++;
actionName = `List ${this.scrapeListCounter}`;
}
if (!this.serializableDataByType[actionType]) {
this.serializableDataByType[actionType] = {};
}
if (!this.serializableDataByType[actionType][actionName]) {
this.serializableDataByType[actionType][actionName] = [];
}
let allResults: Record<string, any>[] = []; let allResults: Record<string, any>[] = [];
let previousHeight = 0; let previousHeight = 0;
let scrapedItems: Set<string> = new Set<string>(); let scrapedItems: Set<string> = new Set<string>();
let visitedUrls: Set<string> = new Set<string>(); let visitedUrls: Set<string> = new Set<string>();
const MAX_RETRIES = 3; const MAX_RETRIES = 3;
const RETRY_DELAY = 1000; // 1 second delay between retries const RETRY_DELAY = 1000;
const MAX_UNCHANGED_RESULTS = 5; const MAX_UNCHANGED_RESULTS = 5;
const debugLog = (message: string, ...args: any[]) => { const debugLog = (message: string, ...args: any[]) => {
@@ -831,7 +847,6 @@ export default class Interpreter extends EventEmitter {
}; };
const scrapeCurrentPage = async () => { const scrapeCurrentPage = async () => {
// Check abort flag before scraping current page
if (this.isAborted) { if (this.isAborted) {
debugLog("Workflow aborted, stopping scrapeCurrentPage"); debugLog("Workflow aborted, stopping scrapeCurrentPage");
return; return;
@@ -849,7 +864,6 @@ export default class Interpreter extends EventEmitter {
debugLog(`Page evaluation failed: ${error.message}`); debugLog(`Page evaluation failed: ${error.message}`);
return; return;
} }
const newResults = results.filter(item => { const newResults = results.filter(item => {
const uniqueKey = JSON.stringify(item); const uniqueKey = JSON.stringify(item);
if (scrapedItems.has(uniqueKey)) return false; if (scrapedItems.has(uniqueKey)) return false;
@@ -859,7 +873,11 @@ export default class Interpreter extends EventEmitter {
allResults = allResults.concat(newResults); allResults = allResults.concat(newResults);
debugLog("Results collected:", allResults.length); debugLog("Results collected:", allResults.length);
await this.options.serializableCallback(allResults); this.serializableDataByType[actionType][actionName] = [...allResults];
await this.options.serializableCallback({
scrapeList: this.serializableDataByType.scrapeList,
scrapeSchema: this.serializableDataByType.scrapeSchema
});
}; };
const checkLimit = () => { const checkLimit = () => {

View File

@@ -562,14 +562,39 @@ export class WorkflowInterpreter {
? data?.scrapeSchema ? data?.scrapeSchema
: null; : null;
if (!subtree) return; if (typeKey === "scrapeList" && data.scrapeList) {
data = data.scrapeList;
if (typeKey === "scrapeList") { } else if (typeKey === "scrapeSchema" && data.scrapeSchema) {
actionName = this.getUniqueActionName(typeKey, actionName); data = data.scrapeSchema;
} }
const values = Object.values(subtree); let actionName = "";
const flattened = values.flat(); if (typeKey === "scrapeList" && data && typeof data === "object" && !Array.isArray(data)) {
const keys = Object.keys(data);
if (keys.length === 1) {
actionName = keys[0];
data = data[actionName];
} else if (keys.length > 1) {
actionName = keys[keys.length - 1];
data = data[actionName];
}
}
if (!actionName) {
actionName = this.currentActionName || "";
if (typeKey === "scrapeList" && !actionName) {
actionName = this.getUniqueActionName(typeKey, "");
}
}
const flattened = Array.isArray(data)
? data
: (
data?.List ??
(data && typeof data === "object"
? Object.values(data).flat?.() ?? data
: [])
);
if (!this.serializableDataByType[typeKey]) { if (!this.serializableDataByType[typeKey]) {
this.serializableDataByType[typeKey] = {}; this.serializableDataByType[typeKey] = {};
@@ -586,9 +611,6 @@ export class WorkflowInterpreter {
name: actionName, name: actionName,
data: flattened, data: flattened,
}); });
this.currentActionType = null;
this.currentActionName = null;
} catch (err: any) { } catch (err: any) {
logger.log("error", `serializableCallback failed: ${err.message}`); logger.log("error", `serializableCallback failed: ${err.message}`);
} }