Merge pull request #890 from RohitR311/persist-fix
fix(maxun-core): pagination data persistence for multiple actions
This commit is contained in:
@@ -82,6 +82,8 @@ export default class Interpreter extends EventEmitter {
|
|||||||
scrapeSchema: {}
|
scrapeSchema: {}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
private scrapeListCounter: number = 0;
|
||||||
|
|
||||||
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
|
||||||
super();
|
super();
|
||||||
this.workflow = workflow.workflow;
|
this.workflow = workflow.workflow;
|
||||||
@@ -484,7 +486,7 @@ export default class Interpreter extends EventEmitter {
|
|||||||
await this.options.serializableCallback(scrapeResults);
|
await this.options.serializableCallback(scrapeResults);
|
||||||
},
|
},
|
||||||
|
|
||||||
scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; shadow: string}>) => {
|
scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; shadow: string}>, actionName: string = "") => {
|
||||||
if (this.isAborted) {
|
if (this.isAborted) {
|
||||||
this.log('Workflow aborted, stopping scrapeSchema', Level.WARN);
|
this.log('Workflow aborted, stopping scrapeSchema', Level.WARN);
|
||||||
return;
|
return;
|
||||||
@@ -540,17 +542,17 @@ export default class Interpreter extends EventEmitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
const actionType = "scrapeSchema";
|
const actionType = "scrapeSchema";
|
||||||
const actionName = (schema as any).__name || "Texts";
|
const name = actionName || "Texts";
|
||||||
|
|
||||||
if (!this.namedResults[actionType]) this.namedResults[actionType] = {};
|
if (!this.namedResults[actionType]) this.namedResults[actionType] = {};
|
||||||
this.namedResults[actionType][actionName] = this.cumulativeResults;
|
this.namedResults[actionType][name] = this.cumulativeResults;
|
||||||
|
|
||||||
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
|
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
|
||||||
if (!this.serializableDataByType[actionType][actionName]) {
|
if (!this.serializableDataByType[actionType][name]) {
|
||||||
this.serializableDataByType[actionType][actionName] = [];
|
this.serializableDataByType[actionType][name] = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
this.serializableDataByType[actionType][actionName] = [...this.cumulativeResults];
|
this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
|
||||||
|
|
||||||
await this.options.serializableCallback({
|
await this.options.serializableCallback({
|
||||||
scrapeList: this.serializableDataByType.scrapeList,
|
scrapeList: this.serializableDataByType.scrapeList,
|
||||||
@@ -558,7 +560,7 @@ export default class Interpreter extends EventEmitter {
|
|||||||
});
|
});
|
||||||
},
|
},
|
||||||
|
|
||||||
scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => {
|
scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }, actionName: string = "") => {
|
||||||
if (this.isAborted) {
|
if (this.isAborted) {
|
||||||
this.log('Workflow aborted, stopping scrapeList', Level.WARN);
|
this.log('Workflow aborted, stopping scrapeList', Level.WARN);
|
||||||
return;
|
return;
|
||||||
@@ -581,6 +583,7 @@ export default class Interpreter extends EventEmitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let scrapeResults = [];
|
let scrapeResults = [];
|
||||||
|
let paginationUsed = false;
|
||||||
|
|
||||||
if (!config.pagination) {
|
if (!config.pagination) {
|
||||||
scrapeResults = await page.evaluate((cfg) => {
|
scrapeResults = await page.evaluate((cfg) => {
|
||||||
@@ -592,38 +595,53 @@ export default class Interpreter extends EventEmitter {
|
|||||||
}
|
}
|
||||||
}, config);
|
}, config);
|
||||||
} else {
|
} else {
|
||||||
scrapeResults = await this.handlePagination(page, config);
|
paginationUsed = true;
|
||||||
|
scrapeResults = await this.handlePagination(page, config, actionName);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!Array.isArray(scrapeResults)) {
|
if (!Array.isArray(scrapeResults)) {
|
||||||
scrapeResults = [];
|
scrapeResults = [];
|
||||||
}
|
}
|
||||||
|
|
||||||
const actionType = "scrapeList";
|
console.log(`ScrapeList completed with ${scrapeResults.length} results`);
|
||||||
const actionName = (config as any).__name || "List";
|
|
||||||
|
|
||||||
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
|
if (!paginationUsed) {
|
||||||
if (!this.serializableDataByType[actionType][actionName]) {
|
const actionType = "scrapeList";
|
||||||
this.serializableDataByType[actionType][actionName] = [];
|
let name = actionName || "";
|
||||||
|
|
||||||
|
if (!name || name.trim() === "") {
|
||||||
|
this.scrapeListCounter++;
|
||||||
|
name = `List ${this.scrapeListCounter}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
|
||||||
|
if (!this.serializableDataByType[actionType][name]) {
|
||||||
|
this.serializableDataByType[actionType][name] = [];
|
||||||
|
}
|
||||||
|
|
||||||
|
this.serializableDataByType[actionType][name].push(...scrapeResults);
|
||||||
|
|
||||||
|
await this.options.serializableCallback({
|
||||||
|
scrapeList: this.serializableDataByType.scrapeList,
|
||||||
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
this.serializableDataByType[actionType][actionName].push(...scrapeResults);
|
|
||||||
|
|
||||||
await this.options.serializableCallback({
|
|
||||||
scrapeList: this.serializableDataByType.scrapeList,
|
|
||||||
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
||||||
});
|
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error('ScrapeList action failed completely:', error.message);
|
console.error('ScrapeList action failed completely:', error.message);
|
||||||
|
|
||||||
const actionType = "scrapeList";
|
const actionType = "scrapeList";
|
||||||
const actionName = (config as any).__name || "List";
|
let name = actionName || "";
|
||||||
|
|
||||||
|
if (!name || name.trim() === "") {
|
||||||
|
this.scrapeListCounter++;
|
||||||
|
name = `List ${this.scrapeListCounter}`;
|
||||||
|
}
|
||||||
|
|
||||||
if (!this.namedResults[actionType]) this.namedResults[actionType] = {};
|
if (!this.namedResults[actionType]) this.namedResults[actionType] = {};
|
||||||
this.namedResults[actionType][actionName] = [];
|
this.namedResults[actionType][name] = [];
|
||||||
|
|
||||||
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
|
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
|
||||||
this.serializableDataByType[actionType][actionName] = [];
|
this.serializableDataByType[actionType][name] = [];
|
||||||
|
|
||||||
await this.options.serializableCallback({
|
await this.options.serializableCallback({
|
||||||
scrapeList: this.serializableDataByType.scrapeList,
|
scrapeList: this.serializableDataByType.scrapeList,
|
||||||
@@ -718,26 +736,7 @@ export default class Interpreter extends EventEmitter {
|
|||||||
debug.setActionType(String(step.action));
|
debug.setActionType(String(step.action));
|
||||||
}
|
}
|
||||||
|
|
||||||
if ((step as any)?.name) {
|
stepName = (step as any)?.name || String(step.action);
|
||||||
stepName = (step as any).name;
|
|
||||||
} else if (
|
|
||||||
Array.isArray((step as any)?.args) &&
|
|
||||||
(step as any).args.length > 0 &&
|
|
||||||
typeof (step as any).args[0] === "object" &&
|
|
||||||
"__name" in (step as any).args[0]
|
|
||||||
) {
|
|
||||||
stepName = (step as any).args[0].__name;
|
|
||||||
} else if (
|
|
||||||
typeof (step as any)?.args === "object" &&
|
|
||||||
step?.args !== null &&
|
|
||||||
"__name" in (step as any).args
|
|
||||||
) {
|
|
||||||
stepName = (step as any).args.__name;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!stepName) {
|
|
||||||
stepName = String(step.action);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (debug && typeof (debug as any).setActionName === "function") {
|
if (debug && typeof (debug as any).setActionName === "function") {
|
||||||
(debug as any).setActionName(stepName);
|
(debug as any).setActionName(stepName);
|
||||||
@@ -751,6 +750,9 @@ export default class Interpreter extends EventEmitter {
|
|||||||
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
|
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
|
||||||
if (step.action === 'screenshot') {
|
if (step.action === 'screenshot') {
|
||||||
await (wawActions.screenshot as any)(...(params ?? []), stepName ?? undefined);
|
await (wawActions.screenshot as any)(...(params ?? []), stepName ?? undefined);
|
||||||
|
} else if (step.action === 'scrapeList' || step.action === 'scrapeSchema') {
|
||||||
|
const actionName = (step as any).name || "";
|
||||||
|
await wawActions[step.action as CustomFunctions](...(params ?? []), actionName);
|
||||||
} else {
|
} else {
|
||||||
await wawActions[step.action as CustomFunctions](...(params ?? []));
|
await wawActions[step.action as CustomFunctions](...(params ?? []));
|
||||||
}
|
}
|
||||||
@@ -812,18 +814,32 @@ export default class Interpreter extends EventEmitter {
|
|||||||
fields: any,
|
fields: any,
|
||||||
limit?: number,
|
limit?: number,
|
||||||
pagination: any
|
pagination: any
|
||||||
}) {
|
}, providedActionName: string = "") {
|
||||||
if (this.isAborted) {
|
if (this.isAborted) {
|
||||||
this.log('Workflow aborted, stopping pagination', Level.WARN);
|
this.log('Workflow aborted, stopping pagination', Level.WARN);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const actionType = "scrapeList";
|
||||||
|
let actionName = providedActionName || "";
|
||||||
|
if (!actionName || actionName.trim() === "") {
|
||||||
|
this.scrapeListCounter++;
|
||||||
|
actionName = `List ${this.scrapeListCounter}`;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!this.serializableDataByType[actionType]) {
|
||||||
|
this.serializableDataByType[actionType] = {};
|
||||||
|
}
|
||||||
|
if (!this.serializableDataByType[actionType][actionName]) {
|
||||||
|
this.serializableDataByType[actionType][actionName] = [];
|
||||||
|
}
|
||||||
|
|
||||||
let allResults: Record<string, any>[] = [];
|
let allResults: Record<string, any>[] = [];
|
||||||
let previousHeight = 0;
|
let previousHeight = 0;
|
||||||
let scrapedItems: Set<string> = new Set<string>();
|
let scrapedItems: Set<string> = new Set<string>();
|
||||||
let visitedUrls: Set<string> = new Set<string>();
|
let visitedUrls: Set<string> = new Set<string>();
|
||||||
const MAX_RETRIES = 3;
|
const MAX_RETRIES = 3;
|
||||||
const RETRY_DELAY = 1000; // 1 second delay between retries
|
const RETRY_DELAY = 1000;
|
||||||
const MAX_UNCHANGED_RESULTS = 5;
|
const MAX_UNCHANGED_RESULTS = 5;
|
||||||
|
|
||||||
const debugLog = (message: string, ...args: any[]) => {
|
const debugLog = (message: string, ...args: any[]) => {
|
||||||
@@ -831,7 +847,6 @@ export default class Interpreter extends EventEmitter {
|
|||||||
};
|
};
|
||||||
|
|
||||||
const scrapeCurrentPage = async () => {
|
const scrapeCurrentPage = async () => {
|
||||||
// Check abort flag before scraping current page
|
|
||||||
if (this.isAborted) {
|
if (this.isAborted) {
|
||||||
debugLog("Workflow aborted, stopping scrapeCurrentPage");
|
debugLog("Workflow aborted, stopping scrapeCurrentPage");
|
||||||
return;
|
return;
|
||||||
@@ -849,7 +864,6 @@ export default class Interpreter extends EventEmitter {
|
|||||||
debugLog(`Page evaluation failed: ${error.message}`);
|
debugLog(`Page evaluation failed: ${error.message}`);
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
|
||||||
const newResults = results.filter(item => {
|
const newResults = results.filter(item => {
|
||||||
const uniqueKey = JSON.stringify(item);
|
const uniqueKey = JSON.stringify(item);
|
||||||
if (scrapedItems.has(uniqueKey)) return false;
|
if (scrapedItems.has(uniqueKey)) return false;
|
||||||
@@ -859,7 +873,11 @@ export default class Interpreter extends EventEmitter {
|
|||||||
allResults = allResults.concat(newResults);
|
allResults = allResults.concat(newResults);
|
||||||
debugLog("Results collected:", allResults.length);
|
debugLog("Results collected:", allResults.length);
|
||||||
|
|
||||||
await this.options.serializableCallback(allResults);
|
this.serializableDataByType[actionType][actionName] = [...allResults];
|
||||||
|
await this.options.serializableCallback({
|
||||||
|
scrapeList: this.serializableDataByType.scrapeList,
|
||||||
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
||||||
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
const checkLimit = () => {
|
const checkLimit = () => {
|
||||||
|
|||||||
@@ -562,14 +562,39 @@ export class WorkflowInterpreter {
|
|||||||
? data?.scrapeSchema
|
? data?.scrapeSchema
|
||||||
: null;
|
: null;
|
||||||
|
|
||||||
if (!subtree) return;
|
if (typeKey === "scrapeList" && data.scrapeList) {
|
||||||
|
data = data.scrapeList;
|
||||||
if (typeKey === "scrapeList") {
|
} else if (typeKey === "scrapeSchema" && data.scrapeSchema) {
|
||||||
actionName = this.getUniqueActionName(typeKey, actionName);
|
data = data.scrapeSchema;
|
||||||
}
|
}
|
||||||
|
|
||||||
const values = Object.values(subtree);
|
let actionName = "";
|
||||||
const flattened = values.flat();
|
if (typeKey === "scrapeList" && data && typeof data === "object" && !Array.isArray(data)) {
|
||||||
|
const keys = Object.keys(data);
|
||||||
|
if (keys.length === 1) {
|
||||||
|
actionName = keys[0];
|
||||||
|
data = data[actionName];
|
||||||
|
} else if (keys.length > 1) {
|
||||||
|
actionName = keys[keys.length - 1];
|
||||||
|
data = data[actionName];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!actionName) {
|
||||||
|
actionName = this.currentActionName || "";
|
||||||
|
if (typeKey === "scrapeList" && !actionName) {
|
||||||
|
actionName = this.getUniqueActionName(typeKey, "");
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const flattened = Array.isArray(data)
|
||||||
|
? data
|
||||||
|
: (
|
||||||
|
data?.List ??
|
||||||
|
(data && typeof data === "object"
|
||||||
|
? Object.values(data).flat?.() ?? data
|
||||||
|
: [])
|
||||||
|
);
|
||||||
|
|
||||||
if (!this.serializableDataByType[typeKey]) {
|
if (!this.serializableDataByType[typeKey]) {
|
||||||
this.serializableDataByType[typeKey] = {};
|
this.serializableDataByType[typeKey] = {};
|
||||||
@@ -586,9 +611,6 @@ export class WorkflowInterpreter {
|
|||||||
name: actionName,
|
name: actionName,
|
||||||
data: flattened,
|
data: flattened,
|
||||||
});
|
});
|
||||||
|
|
||||||
this.currentActionType = null;
|
|
||||||
this.currentActionName = null;
|
|
||||||
} catch (err: any) {
|
} catch (err: any) {
|
||||||
logger.log("error", `serializableCallback failed: ${err.message}`);
|
logger.log("error", `serializableCallback failed: ${err.message}`);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user