From c1af321d8b9bf8af2c2f6214fefcc8c2e09f9263 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 3 Dec 2024 17:51:06 +0530 Subject: [PATCH] feat: add scrapeSchema data accumulation logic --- maxun-core/src/interpret.ts | 79 ++++++++++++++++++++++++++++++------- 1 file changed, 65 insertions(+), 14 deletions(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index ecef02db..114605e2 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -16,6 +16,23 @@ import Concurrency from './utils/concurrency'; import Preprocessor from './preprocessor'; import log, { Level } from './utils/logger'; +/** + * Extending the Window interface for custom scraping functions. + */ +declare global { + interface Window { + scrape: (selector: string | null) => Record[]; + scrapeSchema: ( + schema: Record + ) => Record; + scrapeList: (config: { listSelector: string; fields: any; limit?: number; pagination: any }) => Record[]; + scrapeListAuto: (listSelector: string) => { selector: string; innerText: string }[]; + scrollDown: (pages?: number) => void; + scrollUp: (pages?: number) => void; + } +} + + /** * Defines optional intepreter options (passed in constructor) */ @@ -31,7 +48,6 @@ interface InterpreterOptions { }> } - /** * Class for running the Smart Workflows. */ @@ -50,6 +66,8 @@ export default class Interpreter extends EventEmitter { private blocker: PlaywrightBlocker | null = null; + private cumulativeResults: Record[] = []; + constructor(workflow: WorkflowFile, options?: Partial) { super(); this.workflow = workflow.workflow; @@ -57,7 +75,9 @@ export default class Interpreter extends EventEmitter { this.options = { maxRepeats: 5, maxConcurrency: 5, - serializableCallback: (data) => { log(JSON.stringify(data), Level.WARN); }, + serializableCallback: (data) => { + log(JSON.stringify(data), Level.WARN); + }, binaryCallback: () => { log('Received binary data, thrashing them.', Level.WARN); }, debug: false, debugChannel: {}, @@ -214,11 +234,11 @@ export default class Interpreter extends EventEmitter { // every condition is treated as a single context switch (key as keyof typeof operators) { - case '$and': + case '$and' as keyof typeof operators: return array?.every((x) => this.applicable(x, context)); - case '$or': + case '$or' as keyof typeof operators: return array?.some((x) => this.applicable(x, context)); - case '$not': + case '$not' as keyof typeof operators: return !this.applicable(value, context); // $not should be a unary operator default: throw new Error('Undefined logic operator.'); @@ -233,9 +253,9 @@ export default class Interpreter extends EventEmitter { }; switch (key as keyof typeof meta) { - case '$before': + case '$before' as keyof typeof meta: return !usedActions.find(testRegexString); - case '$after': + case '$after' as keyof typeof meta: return !!usedActions.find(testRegexString); default: throw new Error('Undefined meta operator.'); @@ -308,9 +328,35 @@ export default class Interpreter extends EventEmitter { scrapeSchema: async (schema: Record) => { await this.ensureScriptsLoaded(page); - + + // Scrape data using the schema const scrapeResult = await page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema); - await this.options.serializableCallback(scrapeResult); + + // Log result and accumulate it + console.log("Scrape result:", scrapeResult); + this.cumulativeResults.push(...(Array.isArray(scrapeResult) ? scrapeResult : [scrapeResult])); + + const mergedResult: Record[] = [ + Object.fromEntries( + Object.entries( + this.cumulativeResults.reduce((acc, curr) => { + Object.entries(curr).forEach(([key, value]) => { + // If the key doesn't exist or the current value is not undefined, add/update it + if (value !== undefined) { + acc[key] = value; + } + }); + return acc; + }, {}) + ) + ) + ]; + + // Log cumulative results after each action + console.log("CUMULATIVE results:", this.cumulativeResults); + console.log("MERGED results:", mergedResult); + + await this.options.serializableCallback(mergedResult); }, scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => { @@ -357,7 +403,7 @@ export default class Interpreter extends EventEmitter { }; for (const step of steps) { - this.log(`Launching ${step.action}`, Level.LOG); + this.log(`Launching ${String(step.action)}`, Level.LOG); if (step.action in wawActions) { // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not) @@ -365,7 +411,7 @@ export default class Interpreter extends EventEmitter { await wawActions[step.action as CustomFunctions](...(params ?? [])); } else { // Implements the dot notation for the "method name" in the workflow - const levels = step.action.split('.'); + const levels = String(step.action).split('.'); const methodName = levels[levels.length - 1]; let invokee: any = page; @@ -534,9 +580,14 @@ export default class Interpreter extends EventEmitter { if (this.options.debug) { this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN); } - const actionId = workflow.findIndex( - (step) => this.applicable(step.where, pageState, usedActions), - ); + + const actionId = workflow.findIndex((step) => { + const isApplicable = this.applicable(step.where, pageState, usedActions); + console.log(`Where:`, step.where); + console.log(`Page state:`, pageState); + console.log(`Match result: ${isApplicable}`); + return isApplicable; + }); const action = workflow[actionId];