diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 828a4f84..369a08be 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -249,7 +249,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } }, (key) => key // Use the original key in the output - )); + )) || []; } /** diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index ecef02db..a7a5de47 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -16,6 +16,23 @@ import Concurrency from './utils/concurrency'; import Preprocessor from './preprocessor'; import log, { Level } from './utils/logger'; +/** + * Extending the Window interface for custom scraping functions. + */ +declare global { + interface Window { + scrape: (selector: string | null) => Record[]; + scrapeSchema: ( + schema: Record + ) => Record; + scrapeList: (config: { listSelector: string; fields: any; limit?: number; pagination: any }) => Record[]; + scrapeListAuto: (listSelector: string) => { selector: string; innerText: string }[]; + scrollDown: (pages?: number) => void; + scrollUp: (pages?: number) => void; + } +} + + /** * Defines optional intepreter options (passed in constructor) */ @@ -31,7 +48,6 @@ interface InterpreterOptions { }> } - /** * Class for running the Smart Workflows. */ @@ -50,6 +66,8 @@ export default class Interpreter extends EventEmitter { private blocker: PlaywrightBlocker | null = null; + private cumulativeResults: Record[] = []; + constructor(workflow: WorkflowFile, options?: Partial) { super(); this.workflow = workflow.workflow; @@ -57,7 +75,9 @@ export default class Interpreter extends EventEmitter { this.options = { maxRepeats: 5, maxConcurrency: 5, - serializableCallback: (data) => { log(JSON.stringify(data), Level.WARN); }, + serializableCallback: (data) => { + log(JSON.stringify(data), Level.WARN); + }, binaryCallback: () => { log('Received binary data, thrashing them.', Level.WARN); }, debug: false, debugChannel: {}, @@ -214,11 +234,11 @@ export default class Interpreter extends EventEmitter { // every condition is treated as a single context switch (key as keyof typeof operators) { - case '$and': + case '$and' as keyof typeof operators: return array?.every((x) => this.applicable(x, context)); - case '$or': + case '$or' as keyof typeof operators: return array?.some((x) => this.applicable(x, context)); - case '$not': + case '$not' as keyof typeof operators: return !this.applicable(value, context); // $not should be a unary operator default: throw new Error('Undefined logic operator.'); @@ -233,9 +253,9 @@ export default class Interpreter extends EventEmitter { }; switch (key as keyof typeof meta) { - case '$before': + case '$before' as keyof typeof meta: return !usedActions.find(testRegexString); - case '$after': + case '$after' as keyof typeof meta: return !!usedActions.find(testRegexString); default: throw new Error('Undefined meta operator.'); @@ -308,9 +328,43 @@ export default class Interpreter extends EventEmitter { scrapeSchema: async (schema: Record) => { await this.ensureScriptsLoaded(page); - + const scrapeResult = await page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema); - await this.options.serializableCallback(scrapeResult); + + const newResults = Array.isArray(scrapeResult) ? scrapeResult : [scrapeResult]; + newResults.forEach((result) => { + Object.entries(result).forEach(([key, value]) => { + const keyExists = this.cumulativeResults.some( + (item) => key in item && item[key] !== undefined + ); + + if (!keyExists) { + this.cumulativeResults.push({ [key]: value }); + } + }); + }); + + const mergedResult: Record[] = [ + Object.fromEntries( + Object.entries( + this.cumulativeResults.reduce((acc, curr) => { + Object.entries(curr).forEach(([key, value]) => { + // If the key doesn't exist or the current value is not undefined, add/update it + if (value !== undefined) { + acc[key] = value; + } + }); + return acc; + }, {}) + ) + ) + ]; + + // Log cumulative results after each action + console.log("CUMULATIVE results:", this.cumulativeResults); + console.log("MERGED results:", mergedResult); + + await this.options.serializableCallback(mergedResult); }, scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => { @@ -357,7 +411,7 @@ export default class Interpreter extends EventEmitter { }; for (const step of steps) { - this.log(`Launching ${step.action}`, Level.LOG); + this.log(`Launching ${String(step.action)}`, Level.LOG); if (step.action in wawActions) { // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not) @@ -365,7 +419,7 @@ export default class Interpreter extends EventEmitter { await wawActions[step.action as CustomFunctions](...(params ?? [])); } else { // Implements the dot notation for the "method name" in the workflow - const levels = step.action.split('.'); + const levels = String(step.action).split('.'); const methodName = levels[levels.length - 1]; let invokee: any = page; @@ -534,9 +588,14 @@ export default class Interpreter extends EventEmitter { if (this.options.debug) { this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN); } - const actionId = workflow.findIndex( - (step) => this.applicable(step.where, pageState, usedActions), - ); + + const actionId = workflow.findIndex((step) => { + const isApplicable = this.applicable(step.where, pageState, usedActions); + console.log(`Where:`, step.where); + console.log(`Page state:`, pageState); + console.log(`Match result: ${isApplicable}`); + return isApplicable; + }); const action = workflow[actionId]; diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index 7801a20e..cfef4a30 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -22,6 +22,7 @@ import { getBestSelectorForAction } from "../utils"; import { browserPool } from "../../server"; import { uuid } from "uuidv4"; import { capture } from "../../utils/analytics" +import { encrypt } from "../../utils/auth"; interface PersistedGeneratedData { lastUsedSelector: string; @@ -159,6 +160,55 @@ export class WorkflowGenerator { }) }; + /** + * New function to handle actionable check for scrapeList + * @param page The current Playwright Page object. + * @param config The scrapeList configuration object. + * @returns {Promise} Array of actionable selectors. + */ + private async getSelectorsForScrapeList(page: Page, config: { + listSelector: string; + fields: any; + limit?: number; + pagination: any; + }): Promise { + const { listSelector } = config; + + // Verify if the selectors are present and actionable on the current page + const actionableSelectors: string[] = []; + if (listSelector) { + const isActionable = await page.isVisible(listSelector).catch(() => false); + if (isActionable) { + actionableSelectors.push(listSelector); + logger.log('debug', `List selector ${listSelector} is actionable.`); + } else { + logger.log('warn', `List selector ${listSelector} is not visible on the page.`); + } + } + + return actionableSelectors; + } + + /** + * New function to handle actionable check for scrapeList + * @param page The current Playwright Page object. + * @param schema The scrapeSchema configuration object. + * @returns {Promise} Array of actionable selectors. + */ + private async getSelectorsForSchema(page: Page, schema: Record): Promise { + const selectors = Object.values(schema).map((field) => field.selector); + + // Verify if the selectors are present and actionable on the current page + const actionableSelectors: string[] = []; + for (const selector of selectors) { + const isActionable = await page.isVisible(selector).catch(() => false); + if (isActionable) { + actionableSelectors.push(selector); + } + } + return actionableSelectors; + } + /** * Adds a newly generated pair to the workflow and notifies the client about it by * sending the updated workflow through socket. @@ -184,55 +234,67 @@ export class WorkflowGenerator { */ private addPairToWorkflowAndNotifyClient = async (pair: WhereWhatPair, page: Page) => { let matched = false; - // validate if a pair with the same where conditions is already present in the workflow + + // Check for scrapeSchema actions and enhance the where condition + if (pair.what[0].action === 'scrapeSchema') { + const schema = pair.what[0]?.args?.[0]; + if (schema) { + const additionalSelectors = await this.getSelectorsForSchema(page, schema); + pair.where.selectors = [...(pair.where.selectors || []), ...additionalSelectors]; + } + } + + if (pair.what[0].action === 'scrapeList') { + const config = pair.what[0]?.args?.[0]; + if (config) { + const actionableSelectors = await this.getSelectorsForScrapeList(page, config); + pair.where.selectors = [...(pair.where.selectors || []), ...actionableSelectors]; + } + } + + // Validate if the pair is already in the workflow if (pair.where.selectors && pair.where.selectors[0]) { const match = selectorAlreadyInWorkflow(pair.where.selectors[0], this.workflowRecord.workflow); if (match) { - // if a match of where conditions is found, the new action is added into the matched rule const matchedIndex = this.workflowRecord.workflow.indexOf(match); if (pair.what[0].action !== 'waitForLoadState' && pair.what[0].action !== 'press') { pair.what.push({ action: 'waitForLoadState', args: ['networkidle'], - }) + }); } this.workflowRecord.workflow[matchedIndex].what = this.workflowRecord.workflow[matchedIndex].what.concat(pair.what); - logger.log('info', `Pushed ${JSON.stringify(this.workflowRecord.workflow[matchedIndex])} to workflow pair`); matched = true; } } - // is the where conditions of the pair are not already in the workflow, we need to validate the where conditions - // for possible overshadowing of different rules and handle cases according to the recording logic + + // Handle cases where the where condition isn't already present if (!matched) { const handled = await this.handleOverShadowing(pair, page, this.generatedData.lastIndex || 0); if (!handled) { - //adding waitForLoadState with networkidle, for better success rate of automatically recorded workflows if (pair.what[0].action !== 'waitForLoadState' && pair.what[0].action !== 'press') { pair.what.push({ action: 'waitForLoadState', args: ['networkidle'], - }) + }); } if (this.generatedData.lastIndex === 0) { this.generatedData.lastIndex = null; - // we want to have the most specific selectors at the beginning of the workflow this.workflowRecord.workflow.unshift(pair); } else { this.workflowRecord.workflow.splice(this.generatedData.lastIndex || 0, 0, pair); if (this.generatedData.lastIndex) { - this.generatedData.lastIndex = this.generatedData.lastIndex - 1; + this.generatedData.lastIndex -= 1; } } - logger.log('info', - `${JSON.stringify(pair)}: Added to workflow file on index: ${this.generatedData.lastIndex || 0}`); - } else { - logger.log('debug', - ` ${JSON.stringify(this.workflowRecord.workflow[this.generatedData.lastIndex || 0])} added action to workflow pair`); } } + + // Emit the updated workflow to the client this.socket.emit('workflow', this.workflowRecord); logger.log('info', `Workflow emitted`); }; + /** * Generates a pair for the click event. @@ -300,7 +362,7 @@ export class WorkflowGenerator { where, what: [{ action: 'press', - args: [selector, key], + args: [selector, encrypt(key)], }], } if (selector) { @@ -797,7 +859,7 @@ export class WorkflowGenerator { // when more than one press action is present, add a type action pair.what.splice(index - input.actionCounter, input.actionCounter, { action: 'type', - args: [input.selector, input.value], + args: [input.selector, encrypt(input.value)], }, { action: 'waitForLoadState', args: ['networkidle'], diff --git a/server/src/workflow-management/classes/Interpreter.ts b/server/src/workflow-management/classes/Interpreter.ts index fa5e9332..d53259b7 100644 --- a/server/src/workflow-management/classes/Interpreter.ts +++ b/server/src/workflow-management/classes/Interpreter.ts @@ -3,6 +3,38 @@ import logger from "../../logger"; import { Socket } from "socket.io"; import { Page } from "playwright"; import { InterpreterSettings } from "../../types"; +import { decrypt } from "../../utils/auth"; + +/** + * Decrypts any encrypted inputs in the workflow. + * @param workflow The workflow to decrypt. + */ +function decryptWorkflow(workflow: WorkflowFile): WorkflowFile { + const decryptedWorkflow = JSON.parse(JSON.stringify(workflow)) as WorkflowFile; + + decryptedWorkflow.workflow.forEach((pair) => { + pair.what.forEach((action) => { + if ((action.action === 'type' || action.action === 'press') && Array.isArray(action.args) && action.args.length > 1) { + try { + const encryptedValue = action.args[1]; + if (typeof encryptedValue === 'string') { + const decryptedValue = decrypt(encryptedValue); + action.args[1] = decryptedValue; + } else { + logger.log('error', 'Encrypted value is not a string'); + action.args[1] = ''; + } + } catch (error: unknown) { + const errorMessage = error instanceof Error ? error.message : String(error); + logger.log('error', `Failed to decrypt input value: ${errorMessage}`); + action.args[1] = ''; + } + } + }); + }); + + return decryptedWorkflow; +} /** * This class implements the main interpretation functions. @@ -123,6 +155,9 @@ export class WorkflowInterpreter { ) => { const params = settings.params ? settings.params : null; delete settings.params; + + const decryptedWorkflow = decryptWorkflow(workflow); + const options = { ...settings, debugChannel: { @@ -143,7 +178,7 @@ export class WorkflowInterpreter { } } - const interpreter = new Interpreter(workflow, options); + const interpreter = new Interpreter(decryptedWorkflow, options); this.interpreter = interpreter; interpreter.on('flag', async (page, resume) => { @@ -212,6 +247,9 @@ export class WorkflowInterpreter { public InterpretRecording = async (workflow: WorkflowFile, page: Page, settings: InterpreterSettings) => { const params = settings.params ? settings.params : null; delete settings.params; + + const decryptedWorkflow = decryptWorkflow(workflow); + const options = { ...settings, debugChannel: { @@ -234,15 +272,19 @@ export class WorkflowInterpreter { } } - const interpreter = new Interpreter(workflow, options); + const interpreter = new Interpreter(decryptedWorkflow, options); this.interpreter = interpreter; const status = await interpreter.run(page, params); + const lastArray = this.serializableData.length > 1 + ? [this.serializableData[this.serializableData.length - 1]] + : this.serializableData; + const result = { log: this.debugMessages, result: status, - serializableOutput: this.serializableData.reduce((reducedObject, item, index) => { + serializableOutput: lastArray.reduce((reducedObject, item, index) => { return { [`item-${index}`]: item, ...reducedObject, diff --git a/src/components/organisms/RightSidePanel.tsx b/src/components/organisms/RightSidePanel.tsx index a11989bd..4aaf7b21 100644 --- a/src/components/organisms/RightSidePanel.tsx +++ b/src/components/organisms/RightSidePanel.tsx @@ -54,6 +54,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture const [showCaptureScreenshot, setShowCaptureScreenshot] = useState(true); const [showCaptureText, setShowCaptureText] = useState(true); const [hoverStates, setHoverStates] = useState<{ [id: string]: boolean }>({}); + const [browserStepIdList, setBrowserStepIdList] = useState([]); const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState } = useGlobalInfoStore(); const { getText, startGetText, stopGetText, getScreenshot, startGetScreenshot, stopGetScreenshot, getList, startGetList, stopGetList, startPaginationMode, stopPaginationMode, paginationType, updatePaginationType, limitType, customLimit, updateLimitType, updateCustomLimit, stopLimitMode, startLimitMode, captureStage, setCaptureStage } = useActionContext(); @@ -195,12 +196,18 @@ export const RightSidePanel: React.FC = ({ onFinishCapture const getTextSettingsObject = useCallback(() => { const settings: Record = {}; browserSteps.forEach(step => { + if (browserStepIdList.includes(step.id)) { + return; + } + if (step.type === 'text' && step.label && step.selectorObj?.selector) { settings[step.label] = step.selectorObj; } + setBrowserStepIdList(prevList => [...prevList, step.id]); }); + return settings; - }, [browserSteps]); + }, [browserSteps, browserStepIdList]); const stopCaptureAndEmitGetTextSettings = useCallback(() => { @@ -211,6 +218,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture } stopGetText(); const settings = getTextSettingsObject(); + console.log("SETTINGS", settings); const hasTextSteps = browserSteps.some(step => step.type === 'text'); if (hasTextSteps) { socket?.emit('action', { action: 'scrapeSchema', settings });