From ef571c4ea09ebc15b12bdbdc7398bef7c5d69d68 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sat, 7 Dec 2024 23:28:31 +0530 Subject: [PATCH 01/10] feat: traverse dom tree for parent element selection --- server/src/workflow-management/selector.ts | 144 +++++++++++---------- 1 file changed, 78 insertions(+), 66 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 193de891..e0cd10c5 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -20,49 +20,6 @@ type Workflow = WorkflowFile["workflow"]; * @category WorkflowManagement-Selectors * @returns {Promise} */ -export const getRect = async (page: Page, coordinates: Coordinates) => { - try { - const rect = await page.evaluate( - async ({ x, y }) => { - const el = document.elementFromPoint(x, y) as HTMLElement; - if (el) { - const { parentElement } = el; - // Match the logic in recorder.ts for link clicks - const element = parentElement?.tagName === 'A' ? parentElement : el; - const rectangle = element?.getBoundingClientRect(); - // @ts-ignore - if (rectangle) { - return { - x: rectangle.x, - y: rectangle.y, - width: rectangle.width, - height: rectangle.height, - top: rectangle.top, - right: rectangle.right, - bottom: rectangle.bottom, - left: rectangle.left, - }; - } - } - }, - { x: coordinates.x, y: coordinates.y }, - ); - return rect; - } catch (error) { - const { message, stack } = error as Error; - logger.log('error', `Error while retrieving selector: ${message}`); - logger.log('error', `Stack: ${stack}`); - } -} - -/** - * Checks the basic info about an element and returns a {@link BaseActionInfo} object. - * If the element is not found, returns undefined. - * @param page The page instance. - * @param coordinates Coordinates of an element. - * @category WorkflowManagement-Selectors - * @returns {Promise} - */ export const getElementInformation = async ( page: Page, coordinates: Coordinates @@ -70,10 +27,15 @@ export const getElementInformation = async ( try { const elementInfo = await page.evaluate( async ({ x, y }) => { - const el = document.elementFromPoint(x, y) as HTMLElement; - if (el) { - const { parentElement } = el; - const element = parentElement?.tagName === 'A' ? parentElement : el; + // Find the initial element at the point + const initialElement = document.elementFromPoint(x, y) as HTMLElement; + + if (initialElement) { + // Simply use the direct parent, no complex logic + const parentElement = initialElement.parentElement; + + // Use the parent if it exists, otherwise use the initial element + const element = parentElement || initialElement; let info: { tagName: string; @@ -84,32 +46,41 @@ export const getElementInformation = async ( attributes?: Record; innerHTML?: string; outerHTML?: string; + parentTagName?: string; + parentClasses?: string[]; } = { - tagName: element?.tagName ?? '', + tagName: element.tagName, + parentTagName: element.parentElement?.tagName, + parentClasses: element.parentElement + ? Array.from(element.parentElement.classList) + : [] }; - if (element) { - info.attributes = Array.from(element.attributes).reduce( - (acc, attr) => { - acc[attr.name] = attr.value; - return acc; - }, - {} as Record - ); - } + // Collect attributes + info.attributes = Array.from(element.attributes).reduce( + (acc, attr) => { + acc[attr.name] = attr.value; + return acc; + }, + {} as Record + ); - // Gather specific information based on the tag - if (element?.tagName === 'A') { - info.url = (element as HTMLAnchorElement).href; - info.innerText = element.innerText ?? ''; - } else if (element?.tagName === 'IMG') { - info.imageUrl = (element as HTMLImageElement).src; + // Specific handling for different element types + if (element.tagName === 'A') { + const anchorElement = element as HTMLAnchorElement; + info.url = anchorElement.href; + info.innerText = anchorElement.innerText ?? ''; + } else if (element.tagName === 'IMG') { + const imgElement = element as HTMLImageElement; + info.imageUrl = imgElement.src; } else { - info.hasOnlyText = element?.children?.length === 0 && - element?.innerText?.length > 0; - info.innerText = element?.innerText ?? ''; + // Check if element contains only text + info.hasOnlyText = element.children.length === 0 && + (element.innerText?.length ?? 0) > 0; + info.innerText = element.innerText ?? ''; } + // HTML content info.innerHTML = element.innerHTML; info.outerHTML = element.outerHTML; @@ -127,6 +98,47 @@ export const getElementInformation = async ( } }; +export const getRect = async (page: Page, coordinates: Coordinates) => { + try { + const rect = await page.evaluate( + async ({ x, y }) => { + // Find the initial element at the point + const initialElement = document.elementFromPoint(x, y) as HTMLElement; + + if (initialElement) { + // Simply use the direct parent, no complex logic + const parentElement = initialElement.parentElement; + + // Use the parent if it exists, otherwise use the initial element + const element = parentElement || initialElement; + + // Get bounding rectangle + const rectangle = element?.getBoundingClientRect(); + + if (rectangle) { + return { + x: rectangle.x, + y: rectangle.y, + width: rectangle.width, + height: rectangle.height, + top: rectangle.top, + right: rectangle.right, + bottom: rectangle.bottom, + left: rectangle.left, + }; + } + } + return null; + }, + { x: coordinates.x, y: coordinates.y }, + ); + return rect; + } catch (error) { + const { message, stack } = error as Error; + console.error('Error while retrieving selector:', message); + console.error('Stack:', stack); + } +}; /** * Returns the best and unique css {@link Selectors} for the element on the page. From 9f24e0018c29efa30a7a49a6cea45ea7bc7a004a Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sat, 7 Dec 2024 23:36:01 +0530 Subject: [PATCH 02/10] feat: !return null --- server/src/workflow-management/selector.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index e0cd10c5..aaa53a5d 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -128,7 +128,6 @@ export const getRect = async (page: Page, coordinates: Coordinates) => { }; } } - return null; }, { x: coordinates.x, y: coordinates.y }, ); From 8c4c0b734d863bf1bc78f00e72fd709a89c626bc Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sat, 7 Dec 2024 23:45:07 +0530 Subject: [PATCH 03/10] feat: handle selector generation if no parent element --- server/src/workflow-management/selector.ts | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index aaa53a5d..7908edc2 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -753,7 +753,6 @@ interface SelectorResult { export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates): Promise => { try { const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { - function getNonUniqueSelector(element: HTMLElement): string { let selector = element.tagName.toLowerCase(); @@ -775,18 +774,25 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates let depth = 0; const maxDepth = 2; - while (element && element !== document.body && depth < maxDepth) { - const selector = getNonUniqueSelector(element); + // Ensure we start with a valid element + let currentElement = element; + while (currentElement && currentElement !== document.body && depth < maxDepth) { + const selector = getNonUniqueSelector(currentElement); path.unshift(selector); - element = element.parentElement; + currentElement = currentElement.parentElement; depth++; } return path.join(' > '); } - const element = document.elementFromPoint(x, y) as HTMLElement | null; - if (!element) return null; + // Find the initial element at the point + const initialElement = document.elementFromPoint(x, y) as HTMLElement; + + if (!initialElement) return null; + + // Prefer parent if exists, otherwise use initial element + const element = initialElement.parentElement || initialElement; const generalSelector = getSelectorPath(element); return { From d443503d094071312bad02455f482047cd364d86 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 9 Dec 2024 05:43:51 +0530 Subject: [PATCH 04/10] feat: proper rect and element info --- server/src/workflow-management/selector.ts | 166 +++++++++++++-------- 1 file changed, 103 insertions(+), 63 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 7908edc2..9587e898 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -27,15 +27,33 @@ export const getElementInformation = async ( try { const elementInfo = await page.evaluate( async ({ x, y }) => { - // Find the initial element at the point - const initialElement = document.elementFromPoint(x, y) as HTMLElement; - - if (initialElement) { - // Simply use the direct parent, no complex logic - const parentElement = initialElement.parentElement; + const originalEl = document.elementFromPoint(x, y) as HTMLElement; + if (originalEl) { + let element = originalEl; + + // Generic parent finding logic based on visual containment + while (element.parentElement) { + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); - // Use the parent if it exists, otherwise use the initial element - const element = parentElement || initialElement; + // Check if parent visually contains the child + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; + + // Additional checks for more comprehensive containment + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + element = element.parentElement; + } else { + break; + } + } let info: { tagName: string; @@ -46,44 +64,34 @@ export const getElementInformation = async ( attributes?: Record; innerHTML?: string; outerHTML?: string; - parentTagName?: string; - parentClasses?: string[]; } = { - tagName: element.tagName, - parentTagName: element.parentElement?.tagName, - parentClasses: element.parentElement - ? Array.from(element.parentElement.classList) - : [] + tagName: element?.tagName ?? '', }; - // Collect attributes - info.attributes = Array.from(element.attributes).reduce( - (acc, attr) => { - acc[attr.name] = attr.value; - return acc; - }, - {} as Record - ); - - // Specific handling for different element types - if (element.tagName === 'A') { - const anchorElement = element as HTMLAnchorElement; - info.url = anchorElement.href; - info.innerText = anchorElement.innerText ?? ''; - } else if (element.tagName === 'IMG') { - const imgElement = element as HTMLImageElement; - info.imageUrl = imgElement.src; - } else { - // Check if element contains only text - info.hasOnlyText = element.children.length === 0 && - (element.innerText?.length ?? 0) > 0; - info.innerText = element.innerText ?? ''; + if (element) { + info.attributes = Array.from(element.attributes).reduce( + (acc, attr) => { + acc[attr.name] = attr.value; + return acc; + }, + {} as Record + ); + } + + // Existing tag-specific logic + if (element?.tagName === 'A') { + info.url = (element as HTMLAnchorElement).href; + info.innerText = element.innerText ?? ''; + } else if (element?.tagName === 'IMG') { + info.imageUrl = (element as HTMLImageElement).src; + } else { + info.hasOnlyText = element?.children?.length === 0 && + element?.innerText?.length > 0; + info.innerText = element?.innerText ?? ''; } - // HTML content info.innerHTML = element.innerHTML; info.outerHTML = element.outerHTML; - return info; } return null; @@ -102,17 +110,32 @@ export const getRect = async (page: Page, coordinates: Coordinates) => { try { const rect = await page.evaluate( async ({ x, y }) => { - // Find the initial element at the point - const initialElement = document.elementFromPoint(x, y) as HTMLElement; - - if (initialElement) { - // Simply use the direct parent, no complex logic - const parentElement = initialElement.parentElement; + const originalEl = document.elementFromPoint(x, y) as HTMLElement; + if (originalEl) { + let element = originalEl; + + // Same parent-finding logic as in getElementInformation + while (element.parentElement) { + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); - // Use the parent if it exists, otherwise use the initial element - const element = parentElement || initialElement; + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; + + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + element = element.parentElement; + } else { + break; + } + } - // Get bounding rectangle const rectangle = element?.getBoundingClientRect(); if (rectangle) { @@ -134,10 +157,11 @@ export const getRect = async (page: Page, coordinates: Coordinates) => { return rect; } catch (error) { const { message, stack } = error as Error; - console.error('Error while retrieving selector:', message); - console.error('Stack:', stack); + logger.log('error', `Error while retrieving selector: ${message}`); + logger.log('error', `Stack: ${stack}`); } -}; +} + /** * Returns the best and unique css {@link Selectors} for the element on the page. @@ -774,25 +798,42 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates let depth = 0; const maxDepth = 2; - // Ensure we start with a valid element - let currentElement = element; - while (currentElement && currentElement !== document.body && depth < maxDepth) { - const selector = getNonUniqueSelector(currentElement); + while (element && element !== document.body && depth < maxDepth) { + const selector = getNonUniqueSelector(element); path.unshift(selector); - currentElement = currentElement.parentElement; + element = element.parentElement; depth++; } return path.join(' > '); } - // Find the initial element at the point - const initialElement = document.elementFromPoint(x, y) as HTMLElement; - - if (!initialElement) return null; + const originalEl = document.elementFromPoint(x, y) as HTMLElement; + if (!originalEl) return null; - // Prefer parent if exists, otherwise use initial element - const element = initialElement.parentElement || initialElement; + let element = originalEl; + + // Find the most appropriate parent element + while (element.parentElement) { + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); + + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; + + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + element = element.parentElement; + } else { + break; + } + } const generalSelector = getSelectorPath(element); return { @@ -807,7 +848,6 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates } }; - export const getChildSelectors = async (page: Page, parentSelector: string): Promise => { try { const childSelectors = await page.evaluate((parentSelector: string) => { From 560f0ea24f8f214e48867791b64c4b3630cb94b9 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 9 Dec 2024 14:41:59 +0530 Subject: [PATCH 05/10] fix: a tags --- server/src/workflow-management/selector.ts | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 9587e898..6a80e7a3 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -31,6 +31,11 @@ export const getElementInformation = async ( if (originalEl) { let element = originalEl; + if (originalEl.tagName === 'A') { + element = originalEl; + } else if (originalEl.parentElement?.tagName === 'A') { + element = originalEl.parentElement; + } else { // Generic parent finding logic based on visual containment while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); @@ -53,7 +58,7 @@ export const getElementInformation = async ( } else { break; } - } + } } let info: { tagName: string; @@ -114,7 +119,11 @@ export const getRect = async (page: Page, coordinates: Coordinates) => { if (originalEl) { let element = originalEl; - // Same parent-finding logic as in getElementInformation + if (originalEl.tagName === 'A') { + element = originalEl; + } else if (originalEl.parentElement?.tagName === 'A') { + element = originalEl.parentElement; + } else { while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -134,8 +143,9 @@ export const getRect = async (page: Page, coordinates: Coordinates) => { } else { break; } - } + }} + //element = element?.parentElement?.tagName === 'A' ? element?.parentElement : element; const rectangle = element?.getBoundingClientRect(); if (rectangle) { From 4da462f48bc037ee2eed4db48428b58293c44f1b Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 9 Dec 2024 15:04:45 +0530 Subject: [PATCH 06/10] fix: a tags --- server/src/workflow-management/selector.ts | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 6a80e7a3..f3925ec9 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -823,7 +823,11 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates let element = originalEl; - // Find the most appropriate parent element + if (originalEl.tagName === 'A') { + element = originalEl; + } else if (originalEl.parentElement?.tagName === 'A') { + element = originalEl.parentElement; + } else { while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -844,6 +848,7 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates break; } } + } const generalSelector = getSelectorPath(element); return { From 3dfe9117b0061d0a265f938f16a551f41ef03dae Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 9 Dec 2024 17:57:28 +0530 Subject: [PATCH 07/10] feat: inject cookie remover script --- server/src/browser-management/classes/RemoteBrowser.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/src/browser-management/classes/RemoteBrowser.ts b/server/src/browser-management/classes/RemoteBrowser.ts index 769787da..29beaa9b 100644 --- a/server/src/browser-management/classes/RemoteBrowser.ts +++ b/server/src/browser-management/classes/RemoteBrowser.ts @@ -15,6 +15,7 @@ import { InterpreterSettings, RemoteBrowserOptions } from "../../types"; import { WorkflowGenerator } from "../../workflow-management/classes/Generator"; import { WorkflowInterpreter } from "../../workflow-management/classes/Interpreter"; import { getDecryptedProxyConfig } from '../../routes/proxy'; +import { getInjectableScript } from 'idcac-playwright'; chromium.use(stealthPlugin()); @@ -168,6 +169,7 @@ export class RemoteBrowser { this.currentPage.on('framenavigated', (frame) => { if (frame === this.currentPage?.mainFrame()) { + this.currentPage.evaluate(getInjectableScript()) this.socket.emit('urlChanged', this.currentPage.url()); } }); From 549a0d35fc4c7f8f7d02b25ab632f6e4f5940b0c Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 9 Dec 2024 18:40:00 +0530 Subject: [PATCH 08/10] chore(deps): install idcac-playwright --- package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/package.json b/package.json index 977daada..a7d634f3 100644 --- a/package.json +++ b/package.json @@ -36,6 +36,7 @@ "fortawesome": "^0.0.1-security", "google-auth-library": "^9.14.1", "googleapis": "^144.0.0", + "idcac-playwright": "^0.1.3", "ioredis": "^5.4.1", "joi": "^17.6.0", "jsonwebtoken": "^9.0.2", From 117dddc2ff8337320a162af350632ee69633221d Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 9 Dec 2024 18:49:23 +0530 Subject: [PATCH 09/10] feat: inject cookie remover script --- server/src/browser-management/classes/RemoteBrowser.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/server/src/browser-management/classes/RemoteBrowser.ts b/server/src/browser-management/classes/RemoteBrowser.ts index 29beaa9b..0081cb5c 100644 --- a/server/src/browser-management/classes/RemoteBrowser.ts +++ b/server/src/browser-management/classes/RemoteBrowser.ts @@ -374,6 +374,7 @@ export class RemoteBrowser { this.currentPage.on('framenavigated', (frame) => { if (frame === this.currentPage?.mainFrame()) { + this.currentPage.evaluate(getInjectableScript()); this.socket.emit('urlChanged', this.currentPage.url()); } }); @@ -406,6 +407,7 @@ export class RemoteBrowser { if (this.currentPage) { this.currentPage.on('framenavigated', (frame) => { if (frame === this.currentPage?.mainFrame()) { + this.currentPage.evaluate(getInjectableScript()); this.socket.emit('urlChanged', this.currentPage.url()); } }); From 386e7c9a98d3c519e6ad0d628318aaa355ce2134 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 9 Dec 2024 20:13:46 +0530 Subject: [PATCH 10/10] feat: add programmatic click event for clickNext --- maxun-core/src/interpret.ts | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index a7a5de47..f0b10936 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -475,6 +475,8 @@ export default class Interpreter extends EventEmitter { case 'clickNext': const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); + // console.log("Page results:", pageResults); + // Filter out already scraped items const newResults = pageResults.filter(item => { const uniqueKey = JSON.stringify(item); @@ -482,9 +484,9 @@ export default class Interpreter extends EventEmitter { scrapedItems.add(uniqueKey); // Mark as scraped return true; }); - + allResults = allResults.concat(newResults); - + if (config.limit && allResults.length >= config.limit) { return allResults.slice(0, config.limit); } @@ -494,7 +496,7 @@ export default class Interpreter extends EventEmitter { return allResults; // No more pages to scrape } await Promise.all([ - nextButton.click(), + nextButton.dispatchEvent('click'), page.waitForNavigation({ waitUntil: 'networkidle' }) ]);