From a131ce6c04d1d9f5b1982493f4200b972e989555 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Tue, 24 Dec 2024 23:44:20 +0530 Subject: [PATCH 01/23] feat: shadow dom selection --- server/src/workflow-management/selector.ts | 105 +++++++++++++++++---- 1 file changed, 89 insertions(+), 16 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index c0fa21f1..5a7273df 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -23,7 +23,28 @@ export const getElementInformation = async ( if (!getList || listSelector !== '') { const elementInfo = await page.evaluate( async ({ x, y }) => { - const el = document.elementFromPoint(x, y) as HTMLElement; + // Helper function to get element from point including shadow DOM + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; + + // Traverse through shadow roots + let current = element; + while (current) { + // Check if element has shadow root + const shadowRoot = current.shadowRoot; + if (!shadowRoot) break; + + // Try to find deeper element in shadow DOM + const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; + if (!shadowElement || shadowElement === current) break; + + current = shadowElement; + } + return current; + }; + + const el = getDeepestElementFromPoint(x, y); if (el) { const { parentElement } = el; const element = parentElement?.tagName === 'A' ? parentElement : el; @@ -36,9 +57,12 @@ export const getElementInformation = async ( attributes?: Record; innerHTML?: string; outerHTML?: string; + isShadowRoot?: boolean; } = { tagName: element?.tagName ?? '', + isShadowRoot: !!element?.shadowRoot }; + if (element) { info.attributes = Array.from(element.attributes).reduce( (acc, attr) => { @@ -48,6 +72,7 @@ export const getElementInformation = async ( {} as Record ); } + // Gather specific information based on the tag if (element?.tagName === 'A') { info.url = (element as HTMLAnchorElement).href; @@ -61,7 +86,7 @@ export const getElementInformation = async ( ...info.attributes, selectedValue: selectElement.value, }; - } else if (element?.tagName === 'INPUT' && (element as HTMLInputElement).type === 'time' || (element as HTMLInputElement).type === 'date') { + } else if (element?.tagName === 'INPUT' && ((element as HTMLInputElement).type === 'time' || (element as HTMLInputElement).type === 'date')) { info.innerText = (element as HTMLInputElement).value; } else { info.hasOnlyText = element?.children?.length === 0 && @@ -80,7 +105,26 @@ export const getElementInformation = async ( } else { const elementInfo = await page.evaluate( async ({ x, y }) => { - const originalEl = document.elementFromPoint(x, y) as HTMLElement; + // Helper function to get element from point including shadow DOM + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; + + // Traverse through shadow roots + let current = element; + while (current) { + const shadowRoot = current.shadowRoot; + if (!shadowRoot) break; + + const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; + if (!shadowElement || shadowElement === current) break; + + current = shadowElement; + } + return current; + }; + + const originalEl = getDeepestElementFromPoint(x, y); if (originalEl) { let element = originalEl; @@ -114,8 +158,10 @@ export const getElementInformation = async ( attributes?: Record; innerHTML?: string; outerHTML?: string; + isShadowRoot?: boolean; } = { tagName: element?.tagName ?? '', + isShadowRoot: !!element?.shadowRoot }; if (element) { @@ -156,24 +202,33 @@ export const getElementInformation = async ( } }; -/** - * Returns a {@link Rectangle} object representing - * the coordinates, width, height and corner points of the element. - * If an element is not found, returns null. - * @param page The page instance. - * @param coordinates Coordinates of an element. - * @category WorkflowManagement-Selectors - * @returns {Promise} - */ export const getRect = async (page: Page, coordinates: Coordinates, listSelector: string, getList: boolean) => { try { if (!getList || listSelector !== '') { const rect = await page.evaluate( async ({ x, y }) => { - const el = document.elementFromPoint(x, y) as HTMLElement; + // Helper function to get element from point including shadow DOM + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; + + // Traverse through shadow roots + let current = element; + while (current) { + const shadowRoot = current.shadowRoot; + if (!shadowRoot) break; + + const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; + if (!shadowElement || shadowElement === current) break; + + current = shadowElement; + } + return current; + }; + + const el = getDeepestElementFromPoint(x, y); if (el) { const { parentElement } = el; - // Match the logic in recorder.ts for link clicks const element = parentElement?.tagName === 'A' ? parentElement : el; const rectangle = element?.getBoundingClientRect(); if (rectangle) { @@ -196,7 +251,26 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector } else { const rect = await page.evaluate( async ({ x, y }) => { - const originalEl = document.elementFromPoint(x, y) as HTMLElement; + // Helper function to get element from point including shadow DOM + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; + + // Traverse through shadow roots + let current = element; + while (current) { + const shadowRoot = current.shadowRoot; + if (!shadowRoot) break; + + const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; + if (!shadowElement || shadowElement === current) break; + + current = shadowElement; + } + return current; + }; + + const originalEl = getDeepestElementFromPoint(x, y); if (originalEl) { let element = originalEl; @@ -249,7 +323,6 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector } }; - /** * Returns the best and unique css {@link Selectors} for the element on the page. * Internally uses a finder function from https://github.com/antonmedv/finder/blob/master/finder.ts From a09b03e4a75627d2adc8189f8fdd361b36b8a82b Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sun, 29 Dec 2024 23:36:06 +0530 Subject: [PATCH 02/23] feat: get deepest shadowDOM element selector --- server/src/workflow-management/selector.ts | 243 +++++++++++++++------ 1 file changed, 179 insertions(+), 64 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 5a7273df..9b3af66e 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -23,31 +23,41 @@ export const getElementInformation = async ( if (!getList || listSelector !== '') { const elementInfo = await page.evaluate( async ({ x, y }) => { - // Helper function to get element from point including shadow DOM + // Enhanced helper function to get element from point including shadow DOM const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { let element = document.elementFromPoint(x, y) as HTMLElement; if (!element) return null; // Traverse through shadow roots let current = element; - while (current) { - // Check if element has shadow root - const shadowRoot = current.shadowRoot; - if (!shadowRoot) break; - - // Try to find deeper element in shadow DOM + let shadowRoot = current.shadowRoot; + + // Keep track of the deepest shadow DOM element found + let deepestElement = current; + + while (shadowRoot) { + // Try to find element at same point in shadow DOM const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; if (!shadowElement || shadowElement === current) break; - + + // Update our tracking of the deepest element + deepestElement = shadowElement; current = shadowElement; + shadowRoot = current.shadowRoot; } - return current; + + return deepestElement; }; const el = getDeepestElementFromPoint(x, y); if (el) { const { parentElement } = el; const element = parentElement?.tagName === 'A' ? parentElement : el; + + // Get the containing shadow root if any + const containingShadowRoot = element.getRootNode() as ShadowRoot; + const isShadowRoot = containingShadowRoot instanceof ShadowRoot; + let info: { tagName: string; hasOnlyText?: boolean; @@ -58,11 +68,20 @@ export const getElementInformation = async ( innerHTML?: string; outerHTML?: string; isShadowRoot?: boolean; + shadowRootMode?: string; + shadowRootContent?: string; } = { tagName: element?.tagName ?? '', - isShadowRoot: !!element?.shadowRoot + isShadowRoot: isShadowRoot }; + + if (isShadowRoot) { + // Include shadow root specific information + info.shadowRootMode = containingShadowRoot.mode; + info.shadowRootContent = containingShadowRoot.innerHTML; + } + // Get attributes including those from shadow DOM context if (element) { info.attributes = Array.from(element.attributes).reduce( (acc, attr) => { @@ -71,84 +90,82 @@ export const getElementInformation = async ( }, {} as Record ); + + // Get text content considering shadow DOM context + info.innerText = element.textContent ?? ''; + info.innerHTML = element.innerHTML; + info.outerHTML = element.outerHTML; + info.hasOnlyText = element.children.length === 0 && + (element.textContent !== null && + element.textContent.trim().length > 0); } - // Gather specific information based on the tag - if (element?.tagName === 'A') { - info.url = (element as HTMLAnchorElement).href; - info.innerText = element.innerText ?? ''; - } else if (element?.tagName === 'IMG') { - info.imageUrl = (element as HTMLImageElement).src; - } else if (element?.tagName === 'SELECT') { - const selectElement = element as HTMLSelectElement; - info.innerText = selectElement.options[selectElement.selectedIndex]?.text ?? ''; - info.attributes = { - ...info.attributes, - selectedValue: selectElement.value, - }; - } else if (element?.tagName === 'INPUT' && ((element as HTMLInputElement).type === 'time' || (element as HTMLInputElement).type === 'date')) { - info.innerText = (element as HTMLInputElement).value; - } else { - info.hasOnlyText = element?.children?.length === 0 && - element?.innerText?.length > 0; - info.innerText = element?.innerText ?? ''; - } - info.innerHTML = element.innerHTML; - info.outerHTML = element.outerHTML; return info; } return null; }, - { x: coordinates.x, y: coordinates.y }, + { x: coordinates.x, y: coordinates.y } ); return elementInfo; } else { const elementInfo = await page.evaluate( async ({ x, y }) => { - // Helper function to get element from point including shadow DOM + // Enhanced helper function to get element from point including shadow DOM const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { let element = document.elementFromPoint(x, y) as HTMLElement; if (!element) return null; - + // Traverse through shadow roots let current = element; - while (current) { - const shadowRoot = current.shadowRoot; - if (!shadowRoot) break; - + let shadowRoot = current.shadowRoot; + + // Keep track of the deepest shadow DOM element found + let deepestElement = current; + + while (shadowRoot) { + // Try to find element at same point in shadow DOM const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; if (!shadowElement || shadowElement === current) break; - + + // Update our tracking of the deepest element + deepestElement = shadowElement; current = shadowElement; + shadowRoot = current.shadowRoot; } - return current; + + return deepestElement; }; - + const originalEl = getDeepestElementFromPoint(x, y); if (originalEl) { let element = originalEl; - + + // Handle element hierarchy traversal for list items while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); - + const fullyContained = parentRect.left <= childRect.left && parentRect.right >= childRect.right && parentRect.top <= childRect.top && parentRect.bottom >= childRect.bottom; - + const significantOverlap = (childRect.width * childRect.height) / (parentRect.width * parentRect.height) > 0.5; - + if (fullyContained && significantOverlap) { element = element.parentElement; } else { break; } } - + + // Get the containing shadow root if any + const containingShadowRoot = element.getRootNode() as ShadowRoot; + const isShadowRoot = containingShadowRoot instanceof ShadowRoot; + let info: { tagName: string; hasOnlyText?: boolean; @@ -159,12 +176,21 @@ export const getElementInformation = async ( innerHTML?: string; outerHTML?: string; isShadowRoot?: boolean; + shadowRootMode?: string; + shadowRootContent?: string; } = { tagName: element?.tagName ?? '', - isShadowRoot: !!element?.shadowRoot + isShadowRoot: isShadowRoot }; - + + if (isShadowRoot) { + // Include shadow root specific information + info.shadowRootMode = containingShadowRoot.mode; + info.shadowRootContent = containingShadowRoot.innerHTML; + } + if (element) { + // Get attributes including those from shadow DOM context info.attributes = Array.from(element.attributes).reduce( (acc, attr) => { acc[attr.name] = attr.value; @@ -172,21 +198,25 @@ export const getElementInformation = async ( }, {} as Record ); + + // Handle specific element types + if (element.tagName === 'A') { + info.url = (element as HTMLAnchorElement).href; + info.innerText = element.textContent ?? ''; + } else if (element.tagName === 'IMG') { + info.imageUrl = (element as HTMLImageElement).src; + } else { + // Handle text content with proper null checking + info.hasOnlyText = element.children.length === 0 && + (element.textContent !== null && + element.textContent.trim().length > 0); + info.innerText = element.textContent ?? ''; + } + + info.innerHTML = element.innerHTML; + info.outerHTML = element.outerHTML; } - - if (element?.tagName === 'A') { - info.url = (element as HTMLAnchorElement).href; - info.innerText = element.innerText ?? ''; - } else if (element?.tagName === 'IMG') { - info.imageUrl = (element as HTMLImageElement).src; - } else { - info.hasOnlyText = element?.children?.length === 0 && - element?.innerText?.length > 0; - info.innerText = element?.innerText ?? ''; - } - - info.innerHTML = element.innerHTML; - info.outerHTML = element.outerHTML; + return info; } return null; @@ -793,6 +823,76 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { return output; } + const MAX_DEPTH = 10; + + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; + + let current = element; + let deepestElement = current; + let depth = 0; + + while (current && depth < MAX_DEPTH) { + const shadowRoot = current.shadowRoot; + if (shadowRoot) { + const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; + if (!shadowElement) break; + + deepestElement = shadowElement; + current = shadowElement; + } else { + break; + } + depth++; + } + + return deepestElement; + }; + + const genSelectorForShadowDOM = (element: HTMLElement) => { + const findShadowContext = (element: HTMLElement): { host: HTMLElement, root: ShadowRoot } | null => { + let current: HTMLElement | null = element; + let depth = 0; + + while (current && depth < MAX_DEPTH) { + // Check if element is inside a shadow root + if (current.parentNode instanceof ShadowRoot) { + return { + host: (current.parentNode as ShadowRoot).host as HTMLElement, + root: current.parentNode as ShadowRoot + }; + } + current = current.parentElement; + depth++; + } + return null; + }; + + const shadowContext = findShadowContext(element); + if (!shadowContext) return null; + + try { + // Generate selector for the shadow host + const hostSelector = finder(shadowContext.host); + + // Generate selector for the element within the shadow DOM + const shadowElementSelector = finder(element, { + root: shadowContext.root as unknown as Element + }); + + return { + fullSelector: `${hostSelector} >>> ${shadowElementSelector}`, + hostSelector, + shadowElementSelector, + mode: shadowContext.root.mode + }; + } catch (e) { + console.warn('Error generating shadow DOM selector:', e); + return null; + } + }; + const genSelectors = (element: HTMLElement | null) => { if (element == null) { return null; @@ -812,6 +912,9 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { } catch (e) { } + // Generate shadow DOM specific selector + const shadowSelector = genSelectorForShadowDOM(element); + const hrefSelector = genSelectorForAttributes(element, ['href']); const formSelector = genSelectorForAttributes(element, [ 'name', @@ -858,9 +961,21 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { hrefSelector, accessibilitySelector, formSelector, + // Shadow DOM selector + shadowSelector: shadowSelector ? { + // Full selector that can traverse shadow DOM + full: shadowSelector.fullSelector, + // Individual parts for more flexible usage + host: shadowSelector.hostSelector, + element: shadowSelector.shadowElementSelector, + // Shadow root mode (open/closed) + mode: shadowSelector.mode + } : null }; } + + function genAttributeSet(element: HTMLElement, attributes: string[]) { return new Set( attributes.filter((attr) => { @@ -900,7 +1015,7 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { return char.length === 1 && char.match(/[0-9]/); } - const hoveredElement = document.elementFromPoint(x, y) as HTMLElement; + const hoveredElement = getDeepestElementFromPoint(x, y); if ( hoveredElement != null && !hoveredElement.closest('#overlay-controls') != null From 542f4d31fa43359928d052b9edb3caa5f446c1c3 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sun, 29 Dec 2024 23:41:19 +0530 Subject: [PATCH 03/23] feat: change shadowDOM full selector path --- server/src/workflow-management/selector.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 9b3af66e..690fb0b1 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -882,7 +882,7 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { }); return { - fullSelector: `${hostSelector} >>> ${shadowElementSelector}`, + fullSelector: `${hostSelector} > ${shadowElementSelector}`, hostSelector, shadowElementSelector, mode: shadowContext.root.mode From b60f4b73b8424151f17bfd3389f5698f72c429df Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 01:24:32 +0530 Subject: [PATCH 04/23] feat: add functionality to scrape shadowDOM elements --- maxun-core/src/browserSide/scraper.js | 126 ++++++++++++++++---------- 1 file changed, 80 insertions(+), 46 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index a2009d78..ef979828 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -189,68 +189,102 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} */ window.scrapeSchema = function (lists) { + // These utility functions remain unchanged as they work perfectly function omap(object, f, kf = (x) => x) { - return Object.fromEntries( - Object.entries(object) - .map(([k, v]) => [kf(k), f(v)]), - ); + return Object.fromEntries( + Object.entries(object) + .map(([k, v]) => [kf(k), f(v)]), + ); } function ofilter(object, f) { - return Object.fromEntries( - Object.entries(object) - .filter(([k, v]) => f(k, v)), - ); + return Object.fromEntries( + Object.entries(object) + .filter(([k, v]) => f(k, v)), + ); } - function getSeedKey(listObj) { - const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length))); - return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0]; - } - - function getMBEs(elements) { - return elements.map((element) => { - let candidate = element; - const isUniqueChild = (e) => elements - .filter((elem) => e.parentNode?.contains(elem)) - .length === 1; - - while (candidate && isUniqueChild(candidate)) { - candidate = candidate.parentNode; + function findElement(config) { + // If this is a shadow DOM query + if (config.shadow && config.selector.includes('>>')) { + const [hostSelector, shadowSelector] = config.selector.split('>>').map(s => s.trim()); + const host = document.querySelector(hostSelector); + return host?.shadowRoot?.querySelector(shadowSelector) || null; } + // Otherwise, use regular querySelector + return document.querySelector(config.selector); + } - return candidate; - }); + function findAllElements(config) { + // If this is a shadow DOM query + if (config.shadow && config.selector.includes('>>')) { + const element = findElement(config); + return element ? [element] : []; + } + // Otherwise, use regular querySelectorAll + return Array.from(document.querySelectorAll(config.selector)); + } + + // Modified to use our new element finding functions + function getSeedKey(listObj) { + const maxLength = Math.max(...Object.values( + omap(listObj, (x) => findAllElements(x).length) + )); + return Object.keys( + ofilter(listObj, (_, v) => findAllElements(v).length === maxLength) + )[0]; + } + + // This function remains unchanged as it works with DOM elements + // regardless of how they were found + function getMBEs(elements) { + return elements.map((element) => { + let candidate = element; + const isUniqueChild = (e) => elements + .filter((elem) => e.parentNode?.contains(elem)) + .length === 1; + + while (candidate && isUniqueChild(candidate)) { + candidate = candidate.parentNode; + } + + return candidate; + }); } const seedName = getSeedKey(lists); - const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector)); + const seedElements = findAllElements(lists[seedName]); const MBEs = getMBEs(seedElements); return MBEs.map((mbe) => omap( - lists, - ({ selector, attribute }, key) => { - const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem)); - if (!elem) return undefined; + lists, + (config, key) => { + // Use our new findAllElements function + const elem = findAllElements(config) + .find((elem) => mbe.contains(elem)); - switch (attribute) { - case 'href': - const relativeHref = elem.getAttribute('href'); - return relativeHref ? new URL(relativeHref, window.location.origin).href : null; - case 'src': - const relativeSrc = elem.getAttribute('src'); - return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null; - case 'innerText': - return elem.innerText; - case 'textContent': - return elem.textContent; - default: - return elem.innerText; - } - }, - (key) => key // Use the original key in the output + if (!elem) return undefined; + + switch (config.attribute) { + case 'href': { + const relativeHref = elem.getAttribute('href'); + return relativeHref ? new URL(relativeHref, window.location.origin).href : null; + } + case 'src': { + const relativeSrc = elem.getAttribute('src'); + return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null; + } + case 'innerText': + return elem.innerText; + case 'textContent': + return elem.textContent; + default: + return elem.getAttribute(config.attribute) || elem.innerText; + } + }, + (key) => key )) || []; - } + }; /** * Scrapes multiple lists of similar items based on a template item. From 9f9dc4e1030ca3819355245765ecadcc1e2c8d6f Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 01:25:45 +0530 Subject: [PATCH 05/23] feat: add shadow optional field in SelectorObject --- src/context/browserSteps.tsx | 1 + 1 file changed, 1 insertion(+) diff --git a/src/context/browserSteps.tsx b/src/context/browserSteps.tsx index dd211199..fd311a35 100644 --- a/src/context/browserSteps.tsx +++ b/src/context/browserSteps.tsx @@ -32,6 +32,7 @@ export interface SelectorObject { selector: string; tag?: string; attribute?: string; + shadow?: boolean; [key: string]: any; } From b696fa568d65a0948edd3b999eb31c4ccf39dad5 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 01:28:22 +0530 Subject: [PATCH 06/23] feat: add shadow param for scrapeSchema config --- maxun-core/src/interpret.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index c581954d..495ba2db 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -403,7 +403,7 @@ export default class Interpreter extends EventEmitter { await this.options.serializableCallback(scrapeResults); }, - scrapeSchema: async (schema: Record) => { + scrapeSchema: async (schema: Record) => { await this.ensureScriptsLoaded(page); const scrapeResult = await page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema); From 415ce02a3d2eb82f7434230239a5c7659b557016 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 02:39:27 +0530 Subject: [PATCH 07/23] feat: add shadow bool field to text step --- src/components/organisms/BrowserWindow.tsx | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/components/organisms/BrowserWindow.tsx b/src/components/organisms/BrowserWindow.tsx index c7e9fc0f..11fe8c55 100644 --- a/src/components/organisms/BrowserWindow.tsx +++ b/src/components/organisms/BrowserWindow.tsx @@ -13,6 +13,7 @@ import { useTranslation } from 'react-i18next'; interface ElementInfo { tagName: string; hasOnlyText?: boolean; + isShadowRoot?: boolean; innerText?: string; url?: string; imageUrl?: string; @@ -185,6 +186,7 @@ export const BrowserWindow = () => { addTextStep('', data, { selector: highlighterData.selector, tag: highlighterData.elementInfo?.tagName, + shadow: highlighterData.elementInfo?.isShadowRoot, attribute }); } else { @@ -192,7 +194,7 @@ export const BrowserWindow = () => { setAttributeOptions(options); setSelectedElement({ selector: highlighterData.selector, - info: highlighterData.elementInfo + info: highlighterData.elementInfo, }); setShowAttributeModal(true); } @@ -229,6 +231,7 @@ export const BrowserWindow = () => { selectorObj: { selector: highlighterData.selector, tag: highlighterData.elementInfo?.tagName, + shadow: highlighterData.elementInfo?.isShadowRoot, attribute } }; @@ -276,6 +279,7 @@ export const BrowserWindow = () => { addTextStep('', data, { selector: selectedElement.selector, tag: selectedElement.info?.tagName, + shadow: selectedElement.info?.isShadowRoot, attribute: attribute }); } @@ -288,6 +292,7 @@ export const BrowserWindow = () => { selectorObj: { selector: selectedElement.selector, tag: selectedElement.info?.tagName, + shadow: selectedElement.info?.isShadowRoot, attribute: attribute } }; From 1a6a481b578a7212743ceb199b934585583b5a0e Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 02:46:24 +0530 Subject: [PATCH 08/23] feat: add shadow selectors field type --- server/src/types/index.ts | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/server/src/types/index.ts b/server/src/types/index.ts index f2e327ef..e882f69d 100644 --- a/server/src/types/index.ts +++ b/server/src/types/index.ts @@ -129,6 +129,13 @@ export interface BaseActionInfo { hasOnlyText: boolean; } +interface ShadowSelector { + full: string; + host: string; + element: string; + mode: string; +} + /** * Holds all the possible css selectors that has been found for an element. * @category Types @@ -143,6 +150,7 @@ export interface Selectors { hrefSelector: string|null; accessibilitySelector: string|null; formSelector: string|null; + shadowSelector: ShadowSelector | null; } /** @@ -156,7 +164,7 @@ export interface BaseAction extends BaseActionInfo{ associatedActions: ActionType[]; inputType: string | undefined; value: string | undefined; - selectors: { [key: string]: string | null }; + selectors: Selectors; timestamp: number; isPassword: boolean; /** From c3031811a63d21139c306781a6e64ee09d81b1de Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 02:52:58 +0530 Subject: [PATCH 09/23] feat: prioritize returning shadow selector --- server/src/workflow-management/utils.ts | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/server/src/workflow-management/utils.ts b/server/src/workflow-management/utils.ts index b3dadd60..4f747127 100644 --- a/server/src/workflow-management/utils.ts +++ b/server/src/workflow-management/utils.ts @@ -12,6 +12,11 @@ export const getBestSelectorForAction = (action: Action) => { case ActionType.Hover: case ActionType.DragAndDrop: { const selectors = action.selectors; + + if (selectors?.shadowSelector?.full) { + return selectors.shadowSelector.full; + } + // less than 25 characters, and element only has text inside const textSelector = selectors?.text?.length != null && @@ -75,6 +80,11 @@ export const getBestSelectorForAction = (action: Action) => { case ActionType.Input: case ActionType.Keydown: { const selectors = action.selectors; + + if (selectors?.shadowSelector?.full) { + return selectors.shadowSelector.full; + } + return ( selectors.testIdSelector ?? selectors?.id ?? From cec2397a58256736b60467e40f1cc2e255667394 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 02:55:21 +0530 Subject: [PATCH 10/23] feat: change shadowDOM full selector path --- server/src/workflow-management/selector.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 690fb0b1..164f5220 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -882,7 +882,7 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { }); return { - fullSelector: `${hostSelector} > ${shadowElementSelector}`, + fullSelector: `${hostSelector} >> ${shadowElementSelector}`, hostSelector, shadowElementSelector, mode: shadowContext.root.mode From 05c7921c9d574d4074b64f56319d6166e84b1dc3 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 03:05:07 +0530 Subject: [PATCH 11/23] feat: add shadowInfo in highlighter data --- .../workflow-management/classes/Generator.ts | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index 609541de..d1bccbe4 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -730,15 +730,26 @@ export class WorkflowGenerator { const displaySelector = await this.generateSelector(page, coordinates, ActionType.Click); const elementInfo = await getElementInformation(page, coordinates, this.listSelector, this.getList); if (rect) { + const highlighterData = { + rect, + selector: displaySelector, + elementInfo, + // Include shadow DOM specific information + shadowInfo: elementInfo?.isShadowRoot ? { + mode: elementInfo.shadowRootMode, + content: elementInfo.shadowRootContent + } : null + }; + if (this.getList === true) { if (this.listSelector !== '') { const childSelectors = await getChildSelectors(page, this.listSelector || ''); - this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo, childSelectors }) + this.socket.emit('highlighter', { ...highlighterData, childSelectors }) } else { - this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo }); + this.socket.emit('highlighter', { ...highlighterData }); } } else { - this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo }); + this.socket.emit('highlighter', { ...highlighterData }); } } } From d2ab81e22959acc9fccf65f5845d8962170608d7 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 22:59:28 +0530 Subject: [PATCH 12/23] feat: add logic to get deeply nested shadowDOM elements --- server/src/workflow-management/selector.ts | 87 ++++++++++++---------- 1 file changed, 49 insertions(+), 38 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 164f5220..d957b879 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -823,7 +823,7 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { return output; } - const MAX_DEPTH = 10; + // const MAX_DEPTH = 10; const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { let element = document.elementFromPoint(x, y) as HTMLElement; @@ -832,60 +832,76 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { let current = element; let deepestElement = current; let depth = 0; + const MAX_DEPTH = 4; // Limit to 2 levels of shadow DOM while (current && depth < MAX_DEPTH) { const shadowRoot = current.shadowRoot; - if (shadowRoot) { - const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; - if (!shadowElement) break; - - deepestElement = shadowElement; - current = shadowElement; - } else { - break; - } + if (!shadowRoot) break; + + const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; + if (!shadowElement || shadowElement === current) break; + + deepestElement = shadowElement; + current = shadowElement; depth++; } return deepestElement; }; + // Helper function to generate selectors for shadow DOM elements const genSelectorForShadowDOM = (element: HTMLElement) => { - const findShadowContext = (element: HTMLElement): { host: HTMLElement, root: ShadowRoot } | null => { - let current: HTMLElement | null = element; + // Get complete path up to document root + const getShadowPath = (el: HTMLElement) => { + const path = []; + let current = el; let depth = 0; + const MAX_DEPTH = 4; while (current && depth < MAX_DEPTH) { - // Check if element is inside a shadow root - if (current.parentNode instanceof ShadowRoot) { - return { - host: (current.parentNode as ShadowRoot).host as HTMLElement, - root: current.parentNode as ShadowRoot - }; + const rootNode = current.getRootNode(); + if (rootNode instanceof ShadowRoot) { + path.unshift({ + host: rootNode.host as HTMLElement, + root: rootNode, + element: current + }); + current = rootNode.host as HTMLElement; + depth++; + } else { + break; } - current = current.parentElement; - depth++; } - return null; + return path; }; - - const shadowContext = findShadowContext(element); - if (!shadowContext) return null; + + const shadowPath = getShadowPath(element); + if (shadowPath.length === 0) return null; try { - // Generate selector for the shadow host - const hostSelector = finder(shadowContext.host); + const selectorParts: string[] = []; - // Generate selector for the element within the shadow DOM - const shadowElementSelector = finder(element, { - root: shadowContext.root as unknown as Element + // Generate selector for each shadow DOM boundary + shadowPath.forEach((context, index) => { + // Get selector for the host element + const hostSelector = finder(context.host, { + root: index === 0 ? document.body : (shadowPath[index - 1].root as unknown as Element) + }); + + // For the last context, get selector for target element + if (index === shadowPath.length - 1) { + const elementSelector = finder(element, { + root: context.root as unknown as Element + }); + selectorParts.push(`${hostSelector} >> ${elementSelector}`); + } else { + selectorParts.push(hostSelector); + } }); return { - fullSelector: `${hostSelector} >> ${shadowElementSelector}`, - hostSelector, - shadowElementSelector, - mode: shadowContext.root.mode + fullSelector: selectorParts.join(' >> '), + mode: shadowPath[shadowPath.length - 1].root.mode }; } catch (e) { console.warn('Error generating shadow DOM selector:', e); @@ -963,12 +979,7 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { formSelector, // Shadow DOM selector shadowSelector: shadowSelector ? { - // Full selector that can traverse shadow DOM full: shadowSelector.fullSelector, - // Individual parts for more flexible usage - host: shadowSelector.hostSelector, - element: shadowSelector.shadowElementSelector, - // Shadow root mode (open/closed) mode: shadowSelector.mode } : null }; From 9287c296922478b77391d0c4930f4b478de4614e Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 23:02:21 +0530 Subject: [PATCH 13/23] feat: rm host and element info for shadow selector --- server/src/types/index.ts | 2 -- 1 file changed, 2 deletions(-) diff --git a/server/src/types/index.ts b/server/src/types/index.ts index e882f69d..151e3dd4 100644 --- a/server/src/types/index.ts +++ b/server/src/types/index.ts @@ -131,8 +131,6 @@ export interface BaseActionInfo { interface ShadowSelector { full: string; - host: string; - element: string; mode: string; } From e952d8f202278a67e86350e60542576b09260238 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 23:37:16 +0530 Subject: [PATCH 14/23] feat: add nested shadow-root scraping logic for scrapeSchema --- maxun-core/src/browserSide/scraper.js | 115 ++++++++++++++++---------- 1 file changed, 70 insertions(+), 45 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index ef979828..ad9295b8 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -188,8 +188,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @param {Object.} lists The named lists of HTML elements. * @returns {Array.>} */ - window.scrapeSchema = function (lists) { - // These utility functions remain unchanged as they work perfectly + window.scrapeSchema = function(lists) { function omap(object, f, kf = (x) => x) { return Object.fromEntries( Object.entries(object) @@ -203,29 +202,73 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, .filter(([k, v]) => f(k, v)), ); } - - function findElement(config) { - // If this is a shadow DOM query - if (config.shadow && config.selector.includes('>>')) { - const [hostSelector, shadowSelector] = config.selector.split('>>').map(s => s.trim()); - const host = document.querySelector(hostSelector); - return host?.shadowRoot?.querySelector(shadowSelector) || null; - } - // Otherwise, use regular querySelector - return document.querySelector(config.selector); - } - + function findAllElements(config) { - // If this is a shadow DOM query - if (config.shadow && config.selector.includes('>>')) { - const element = findElement(config); - return element ? [element] : []; - } - // Otherwise, use regular querySelectorAll - return Array.from(document.querySelectorAll(config.selector)); + if (!config.shadow || !config.selector.includes('>>')) { + return Array.from(document.querySelectorAll(config.selector)); + } + + // For shadow DOM, we'll get all possible combinations + const parts = config.selector.split('>>').map(s => s.trim()); + let currentElements = [document]; + + for (let i = 0; i < parts.length; i++) { + const part = parts[i]; + const nextElements = []; + + for (const element of currentElements) { + let targets; + if (i === 0) { + // First selector is queried from document + targets = Array.from(element.querySelectorAll(part)) + .filter(el => { + // Only include elements that either: + // 1. Have an open shadow root + // 2. Don't need shadow root (last part of selector) + if (i === parts.length - 1) return true; + const shadowRoot = el.shadowRoot; + return shadowRoot && shadowRoot.mode === 'open'; + }); + } else { + // For subsequent selectors, only use elements with open shadow roots + const shadowRoot = element.shadowRoot; + if (!shadowRoot || shadowRoot.mode !== 'open') continue; + + targets = Array.from(shadowRoot.querySelectorAll(part)); + } + nextElements.push(...targets); + } + + if (nextElements.length === 0) return []; + currentElements = nextElements; + } + + return currentElements; } + + // Helper function to extract value from element based on attribute + function getElementValue(element, attribute) { + if (!element) return null; + + switch (attribute) { + case 'href': { + const relativeHref = element.getAttribute('href'); + return relativeHref ? new URL(relativeHref, window.location.origin).href : null; + } + case 'src': { + const relativeSrc = element.getAttribute('src'); + return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null; + } + case 'innerText': + return element.innerText?.trim(); + case 'textContent': + return element.textContent?.trim(); + default: + return element.getAttribute(attribute) || element.innerText?.trim(); + } + } - // Modified to use our new element finding functions + // Get the seed key based on the maximum number of elements found function getSeedKey(listObj) { const maxLength = Math.max(...Object.values( omap(listObj, (x) => findAllElements(x).length) @@ -235,8 +278,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, )[0]; } - // This function remains unchanged as it works with DOM elements - // regardless of how they were found + // Find minimal bounding elements function getMBEs(elements) { return elements.map((element) => { let candidate = element; @@ -252,35 +294,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, }); } + // Main scraping logic const seedName = getSeedKey(lists); const seedElements = findAllElements(lists[seedName]); const MBEs = getMBEs(seedElements); return MBEs.map((mbe) => omap( lists, - (config, key) => { - // Use our new findAllElements function + (config) => { const elem = findAllElements(config) .find((elem) => mbe.contains(elem)); - - if (!elem) return undefined; - - switch (config.attribute) { - case 'href': { - const relativeHref = elem.getAttribute('href'); - return relativeHref ? new URL(relativeHref, window.location.origin).href : null; - } - case 'src': { - const relativeSrc = elem.getAttribute('src'); - return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null; - } - case 'innerText': - return elem.innerText; - case 'textContent': - return elem.textContent; - default: - return elem.getAttribute(config.attribute) || elem.innerText; - } + + return elem ? getElementValue(elem, config.attribute) : undefined; }, (key) => key )) || []; From b757d9c4f8b0ea00d6eb6d6fe6c2e7c37407ed92 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Mon, 30 Dec 2024 23:38:38 +0530 Subject: [PATCH 15/23] feat: add func to rm shadow selectors from workflow --- maxun-core/src/interpret.ts | 19 ++++++++++++++++++- 1 file changed, 18 insertions(+), 1 deletion(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 495ba2db..3cef8c29 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -663,11 +663,28 @@ export default class Interpreter extends EventEmitter { if (isApplicable) { return actionId; } + } } + + private removeShadowSelectors(workflow: Workflow) { + for (let actionId = workflow.length - 1; actionId >= 0; actionId--) { + const step = workflow[actionId]; + + // Check if step has where and selectors + if (step.where && Array.isArray(step.where.selectors)) { + // Filter out selectors that contain ">>" + step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>')); + } + } + + return workflow; } private async runLoop(p: Page, workflow: Workflow) { - const workflowCopy: Workflow = JSON.parse(JSON.stringify(workflow)); + let workflowCopy: Workflow = JSON.parse(JSON.stringify(workflow)); + + // remove shadow selectors + workflowCopy = this.removeShadowSelectors(workflowCopy); // apply ad-blocker to the current page try { From 4b4074b70d352401120bd1fe0b37fbee7838bac5 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 31 Dec 2024 01:52:38 +0530 Subject: [PATCH 16/23] feat: add logic to scrape multiple nested shadow dom elements --- maxun-core/src/browserSide/scraper.js | 172 +++++++++++++++----------- 1 file changed, 99 insertions(+), 73 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index ad9295b8..00f8cef7 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -204,69 +204,68 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } function findAllElements(config) { - if (!config.shadow || !config.selector.includes('>>')) { - return Array.from(document.querySelectorAll(config.selector)); - } - - // For shadow DOM, we'll get all possible combinations - const parts = config.selector.split('>>').map(s => s.trim()); - let currentElements = [document]; - - for (let i = 0; i < parts.length; i++) { - const part = parts[i]; - const nextElements = []; - - for (const element of currentElements) { - let targets; - if (i === 0) { - // First selector is queried from document - targets = Array.from(element.querySelectorAll(part)) - .filter(el => { - // Only include elements that either: - // 1. Have an open shadow root - // 2. Don't need shadow root (last part of selector) - if (i === parts.length - 1) return true; - const shadowRoot = el.shadowRoot; - return shadowRoot && shadowRoot.mode === 'open'; - }); - } else { - // For subsequent selectors, only use elements with open shadow roots - const shadowRoot = element.shadowRoot; - if (!shadowRoot || shadowRoot.mode !== 'open') continue; - - targets = Array.from(shadowRoot.querySelectorAll(part)); + if (!config.shadow || !config.selector.includes('>>')) { + return Array.from(document.querySelectorAll(config.selector)); + } + + // For shadow DOM, we'll get all possible combinations + const parts = config.selector.split('>>').map(s => s.trim()); + let currentElements = [document]; + + for (let i = 0; i < parts.length; i++) { + const part = parts[i]; + const nextElements = []; + + for (const element of currentElements) { + let targets; + if (i === 0) { + // First selector is queried from document + targets = Array.from(element.querySelectorAll(part)) + .filter(el => { + // Only include elements that either: + // 1. Have an open shadow root + // 2. Don't need shadow root (last part of selector) + if (i === parts.length - 1) return true; + const shadowRoot = el.shadowRoot; + return shadowRoot && shadowRoot.mode === 'open'; + }); + } else { + // For subsequent selectors, only use elements with open shadow roots + const shadowRoot = element.shadowRoot; + if (!shadowRoot || shadowRoot.mode !== 'open') continue; + + targets = Array.from(shadowRoot.querySelectorAll(part)); + } + nextElements.push(...targets); } - nextElements.push(...targets); - } - - if (nextElements.length === 0) return []; - currentElements = nextElements; - } - - return currentElements; + + if (nextElements.length === 0) return []; + currentElements = nextElements; + } + + return currentElements; } - // Helper function to extract value from element based on attribute - function getElementValue(element, attribute) { - if (!element) return null; - - switch (attribute) { - case 'href': { - const relativeHref = element.getAttribute('href'); - return relativeHref ? new URL(relativeHref, window.location.origin).href : null; - } - case 'src': { - const relativeSrc = element.getAttribute('src'); - return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null; - } - case 'innerText': - return element.innerText?.trim(); - case 'textContent': - return element.textContent?.trim(); - default: - return element.getAttribute(attribute) || element.innerText?.trim(); - } - } + function getElementValue(element, attribute) { + if (!element) return null; + + switch (attribute) { + case 'href': { + const relativeHref = element.getAttribute('href'); + return relativeHref ? new URL(relativeHref, window.location.origin).href : null; + } + case 'src': { + const relativeSrc = element.getAttribute('src'); + return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null; + } + case 'innerText': + return element.innerText?.trim(); + case 'textContent': + return element.textContent?.trim(); + default: + return element.getAttribute(attribute) || element.innerText?.trim(); + } + } // Get the seed key based on the maximum number of elements found function getSeedKey(listObj) { @@ -280,26 +279,26 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, // Find minimal bounding elements function getMBEs(elements) { - return elements.map((element) => { - let candidate = element; - const isUniqueChild = (e) => elements - .filter((elem) => e.parentNode?.contains(elem)) - .length === 1; + return elements.map((element) => { + let candidate = element; + const isUniqueChild = (e) => elements + .filter((elem) => e.parentNode?.contains(elem)) + .length === 1; - while (candidate && isUniqueChild(candidate)) { - candidate = candidate.parentNode; - } + while (candidate && isUniqueChild(candidate)) { + candidate = candidate.parentNode; + } - return candidate; - }); + return candidate; + }); } - // Main scraping logic + // First try the MBE approach const seedName = getSeedKey(lists); const seedElements = findAllElements(lists[seedName]); const MBEs = getMBEs(seedElements); - - return MBEs.map((mbe) => omap( + + const mbeResults = MBEs.map((mbe) => omap( lists, (config) => { const elem = findAllElements(config) @@ -309,6 +308,33 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, }, (key) => key )) || []; + + // If MBE approach didn't find all elements, try independent scraping + if (mbeResults.some(result => Object.values(result).some(v => v === undefined))) { + // Fall back to independent scraping + const results = []; + const foundElements = new Map(); + + // Find all elements for each selector + Object.entries(lists).forEach(([key, config]) => { + const elements = findAllElements(config); + foundElements.set(key, elements); + }); + + // Create result objects for each found element + foundElements.forEach((elements, key) => { + elements.forEach((element, index) => { + if (!results[index]) { + results[index] = {}; + } + results[index][key] = getElementValue(element, lists[key].attribute); + }); + }); + + return results.filter(result => Object.keys(result).length > 0); + } + + return mbeResults; }; /** From 4a09ea66ff6c3c25c02b7997ed97f0ac4d677cd9 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 31 Dec 2024 12:26:09 +0530 Subject: [PATCH 17/23] feat: get deepest element rect coordinates --- server/src/workflow-management/selector.ts | 53 ++++++++++++++-------- 1 file changed, 34 insertions(+), 19 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index d957b879..910b3134 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -237,23 +237,30 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector if (!getList || listSelector !== '') { const rect = await page.evaluate( async ({ x, y }) => { - // Helper function to get element from point including shadow DOM + // Enhanced helper function to get element from point including shadow DOM const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { let element = document.elementFromPoint(x, y) as HTMLElement; if (!element) return null; // Traverse through shadow roots let current = element; - while (current) { - const shadowRoot = current.shadowRoot; - if (!shadowRoot) break; - + let shadowRoot = current.shadowRoot; + + // Keep track of the deepest shadow DOM element found + let deepestElement = current; + + while (shadowRoot) { + // Try to find element at same point in shadow DOM const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; if (!shadowElement || shadowElement === current) break; - + + // Update our tracking of the deepest element + deepestElement = shadowElement; current = shadowElement; + shadowRoot = current.shadowRoot; } - return current; + + return deepestElement; }; const el = getDeepestElementFromPoint(x, y); @@ -274,36 +281,45 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector }; } } + return null; }, - { x: coordinates.x, y: coordinates.y }, + { x: coordinates.x, y: coordinates.y } ); return rect; } else { const rect = await page.evaluate( async ({ x, y }) => { - // Helper function to get element from point including shadow DOM + // Enhanced helper function to get element from point including shadow DOM const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { let element = document.elementFromPoint(x, y) as HTMLElement; if (!element) return null; // Traverse through shadow roots let current = element; - while (current) { - const shadowRoot = current.shadowRoot; - if (!shadowRoot) break; - + let shadowRoot = current.shadowRoot; + + // Keep track of the deepest shadow DOM element found + let deepestElement = current; + + while (shadowRoot) { + // Try to find element at same point in shadow DOM const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; if (!shadowElement || shadowElement === current) break; - + + // Update our tracking of the deepest element + deepestElement = shadowElement; current = shadowElement; + shadowRoot = current.shadowRoot; } - return current; + + return deepestElement; }; const originalEl = getDeepestElementFromPoint(x, y); if (originalEl) { let element = originalEl; + // Handle element hierarchy traversal for list items while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -326,7 +342,6 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector } const rectangle = element?.getBoundingClientRect(); - if (rectangle) { return { x: rectangle.x, @@ -342,14 +357,14 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector } return null; }, - { x: coordinates.x, y: coordinates.y }, + { x: coordinates.x, y: coordinates.y } ); return rect; } } catch (error) { const { message, stack } = error as Error; - logger.log('error', `Error while retrieving selector: ${message}`); - logger.log('error', `Stack: ${stack}`); + console.error('Error while retrieving selector:', message); + console.error('Stack:', stack); } }; From 42e13066bd7800043e6952ddaae06d62985c2ee4 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Wed, 1 Jan 2025 16:13:38 +0530 Subject: [PATCH 18/23] feat: add shadowDOM support for capture list selector generation --- server/src/workflow-management/selector.ts | 343 +++++++++++++++++---- 1 file changed, 286 insertions(+), 57 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 910b3134..713c05bc 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -1076,46 +1076,133 @@ interface SelectorResult { */ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates, listSelector: string): Promise => { + interface ShadowContext { + host: HTMLElement; + root: ShadowRoot; + element: HTMLElement; + } + try { if (!listSelector) { const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { - function getNonUniqueSelector(element: HTMLElement): string { - let selector = element.tagName.toLowerCase(); + // Helper function to get deepest element, traversing shadow DOM + function getDeepestElementFromPoint(x: number, y: number): HTMLElement | null { + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; - if (element.className) { - const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls)); - if (classes.length > 0) { - const validClasses = classes.filter((cls: string) => !cls.startsWith('!') && !cls.includes(':')); - if (validClasses.length > 0) { - selector += '.' + validClasses.map(cls => CSS.escape(cls)).join('.'); - } - } + let current = element; + let deepestElement = current; + let depth = 0; + const MAX_DEPTH = 4; // Limit shadow DOM traversal depth + + while (current && depth < MAX_DEPTH) { + const shadowRoot = current.shadowRoot; + if (!shadowRoot) break; + + const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; + if (!shadowElement || shadowElement === current) break; + + deepestElement = shadowElement; + current = shadowElement; + depth++; } + return deepestElement; + } + + // Generate basic selector from element's tag and classes + function getNonUniqueSelector(element: HTMLElement): string { + let selector = element.tagName.toLowerCase(); + + const className = typeof element.className === 'string' ? element.className : ''; + if (className) { + const classes = className.split(/\s+/) + .filter(cls => Boolean(cls) && !cls.startsWith('!') && !cls.includes(':')); + + if (classes.length > 0) { + selector += '.' + classes.map(cls => CSS.escape(cls)).join('.'); + } + } + return selector; } - function getSelectorPath(element: HTMLElement | null): string { - const path: string[] = []; + // Get complete shadow DOM path for an element + function getShadowPath(element: HTMLElement): ShadowContext[] { + const path: ShadowContext[] = []; + let current = element; let depth = 0; - const maxDepth = 2; + const MAX_DEPTH = 4; + + while (current && depth < MAX_DEPTH) { + const rootNode = current.getRootNode(); + if (rootNode instanceof ShadowRoot) { + path.unshift({ + host: rootNode.host as HTMLElement, + root: rootNode, + element: current + }); + current = rootNode.host as HTMLElement; + depth++; + } else { + break; + } + } + return path; + } - while (element && element !== document.body && depth < maxDepth) { - const selector = getNonUniqueSelector(element); + // Generate complete selector path for any element + function getSelectorPath(element: HTMLElement | null): string { + if (!element) return ''; + + // Check for shadow DOM path first + const shadowPath = getShadowPath(element); + if (shadowPath.length > 0) { + const selectorParts: string[] = []; + + // Build complete shadow DOM path + shadowPath.forEach((context, index) => { + const hostSelector = getNonUniqueSelector(context.host); + + if (index === shadowPath.length - 1) { + // For deepest shadow context, include target element + const elementSelector = getNonUniqueSelector(element); + selectorParts.push(`${hostSelector} >> ${elementSelector}`); + } else { + // For intermediate shadow boundaries + selectorParts.push(hostSelector); + } + }); + + return selectorParts.join(' >> '); + } + + // Regular DOM path generation + const path: string[] = []; + let currentElement = element; + let depth = 0; + const MAX_DEPTH = 2; + + while (currentElement && currentElement !== document.body && depth < MAX_DEPTH) { + const selector = getNonUniqueSelector(currentElement); path.unshift(selector); - element = element.parentElement; + + const parentElement = currentElement.parentElement; + if (!parentElement) break; + currentElement = parentElement; depth++; } return path.join(' > '); } - const originalEl = document.elementFromPoint(x, y) as HTMLElement; + // Main logic to get element and generate selector + const originalEl = getDeepestElementFromPoint(x, y); if (!originalEl) return null; let element = originalEl; - // if (listSelector === '') { + // Handle parent traversal for better element targeting while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -1136,60 +1223,134 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates break; } } - // } const generalSelector = getSelectorPath(element); - return { - generalSelector, - }; + return { generalSelector }; }, coordinates); + return selectors || { generalSelector: '' }; } else { + // When we have a list selector, we need special handling while maintaining shadow DOM support const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { - function getNonUniqueSelector(element: HTMLElement): string { - let selector = element.tagName.toLowerCase(); + // Helper function to get deepest element, traversing shadow DOM + function getDeepestElementFromPoint(x: number, y: number): HTMLElement | null { + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; - if (element.className) { - const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls)); - if (classes.length > 0) { - const validClasses = classes.filter((cls: string) => !cls.startsWith('!') && !cls.includes(':')); - if (validClasses.length > 0) { - selector += '.' + validClasses.map(cls => CSS.escape(cls)).join('.'); - } - } + let current = element; + let deepestElement = current; + let depth = 0; + const MAX_DEPTH = 4; + + while (current && depth < MAX_DEPTH) { + const shadowRoot = current.shadowRoot; + if (!shadowRoot) break; + + const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement; + if (!shadowElement || shadowElement === current) break; + + deepestElement = shadowElement; + current = shadowElement; + depth++; } + return deepestElement; + } + + // Generate basic selector from element's tag and classes + function getNonUniqueSelector(element: HTMLElement): string { + let selector = element.tagName.toLowerCase(); + + const className = typeof element.className === 'string' ? element.className : ''; + if (className) { + const classes = className.split(/\s+/) + .filter(cls => Boolean(cls) && !cls.startsWith('!') && !cls.includes(':')); + + if (classes.length > 0) { + selector += '.' + classes.map(cls => CSS.escape(cls)).join('.'); + } + } + return selector; } - function getSelectorPath(element: HTMLElement | null): string { - const path: string[] = []; + // Get complete shadow DOM path for an element + function getShadowPath(element: HTMLElement): ShadowContext[] { + const path: ShadowContext[] = []; + let current = element; let depth = 0; - const maxDepth = 2; + const MAX_DEPTH = 4; + + while (current && depth < MAX_DEPTH) { + const rootNode = current.getRootNode(); + if (rootNode instanceof ShadowRoot) { + path.unshift({ + host: rootNode.host as HTMLElement, + root: rootNode, + element: current + }); + current = rootNode.host as HTMLElement; + depth++; + } else { + break; + } + } + return path; + } - while (element && element !== document.body && depth < maxDepth) { - const selector = getNonUniqueSelector(element); + // Generate selector path specifically for list items + function getListItemSelectorPath(element: HTMLElement | null): string { + if (!element) return ''; + + // Check for shadow DOM path first + const shadowPath = getShadowPath(element); + if (shadowPath.length > 0) { + const selectorParts: string[] = []; + + shadowPath.forEach((context, index) => { + const hostSelector = getNonUniqueSelector(context.host); + + if (index === shadowPath.length - 1) { + const elementSelector = getNonUniqueSelector(element); + selectorParts.push(`${hostSelector} >> ${elementSelector}`); + } else { + selectorParts.push(hostSelector); + } + }); + + return selectorParts.join(' >> '); + } + + // For list items, we want a shallower path to better match list patterns + const path: string[] = []; + let currentElement = element; + let depth = 0; + const MAX_LIST_DEPTH = 2; // Keeping shallow depth for list items + + while (currentElement && currentElement !== document.body && depth < MAX_LIST_DEPTH) { + const selector = getNonUniqueSelector(currentElement); path.unshift(selector); - element = element.parentElement; + + if (!currentElement.parentElement) break; + currentElement = currentElement.parentElement; depth++; } return path.join(' > '); } - const originalEl = document.elementFromPoint(x, y) as HTMLElement; - if (!originalEl) return null; + // Main logic for list item selection + const originalEl = getDeepestElementFromPoint(x, y); + if (!originalEl) return { generalSelector: '' }; let element = originalEl; - const generalSelector = getSelectorPath(element); - return { - generalSelector, - }; - }, coordinates); - return selectors || { generalSelector: '' }; - } + const generalSelector = getListItemSelectorPath(element); + return { generalSelector }; + }, coordinates); + return selectors || { generalSelector: '' }; + } } catch (error) { console.error('Error in getNonUniqueSelectors:', error); return { generalSelector: '' }; @@ -1218,42 +1379,110 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro } // Function to generate selector path from an element to its parent - function getSelectorPath(element: HTMLElement | null): string { + function getSelectorPath(element: HTMLElement): string { if (!element || !element.parentElement) return ''; const parentSelector = getNonUniqueSelector(element.parentElement); const elementSelector = getNonUniqueSelector(element); + // Check if element is in shadow DOM + const rootNode = element.getRootNode(); + if (rootNode instanceof ShadowRoot) { + const hostSelector = getNonUniqueSelector(rootNode.host as HTMLElement); + return `${hostSelector} >> ${elementSelector}`; + } + return `${parentSelector} > ${elementSelector}`; } - // Function to recursively get all descendant selectors + // Function to get all shadow DOM children of an element + function getShadowChildren(element: HTMLElement): HTMLElement[] { + const children: HTMLElement[] = []; + + // Check if element has shadow root + const shadowRoot = element.shadowRoot; + if (shadowRoot) { + // Get all elements in the shadow DOM + const shadowElements = Array.from(shadowRoot.querySelectorAll('*')) as HTMLElement[]; + children.push(...shadowElements); + } + + return children; + } + + // Function to recursively get all descendant selectors including shadow DOM function getAllDescendantSelectors(element: HTMLElement): string[] { let selectors: string[] = []; + + // Handle regular DOM children const children = Array.from(element.children) as HTMLElement[]; - for (const child of children) { const childPath = getSelectorPath(child); if (childPath) { - selectors.push(childPath); // Add direct child path - selectors = selectors.concat(getAllDescendantSelectors(child)); // Recursively process descendants + selectors.push(childPath); + // Recursively process regular DOM descendants + selectors = selectors.concat(getAllDescendantSelectors(child)); + + // Check for shadow DOM in this child + const shadowChildren = getShadowChildren(child); + for (const shadowChild of shadowChildren) { + const shadowPath = getSelectorPath(shadowChild); + if (shadowPath) { + selectors.push(shadowPath); + // Recursively process shadow DOM descendants + selectors = selectors.concat(getAllDescendantSelectors(shadowChild)); + } + } + } + } + + // Handle direct shadow DOM children of the current element + const shadowChildren = getShadowChildren(element); + for (const shadowChild of shadowChildren) { + const shadowPath = getSelectorPath(shadowChild); + if (shadowPath) { + selectors.push(shadowPath); + selectors = selectors.concat(getAllDescendantSelectors(shadowChild)); } } return selectors; } - // Find all occurrences of the parent selector in the DOM - const parentElements = Array.from(document.querySelectorAll(parentSelector)) as HTMLElement[]; - const allChildSelectors = new Set(); // Use a set to ensure uniqueness + // Split the parent selector if it contains shadow DOM parts + const selectorParts = parentSelector.split('>>').map(part => part.trim()); + let parentElements: HTMLElement[] = []; + + // Handle shadow DOM traversal if needed + if (selectorParts.length > 1) { + // Start with the host elements + parentElements = Array.from(document.querySelectorAll(selectorParts[0])) as HTMLElement[]; + + // Traverse through shadow DOM parts + for (let i = 1; i < selectorParts.length; i++) { + const newParentElements: HTMLElement[] = []; + for (const element of parentElements) { + if (element.shadowRoot) { + const shadowChildren = Array.from(element.shadowRoot.querySelectorAll(selectorParts[i])) as HTMLElement[]; + newParentElements.push(...shadowChildren); + } + } + parentElements = newParentElements; + } + } else { + // Regular DOM selector + parentElements = Array.from(document.querySelectorAll(parentSelector)) as HTMLElement[]; + } + + const allChildSelectors = new Set(); // Process each parent element and its descendants parentElements.forEach((parentElement) => { const descendantSelectors = getAllDescendantSelectors(parentElement); - descendantSelectors.forEach((selector) => allChildSelectors.add(selector)); // Add selectors to the set + descendantSelectors.forEach((selector) => allChildSelectors.add(selector)); }); - return Array.from(allChildSelectors); // Convert the set back to an array + return Array.from(allChildSelectors); }, parentSelector); return childSelectors || []; From c6105b4ee226a562f80b7054fffb3acac23e9d23 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Wed, 1 Jan 2025 16:15:13 +0530 Subject: [PATCH 19/23] feat: generate highlighter for shadoDOM and mixedDOM elements --- src/components/organisms/BrowserWindow.tsx | 30 +++++++++++++++++++++- 1 file changed, 29 insertions(+), 1 deletion(-) diff --git a/src/components/organisms/BrowserWindow.tsx b/src/components/organisms/BrowserWindow.tsx index 11fe8c55..442b7e50 100644 --- a/src/components/organisms/BrowserWindow.tsx +++ b/src/components/organisms/BrowserWindow.tsx @@ -120,7 +120,13 @@ export const BrowserWindow = () => { const highlighterHandler = useCallback((data: { rect: DOMRect, selector: string, elementInfo: ElementInfo | null, childSelectors?: string[] }) => { if (getList === true) { if (listSelector) { + console.log("LIST SELEECTORRRRR: ", listSelector); + console.log("DATA SELEECTORRRRR: ", data.selector); + console.log("CHILDREEENN SELECORRRR: ", data.childSelectors); socket?.emit('listSelector', { selector: listSelector }); + + const hasValidChildSelectors = Array.isArray(data.childSelectors) && data.childSelectors.length > 0; + if (limitMode) { setHighlighterData(null); } else if (paginationMode) { @@ -133,7 +139,29 @@ export const BrowserWindow = () => { } else if (data.childSelectors && data.childSelectors.includes(data.selector)) { // highlight only valid child elements within the listSelector setHighlighterData(data); - } else { + } else if (data.elementInfo?.isShadowRoot && data.childSelectors) { + // New case: Handle pure Shadow DOM elements + // Check if the selector matches any shadow root child selectors + const isShadowChild = data.childSelectors.some(childSelector => + data.selector.includes('>>') && // Shadow DOM uses >> for piercing + childSelector.split('>>').some(part => + data.selector.includes(part.trim()) + ) + ); + setHighlighterData(isShadowChild ? data : null); + } else if (data.selector.includes('>>') && hasValidChildSelectors) { + // New case: Handle mixed DOM cases + // Split the selector into parts and check each against child selectors + const selectorParts = data.selector.split('>>').map(part => part.trim()); + const isValidMixedSelector = selectorParts.some(part => + // Now we know data.childSelectors is defined + data.childSelectors!.some(childSelector => + childSelector.includes(part) + ) + ); + setHighlighterData(isValidMixedSelector ? data : null); + } + else { // if !valid child in normal mode, clear the highlighter setHighlighterData(null); } From 8db6279f05c25e671098f959bfe0b79f5d06cb4f Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Wed, 1 Jan 2025 16:39:36 +0530 Subject: [PATCH 20/23] feat: add shadowDOM support for scraping list --- maxun-core/src/browserSide/scraper.js | 146 ++++++++++++++++++++------ 1 file changed, 113 insertions(+), 33 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 00f8cef7..caa783c8 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -349,27 +349,100 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { const scrapedData = []; - while (scrapedData.length < limit) { - let parentElements = Array.from(document.querySelectorAll(listSelector)); + // Helper function to query through Shadow DOM + const queryShadowDOM = (rootElement, selector) => { + // Split the selector by Shadow DOM delimiter + const parts = selector.split('>>').map(part => part.trim()); + let currentElement = rootElement; + + // Traverse through each part of the selector + for (let i = 0; i < parts.length; i++) { + if (!currentElement) return null; + + // If we're at the document level (first part) + if (!currentElement.querySelector && !currentElement.shadowRoot) { + currentElement = document.querySelector(parts[i]); + continue; + } + + // Try to find element in regular DOM first + let nextElement = currentElement.querySelector(parts[i]); + + // If not found, check shadow DOM + if (!nextElement && currentElement.shadowRoot) { + nextElement = currentElement.shadowRoot.querySelector(parts[i]); + } + + // If still not found, try to find in shadow DOM of all child elements + if (!nextElement) { + const allChildren = Array.from(currentElement.children || []); + for (const child of allChildren) { + if (child.shadowRoot) { + nextElement = child.shadowRoot.querySelector(parts[i]); + if (nextElement) break; + } + } + } + + currentElement = nextElement; + } + + return currentElement; + }; + + // Helper function to query all elements through Shadow DOM + const queryShadowDOMAll = (rootElement, selector) => { + const parts = selector.split('>>').map(part => part.trim()); + let currentElements = [rootElement]; - // If we only got one element or none, try a more generic approach + for (const part of parts) { + const nextElements = []; + + for (const element of currentElements) { + // Check regular DOM + if (element.querySelectorAll) { + nextElements.push(...element.querySelectorAll(part)); + } + + // Check shadow DOM + if (element.shadowRoot) { + nextElements.push(...element.shadowRoot.querySelectorAll(part)); + } + + // Check shadow DOM of children + const children = Array.from(element.children || []); + for (const child of children) { + if (child.shadowRoot) { + nextElements.push(...child.shadowRoot.querySelectorAll(part)); + } + } + } + + currentElements = nextElements; + } + + return currentElements; + }; + + while (scrapedData.length < limit) { + // Use our shadow DOM query function to get parent elements + let parentElements = queryShadowDOMAll(document, listSelector); + parentElements = Array.from(parentElements); + + // Handle the case when we don't find enough elements if (limit > 1 && parentElements.length <= 1) { - const [containerSelector, _] = listSelector.split('>').map(s => s.trim()); - const container = document.querySelector(containerSelector); + const [containerSelector, ...rest] = listSelector.split('>>').map(s => s.trim()); + const container = queryShadowDOM(document, containerSelector); if (container) { - const allChildren = Array.from(container.children); + const allChildren = Array.from(container.children || []); + const firstMatch = queryShadowDOM(document, listSelector); - const firstMatch = document.querySelector(listSelector); if (firstMatch) { - // Get classes from the first matching element - const firstMatchClasses = Array.from(firstMatch.classList); + const firstMatchClasses = Array.from(firstMatch.classList || []); - // Find similar elements by matching most of their classes parentElements = allChildren.filter(element => { - const elementClasses = Array.from(element.classList); - - // Element should share at least 70% of classes with the first match + const elementClasses = Array.from(element.classList || []); const commonClasses = firstMatchClasses.filter(cls => elementClasses.includes(cls)); return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7); @@ -378,42 +451,49 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } } - // Iterate through each parent element + // Process each parent element for (const parent of parentElements) { if (scrapedData.length >= limit) break; const record = {}; - // For each field, select the corresponding element within the parent + // Process each field using shadow DOM querying for (const [label, { selector, attribute }] of Object.entries(fields)) { - const fieldElement = parent.querySelector(selector); + // Use relative selector from parent + const relativeSelector = selector.split('>>').slice(-1)[0]; + const fieldElement = queryShadowDOM(parent, relativeSelector); if (fieldElement) { - if (attribute === 'innerText') { - record[label] = fieldElement.innerText.trim(); - } else if (attribute === 'innerHTML') { - record[label] = fieldElement.innerHTML.trim(); - } else if (attribute === 'src') { - // Handle relative 'src' URLs - const src = fieldElement.getAttribute('src'); - record[label] = src ? new URL(src, window.location.origin).href : null; - } else if (attribute === 'href') { - // Handle relative 'href' URLs - const href = fieldElement.getAttribute('href'); - record[label] = href ? new URL(href, window.location.origin).href : null; - } else { - record[label] = fieldElement.getAttribute(attribute); + switch (attribute) { + case 'innerText': + record[label] = fieldElement.innerText?.trim() || ''; + break; + case 'innerHTML': + record[label] = fieldElement.innerHTML?.trim() || ''; + break; + case 'src': + const src = fieldElement.getAttribute('src'); + record[label] = src ? new URL(src, window.location.origin).href : null; + break; + case 'href': + const href = fieldElement.getAttribute('href'); + record[label] = href ? new URL(href, window.location.origin).href : null; + break; + default: + record[label] = fieldElement.getAttribute(attribute); } } } - scrapedData.push(record); + + if (Object.keys(record).length > 0) { + scrapedData.push(record); + } } - // If we've processed all available elements and still haven't reached the limit, - // break to avoid infinite loop if (parentElements.length === 0 || scrapedData.length >= parentElements.length) { break; } } + return scrapedData; }; From c287340f845e9429fc5534dc38af6257b4d75826 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 2 Jan 2025 14:17:19 +0530 Subject: [PATCH 21/23] feat: shadowDOM support for table and non table list scraping --- maxun-core/src/browserSide/scraper.js | 347 +++++++++++++++++++++----- 1 file changed, 281 insertions(+), 66 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index caa783c8..ff5a1938 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -347,33 +347,29 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - const scrapedData = []; - - // Helper function to query through Shadow DOM + // Shadow DOM query functions remain unchanged const queryShadowDOM = (rootElement, selector) => { - // Split the selector by Shadow DOM delimiter + if (!selector.includes('>>')) { + return rootElement.querySelector(selector); + } + const parts = selector.split('>>').map(part => part.trim()); let currentElement = rootElement; - // Traverse through each part of the selector for (let i = 0; i < parts.length; i++) { if (!currentElement) return null; - // If we're at the document level (first part) if (!currentElement.querySelector && !currentElement.shadowRoot) { currentElement = document.querySelector(parts[i]); continue; } - // Try to find element in regular DOM first let nextElement = currentElement.querySelector(parts[i]); - // If not found, check shadow DOM if (!nextElement && currentElement.shadowRoot) { nextElement = currentElement.shadowRoot.querySelector(parts[i]); } - // If still not found, try to find in shadow DOM of all child elements if (!nextElement) { const allChildren = Array.from(currentElement.children || []); for (const child of allChildren) { @@ -390,8 +386,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return currentElement; }; - // Helper function to query all elements through Shadow DOM const queryShadowDOMAll = (rootElement, selector) => { + if (!selector.includes('>>')) { + return rootElement.querySelectorAll(selector); + } + const parts = selector.split('>>').map(part => part.trim()); let currentElements = [rootElement]; @@ -399,17 +398,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const nextElements = []; for (const element of currentElements) { - // Check regular DOM if (element.querySelectorAll) { nextElements.push(...element.querySelectorAll(part)); } - // Check shadow DOM if (element.shadowRoot) { nextElements.push(...element.shadowRoot.querySelectorAll(part)); } - // Check shadow DOM of children const children = Array.from(element.children || []); for (const child of children) { if (child.shadowRoot) { @@ -424,76 +420,295 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return currentElements; }; - while (scrapedData.length < limit) { - // Use our shadow DOM query function to get parent elements - let parentElements = queryShadowDOMAll(document, listSelector); - parentElements = Array.from(parentElements); + // Enhanced table processing helper functions with shadow DOM support + function extractValue(element, attribute) { + if (!element) return null; + + // Check for shadow root first + if (element.shadowRoot) { + const shadowContent = element.shadowRoot.textContent; + if (shadowContent && shadowContent.trim()) { + return shadowContent.trim(); + } + } + + if (attribute === 'innerText') { + return element.innerText.trim(); + } else if (attribute === 'innerHTML') { + return element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + return attrValue ? new URL(attrValue, window.location.origin).href : null; + } + return element.getAttribute(attribute); + } - // Handle the case when we don't find enough elements - if (limit > 1 && parentElements.length <= 1) { - const [containerSelector, ...rest] = listSelector.split('>>').map(s => s.trim()); - const container = queryShadowDOM(document, containerSelector); + function findTableAncestor(element) { + let currentElement = element; + const MAX_DEPTH = 5; + let depth = 0; + + while (currentElement && depth < MAX_DEPTH) { + // Check if current element is in shadow DOM + if (currentElement.getRootNode() instanceof ShadowRoot) { + currentElement = currentElement.getRootNode().host; + continue; + } - if (container) { - const allChildren = Array.from(container.children || []); - const firstMatch = queryShadowDOM(document, listSelector); - - if (firstMatch) { - const firstMatchClasses = Array.from(firstMatch.classList || []); + if (currentElement.tagName === 'TD') { + return { type: 'TD', element: currentElement }; + } else if (currentElement.tagName === 'TR') { + return { type: 'TR', element: currentElement }; + } + currentElement = currentElement.parentElement; + depth++; + } + return null; + } + + function getCellIndex(td) { + let index = 0; + let sibling = td; + + // Handle shadow DOM case + if (td.getRootNode() instanceof ShadowRoot) { + const shadowRoot = td.getRootNode(); + const allCells = Array.from(shadowRoot.querySelectorAll('td')); + return allCells.indexOf(td); + } + + while (sibling = sibling.previousElementSibling) { + index++; + } + return index; + } + + function hasThElement(row, tableFields) { + for (const [label, { selector }] of Object.entries(tableFields)) { + const element = queryShadowDOM(row, selector); + if (element) { + let current = element; + while (current && current !== row) { + // Check if we're in shadow DOM + if (current.getRootNode() instanceof ShadowRoot) { + current = current.getRootNode().host; + continue; + } - parentElements = allChildren.filter(element => { - const elementClasses = Array.from(element.classList || []); - const commonClasses = firstMatchClasses.filter(cls => - elementClasses.includes(cls)); - return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7); - }); + if (current.tagName === 'TH') { + return true; + } + current = current.parentElement; } } } + return false; + } - // Process each parent element - for (const parent of parentElements) { - if (scrapedData.length >= limit) break; - const record = {}; + function filterRowsBasedOnTag(rows, tableFields) { + for (const row of rows) { + if (hasThElement(row, tableFields)) { + return rows; + } + } + // Include shadow DOM in TH search + return rows.filter(row => { + const directTH = row.getElementsByTagName('TH').length === 0; + const shadowTH = row.shadowRoot ? + row.shadowRoot.querySelector('th') === null : true; + return directTH && shadowTH; + }); + } - // Process each field using shadow DOM querying - for (const [label, { selector, attribute }] of Object.entries(fields)) { - // Use relative selector from parent - const relativeSelector = selector.split('>>').slice(-1)[0]; - const fieldElement = queryShadowDOM(parent, relativeSelector); + // Class similarity functions remain unchanged + function calculateClassSimilarity(classList1, classList2) { + const set1 = new Set(classList1); + const set2 = new Set(classList2); + const intersection = new Set([...set1].filter(x => set2.has(x))); + const union = new Set([...set1, ...set2]); + return intersection.size / union.size; + } - if (fieldElement) { - switch (attribute) { - case 'innerText': - record[label] = fieldElement.innerText?.trim() || ''; - break; - case 'innerHTML': - record[label] = fieldElement.innerHTML?.trim() || ''; - break; - case 'src': - const src = fieldElement.getAttribute('src'); - record[label] = src ? new URL(src, window.location.origin).href : null; - break; - case 'href': - const href = fieldElement.getAttribute('href'); - record[label] = href ? new URL(href, window.location.origin).href : null; - break; - default: - record[label] = fieldElement.getAttribute(attribute); + function findSimilarElements(baseElement, similarityThreshold = 0.7) { + const baseClasses = Array.from(baseElement.classList); + if (baseClasses.length === 0) return []; + const potentialElements = document.getElementsByTagName(baseElement.tagName); + return Array.from(potentialElements).filter(element => { + if (element === baseElement) return false; + const similarity = calculateClassSimilarity( + baseClasses, + Array.from(element.classList) + ); + return similarity >= similarityThreshold; + }); + } + + // Main scraping logic with shadow DOM support + let containers = queryShadowDOMAll(document, listSelector); + containers = Array.from(containers); + + if (containers.length === 0) return []; + + if (limit > 1 && containers.length === 1) { + const baseContainer = containers[0]; + const similarContainers = findSimilarElements(baseContainer); + + if (similarContainers.length > 0) { + const newContainers = similarContainers.filter(container => + !container.matches(listSelector) + ); + containers = [...containers, ...newContainers]; + } + } + + const containerFields = containers.map(() => ({ + tableFields: {}, + nonTableFields: {} + })); + + // Classify fields + containers.forEach((container, containerIndex) => { + for (const [label, field] of Object.entries(fields)) { + const sampleElement = queryShadowDOM(container, field.selector); + + if (sampleElement) { + const ancestor = findTableAncestor(sampleElement); + if (ancestor) { + containerFields[containerIndex].tableFields[label] = { + ...field, + tableContext: ancestor.type, + cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 + }; + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } + }); + + const tableData = []; + const nonTableData = []; + + // Process table data with shadow DOM support + for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { + const container = containers[containerIndex]; + const { tableFields } = containerFields[containerIndex]; + + if (Object.keys(tableFields).length > 0) { + const firstField = Object.values(tableFields)[0]; + const firstElement = queryShadowDOM(container, firstField.selector); + let tableContext = firstElement; + + // Find table context including shadow DOM + while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { + if (tableContext.getRootNode() instanceof ShadowRoot) { + tableContext = tableContext.getRootNode().host; + } else { + tableContext = tableContext.parentElement; + } + } + + if (tableContext) { + // Get rows from both regular DOM and shadow DOM + const rows = []; + if (tableContext.shadowRoot) { + rows.push(...tableContext.shadowRoot.getElementsByTagName('TR')); + } + rows.push(...tableContext.getElementsByTagName('TR')); + + const processedRows = filterRowsBasedOnTag(rows, tableFields); + + for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) { + const record = {}; + const currentRow = processedRows[rowIndex]; + + for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) { + let element = null; + + if (cellIndex >= 0) { + let td = currentRow.children[cellIndex]; + + // Check shadow DOM for td + if (!td && currentRow.shadowRoot) { + const shadowCells = currentRow.shadowRoot.children; + if (shadowCells && shadowCells.length > cellIndex) { + td = shadowCells[cellIndex]; + } + } + + if (td) { + element = queryShadowDOM(td, selector); + + if (!element && selector.split(">").pop().includes('td:nth-child')) { + element = td; + } + + if (!element) { + const tagOnlySelector = selector.split('.')[0]; + element = queryShadowDOM(td, tagOnlySelector); + } + + if (!element) { + let currentElement = td; + while (currentElement && currentElement.children.length > 0) { + let foundContentChild = false; + for (const child of currentElement.children) { + if (extractValue(child, attribute)) { + currentElement = child; + foundContentChild = true; + break; + } + } + if (!foundContentChild) break; + } + element = currentElement; + } + } + } else { + element = queryShadowDOM(currentRow, selector); + } + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + tableData.push(record); } } } - - if (Object.keys(record).length > 0) { - scrapedData.push(record); - } } + } - if (parentElements.length === 0 || scrapedData.length >= parentElements.length) { - break; + // Non-table data scraping remains unchanged + for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { + if (nonTableData.length >= limit) break; + + const container = containers[containerIndex]; + const { nonTableFields } = containerFields[containerIndex]; + + if (Object.keys(nonTableFields).length > 0) { + const record = {}; + + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const relativeSelector = selector.split('>>').slice(-1)[0]; + const element = queryShadowDOM(container, relativeSelector); + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + nonTableData.push(record); + } } } + const scrapedData = [...tableData, ...nonTableData]; return scrapedData; }; From 8323593bb09d0b9a869afb825aa58be3944199b9 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 2 Jan 2025 21:18:49 +0530 Subject: [PATCH 22/23] chore: format --- src/components/organisms/BrowserWindow.tsx | 1 - 1 file changed, 1 deletion(-) diff --git a/src/components/organisms/BrowserWindow.tsx b/src/components/organisms/BrowserWindow.tsx index 442b7e50..2a5f7758 100644 --- a/src/components/organisms/BrowserWindow.tsx +++ b/src/components/organisms/BrowserWindow.tsx @@ -9,7 +9,6 @@ import { useBrowserSteps, TextStep } from '../../context/browserSteps'; import { useGlobalInfoStore } from '../../context/globalInfo'; import { useTranslation } from 'react-i18next'; - interface ElementInfo { tagName: string; hasOnlyText?: boolean; From e91a3916a0513af15d306adc71fdd68bb9250e7d Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 2 Jan 2025 21:19:36 +0530 Subject: [PATCH 23/23] chore: cleanup console logs --- src/components/organisms/BrowserWindow.tsx | 3 --- 1 file changed, 3 deletions(-) diff --git a/src/components/organisms/BrowserWindow.tsx b/src/components/organisms/BrowserWindow.tsx index 2f66e906..ad58a309 100644 --- a/src/components/organisms/BrowserWindow.tsx +++ b/src/components/organisms/BrowserWindow.tsx @@ -119,9 +119,6 @@ export const BrowserWindow = () => { const highlighterHandler = useCallback((data: { rect: DOMRect, selector: string, elementInfo: ElementInfo | null, childSelectors?: string[] }) => { if (getList === true) { if (listSelector) { - console.log("LIST SELEECTORRRRR: ", listSelector); - console.log("DATA SELEECTORRRRR: ", data.selector); - console.log("CHILDREEENN SELECORRRR: ", data.childSelectors); socket?.emit('listSelector', { selector: listSelector }); const hasValidChildSelectors = Array.isArray(data.childSelectors) && data.childSelectors.length > 0;