From 0d763f78213671d5809102b9b501920921418ffd Mon Sep 17 00:00:00 2001 From: amhsirak Date: Fri, 20 Dec 2024 18:32:08 +0530 Subject: [PATCH 01/15] feat: iframe support for get element info --- server/src/workflow-management/selector.ts | 214 ++++++++++++++++----- 1 file changed, 168 insertions(+), 46 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 240f8921..16979487 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -23,10 +23,8 @@ export const getElementInformation = async ( if (!getList || listSelector !== '') { const elementInfo = await page.evaluate( async ({ x, y }) => { - const el = document.elementFromPoint(x, y) as HTMLElement; - if (el) { - const { parentElement } = el; - const element = parentElement?.tagName === 'A' ? parentElement : el; + // Helper function to get element info + const getElementInfo = (element: HTMLElement) => { let info: { tagName: string; hasOnlyText?: boolean; @@ -36,9 +34,12 @@ export const getElementInformation = async ( attributes?: Record; innerHTML?: string; outerHTML?: string; + fromIframe?: boolean; + iframePath?: string[]; } = { tagName: element?.tagName ?? '', }; + if (element) { info.attributes = Array.from(element.attributes).reduce( (acc, attr) => { @@ -48,7 +49,7 @@ export const getElementInformation = async ( {} as Record ); } - // Gather specific information based on the tag + if (element?.tagName === 'A') { info.url = (element as HTMLAnchorElement).href; info.innerText = element.innerText ?? ''; @@ -61,29 +62,186 @@ export const getElementInformation = async ( ...info.attributes, selectedValue: selectElement.value, }; - } else if (element?.tagName === 'INPUT' && (element as HTMLInputElement).type === 'time' || (element as HTMLInputElement).type === 'date') { + } else if (element?.tagName === 'INPUT' && + ((element as HTMLInputElement).type === 'time' || + (element as HTMLInputElement).type === 'date')) { info.innerText = (element as HTMLInputElement).value; } else { info.hasOnlyText = element?.children?.length === 0 && element?.innerText?.length > 0; info.innerText = element?.innerText ?? ''; } + info.innerHTML = element.innerHTML; info.outerHTML = element.outerHTML; return info; + }; + + // Helper function to search in iframe + const searchInIframe = ( + iframe: HTMLIFrameElement, + relativeX: number, + relativeY: number, + iframePath: string[] + ) => { + try { + if (!iframe.contentDocument) return null; + + const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; + if (!el) return null; + + const { parentElement } = el; + const element = parentElement?.tagName === 'A' ? parentElement : el; + + const info = getElementInfo(element); + info.fromIframe = true; + info.iframePath = iframePath; + + return info; + } catch (e) { + console.warn('Cannot access iframe content:', e); + return null; + } + }; + + const el = document.elementFromPoint(x, y) as HTMLElement; + if (el) { + // Check if the element is an iframe + if (el.tagName === 'IFRAME') { + const iframe = el as HTMLIFrameElement; + const rect = iframe.getBoundingClientRect(); + const relativeX = x - rect.left; + const relativeY = y - rect.top; + + const iframeResult = searchInIframe( + iframe, + relativeX, + relativeY, + [iframe.id || 'unnamed-iframe'] + ); + if (iframeResult) return iframeResult; + } + + const { parentElement } = el; + const element = parentElement?.tagName === 'A' ? parentElement : el; + return getElementInfo(element); } return null; }, - { x: coordinates.x, y: coordinates.y }, + { x: coordinates.x, y: coordinates.y } ); return elementInfo; } else { const elementInfo = await page.evaluate( async ({ x, y }) => { + // Helper function to get element info (same as above) + const getElementInfo = (element: HTMLElement) => { + let info: { + tagName: string; + hasOnlyText?: boolean; + innerText?: string; + url?: string; + imageUrl?: string; + attributes?: Record; + innerHTML?: string; + outerHTML?: string; + fromIframe?: boolean; + iframePath?: string[]; + } = { + tagName: element?.tagName ?? '', + }; + + if (element) { + info.attributes = Array.from(element.attributes).reduce( + (acc, attr) => { + acc[attr.name] = attr.value; + return acc; + }, + {} as Record + ); + } + + if (element?.tagName === 'A') { + info.url = (element as HTMLAnchorElement).href; + info.innerText = element.innerText ?? ''; + } else if (element?.tagName === 'IMG') { + info.imageUrl = (element as HTMLImageElement).src; + } else { + info.hasOnlyText = element?.children?.length === 0 && + element?.innerText?.length > 0; + info.innerText = element?.innerText ?? ''; + } + + info.innerHTML = element.innerHTML; + info.outerHTML = element.outerHTML; + return info; + }; + + // Helper function to search in iframe (same as above) + const searchInIframe = ( + iframe: HTMLIFrameElement, + relativeX: number, + relativeY: number, + iframePath: string[] + ) => { + try { + if (!iframe.contentDocument) return null; + + const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; + if (!el) return null; + + let element = el; + while (element.parentElement) { + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); + + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; + + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + element = element.parentElement; + } else { + break; + } + } + + const info = getElementInfo(element); + info.fromIframe = true; + info.iframePath = iframePath; + + return info; + } catch (e) { + console.warn('Cannot access iframe content:', e); + return null; + } + }; + const originalEl = document.elementFromPoint(x, y) as HTMLElement; if (originalEl) { - let element = originalEl; + // Check if the element is an iframe + if (originalEl.tagName === 'IFRAME') { + const iframe = originalEl as HTMLIFrameElement; + const rect = iframe.getBoundingClientRect(); + const relativeX = x - rect.left; + const relativeY = y - rect.top; + + const iframeResult = searchInIframe( + iframe, + relativeX, + relativeY, + [iframe.id || 'unnamed-iframe'] + ); + if (iframeResult) return iframeResult; + } + let element = originalEl; while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -105,47 +263,11 @@ export const getElementInformation = async ( } } - let info: { - tagName: string; - hasOnlyText?: boolean; - innerText?: string; - url?: string; - imageUrl?: string; - attributes?: Record; - innerHTML?: string; - outerHTML?: string; - } = { - tagName: element?.tagName ?? '', - }; - - if (element) { - info.attributes = Array.from(element.attributes).reduce( - (acc, attr) => { - acc[attr.name] = attr.value; - return acc; - }, - {} as Record - ); - } - - if (element?.tagName === 'A') { - info.url = (element as HTMLAnchorElement).href; - info.innerText = element.innerText ?? ''; - } else if (element?.tagName === 'IMG') { - info.imageUrl = (element as HTMLImageElement).src; - } else { - info.hasOnlyText = element?.children?.length === 0 && - element?.innerText?.length > 0; - info.innerText = element?.innerText ?? ''; - } - - info.innerHTML = element.innerHTML; - info.outerHTML = element.outerHTML; - return info; + return getElementInfo(element); } return null; }, - { x: coordinates.x, y: coordinates.y }, + { x: coordinates.x, y: coordinates.y } ); return elementInfo; } From 6904933036bc48bc09fc331479efbfe174181c78 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Fri, 20 Dec 2024 20:28:11 +0530 Subject: [PATCH 02/15] feat: iframe support for getRect --- server/src/workflow-management/selector.ts | 189 ++++++++++++++++++--- 1 file changed, 166 insertions(+), 23 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 16979487..83491042 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -292,25 +292,90 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector if (!getList || listSelector !== '') { const rect = await page.evaluate( async ({ x, y }) => { + // Helper function to convert rectangle to plain object + const getRectangleInfo = (rectangle: DOMRect) => { + const info = { + x: rectangle.x, + y: rectangle.y, + width: rectangle.width, + height: rectangle.height, + top: rectangle.top, + right: rectangle.right, + bottom: rectangle.bottom, + left: rectangle.left, + fromIframe: false, + iframePath: [] as string[] + }; + return info; + }; + + // Helper function to search in iframe + const searchInIframe = ( + iframe: HTMLIFrameElement, + relativeX: number, + relativeY: number, + iframePath: string[] + ) => { + try { + if (!iframe.contentDocument) return null; + + const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; + if (!el) return null; + + const { parentElement } = el; + const element = parentElement?.tagName === 'A' ? parentElement : el; + const rectangle = element?.getBoundingClientRect(); + + if (rectangle) { + const iframeRect = iframe.getBoundingClientRect(); + const rectInfo = getRectangleInfo(rectangle); + + // Adjust coordinates relative to the main document + rectInfo.x += iframeRect.x; + rectInfo.y += iframeRect.y; + rectInfo.top += iframeRect.top; + rectInfo.right += iframeRect.left; + rectInfo.bottom += iframeRect.top; + rectInfo.left += iframeRect.left; + rectInfo.fromIframe = true; + rectInfo.iframePath = iframePath; + + return rectInfo; + } + return null; + } catch (e) { + console.warn('Cannot access iframe content:', e); + return null; + } + }; + const el = document.elementFromPoint(x, y) as HTMLElement; if (el) { + // Check if the element is an iframe + if (el.tagName === 'IFRAME') { + const iframe = el as HTMLIFrameElement; + const rect = iframe.getBoundingClientRect(); + const relativeX = x - rect.left; + const relativeY = y - rect.top; + + const iframeResult = searchInIframe( + iframe, + relativeX, + relativeY, + [iframe.id || 'unnamed-iframe'] + ); + if (iframeResult) return iframeResult; + } + const { parentElement } = el; - // Match the logic in recorder.ts for link clicks const element = parentElement?.tagName === 'A' ? parentElement : el; const rectangle = element?.getBoundingClientRect(); + if (rectangle) { - return { - x: rectangle.x, - y: rectangle.y, - width: rectangle.width, - height: rectangle.height, - top: rectangle.top, - right: rectangle.right, - bottom: rectangle.bottom, - left: rectangle.left, - }; + return getRectangleInfo(rectangle); } } + return null; }, { x: coordinates.x, y: coordinates.y }, ); @@ -318,10 +383,98 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector } else { const rect = await page.evaluate( async ({ x, y }) => { + // Helper function to convert rectangle to plain object (same as above) + const getRectangleInfo = (rectangle: DOMRect) => ({ + x: rectangle.x, + y: rectangle.y, + width: rectangle.width, + height: rectangle.height, + top: rectangle.top, + right: rectangle.right, + bottom: rectangle.bottom, + left: rectangle.left, + fromIframe: false, + iframePath: [] as string[] + }); + + // Helper function to search in iframe (same as above) + const searchInIframe = ( + iframe: HTMLIFrameElement, + relativeX: number, + relativeY: number, + iframePath: string[] + ) => { + try { + if (!iframe.contentDocument) return null; + + const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; + if (!el) return null; + + let element = el; + while (element.parentElement) { + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); + + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; + + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + element = element.parentElement; + } else { + break; + } + } + + const rectangle = element?.getBoundingClientRect(); + if (rectangle) { + const iframeRect = iframe.getBoundingClientRect(); + const rectInfo = getRectangleInfo(rectangle); + + // Adjust coordinates relative to the main document + rectInfo.x += iframeRect.x; + rectInfo.y += iframeRect.y; + rectInfo.top += iframeRect.top; + rectInfo.right += iframeRect.left; + rectInfo.bottom += iframeRect.top; + rectInfo.left += iframeRect.left; + rectInfo.fromIframe = true; + rectInfo.iframePath = iframePath; + + return rectInfo; + } + return null; + } catch (e) { + console.warn('Cannot access iframe content:', e); + return null; + } + }; + const originalEl = document.elementFromPoint(x, y) as HTMLElement; if (originalEl) { - let element = originalEl; + // Check if the element is an iframe + if (originalEl.tagName === 'IFRAME') { + const iframe = originalEl as HTMLIFrameElement; + const rect = iframe.getBoundingClientRect(); + const relativeX = x - rect.left; + const relativeY = y - rect.top; + + const iframeResult = searchInIframe( + iframe, + relativeX, + relativeY, + [iframe.id || 'unnamed-iframe'] + ); + if (iframeResult) return iframeResult; + } + let element = originalEl; while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -344,18 +497,8 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector } const rectangle = element?.getBoundingClientRect(); - if (rectangle) { - return { - x: rectangle.x, - y: rectangle.y, - width: rectangle.width, - height: rectangle.height, - top: rectangle.top, - right: rectangle.right, - bottom: rectangle.bottom, - left: rectangle.left, - }; + return getRectangleInfo(rectangle); } } return null; From 8ba928dae6cac4d7e5924bcc799e792068e6734d Mon Sep 17 00:00:00 2001 From: amhsirak Date: Fri, 20 Dec 2024 20:28:24 +0530 Subject: [PATCH 03/15] chore: fix format --- server/src/workflow-management/selector.ts | 76 +++++++++++----------- 1 file changed, 38 insertions(+), 38 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 83491042..dd869f3d 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -62,9 +62,9 @@ export const getElementInformation = async ( ...info.attributes, selectedValue: selectElement.value, }; - } else if (element?.tagName === 'INPUT' && - ((element as HTMLInputElement).type === 'time' || - (element as HTMLInputElement).type === 'date')) { + } else if (element?.tagName === 'INPUT' && + ((element as HTMLInputElement).type === 'time' || + (element as HTMLInputElement).type === 'date')) { info.innerText = (element as HTMLInputElement).value; } else { info.hasOnlyText = element?.children?.length === 0 && @@ -79,20 +79,20 @@ export const getElementInformation = async ( // Helper function to search in iframe const searchInIframe = ( - iframe: HTMLIFrameElement, - relativeX: number, + iframe: HTMLIFrameElement, + relativeX: number, relativeY: number, iframePath: string[] ) => { try { if (!iframe.contentDocument) return null; - + const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; if (!el) return null; const { parentElement } = el; const element = parentElement?.tagName === 'A' ? parentElement : el; - + const info = getElementInfo(element); info.fromIframe = true; info.iframePath = iframePath; @@ -112,11 +112,11 @@ export const getElementInformation = async ( const rect = iframe.getBoundingClientRect(); const relativeX = x - rect.left; const relativeY = y - rect.top; - + const iframeResult = searchInIframe( - iframe, - relativeX, - relativeY, + iframe, + relativeX, + relativeY, [iframe.id || 'unnamed-iframe'] ); if (iframeResult) return iframeResult; @@ -179,14 +179,14 @@ export const getElementInformation = async ( // Helper function to search in iframe (same as above) const searchInIframe = ( - iframe: HTMLIFrameElement, - relativeX: number, + iframe: HTMLIFrameElement, + relativeX: number, relativeY: number, iframePath: string[] ) => { try { if (!iframe.contentDocument) return null; - + const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; if (!el) return null; @@ -231,11 +231,11 @@ export const getElementInformation = async ( const rect = iframe.getBoundingClientRect(); const relativeX = x - rect.left; const relativeY = y - rect.top; - + const iframeResult = searchInIframe( - iframe, - relativeX, - relativeY, + iframe, + relativeX, + relativeY, [iframe.id || 'unnamed-iframe'] ); if (iframeResult) return iframeResult; @@ -311,25 +311,25 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector // Helper function to search in iframe const searchInIframe = ( - iframe: HTMLIFrameElement, - relativeX: number, + iframe: HTMLIFrameElement, + relativeX: number, relativeY: number, iframePath: string[] ) => { try { if (!iframe.contentDocument) return null; - + const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; if (!el) return null; const { parentElement } = el; const element = parentElement?.tagName === 'A' ? parentElement : el; const rectangle = element?.getBoundingClientRect(); - + if (rectangle) { const iframeRect = iframe.getBoundingClientRect(); const rectInfo = getRectangleInfo(rectangle); - + // Adjust coordinates relative to the main document rectInfo.x += iframeRect.x; rectInfo.y += iframeRect.y; @@ -339,7 +339,7 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector rectInfo.left += iframeRect.left; rectInfo.fromIframe = true; rectInfo.iframePath = iframePath; - + return rectInfo; } return null; @@ -357,11 +357,11 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector const rect = iframe.getBoundingClientRect(); const relativeX = x - rect.left; const relativeY = y - rect.top; - + const iframeResult = searchInIframe( - iframe, - relativeX, - relativeY, + iframe, + relativeX, + relativeY, [iframe.id || 'unnamed-iframe'] ); if (iframeResult) return iframeResult; @@ -370,7 +370,7 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector const { parentElement } = el; const element = parentElement?.tagName === 'A' ? parentElement : el; const rectangle = element?.getBoundingClientRect(); - + if (rectangle) { return getRectangleInfo(rectangle); } @@ -399,14 +399,14 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector // Helper function to search in iframe (same as above) const searchInIframe = ( - iframe: HTMLIFrameElement, - relativeX: number, + iframe: HTMLIFrameElement, + relativeX: number, relativeY: number, iframePath: string[] ) => { try { if (!iframe.contentDocument) return null; - + const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; if (!el) return null; @@ -436,7 +436,7 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector if (rectangle) { const iframeRect = iframe.getBoundingClientRect(); const rectInfo = getRectangleInfo(rectangle); - + // Adjust coordinates relative to the main document rectInfo.x += iframeRect.x; rectInfo.y += iframeRect.y; @@ -446,7 +446,7 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector rectInfo.left += iframeRect.left; rectInfo.fromIframe = true; rectInfo.iframePath = iframePath; - + return rectInfo; } return null; @@ -464,11 +464,11 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector const rect = iframe.getBoundingClientRect(); const relativeX = x - rect.left; const relativeY = y - rect.top; - + const iframeResult = searchInIframe( - iframe, - relativeX, - relativeY, + iframe, + relativeX, + relativeY, [iframe.id || 'unnamed-iframe'] ); if (iframeResult) return iframeResult; From b6faf5cf17736dcc99ffa0b146031f23ccc55f80 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 2 Jan 2025 19:35:03 +0530 Subject: [PATCH 04/15] feat: add iframeSelector generation logic for capture text --- server/src/workflow-management/selector.ts | 471 +++++++++++---------- 1 file changed, 242 insertions(+), 229 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index dd869f3d..6ed6a997 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -23,247 +23,110 @@ export const getElementInformation = async ( if (!getList || listSelector !== '') { const elementInfo = await page.evaluate( async ({ x, y }) => { - // Helper function to get element info - const getElementInfo = (element: HTMLElement) => { - let info: { - tagName: string; - hasOnlyText?: boolean; - innerText?: string; - url?: string; - imageUrl?: string; - attributes?: Record; - innerHTML?: string; - outerHTML?: string; - fromIframe?: boolean; - iframePath?: string[]; - } = { - tagName: element?.tagName ?? '', - }; + // Helper function to find elements within iframes, handling nested cases + const getElementFromIframePoint = ( + x: number, + y: number, + context: Document = document, + iframePath: string[] = [] + ): { element: HTMLElement | null; iframePath: string[] } => { + // First try to get element at the given coordinates + let element = context.elementFromPoint(x, y) as HTMLElement; + if (!element) return { element: null, iframePath }; - if (element) { - info.attributes = Array.from(element.attributes).reduce( - (acc, attr) => { - acc[attr.name] = attr.value; - return acc; - }, - {} as Record - ); - } - - if (element?.tagName === 'A') { - info.url = (element as HTMLAnchorElement).href; - info.innerText = element.innerText ?? ''; - } else if (element?.tagName === 'IMG') { - info.imageUrl = (element as HTMLImageElement).src; - } else if (element?.tagName === 'SELECT') { - const selectElement = element as HTMLSelectElement; - info.innerText = selectElement.options[selectElement.selectedIndex]?.text ?? ''; - info.attributes = { - ...info.attributes, - selectedValue: selectElement.value, - }; - } else if (element?.tagName === 'INPUT' && - ((element as HTMLInputElement).type === 'time' || - (element as HTMLInputElement).type === 'date')) { - info.innerText = (element as HTMLInputElement).value; - } else { - info.hasOnlyText = element?.children?.length === 0 && - element?.innerText?.length > 0; - info.innerText = element?.innerText ?? ''; - } - - info.innerHTML = element.innerHTML; - info.outerHTML = element.outerHTML; - return info; - }; - - // Helper function to search in iframe - const searchInIframe = ( - iframe: HTMLIFrameElement, - relativeX: number, - relativeY: number, - iframePath: string[] - ) => { - try { - if (!iframe.contentDocument) return null; - - const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; - if (!el) return null; - - const { parentElement } = el; - const element = parentElement?.tagName === 'A' ? parentElement : el; - - const info = getElementInfo(element); - info.fromIframe = true; - info.iframePath = iframePath; - - return info; - } catch (e) { - console.warn('Cannot access iframe content:', e); - return null; - } - }; - - const el = document.elementFromPoint(x, y) as HTMLElement; - if (el) { - // Check if the element is an iframe - if (el.tagName === 'IFRAME') { - const iframe = el as HTMLIFrameElement; - const rect = iframe.getBoundingClientRect(); - const relativeX = x - rect.left; - const relativeY = y - rect.top; - - const iframeResult = searchInIframe( - iframe, - relativeX, - relativeY, - [iframe.id || 'unnamed-iframe'] - ); - if (iframeResult) return iframeResult; - } - - const { parentElement } = el; - const element = parentElement?.tagName === 'A' ? parentElement : el; - return getElementInfo(element); - } - return null; - }, - { x: coordinates.x, y: coordinates.y } - ); - return elementInfo; - } else { - const elementInfo = await page.evaluate( - async ({ x, y }) => { - // Helper function to get element info (same as above) - const getElementInfo = (element: HTMLElement) => { - let info: { - tagName: string; - hasOnlyText?: boolean; - innerText?: string; - url?: string; - imageUrl?: string; - attributes?: Record; - innerHTML?: string; - outerHTML?: string; - fromIframe?: boolean; - iframePath?: string[]; - } = { - tagName: element?.tagName ?? '', - }; - - if (element) { - info.attributes = Array.from(element.attributes).reduce( - (acc, attr) => { - acc[attr.name] = attr.value; - return acc; - }, - {} as Record - ); - } - - if (element?.tagName === 'A') { - info.url = (element as HTMLAnchorElement).href; - info.innerText = element.innerText ?? ''; - } else if (element?.tagName === 'IMG') { - info.imageUrl = (element as HTMLImageElement).src; - } else { - info.hasOnlyText = element?.children?.length === 0 && - element?.innerText?.length > 0; - info.innerText = element?.innerText ?? ''; - } - - info.innerHTML = element.innerHTML; - info.outerHTML = element.outerHTML; - return info; - }; - - // Helper function to search in iframe (same as above) - const searchInIframe = ( - iframe: HTMLIFrameElement, - relativeX: number, - relativeY: number, - iframePath: string[] - ) => { - try { - if (!iframe.contentDocument) return null; - - const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; - if (!el) return null; - - let element = el; - while (element.parentElement) { - const parentRect = element.parentElement.getBoundingClientRect(); - const childRect = element.getBoundingClientRect(); - - const fullyContained = - parentRect.left <= childRect.left && - parentRect.right >= childRect.right && - parentRect.top <= childRect.top && - parentRect.bottom >= childRect.bottom; - - const significantOverlap = - (childRect.width * childRect.height) / - (parentRect.width * parentRect.height) > 0.5; - - if (fullyContained && significantOverlap) { - element = element.parentElement; - } else { - break; + // Check if we found an iframe + if (element.tagName === 'IFRAME') { + const iframe = element as HTMLIFrameElement; + try { + // Make sure we can access the iframe's content + if (!iframe.contentDocument) { + return { element, iframePath }; } + + // Transform coordinates to iframe's space + const rect = iframe.getBoundingClientRect(); + const relativeX = x - rect.left; + const relativeY = y - rect.top; + + // Add this iframe to the path + const updatedPath = [...iframePath, iframe.id || 'unnamed-iframe']; + + // Recursively search within the iframe + const iframeResult = getElementFromIframePoint( + relativeX, + relativeY, + iframe.contentDocument, + updatedPath + ); + + // If we found an element in the iframe, return it + if (iframeResult.element) { + return iframeResult; + } + } catch (e) { + console.warn('Cannot access iframe content:', e); } - - const info = getElementInfo(element); - info.fromIframe = true; - info.iframePath = iframePath; - - return info; - } catch (e) { - console.warn('Cannot access iframe content:', e); - return null; } + + // Return the element we found (either in main document or iframe) + return { element, iframePath }; }; - const originalEl = document.elementFromPoint(x, y) as HTMLElement; - if (originalEl) { - // Check if the element is an iframe - if (originalEl.tagName === 'IFRAME') { - const iframe = originalEl as HTMLIFrameElement; - const rect = iframe.getBoundingClientRect(); - const relativeX = x - rect.left; - const relativeY = y - rect.top; + // Get the element and its iframe path + const { element: el, iframePath } = getElementFromIframePoint(x, y); + + if (el) { + // Handle potential anchor parent + const { parentElement } = el; + const targetElement = parentElement?.tagName === 'A' ? parentElement : el; - const iframeResult = searchInIframe( - iframe, - relativeX, - relativeY, - [iframe.id || 'unnamed-iframe'] + // Build the element information object + let info: { + tagName: string; + hasOnlyText?: boolean; + innerText?: string; + url?: string; + imageUrl?: string; + attributes?: Record; + innerHTML?: string; + outerHTML?: string; + fromIframe?: boolean; + iframePath?: string[]; + } = { + tagName: targetElement?.tagName ?? '', + fromIframe: iframePath.length > 0, + iframePath: iframePath.length > 0 ? iframePath : undefined + }; + + // Collect element attributes and properties + if (targetElement) { + // Get all attributes + info.attributes = Array.from(targetElement.attributes).reduce( + (acc, attr) => { + acc[attr.name] = attr.value; + return acc; + }, + {} as Record ); - if (iframeResult) return iframeResult; - } - let element = originalEl; - while (element.parentElement) { - const parentRect = element.parentElement.getBoundingClientRect(); - const childRect = element.getBoundingClientRect(); - - const fullyContained = - parentRect.left <= childRect.left && - parentRect.right >= childRect.right && - parentRect.top <= childRect.top && - parentRect.bottom >= childRect.bottom; - - const significantOverlap = - (childRect.width * childRect.height) / - (parentRect.width * parentRect.height) > 0.5; - - if (fullyContained && significantOverlap) { - element = element.parentElement; + // Handle specific element types + if (targetElement.tagName === 'A') { + info.url = (targetElement as HTMLAnchorElement).href; + info.innerText = targetElement.textContent ?? ''; + } else if (targetElement.tagName === 'IMG') { + info.imageUrl = (targetElement as HTMLImageElement).src; } else { - break; + info.hasOnlyText = targetElement.children.length === 0 && + (targetElement.textContent !== null && + targetElement.textContent.trim().length > 0); + info.innerText = targetElement.textContent ?? ''; } + + info.innerHTML = targetElement.innerHTML; + info.outerHTML = targetElement.outerHTML; } - return getElementInfo(element); + return info; } return null; }, @@ -271,6 +134,7 @@ export const getElementInformation = async ( ); return elementInfo; } + // ... rest of the code remains same } catch (error) { const { message, stack } = error as Error; console.error('Error while retrieving selector:', message); @@ -984,6 +848,148 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { } return output; } + + const getIframeOffset = (iframe: HTMLIFrameElement): { x: number; y: number } => { + const rect = iframe.getBoundingClientRect(); + return { + x: rect.left, + y: rect.top + }; + }; + + const isAccessibleIframe = (iframe: HTMLIFrameElement): boolean => { + try { + return !!iframe.contentDocument; + } catch (e) { + return false; + } + }; + + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + // Get the initial element at the specified coordinates + let currentElement = document.elementFromPoint(x, y) as HTMLElement; + if (!currentElement) return null; + + let deepestElement = currentElement; + let current = currentElement; + let currentX = x; + let currentY = y; + let depth = 0; + const MAX_DEPTH = 20; // Prevent infinite loops with deeply nested iframes + + // Continue traversing while we find nested iframes + while (current && depth < MAX_DEPTH) { + // Check if the current element is an iframe and if we can access it + if (current instanceof HTMLIFrameElement && isAccessibleIframe(current)) { + // Calculate the offset of the iframe + const iframeOffset = getIframeOffset(current); + + // Transform coordinates to be relative to the iframe's content window + const relativeX = currentX - iframeOffset.x; + const relativeY = currentY - iframeOffset.y; + + // Find the element at these coordinates within the iframe + const iframeElement = current.contentDocument?.elementFromPoint(relativeX, relativeY) as HTMLElement; + + // If we don't find an element or we get the same element, stop traversing + if (!iframeElement || iframeElement === current) break; + + // Update our tracking variables + deepestElement = iframeElement; + current = iframeElement; + currentX = relativeX; + currentY = relativeY; + depth++; + } else { + // If the current element is not an iframe, we're done traversing + break; + } + } + + return deepestElement; + }; + + interface IframeContext { + frame: HTMLIFrameElement; + document: Document; + element: HTMLElement; + } + + const genSelectorForIframe = (element: HTMLElement) => { + // Helper function to check if we can access an iframe's content + const isAccessibleIframe = (iframe: HTMLIFrameElement): boolean => { + try { + return !!iframe.contentDocument; + } catch (e) { + return false; + } + }; + + // Get complete path up through nested iframes to document root + const getIframePath = (el: HTMLElement) => { + const path: IframeContext[] = []; + let current = el; + let currentDoc = el.ownerDocument; + let depth = 0; + const MAX_DEPTH = 20; // Limit depth to prevent infinite loops + + while (current && depth < MAX_DEPTH) { + // If we're in an iframe, get its parent document + const frameElement = currentDoc.defaultView?.frameElement as HTMLIFrameElement; + if (frameElement && isAccessibleIframe(frameElement)) { + path.unshift({ + frame: frameElement, + document: currentDoc, + element: current + }); + current = frameElement; + currentDoc = frameElement.ownerDocument; + depth++; + } else { + break; + } + } + return path; + }; + + // Get the iframe path for our target element + const iframePath = getIframePath(element); + if (iframePath.length === 0) return null; + + try { + const selectorParts: string[] = []; + + // Generate selector for each iframe boundary + iframePath.forEach((context, index) => { + // Get selector for the iframe element in its parent document + const frameSelector = finder(context.frame, { + root: index === 0 ? document.body : (iframePath[index - 1].document.body as Element) + }); + + // For the last context, get selector for target element + if (index === iframePath.length - 1) { + const elementSelector = finder(element, { + root: context.document.body as Element + }); + // Use :>> for iframe traversal in the selector + selectorParts.push(`${frameSelector} :>> ${elementSelector}`); + } else { + selectorParts.push(frameSelector); + } + }); + + return { + // Join all parts with :>> to indicate iframe traversal + fullSelector: selectorParts.join(' :>> '), + // Include additional metadata about the frames if needed + frameCount: iframePath.length, + isAccessible: true + }; + } catch (e) { + console.warn('Error generating iframe selector:', e); + return null; + } + }; const genSelectors = (element: HTMLElement | null) => { if (element == null) { @@ -1004,6 +1010,8 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { } catch (e) { } + const iframeSelector = genSelectorForIframe(element); + const hrefSelector = genSelectorForAttributes(element, ['href']); const formSelector = genSelectorForAttributes(element, [ 'name', @@ -1050,6 +1058,11 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { hrefSelector, accessibilitySelector, formSelector, + iframeSelector: iframeSelector ? { + full: iframeSelector.fullSelector, + frame: iframeSelector.frameCount, + accesible: iframeSelector.isAccessible + } : null }; } @@ -1092,7 +1105,7 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { return char.length === 1 && char.match(/[0-9]/); } - const hoveredElement = document.elementFromPoint(x, y) as HTMLElement; + const hoveredElement = getDeepestElementFromPoint(x, y) as HTMLElement; if ( hoveredElement != null && !hoveredElement.closest('#overlay-controls') != null From 3f73a48c31eb72a398690210a2d0767e206ad204 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 4 Jan 2025 09:01:13 +0530 Subject: [PATCH 05/15] feat: add nested iframe selector generation func for capture text --- server/src/workflow-management/selector.ts | 221 +++++++++------------ 1 file changed, 95 insertions(+), 126 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 6ed6a997..6c955934 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -849,145 +849,115 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { return output; } - const getIframeOffset = (iframe: HTMLIFrameElement): { x: number; y: number } => { - const rect = iframe.getBoundingClientRect(); - return { - x: rect.left, - y: rect.top - }; - }; - - const isAccessibleIframe = (iframe: HTMLIFrameElement): boolean => { - try { - return !!iframe.contentDocument; - } catch (e) { - return false; - } - }; - const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { - // Get the initial element at the specified coordinates - let currentElement = document.elementFromPoint(x, y) as HTMLElement; - if (!currentElement) return null; - - let deepestElement = currentElement; - let current = currentElement; - let currentX = x; - let currentY = y; + // First, get the element at the specified coordinates in the main document + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; + + // Check if the element is an iframe + if (element.tagName !== 'IFRAME') return element; + + let currentIframe = element as HTMLIFrameElement; + let deepestElement = element; let depth = 0; - const MAX_DEPTH = 20; // Prevent infinite loops with deeply nested iframes - - // Continue traversing while we find nested iframes - while (current && depth < MAX_DEPTH) { - // Check if the current element is an iframe and if we can access it - if (current instanceof HTMLIFrameElement && isAccessibleIframe(current)) { - // Calculate the offset of the iframe - const iframeOffset = getIframeOffset(current); - - // Transform coordinates to be relative to the iframe's content window - const relativeX = currentX - iframeOffset.x; - const relativeY = currentY - iframeOffset.y; - - // Find the element at these coordinates within the iframe - const iframeElement = current.contentDocument?.elementFromPoint(relativeX, relativeY) as HTMLElement; - - // If we don't find an element or we get the same element, stop traversing - if (!iframeElement || iframeElement === current) break; - - // Update our tracking variables - deepestElement = iframeElement; - current = iframeElement; - currentX = relativeX; - currentY = relativeY; - depth++; - } else { - // If the current element is not an iframe, we're done traversing - break; - } + const MAX_DEPTH = 4; // Limit the depth of nested iframes to prevent infinite loops + + while (currentIframe && depth < MAX_DEPTH) { + try { + // Convert coordinates from main document to iframe's coordinate system + const iframeRect = currentIframe.getBoundingClientRect(); + const iframeX = x - iframeRect.left; + const iframeY = y - iframeRect.top; + + // Access the iframe's content document and get the element at the transformed coordinates + const iframeDoc = currentIframe.contentDocument || currentIframe.contentWindow?.document; + if (!iframeDoc) break; + + const iframeElement = iframeDoc.elementFromPoint(iframeX, iframeY) as HTMLElement; + if (!iframeElement) break; + + // If the element found is another iframe, continue traversing + if (iframeElement.tagName === 'IFRAME') { + deepestElement = iframeElement; + currentIframe = iframeElement as HTMLIFrameElement; + depth++; + } else { + // If it's not an iframe, we've found our deepest element + deepestElement = iframeElement; + break; + } + } catch (error) { + // Handle potential security errors when accessing cross-origin iframes + console.warn('Cannot access iframe content:', error); + break; + } } - return deepestElement; }; - - interface IframeContext { - frame: HTMLIFrameElement; - document: Document; - element: HTMLElement; - } const genSelectorForIframe = (element: HTMLElement) => { - // Helper function to check if we can access an iframe's content - const isAccessibleIframe = (iframe: HTMLIFrameElement): boolean => { - try { - return !!iframe.contentDocument; - } catch (e) { - return false; - } - }; - - // Get complete path up through nested iframes to document root + // Helper function to get the complete iframe path up to document root const getIframePath = (el: HTMLElement) => { - const path: IframeContext[] = []; - let current = el; - let currentDoc = el.ownerDocument; - let depth = 0; - const MAX_DEPTH = 20; // Limit depth to prevent infinite loops - - while (current && depth < MAX_DEPTH) { - // If we're in an iframe, get its parent document - const frameElement = currentDoc.defaultView?.frameElement as HTMLIFrameElement; - if (frameElement && isAccessibleIframe(frameElement)) { - path.unshift({ - frame: frameElement, - document: currentDoc, - element: current - }); - current = frameElement; - currentDoc = frameElement.ownerDocument; - depth++; - } else { - break; + const path = []; + let current = el; + let depth = 0; + const MAX_DEPTH = 4; + + while (current && depth < MAX_DEPTH) { + // Get the owner document of the current element + const ownerDocument = current.ownerDocument; + + // Check if this document belongs to an iframe + const frameElement = ownerDocument?.defaultView?.frameElement as HTMLIFrameElement; + + if (frameElement) { + path.unshift({ + frame: frameElement, + document: ownerDocument, + element: current + }); + // Move up to the parent document's element (the iframe) + current = frameElement; + depth++; + } else { + break; + } } - } - return path; + return path; }; - - // Get the iframe path for our target element + const iframePath = getIframePath(element); if (iframePath.length === 0) return null; - + try { - const selectorParts: string[] = []; - - // Generate selector for each iframe boundary - iframePath.forEach((context, index) => { - // Get selector for the iframe element in its parent document - const frameSelector = finder(context.frame, { - root: index === 0 ? document.body : (iframePath[index - 1].document.body as Element) - }); + const selectorParts: string[] = []; - // For the last context, get selector for target element - if (index === iframePath.length - 1) { - const elementSelector = finder(element, { - root: context.document.body as Element - }); - // Use :>> for iframe traversal in the selector - selectorParts.push(`${frameSelector} :>> ${elementSelector}`); - } else { - selectorParts.push(frameSelector); - } - }); - - return { - // Join all parts with :>> to indicate iframe traversal - fullSelector: selectorParts.join(' :>> '), - // Include additional metadata about the frames if needed - frameCount: iframePath.length, - isAccessible: true - }; + // Generate selector for each iframe boundary + iframePath.forEach((context, index) => { + // Get selector for the iframe element + const frameSelector = finder(context.frame, { + root: index === 0 ? document.body : + (iframePath[index - 1].document.body as Element) + }); + + // For the last context, get selector for target element + if (index === iframePath.length - 1) { + const elementSelector = finder(element, { + root: context.document.body as Element + }); + selectorParts.push(`${frameSelector} :>> ${elementSelector}`); + } else { + selectorParts.push(frameSelector); + } + }); + + return { + fullSelector: selectorParts.join(' :>> '), + isFrameContent: true + }; } catch (e) { - console.warn('Error generating iframe selector:', e); - return null; + console.warn('Error generating iframe selector:', e); + return null; } }; @@ -1060,8 +1030,7 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => { formSelector, iframeSelector: iframeSelector ? { full: iframeSelector.fullSelector, - frame: iframeSelector.frameCount, - accesible: iframeSelector.isAccessible + isIframe: iframeSelector.isFrameContent, } : null }; } From 9eb4ec398aee764d539b117c2b956c16904d645a Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 4 Jan 2025 09:13:17 +0530 Subject: [PATCH 06/15] feat: add iframe selector type for selectors --- server/src/types/index.ts | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/server/src/types/index.ts b/server/src/types/index.ts index f2e327ef..a2673d1d 100644 --- a/server/src/types/index.ts +++ b/server/src/types/index.ts @@ -129,6 +129,11 @@ export interface BaseActionInfo { hasOnlyText: boolean; } +interface IframeSelector { + full: string; + isIframe: boolean; +} + /** * Holds all the possible css selectors that has been found for an element. * @category Types @@ -143,6 +148,7 @@ export interface Selectors { hrefSelector: string|null; accessibilitySelector: string|null; formSelector: string|null; + iframeSelector: IframeSelector|null; } /** @@ -156,7 +162,7 @@ export interface BaseAction extends BaseActionInfo{ associatedActions: ActionType[]; inputType: string | undefined; value: string | undefined; - selectors: { [key: string]: string | null }; + selectors: Selectors; timestamp: number; isPassword: boolean; /** From 2943681c9bb53dea43b1a915850a0ea1a5f7ea51 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 4 Jan 2025 09:14:05 +0530 Subject: [PATCH 07/15] feat: prioritize returning iframe selector if exists --- server/src/workflow-management/utils.ts | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/server/src/workflow-management/utils.ts b/server/src/workflow-management/utils.ts index b3dadd60..53006c78 100644 --- a/server/src/workflow-management/utils.ts +++ b/server/src/workflow-management/utils.ts @@ -12,6 +12,11 @@ export const getBestSelectorForAction = (action: Action) => { case ActionType.Hover: case ActionType.DragAndDrop: { const selectors = action.selectors; + + if (selectors?.iframeSelector?.full) { + return selectors.iframeSelector.full; + } + // less than 25 characters, and element only has text inside const textSelector = selectors?.text?.length != null && From 360fe63938ad61892886e52c60f8f3189f7cce0a Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 4 Jan 2025 09:37:44 +0530 Subject: [PATCH 08/15] feat: get deeply nested iframe element info --- server/src/workflow-management/selector.ts | 297 +++++++++++++++++---- 1 file changed, 246 insertions(+), 51 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 6c955934..48229da7 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -24,62 +24,67 @@ export const getElementInformation = async ( const elementInfo = await page.evaluate( async ({ x, y }) => { // Helper function to find elements within iframes, handling nested cases - const getElementFromIframePoint = ( - x: number, - y: number, - context: Document = document, - iframePath: string[] = [] - ): { element: HTMLElement | null; iframePath: string[] } => { - // First try to get element at the given coordinates - let element = context.elementFromPoint(x, y) as HTMLElement; - if (!element) return { element: null, iframePath }; - - // Check if we found an iframe - if (element.tagName === 'IFRAME') { - const iframe = element as HTMLIFrameElement; - try { - // Make sure we can access the iframe's content - if (!iframe.contentDocument) { - return { element, iframePath }; + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + // First, get the element at the clicked coordinates in the main document + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; + + // If it's not an iframe, return the element as is + if (element.tagName !== 'IFRAME') return element; + + // Initialize tracking variables for iframe traversal + let currentIframe = element as HTMLIFrameElement; + let deepestElement = element; + + // Continue traversing while we have a valid iframe + while (currentIframe) { + try { + // Convert the coordinates from main document space to iframe's local space + const iframeRect = currentIframe.getBoundingClientRect(); + const iframeX = x - iframeRect.left; + const iframeY = y - iframeRect.top; + + // Get the iframe's document object - this gives us access to the iframe's content + const iframeDocument = currentIframe.contentDocument || currentIframe.contentWindow?.document; + if (!iframeDocument) break; + + // Find the element at the transformed coordinates within the iframe + const iframeElement = iframeDocument.elementFromPoint(iframeX, iframeY) as HTMLElement; + + // If no element found or it's the same as current, stop traversing + if (!iframeElement) break; + + // Update our tracking of the deepest element + deepestElement = iframeElement; + + // If we found another iframe, continue traversing through it + if (iframeElement.tagName === 'IFRAME') { + currentIframe = iframeElement as HTMLIFrameElement; + } else { + // If it's not an iframe, we've reached the deepest level + break; + } + } catch (error) { + // Handle potential cross-origin security restrictions + console.warn('Cannot access iframe content:', error); + break; } - - // Transform coordinates to iframe's space - const rect = iframe.getBoundingClientRect(); - const relativeX = x - rect.left; - const relativeY = y - rect.top; - - // Add this iframe to the path - const updatedPath = [...iframePath, iframe.id || 'unnamed-iframe']; - - // Recursively search within the iframe - const iframeResult = getElementFromIframePoint( - relativeX, - relativeY, - iframe.contentDocument, - updatedPath - ); - - // If we found an element in the iframe, return it - if (iframeResult.element) { - return iframeResult; - } - } catch (e) { - console.warn('Cannot access iframe content:', e); - } } - - // Return the element we found (either in main document or iframe) - return { element, iframePath }; + return deepestElement; }; // Get the element and its iframe path - const { element: el, iframePath } = getElementFromIframePoint(x, y); + const el = getDeepestElementFromPoint(x, y); if (el) { // Handle potential anchor parent const { parentElement } = el; const targetElement = parentElement?.tagName === 'A' ? parentElement : el; + const ownerDocument = targetElement.ownerDocument; + const frameElement = ownerDocument?.defaultView?.frameElement as HTMLIFrameElement; + const isIframeContent = Boolean(frameElement); + // Build the element information object let info: { tagName: string; @@ -90,14 +95,42 @@ export const getElementInformation = async ( attributes?: Record; innerHTML?: string; outerHTML?: string; - fromIframe?: boolean; - iframePath?: string[]; + isIframeContent?: boolean; + iframeURL?: string; + iframeIndex?: number; + frameHierarchy?: string[]; } = { - tagName: targetElement?.tagName ?? '', - fromIframe: iframePath.length > 0, - iframePath: iframePath.length > 0 ? iframePath : undefined + tagName: targetElement?.tagName ?? '', + isIframeContent: isIframeContent }; + if (isIframeContent) { + // Include iframe specific information + info.iframeURL = frameElement.src; + + // Calculate the frame's position in the hierarchy + let currentFrame = frameElement; + const frameHierarchy: string[] = []; + let frameIndex = 0; + + while (currentFrame) { + // Store the frame's identifier (src, id, or index) + frameHierarchy.unshift( + currentFrame.id || + currentFrame.src || + `iframe[${frameIndex}]` + ); + + // Move up to parent frame if it exists + const parentDoc = currentFrame.ownerDocument; + currentFrame = parentDoc?.defaultView?.frameElement as HTMLIFrameElement; + frameIndex++; + } + + info.frameHierarchy = frameHierarchy; + info.iframeIndex = frameIndex - 1; // Adjust for 0-based index + } + // Collect element attributes and properties if (targetElement) { // Get all attributes @@ -133,8 +166,170 @@ export const getElementInformation = async ( { x: coordinates.x, y: coordinates.y } ); return elementInfo; + } else { + const elementInfo = await page.evaluate( + async ({ x, y }) => { + // Enhanced helper function to get element from point including shadow DOM + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + // First, get the element at the clicked coordinates in the main document + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; + + // If it's not an iframe, return the element as is + if (element.tagName !== 'IFRAME') return element; + + // Initialize tracking variables for iframe traversal + let currentIframe = element as HTMLIFrameElement; + let deepestElement = element; + + // Continue traversing while we have a valid iframe + while (currentIframe) { + try { + // Convert the coordinates from main document space to iframe's local space + const iframeRect = currentIframe.getBoundingClientRect(); + const iframeX = x - iframeRect.left; + const iframeY = y - iframeRect.top; + + // Get the iframe's document object - this gives us access to the iframe's content + const iframeDocument = currentIframe.contentDocument || currentIframe.contentWindow?.document; + if (!iframeDocument) break; + + // Find the element at the transformed coordinates within the iframe + const iframeElement = iframeDocument.elementFromPoint(iframeX, iframeY) as HTMLElement; + + // If no element found or it's the same as current, stop traversing + if (!iframeElement) break; + + // Update our tracking of the deepest element + deepestElement = iframeElement; + + // If we found another iframe, continue traversing through it + if (iframeElement.tagName === 'IFRAME') { + currentIframe = iframeElement as HTMLIFrameElement; + } else { + // If it's not an iframe, we've reached the deepest level + break; + } + } catch (error) { + // Handle potential cross-origin security restrictions + console.warn('Cannot access iframe content:', error); + break; + } + } + return deepestElement; + }; + + const originalEl = getDeepestElementFromPoint(x, y); + if (originalEl) { + let element = originalEl; + + while (element.parentElement) { + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); + + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; + + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + element = element.parentElement; + } else { + break; + } + } + + const ownerDocument = element.ownerDocument; + const frameElement = ownerDocument?.defaultView?.frameElement as HTMLIFrameElement; + const isIframeContent = Boolean(frameElement); + + // Build the element information object + let info: { + tagName: string; + hasOnlyText?: boolean; + innerText?: string; + url?: string; + imageUrl?: string; + attributes?: Record; + innerHTML?: string; + outerHTML?: string; + isIframeContent?: boolean; + iframeURL?: string; + iframeIndex?: number; + frameHierarchy?: string[]; + } = { + tagName: element?.tagName ?? '', + isIframeContent: isIframeContent + }; + + if (isIframeContent) { + // Include iframe specific information + info.iframeURL = frameElement.src; + + // Calculate the frame's position in the hierarchy + let currentFrame = frameElement; + const frameHierarchy: string[] = []; + let frameIndex = 0; + + while (currentFrame) { + // Store the frame's identifier (src, id, or index) + frameHierarchy.unshift( + currentFrame.id || + currentFrame.src || + `iframe[${frameIndex}]` + ); + + // Move up to parent frame if it exists + const parentDoc = currentFrame.ownerDocument; + currentFrame = parentDoc?.defaultView?.frameElement as HTMLIFrameElement; + frameIndex++; + } + + info.frameHierarchy = frameHierarchy; + info.iframeIndex = frameIndex - 1; // Adjust for 0-based index + } + + if (element) { + // Get attributes including those from shadow DOM context + info.attributes = Array.from(element.attributes).reduce( + (acc, attr) => { + acc[attr.name] = attr.value; + return acc; + }, + {} as Record + ); + + // Handle specific element types + if (element.tagName === 'A') { + info.url = (element as HTMLAnchorElement).href; + info.innerText = element.textContent ?? ''; + } else if (element.tagName === 'IMG') { + info.imageUrl = (element as HTMLImageElement).src; + } else { + // Handle text content with proper null checking + info.hasOnlyText = element.children.length === 0 && + (element.textContent !== null && + element.textContent.trim().length > 0); + info.innerText = element.textContent ?? ''; + } + + info.innerHTML = element.innerHTML; + info.outerHTML = element.outerHTML; + } + + return info; + } + return null; + }, + { x: coordinates.x, y: coordinates.y }, + ); + return elementInfo; } - // ... rest of the code remains same } catch (error) { const { message, stack } = error as Error; console.error('Error while retrieving selector:', message); From 0eb9a8f0a858cc2dfa3a3ec0466abfd866802045 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 4 Jan 2025 10:10:40 +0530 Subject: [PATCH 09/15] feat: improve rect generation for iframe elements --- server/src/workflow-management/selector.ts | 320 +++++++++++---------- 1 file changed, 172 insertions(+), 148 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 48229da7..d992e740 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -351,189 +351,167 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector if (!getList || listSelector !== '') { const rect = await page.evaluate( async ({ x, y }) => { - // Helper function to convert rectangle to plain object - const getRectangleInfo = (rectangle: DOMRect) => { - const info = { - x: rectangle.x, - y: rectangle.y, - width: rectangle.width, - height: rectangle.height, - top: rectangle.top, - right: rectangle.right, - bottom: rectangle.bottom, - left: rectangle.left, - fromIframe: false, - iframePath: [] as string[] - }; - return info; - }; + // Enhanced helper function to get element from point including iframes + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + // First, get the element at the clicked coordinates in the main document + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; - // Helper function to search in iframe - const searchInIframe = ( - iframe: HTMLIFrameElement, - relativeX: number, - relativeY: number, - iframePath: string[] - ) => { - try { - if (!iframe.contentDocument) return null; + // If it's not an iframe, return the element as is + if (element.tagName !== 'IFRAME') return element; - const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; - if (!el) return null; + // Initialize tracking variables for iframe traversal + let currentIframe = element as HTMLIFrameElement; + let deepestElement = element; - const { parentElement } = el; - const element = parentElement?.tagName === 'A' ? parentElement : el; - const rectangle = element?.getBoundingClientRect(); + // Continue traversing while we have a valid iframe + while (currentIframe) { + try { + // Convert coordinates from main document space to iframe's local space + const iframeRect = currentIframe.getBoundingClientRect(); + const iframeX = x - iframeRect.left; + const iframeY = y - iframeRect.top; - if (rectangle) { - const iframeRect = iframe.getBoundingClientRect(); - const rectInfo = getRectangleInfo(rectangle); + // Get the iframe's document object + const iframeDocument = currentIframe.contentDocument || currentIframe.contentWindow?.document; + if (!iframeDocument) break; - // Adjust coordinates relative to the main document - rectInfo.x += iframeRect.x; - rectInfo.y += iframeRect.y; - rectInfo.top += iframeRect.top; - rectInfo.right += iframeRect.left; - rectInfo.bottom += iframeRect.top; - rectInfo.left += iframeRect.left; - rectInfo.fromIframe = true; - rectInfo.iframePath = iframePath; + // Find the element at the transformed coordinates within the iframe + const iframeElement = iframeDocument.elementFromPoint(iframeX, iframeY) as HTMLElement; + if (!iframeElement) break; - return rectInfo; + // Update our tracking of the deepest element + deepestElement = iframeElement; + + // If we found another iframe, continue traversing through it + if (iframeElement.tagName === 'IFRAME') { + currentIframe = iframeElement as HTMLIFrameElement; + } else { + break; + } + } catch (error) { + // Handle potential cross-origin security restrictions + console.warn('Cannot access iframe content:', error); + break; } - return null; - } catch (e) { - console.warn('Cannot access iframe content:', e); - return null; } + return deepestElement; }; - const el = document.elementFromPoint(x, y) as HTMLElement; + const el = getDeepestElementFromPoint(x, y); if (el) { - // Check if the element is an iframe - if (el.tagName === 'IFRAME') { - const iframe = el as HTMLIFrameElement; - const rect = iframe.getBoundingClientRect(); - const relativeX = x - rect.left; - const relativeY = y - rect.top; - - const iframeResult = searchInIframe( - iframe, - relativeX, - relativeY, - [iframe.id || 'unnamed-iframe'] - ); - if (iframeResult) return iframeResult; - } - const { parentElement } = el; const element = parentElement?.tagName === 'A' ? parentElement : el; const rectangle = element?.getBoundingClientRect(); - if (rectangle) { - return getRectangleInfo(rectangle); + const createRectObject = (rect: DOMRect) => ({ + x: rect.x, + y: rect.y, + width: rect.width, + height: rect.height, + top: rect.top, + right: rect.right, + bottom: rect.bottom, + left: rect.left, + toJSON() { + return { + x: this.x, + y: this.y, + width: this.width, + height: this.height, + top: this.top, + right: this.right, + bottom: this.bottom, + left: this.left + }; + } + }); + + // For elements inside iframes, adjust coordinates relative to the top window + let adjustedRect = createRectObject(rectangle); + let currentWindow = element.ownerDocument.defaultView; + + while (currentWindow !== window.top) { + const frameElement = currentWindow?.frameElement as HTMLIFrameElement; + if (!frameElement) break; + + const frameRect = frameElement.getBoundingClientRect(); + adjustedRect = createRectObject({ + x: adjustedRect.x + frameRect.x, + y: adjustedRect.y + frameRect.y, + width: adjustedRect.width, + height: adjustedRect.height, + top: adjustedRect.top + frameRect.top, + right: adjustedRect.right + frameRect.left, + bottom: adjustedRect.bottom + frameRect.top, + left: adjustedRect.left + frameRect.left, + } as DOMRect); + + currentWindow = frameElement.ownerDocument.defaultView; + } + + return adjustedRect; } } return null; }, - { x: coordinates.x, y: coordinates.y }, + { x: coordinates.x, y: coordinates.y } ); return rect; } else { const rect = await page.evaluate( async ({ x, y }) => { - // Helper function to convert rectangle to plain object (same as above) - const getRectangleInfo = (rectangle: DOMRect) => ({ - x: rectangle.x, - y: rectangle.y, - width: rectangle.width, - height: rectangle.height, - top: rectangle.top, - right: rectangle.right, - bottom: rectangle.bottom, - left: rectangle.left, - fromIframe: false, - iframePath: [] as string[] - }); + // Same getDeepestElementFromPoint function as above + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + // First, get the element at the clicked coordinates in the main document + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; - // Helper function to search in iframe (same as above) - const searchInIframe = ( - iframe: HTMLIFrameElement, - relativeX: number, - relativeY: number, - iframePath: string[] - ) => { - try { - if (!iframe.contentDocument) return null; + // If it's not an iframe, return the element as is + if (element.tagName !== 'IFRAME') return element; - const el = iframe.contentDocument.elementFromPoint(relativeX, relativeY) as HTMLElement; - if (!el) return null; + // Initialize tracking variables for iframe traversal + let currentIframe = element as HTMLIFrameElement; + let deepestElement = element; - let element = el; - while (element.parentElement) { - const parentRect = element.parentElement.getBoundingClientRect(); - const childRect = element.getBoundingClientRect(); + // Continue traversing while we have a valid iframe + while (currentIframe) { + try { + // Convert coordinates from main document space to iframe's local space + const iframeRect = currentIframe.getBoundingClientRect(); + const iframeX = x - iframeRect.left; + const iframeY = y - iframeRect.top; - const fullyContained = - parentRect.left <= childRect.left && - parentRect.right >= childRect.right && - parentRect.top <= childRect.top && - parentRect.bottom >= childRect.bottom; + // Get the iframe's document object + const iframeDocument = currentIframe.contentDocument || currentIframe.contentWindow?.document; + if (!iframeDocument) break; - const significantOverlap = - (childRect.width * childRect.height) / - (parentRect.width * parentRect.height) > 0.5; + // Find the element at the transformed coordinates within the iframe + const iframeElement = iframeDocument.elementFromPoint(iframeX, iframeY) as HTMLElement; + if (!iframeElement) break; - if (fullyContained && significantOverlap) { - element = element.parentElement; + // Update our tracking of the deepest element + deepestElement = iframeElement; + + // If we found another iframe, continue traversing through it + if (iframeElement.tagName === 'IFRAME') { + currentIframe = iframeElement as HTMLIFrameElement; } else { break; } + } catch (error) { + // Handle potential cross-origin security restrictions + console.warn('Cannot access iframe content:', error); + break; } - - const rectangle = element?.getBoundingClientRect(); - if (rectangle) { - const iframeRect = iframe.getBoundingClientRect(); - const rectInfo = getRectangleInfo(rectangle); - - // Adjust coordinates relative to the main document - rectInfo.x += iframeRect.x; - rectInfo.y += iframeRect.y; - rectInfo.top += iframeRect.top; - rectInfo.right += iframeRect.left; - rectInfo.bottom += iframeRect.top; - rectInfo.left += iframeRect.left; - rectInfo.fromIframe = true; - rectInfo.iframePath = iframePath; - - return rectInfo; - } - return null; - } catch (e) { - console.warn('Cannot access iframe content:', e); - return null; } + return deepestElement; }; - const originalEl = document.elementFromPoint(x, y) as HTMLElement; + const originalEl = getDeepestElementFromPoint(x, y); if (originalEl) { - // Check if the element is an iframe - if (originalEl.tagName === 'IFRAME') { - const iframe = originalEl as HTMLIFrameElement; - const rect = iframe.getBoundingClientRect(); - const relativeX = x - rect.left; - const relativeY = y - rect.top; - - const iframeResult = searchInIframe( - iframe, - relativeX, - relativeY, - [iframe.id || 'unnamed-iframe'] - ); - if (iframeResult) return iframeResult; - } - let element = originalEl; + while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -557,19 +535,65 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector const rectangle = element?.getBoundingClientRect(); if (rectangle) { - return getRectangleInfo(rectangle); + const createRectObject = (rect: DOMRect) => ({ + x: rect.x, + y: rect.y, + width: rect.width, + height: rect.height, + top: rect.top, + right: rect.right, + bottom: rect.bottom, + left: rect.left, + toJSON() { + return { + x: this.x, + y: this.y, + width: this.width, + height: this.height, + top: this.top, + right: this.right, + bottom: this.bottom, + left: this.left + }; + } + }); + + // Same coordinate adjustment for iframe elements as above + let adjustedRect = createRectObject(rectangle); + let currentWindow = element.ownerDocument.defaultView; + + while (currentWindow !== window.top) { + const frameElement = currentWindow?.frameElement as HTMLIFrameElement; + if (!frameElement) break; + + const frameRect = frameElement.getBoundingClientRect(); + adjustedRect = createRectObject({ + x: adjustedRect.x + frameRect.x, + y: adjustedRect.y + frameRect.y, + width: adjustedRect.width, + height: adjustedRect.height, + top: adjustedRect.top + frameRect.top, + right: adjustedRect.right + frameRect.left, + bottom: adjustedRect.bottom + frameRect.top, + left: adjustedRect.left + frameRect.left, + } as DOMRect); + + currentWindow = frameElement.ownerDocument.defaultView; + } + + return adjustedRect; } } return null; }, - { x: coordinates.x, y: coordinates.y }, + { x: coordinates.x, y: coordinates.y } ); return rect; } } catch (error) { const { message, stack } = error as Error; - logger.log('error', `Error while retrieving selector: ${message}`); - logger.log('error', `Stack: ${stack}`); + console.error('Error while retrieving selector:', message); + console.error('Stack:', stack); } }; From a845509a73ff923d863b34b0f2d205306f22cc6a Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 4 Jan 2025 10:51:01 +0530 Subject: [PATCH 10/15] feat: add iframe support for scrapeSchema action --- maxun-core/src/browserSide/scraper.js | 178 ++++++++++++++++++++------ 1 file changed, 140 insertions(+), 38 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index a2009d78..c3610bb5 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -188,69 +188,171 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @param {Object.} lists The named lists of HTML elements. * @returns {Array.>} */ - window.scrapeSchema = function (lists) { + window.scrapeSchema = function(lists) { + // Utility functions remain the same function omap(object, f, kf = (x) => x) { return Object.fromEntries( Object.entries(object) - .map(([k, v]) => [kf(k), f(v)]), + .map(([k, v]) => [kf(k), f(v)]), ); } function ofilter(object, f) { return Object.fromEntries( Object.entries(object) - .filter(([k, v]) => f(k, v)), + .filter(([k, v]) => f(k, v)), ); } + function findAllElements(config) { + // Check if selector contains iframe notation (:>>) + if (!config.selector.includes(':>>')) { + return Array.from(document.querySelectorAll(config.selector)); + } + + // For iframe traversal, split by iframe boundary marker + const parts = config.selector.split(':>>').map(s => s.trim()); + let currentElements = [document]; + + for (let i = 0; i < parts.length; i++) { + const part = parts[i]; + const nextElements = []; + + for (const element of currentElements) { + try { + let targets; + if (i === 0) { + // First selector is queried from main document + targets = Array.from(element.querySelectorAll(part)) + .filter(el => { + // Only include iframes if not the last part + if (i === parts.length - 1) return true; + return el.tagName === 'IFRAME'; + }); + } else { + // For subsequent selectors, we need to look inside iframes + const iframeDocument = element.contentDocument || element.contentWindow?.document; + if (!iframeDocument) continue; + + targets = Array.from(iframeDocument.querySelectorAll(part)); + + // If this isn't the last part, filter for iframes only + if (i < parts.length - 1) { + targets = targets.filter(el => el.tagName === 'IFRAME'); + } + } + nextElements.push(...targets); + } catch (error) { + // Handle cross-origin iframe access errors + console.warn('Cannot access iframe content:', error); + continue; + } + } + + if (nextElements.length === 0) return []; + currentElements = nextElements; + } + + return currentElements; + } + + // Modified to handle iframe context for URL resolution + function getElementValue(element, attribute) { + if (!element) return null; + + // Get the base URL for resolving relative URLs + const baseURL = element.ownerDocument?.location?.href || window.location.origin; + + switch (attribute) { + case 'href': { + const relativeHref = element.getAttribute('href'); + return relativeHref ? new URL(relativeHref, baseURL).href : null; + } + case 'src': { + const relativeSrc = element.getAttribute('src'); + return relativeSrc ? new URL(relativeSrc, baseURL).href : null; + } + case 'innerText': + return element.innerText?.trim(); + case 'textContent': + return element.textContent?.trim(); + default: + return element.getAttribute(attribute) || element.innerText?.trim(); + } + } + + // Rest of the functions remain largely the same function getSeedKey(listObj) { - const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length))); - return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0]; + const maxLength = Math.max(...Object.values( + omap(listObj, (x) => findAllElements(x).length) + )); + return Object.keys( + ofilter(listObj, (_, v) => findAllElements(v).length === maxLength) + )[0]; } function getMBEs(elements) { return elements.map((element) => { - let candidate = element; - const isUniqueChild = (e) => elements - .filter((elem) => e.parentNode?.contains(elem)) - .length === 1; + let candidate = element; + const isUniqueChild = (e) => elements + .filter((elem) => { + // Handle iframe boundaries when checking containment + const sameDocument = elem.ownerDocument === e.ownerDocument; + return sameDocument && e.parentNode?.contains(elem); + }) + .length === 1; - while (candidate && isUniqueChild(candidate)) { - candidate = candidate.parentNode; - } + while (candidate && isUniqueChild(candidate)) { + candidate = candidate.parentNode; + } - return candidate; + return candidate; }); } + // Main scraping logic remains the same const seedName = getSeedKey(lists); - const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector)); + const seedElements = findAllElements(lists[seedName]); const MBEs = getMBEs(seedElements); - - return MBEs.map((mbe) => omap( - lists, - ({ selector, attribute }, key) => { - const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem)); - if (!elem) return undefined; - - switch (attribute) { - case 'href': - const relativeHref = elem.getAttribute('href'); - return relativeHref ? new URL(relativeHref, window.location.origin).href : null; - case 'src': - const relativeSrc = elem.getAttribute('src'); - return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null; - case 'innerText': - return elem.innerText; - case 'textContent': - return elem.textContent; - default: - return elem.innerText; - } - }, - (key) => key // Use the original key in the output + + const mbeResults = MBEs.map((mbe) => omap( + lists, + (config) => { + const elem = findAllElements(config) + .find((elem) => mbe.contains(elem)); + + return elem ? getElementValue(elem, config.attribute) : undefined; + }, + (key) => key )) || []; - } + + // If MBE approach didn't find all elements, try independent scraping + if (mbeResults.some(result => Object.values(result).some(v => v === undefined))) { + // Fall back to independent scraping + const results = []; + const foundElements = new Map(); + + // Find all elements for each selector + Object.entries(lists).forEach(([key, config]) => { + const elements = findAllElements(config); + foundElements.set(key, elements); + }); + + // Create result objects for each found element + foundElements.forEach((elements, key) => { + elements.forEach((element, index) => { + if (!results[index]) { + results[index] = {}; + } + results[index][key] = getElementValue(element, lists[key].attribute); + }); + }); + + return results.filter(result => Object.keys(result).length > 0); + } + + return mbeResults; + }; /** * Scrapes multiple lists of similar items based on a template item. From 7018ba64fae2871c6600e80f7d2eef999910c751 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 4 Jan 2025 14:27:47 +0530 Subject: [PATCH 11/15] feat: improve non unique selector generation for capture list --- server/src/workflow-management/selector.ts | 237 +++++++++++++++++++-- 1 file changed, 222 insertions(+), 15 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index d992e740..d297d774 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -1328,10 +1328,61 @@ interface SelectorResult { */ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates, listSelector: string): Promise => { + interface IframeContext { + frame: HTMLIFrameElement; + document: Document; + element: HTMLElement; + } + try { if (!listSelector) { - console.log(`NON UNIQUE: MODE 1`) const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + // First, get the element at the specified coordinates in the main document + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; + + // Check if the element is an iframe + if (element.tagName !== 'IFRAME') return element; + + let currentIframe = element as HTMLIFrameElement; + let deepestElement = element; + let depth = 0; + const MAX_DEPTH = 4; // Limit the depth of nested iframes to prevent infinite loops + + while (currentIframe && depth < MAX_DEPTH) { + try { + // Convert coordinates from main document to iframe's coordinate system + const iframeRect = currentIframe.getBoundingClientRect(); + const iframeX = x - iframeRect.left; + const iframeY = y - iframeRect.top; + + // Access the iframe's content document and get the element at the transformed coordinates + const iframeDoc = currentIframe.contentDocument || currentIframe.contentWindow?.document; + if (!iframeDoc) break; + + const iframeElement = iframeDoc.elementFromPoint(iframeX, iframeY) as HTMLElement; + if (!iframeElement) break; + + // If the element found is another iframe, continue traversing + if (iframeElement.tagName === 'IFRAME') { + deepestElement = iframeElement; + currentIframe = iframeElement as HTMLIFrameElement; + depth++; + } else { + // If it's not an iframe, we've found our deepest element + deepestElement = iframeElement; + break; + } + } catch (error) { + // Handle potential security errors when accessing cross-origin iframes + console.warn('Cannot access iframe content:', error); + break; + } + } + return deepestElement; + }; + function getNonUniqueSelector(element: HTMLElement): string { let selector = element.tagName.toLowerCase(); @@ -1348,22 +1399,77 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates return selector; } - function getSelectorPath(element: HTMLElement | null): string { - const path: string[] = []; + function getIframePath(element: HTMLElement): IframeContext[] { + const path: IframeContext[] = []; + let current = element; let depth = 0; - const maxDepth = 2; + const MAX_DEPTH = 4; + + while (current && depth < MAX_DEPTH) { + // Get the owning document and its frame element + const ownerDocument = current.ownerDocument; + const frameElement = ownerDocument?.defaultView?.frameElement as HTMLIFrameElement; + + if (frameElement) { + path.unshift({ + frame: frameElement, + document: ownerDocument, + element: current + }); + current = frameElement; + depth++; + } else { + break; + } + } + return path; + } - while (element && element !== document.body && depth < maxDepth) { - const selector = getNonUniqueSelector(element); + function getSelectorPath(element: HTMLElement | null): string { + if (!element) return ''; + + // Check for iframe path first + const iframePath = getIframePath(element); + if (iframePath.length > 0) { + const selectorParts: string[] = []; + + // Build complete iframe path + iframePath.forEach((context, index) => { + const frameSelector = getNonUniqueSelector(context.frame); + + if (index === iframePath.length - 1) { + // For deepest iframe context, include target element + const elementSelector = getNonUniqueSelector(element); + selectorParts.push(`${frameSelector} :>> ${elementSelector}`); + } else { + // For intermediate iframe boundaries + selectorParts.push(frameSelector); + } + }); + + return selectorParts.join(' :>> '); + } + + // Regular DOM path generation remains the same + const path: string[] = []; + let currentElement = element; + let depth = 0; + const MAX_DEPTH = 2; + + while (currentElement && currentElement !== document.body && depth < MAX_DEPTH) { + const selector = getNonUniqueSelector(currentElement); path.unshift(selector); - element = element.parentElement; + + const parentElement = currentElement.parentElement; + if (!parentElement) break; + currentElement = parentElement; depth++; } return path.join(' > '); } - const originalEl = document.elementFromPoint(x, y) as HTMLElement; + const originalEl = getDeepestElementFromPoint(x, y); if (!originalEl) return null; let element = originalEl; @@ -1400,6 +1506,52 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates } else { console.log(`NON UNIQUE: MODE 2`) const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { + const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => { + // First, get the element at the specified coordinates in the main document + let element = document.elementFromPoint(x, y) as HTMLElement; + if (!element) return null; + + // Check if the element is an iframe + if (element.tagName !== 'IFRAME') return element; + + let currentIframe = element as HTMLIFrameElement; + let deepestElement = element; + let depth = 0; + const MAX_DEPTH = 4; // Limit the depth of nested iframes to prevent infinite loops + + while (currentIframe && depth < MAX_DEPTH) { + try { + // Convert coordinates from main document to iframe's coordinate system + const iframeRect = currentIframe.getBoundingClientRect(); + const iframeX = x - iframeRect.left; + const iframeY = y - iframeRect.top; + + // Access the iframe's content document and get the element at the transformed coordinates + const iframeDoc = currentIframe.contentDocument || currentIframe.contentWindow?.document; + if (!iframeDoc) break; + + const iframeElement = iframeDoc.elementFromPoint(iframeX, iframeY) as HTMLElement; + if (!iframeElement) break; + + // If the element found is another iframe, continue traversing + if (iframeElement.tagName === 'IFRAME') { + deepestElement = iframeElement; + currentIframe = iframeElement as HTMLIFrameElement; + depth++; + } else { + // If it's not an iframe, we've found our deepest element + deepestElement = iframeElement; + break; + } + } catch (error) { + // Handle potential security errors when accessing cross-origin iframes + console.warn('Cannot access iframe content:', error); + break; + } + } + return deepestElement; + }; + function getNonUniqueSelector(element: HTMLElement): string { let selector = element.tagName.toLowerCase(); @@ -1416,22 +1568,77 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates return selector; } - function getSelectorPath(element: HTMLElement | null): string { - const path: string[] = []; + function getIframePath(element: HTMLElement): IframeContext[] { + const path: IframeContext[] = []; + let current = element; let depth = 0; - const maxDepth = 2; + const MAX_DEPTH = 4; + + while (current && depth < MAX_DEPTH) { + // Get the owning document and its frame element + const ownerDocument = current.ownerDocument; + const frameElement = ownerDocument?.defaultView?.frameElement as HTMLIFrameElement; + + if (frameElement) { + path.unshift({ + frame: frameElement, + document: ownerDocument, + element: current + }); + current = frameElement; + depth++; + } else { + break; + } + } + return path; + } - while (element && element !== document.body && depth < maxDepth) { - const selector = getNonUniqueSelector(element); + function getSelectorPath(element: HTMLElement | null): string { + if (!element) return ''; + + // Check for iframe path first + const iframePath = getIframePath(element); + if (iframePath.length > 0) { + const selectorParts: string[] = []; + + // Build complete iframe path + iframePath.forEach((context, index) => { + const frameSelector = getNonUniqueSelector(context.frame); + + if (index === iframePath.length - 1) { + // For deepest iframe context, include target element + const elementSelector = getNonUniqueSelector(element); + selectorParts.push(`${frameSelector} :>> ${elementSelector}`); + } else { + // For intermediate iframe boundaries + selectorParts.push(frameSelector); + } + }); + + return selectorParts.join(' :>> '); + } + + // Regular DOM path generation remains the same + const path: string[] = []; + let currentElement = element; + let depth = 0; + const MAX_DEPTH = 2; + + while (currentElement && currentElement !== document.body && depth < MAX_DEPTH) { + const selector = getNonUniqueSelector(currentElement); path.unshift(selector); - element = element.parentElement; + + const parentElement = currentElement.parentElement; + if (!parentElement) break; + currentElement = parentElement; depth++; } return path.join(' > '); } - const originalEl = document.elementFromPoint(x, y) as HTMLElement; + const originalEl = getDeepestElementFromPoint(x, y); if (!originalEl) return null; let element = originalEl; From ed1ea41c4e683b7a94add148e66f3de4f96fc3f8 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 4 Jan 2025 14:45:25 +0530 Subject: [PATCH 12/15] feat: add iframe support for child selector generation --- server/src/workflow-management/selector.ts | 109 +++++++++++++++++++-- 1 file changed, 103 insertions(+), 6 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index d297d774..9367631f 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -1679,33 +1679,130 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro } // Function to generate selector path from an element to its parent - function getSelectorPath(element: HTMLElement | null): string { + function getSelectorPath(element: HTMLElement): string { if (!element || !element.parentElement) return ''; const parentSelector = getNonUniqueSelector(element.parentElement); const elementSelector = getNonUniqueSelector(element); + // Check if element is in an iframe + const ownerDocument = element.ownerDocument; + const frameElement = ownerDocument?.defaultView?.frameElement as HTMLIFrameElement; + + if (frameElement) { + const frameSelector = getNonUniqueSelector(frameElement); + return `${frameSelector} :>> ${elementSelector}`; + } + return `${parentSelector} > ${elementSelector}`; } + function getIframeChildren(element: HTMLElement): HTMLElement[] { + const children: HTMLElement[] = []; + + // Find all iframe elements + const iframes = Array.from(element.querySelectorAll('iframe')) as HTMLIFrameElement[]; + + for (const iframe of iframes) { + try { + // Access iframe's document + const iframeDoc = iframe.contentDocument || iframe.contentWindow?.document; + if (iframeDoc) { + // Get all elements in the iframe + const iframeElements = Array.from(iframeDoc.querySelectorAll('*')) as HTMLElement[]; + children.push(...iframeElements); + } + } catch (error) { + console.warn('Cannot access iframe content:', error); + continue; + } + } + + return children; + } + // Function to recursively get all descendant selectors function getAllDescendantSelectors(element: HTMLElement): string[] { let selectors: string[] = []; + + // Handle regular DOM children const children = Array.from(element.children) as HTMLElement[]; - for (const child of children) { const childPath = getSelectorPath(child); if (childPath) { - selectors.push(childPath); // Add direct child path - selectors = selectors.concat(getAllDescendantSelectors(child)); // Recursively process descendants + selectors.push(childPath); + // Recursively process regular DOM descendants + selectors = selectors.concat(getAllDescendantSelectors(child)); + + // Check for iframes in this child + const iframeChildren = getIframeChildren(child); + for (const iframeChild of iframeChildren) { + try { + const iframePath = getSelectorPath(iframeChild); + if (iframePath) { + selectors.push(iframePath); + // Recursively process iframe descendants + selectors = selectors.concat(getAllDescendantSelectors(iframeChild)); + } + } catch (error) { + console.warn('Error processing iframe child:', error); + continue; + } + } + } + } + + // Handle direct iframe children of the current element + const iframeChildren = getIframeChildren(element); + for (const iframeChild of iframeChildren) { + try { + const iframePath = getSelectorPath(iframeChild); + if (iframePath) { + selectors.push(iframePath); + selectors = selectors.concat(getAllDescendantSelectors(iframeChild)); + } + } catch (error) { + console.warn('Error processing direct iframe child:', error); + continue; } } return selectors; } - // Find all occurrences of the parent selector in the DOM - const parentElements = Array.from(document.querySelectorAll(parentSelector)) as HTMLElement[]; + const selectorParts = parentSelector.split(':>>').map(part => part.trim()); + let parentElements: HTMLElement[] = []; + + // Handle iframe traversal if needed + if (selectorParts.length > 1) { + // Start with the initial iframe elements + parentElements = Array.from(document.querySelectorAll(selectorParts[0])) as HTMLElement[]; + + // Traverse through iframe parts + for (let i = 1; i < selectorParts.length; i++) { + const newParentElements: HTMLElement[] = []; + for (const element of parentElements) { + if (element.tagName === 'IFRAME') { + try { + const iframeDoc = (element as HTMLIFrameElement).contentDocument || + (element as HTMLIFrameElement).contentWindow?.document; + if (iframeDoc) { + const iframeChildren = Array.from(iframeDoc.querySelectorAll(selectorParts[i])) as HTMLElement[]; + newParentElements.push(...iframeChildren); + } + } catch (error) { + console.warn('Cannot access iframe content during traversal:', error); + continue; + } + } + } + parentElements = newParentElements; + } + } else { + // Regular DOM selector + parentElements = Array.from(document.querySelectorAll(parentSelector)) as HTMLElement[]; + } + const allChildSelectors = new Set(); // Use a set to ensure uniqueness // Process each parent element and its descendants From 1c13a230f5fb230a7664e37b65850a3da40357a4 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 4 Jan 2025 14:47:51 +0530 Subject: [PATCH 13/15] feat: add iframe support for highlighter logic --- src/components/organisms/BrowserWindow.tsx | 37 +++++++++++++++++++--- 1 file changed, 32 insertions(+), 5 deletions(-) diff --git a/src/components/organisms/BrowserWindow.tsx b/src/components/organisms/BrowserWindow.tsx index 697b4adb..95ca5e3b 100644 --- a/src/components/organisms/BrowserWindow.tsx +++ b/src/components/organisms/BrowserWindow.tsx @@ -12,6 +12,7 @@ import { useGlobalInfoStore } from '../../context/globalInfo'; interface ElementInfo { tagName: string; hasOnlyText?: boolean; + isIframeContent?: boolean; innerText?: string; url?: string; imageUrl?: string; @@ -115,31 +116,57 @@ export const BrowserWindow = () => { }, [screenShot, canvasRef, socket, screencastHandler]); const highlighterHandler = useCallback((data: { rect: DOMRect, selector: string, elementInfo: ElementInfo | null, childSelectors?: string[] }) => { + console.log("LIST SELECTOR", listSelector); + console.log("DATA SELECTOR", data.selector); + console.log("CHILD SELECTORS", data.childSelectors); if (getList === true) { if (listSelector) { socket?.emit('listSelector', { selector: listSelector }); + const hasValidChildSelectors = Array.isArray(data.childSelectors) && data.childSelectors.length > 0; + if (limitMode) { setHighlighterData(null); } else if (paginationMode) { - // only set highlighterData if type is not empty, 'none', 'scrollDown', or 'scrollUp' + // Only set highlighterData if type is not empty, 'none', 'scrollDown', or 'scrollUp' if (paginationType !== '' && !['none', 'scrollDown', 'scrollUp'].includes(paginationType)) { setHighlighterData(data); } else { setHighlighterData(null); } } else if (data.childSelectors && data.childSelectors.includes(data.selector)) { - // highlight only valid child elements within the listSelector + // Highlight only valid child elements within the listSelector setHighlighterData(data); + } else if (data.elementInfo?.isIframeContent && data.childSelectors) { + // Handle pure iframe elements - similar to previous shadow DOM logic but using iframe syntax + // Check if the selector matches any iframe child selectors + const isIframeChild = data.childSelectors.some(childSelector => + data.selector.includes(':>>') && // Iframe uses :>> for traversal + childSelector.split(':>>').some(part => + data.selector.includes(part.trim()) + ) + ); + setHighlighterData(isIframeChild ? data : null); + } else if (data.selector.includes(':>>') && hasValidChildSelectors) { + // Handle mixed DOM cases with iframes + // Split the selector into parts and check each against child selectors + const selectorParts = data.selector.split(':>>').map(part => part.trim()); + const isValidMixedSelector = selectorParts.some(part => + // We know data.childSelectors is defined due to hasValidChildSelectors check + data.childSelectors!.some(childSelector => + childSelector.includes(part) + ) + ); + setHighlighterData(isValidMixedSelector ? data : null); } else { - // if !valid child in normal mode, clear the highlighter + // If no valid child in normal mode, clear the highlighter setHighlighterData(null); } } else { - // set highlighterData for the initial listSelector selection + // Set highlighterData for the initial listSelector selection setHighlighterData(data); } } else { - // for non-list steps + // For non-list steps setHighlighterData(data); } }, [highlighterData, getList, socket, listSelector, paginationMode, paginationType]); From 96049e361bbb77fbc3293e910f92bd71dc36d87a Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 4 Jan 2025 15:36:54 +0530 Subject: [PATCH 14/15] feat: add iframe support for table and non table scraping --- maxun-core/src/browserSide/scraper.js | 390 ++++++++++++++++++++++---- 1 file changed, 334 insertions(+), 56 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index c3610bb5..bb169104 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -364,75 +364,353 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - const scrapedData = []; - - while (scrapedData.length < limit) { - let parentElements = Array.from(document.querySelectorAll(listSelector)); - - // If we only got one element or none, try a more generic approach - if (limit > 1 && parentElements.length <= 1) { - const [containerSelector, _] = listSelector.split('>').map(s => s.trim()); - const container = document.querySelector(containerSelector); - - if (container) { - const allChildren = Array.from(container.children); - - const firstMatch = document.querySelector(listSelector); - if (firstMatch) { - // Get classes from the first matching element - const firstMatchClasses = Array.from(firstMatch.classList); - - // Find similar elements by matching most of their classes - parentElements = allChildren.filter(element => { - const elementClasses = Array.from(element.classList); - - // Element should share at least 70% of classes with the first match - const commonClasses = firstMatchClasses.filter(cls => - elementClasses.includes(cls)); - return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7); - }); - } - } + // Helper function to query elements within an iframe + const queryIframe = (rootElement, selector) => { + if (!selector.includes(':>>')) { + return rootElement.querySelector(selector); } - // Iterate through each parent element - for (const parent of parentElements) { - if (scrapedData.length >= limit) break; - const record = {}; + const parts = selector.split(':>>').map(part => part.trim()); + let currentElement = rootElement; - // For each field, select the corresponding element within the parent - for (const [label, { selector, attribute }] of Object.entries(fields)) { - const fieldElement = parent.querySelector(selector); + for (let i = 0; i < parts.length; i++) { + if (!currentElement) return null; - if (fieldElement) { - if (attribute === 'innerText') { - record[label] = fieldElement.innerText.trim(); - } else if (attribute === 'innerHTML') { - record[label] = fieldElement.innerHTML.trim(); - } else if (attribute === 'src') { - // Handle relative 'src' URLs - const src = fieldElement.getAttribute('src'); - record[label] = src ? new URL(src, window.location.origin).href : null; - } else if (attribute === 'href') { - // Handle relative 'href' URLs - const href = fieldElement.getAttribute('href'); - record[label] = href ? new URL(href, window.location.origin).href : null; + // Handle iframe content document + if (currentElement.tagName === 'IFRAME') { + try { + const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document; + currentElement = iframeDoc.querySelector(parts[i]); + continue; + } catch (e) { + console.error('Cannot access iframe content:', e); + return null; + } + } + + currentElement = currentElement.querySelector(parts[i]); + } + + return currentElement; + }; + + // Helper function to query all matching elements within iframes + const queryIframeAll = (rootElement, selector) => { + if (!selector.includes(':>>')) { + return rootElement.querySelectorAll(selector); + } + + const parts = selector.split(':>>').map(part => part.trim()); + let currentElements = [rootElement]; + + for (const part of parts) { + const nextElements = []; + + for (const element of currentElements) { + if (element.tagName === 'IFRAME') { + try { + const iframeDoc = element.contentDocument || element.contentWindow.document; + nextElements.push(...iframeDoc.querySelectorAll(part)); + } catch (e) { + console.error('Cannot access iframe content:', e); + continue; + } + } else { + nextElements.push(...element.querySelectorAll(part)); + } + } + + currentElements = nextElements; + } + + return currentElements; + }; + + // Helper function to extract values from elements + function extractValue(element, attribute) { + if (!element) return null; + + if (attribute === 'innerText') { + return element.innerText.trim(); + } else if (attribute === 'innerHTML') { + return element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + return attrValue ? new URL(attrValue, window.location.origin).href : null; + } + return element.getAttribute(attribute); + } + + // Helper function to find table ancestor elements + function findTableAncestor(element) { + let currentElement = element; + const MAX_DEPTH = 5; + let depth = 0; + + while (currentElement && depth < MAX_DEPTH) { + if (currentElement.tagName === 'TD') { + return { type: 'TD', element: currentElement }; + } else if (currentElement.tagName === 'TR') { + return { type: 'TR', element: currentElement }; + } + + // Handle iframe boundary crossing + if (currentElement.tagName === 'IFRAME') { + try { + currentElement = currentElement.contentDocument.body; + } catch (e) { + return null; + } + } else { + currentElement = currentElement.parentElement; + } + depth++; + } + return null; + } + + // Helper function to get cell index + function getCellIndex(td) { + let index = 0; + let sibling = td; + while (sibling = sibling.previousElementSibling) { + index++; + } + return index; + } + + // Helper function to check for TH elements + function hasThElement(row, tableFields) { + for (const [label, { selector }] of Object.entries(tableFields)) { + const element = queryIframe(row, selector); + if (element) { + let current = element; + while (current && current !== row) { + if (current.tagName === 'TH') { + return true; + } + if (current.tagName === 'IFRAME') { + try { + current = current.contentDocument.body; + } catch (e) { + break; + } } else { - record[label] = fieldElement.getAttribute(attribute); + current = current.parentElement; } } } - scrapedData.push(record); } + return false; + } - // If we've processed all available elements and still haven't reached the limit, - // break to avoid infinite loop - if (parentElements.length === 0 || scrapedData.length >= parentElements.length) { - break; + // Helper function to filter rows + function filterRowsBasedOnTag(rows, tableFields) { + for (const row of rows) { + if (hasThElement(row, tableFields)) { + return rows; + } + } + return rows.filter(row => row.getElementsByTagName('TH').length === 0); + } + + // Class similarity comparison functions + function calculateClassSimilarity(classList1, classList2) { + const set1 = new Set(classList1); + const set2 = new Set(classList2); + const intersection = new Set([...set1].filter(x => set2.has(x))); + const union = new Set([...set1, ...set2]); + return intersection.size / union.size; + } + + function findSimilarElements(baseElement, similarityThreshold = 0.7) { + const baseClasses = Array.from(baseElement.classList); + if (baseClasses.length === 0) return []; + + // Include elements from all iframes + const allElements = []; + const iframes = document.getElementsByTagName('iframe'); + + // Add elements from main document + allElements.push(...document.getElementsByTagName(baseElement.tagName)); + + // Add elements from each iframe + for (const iframe of iframes) { + try { + const iframeDoc = iframe.contentDocument || iframe.contentWindow.document; + allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName)); + } catch (e) { + console.error('Cannot access iframe content:', e); + } + } + + return allElements.filter(element => { + if (element === baseElement) return false; + const similarity = calculateClassSimilarity( + baseClasses, + Array.from(element.classList) + ); + return similarity >= similarityThreshold; + }); + } + + // Main scraping logic + let containers = queryIframeAll(document, listSelector); + containers = Array.from(containers); + + if (containers.length === 0) return []; + + if (limit > 1 && containers.length === 1) { + const baseContainer = containers[0]; + const similarContainers = findSimilarElements(baseContainer); + + if (similarContainers.length > 0) { + const newContainers = similarContainers.filter(container => + !container.matches(listSelector) + ); + containers = [...containers, ...newContainers]; } } + + const containerFields = containers.map(() => ({ + tableFields: {}, + nonTableFields: {} + })); + + // Classify fields + containers.forEach((container, containerIndex) => { + for (const [label, field] of Object.entries(fields)) { + const sampleElement = queryIframe(container, field.selector); + + if (sampleElement) { + const ancestor = findTableAncestor(sampleElement); + if (ancestor) { + containerFields[containerIndex].tableFields[label] = { + ...field, + tableContext: ancestor.type, + cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 + }; + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } + }); + + const tableData = []; + const nonTableData = []; + + // Process table data + for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { + const container = containers[containerIndex]; + const { tableFields } = containerFields[containerIndex]; + + if (Object.keys(tableFields).length > 0) { + const firstField = Object.values(tableFields)[0]; + const firstElement = queryIframe(container, firstField.selector); + let tableContext = firstElement; + + while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { + if (tableContext.tagName === 'IFRAME') { + try { + tableContext = tableContext.contentDocument.body; + } catch (e) { + break; + } + } else { + tableContext = tableContext.parentElement; + } + } + + if (tableContext) { + const rows = Array.from(tableContext.getElementsByTagName('TR')); + const processedRows = filterRowsBasedOnTag(rows, tableFields); + + for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) { + const record = {}; + const currentRow = processedRows[rowIndex]; + + for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) { + let element = null; + + if (cellIndex >= 0) { + const td = currentRow.children[cellIndex]; + if (td) { + element = queryIframe(td, selector); + + if (!element && selector.split(">").pop().includes('td:nth-child')) { + element = td; + } + + if (!element) { + const tagOnlySelector = selector.split('.')[0]; + element = queryIframe(td, tagOnlySelector); + } + + if (!element) { + let currentElement = td; + while (currentElement && currentElement.children.length > 0) { + let foundContentChild = false; + for (const child of currentElement.children) { + if (extractValue(child, attribute)) { + currentElement = child; + foundContentChild = true; + break; + } + } + if (!foundContentChild) break; + } + element = currentElement; + } + } + } else { + element = queryIframe(currentRow, selector); + } + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + tableData.push(record); + } + } + } + } + } + + // Process non-table data + for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { + if (nonTableData.length >= limit) break; + + const container = containers[containerIndex]; + const { nonTableFields } = containerFields[containerIndex]; + + if (Object.keys(nonTableFields).length > 0) { + const record = {}; + + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const relativeSelector = selector.split(':>>').slice(-1)[0]; + const element = queryIframe(container, relativeSelector); + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + nonTableData.push(record); + } + } + } + + // Merge and limit the results + const scrapedData = [...tableData, ...nonTableData]; return scrapedData; -}; + }; /** From 7b08471ba1cbb06198457c8a3ac92ee7b4201bd3 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 4 Jan 2025 15:37:35 +0530 Subject: [PATCH 15/15] feat: add func to rm iframe selector from workflow --- maxun-core/src/interpret.ts | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 14d8f46e..e11ea517 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -658,8 +658,24 @@ export default class Interpreter extends EventEmitter { } } + private removeIframeSelectors(workflow: Workflow) { + for (let actionId = workflow.length - 1; actionId >= 0; actionId--) { + const step = workflow[actionId]; + + // Check if step has where and selectors + if (step.where && Array.isArray(step.where.selectors)) { + // Filter out selectors that contain ">>" + step.where.selectors = step.where.selectors.filter(selector => !selector.includes(':>>')); + } + } + + return workflow; + } + private async runLoop(p: Page, workflow: Workflow) { - const workflowCopy: Workflow = JSON.parse(JSON.stringify(workflow)); + let workflowCopy: Workflow = JSON.parse(JSON.stringify(workflow)); + + workflowCopy = this.removeIframeSelectors(workflowCopy); // apply ad-blocker to the current page await this.applyAdBlocker(p);