From 5f1084c503c0a970f7953caa17a545097fc5e533 Mon Sep 17 00:00:00 2001 From: Rohit Date: Sun, 6 Jul 2025 16:20:06 +0530 Subject: [PATCH] feat: revamp client list extractor with xpath logic --- src/helpers/clientListExtractor.ts | 812 ++++++++++++----------------- 1 file changed, 324 insertions(+), 488 deletions(-) diff --git a/src/helpers/clientListExtractor.ts b/src/helpers/clientListExtractor.ts index c7b21fd3..790abdea 100644 --- a/src/helpers/clientListExtractor.ts +++ b/src/helpers/clientListExtractor.ts @@ -15,30 +15,89 @@ interface ExtractedListData { [key: string]: string; } -interface TableField { +interface Field { selector: string; attribute: string; - tableContext?: string; - cellIndex?: number; -} - -interface NonTableField { - selector: string; - attribute: string; -} - -interface ContainerFields { - tableFields: Record; - nonTableFields: Record; } class ClientListExtractor { + private evaluateXPath = ( + rootElement: Element | Document, + xpath: string + ): Element | null => { + try { + const ownerDoc = + rootElement.nodeType === Node.DOCUMENT_NODE + ? (rootElement as Document) + : rootElement.ownerDocument; + + if (!ownerDoc) return null; + + const result = ownerDoc.evaluate( + xpath, + rootElement, + null, + XPathResult.FIRST_ORDERED_NODE_TYPE, + null + ); + + return result.singleNodeValue as Element | null; + } catch (error) { + console.warn("XPath evaluation failed:", xpath, error); + return null; + } + }; + + private evaluateXPathAll = ( + rootElement: Element | Document, + xpath: string + ): Element[] => { + try { + const ownerDoc = + rootElement.nodeType === Node.DOCUMENT_NODE + ? (rootElement as Document) + : rootElement.ownerDocument; + + if (!ownerDoc) return []; + + const result = ownerDoc.evaluate( + xpath, + rootElement, + null, + XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, + null + ); + + const elements: Element[] = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node as Element); + } + } + + return elements; + } catch (error) { + console.warn("XPath evaluation failed:", xpath, error); + return []; + } + }; + private queryElement = ( rootElement: Element | Document, selector: string ): Element | null => { if (!selector.includes(">>") && !selector.includes(":>>")) { - return rootElement.querySelector(selector); + // Check if it's an XPath selector (starts with // or / or ./) + if ( + selector.startsWith("//") || + selector.startsWith("/") || + selector.startsWith("./") + ) { + return this.evaluateXPath(rootElement, selector); + } else { + return rootElement.querySelector(selector); + } } const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim()); @@ -59,7 +118,17 @@ class ClientListExtractor { frameElement.contentDocument || frameElement.contentWindow?.document; if (!frameDoc) return null; - currentElement = frameDoc.querySelector(parts[i]); + + // Handle XPath in iframe context + if ( + parts[i].startsWith("//") || + parts[i].startsWith("/") || + parts[i].startsWith("./") + ) { + currentElement = this.evaluateXPath(frameDoc, parts[i]); + } else { + currentElement = frameDoc.querySelector(parts[i]); + } continue; } catch (e) { console.warn( @@ -75,7 +144,16 @@ class ClientListExtractor { let nextElement: Element | null = null; if ("querySelector" in currentElement) { - nextElement = currentElement.querySelector(parts[i]); + // Handle XPath vs CSS selector + if ( + parts[i].startsWith("//") || + parts[i].startsWith("/") || + parts[i].startsWith("./") + ) { + nextElement = this.evaluateXPath(currentElement, parts[i]); + } else { + nextElement = currentElement.querySelector(parts[i]); + } } if ( @@ -83,9 +161,20 @@ class ClientListExtractor { "shadowRoot" in currentElement && (currentElement as Element).shadowRoot ) { - nextElement = (currentElement as Element).shadowRoot!.querySelector( - parts[i] - ); + if ( + parts[i].startsWith("//") || + parts[i].startsWith("/") || + parts[i].startsWith("./") + ) { + nextElement = this.evaluateXPath( + (currentElement as Element).shadowRoot as unknown as Document, + parts[i] + ); + } else { + nextElement = (currentElement as Element).shadowRoot!.querySelector( + parts[i] + ); + } } if (!nextElement && "children" in currentElement) { @@ -94,7 +183,18 @@ class ClientListExtractor { ); for (const child of children) { if (child.shadowRoot) { - nextElement = child.shadowRoot.querySelector(parts[i]); + if ( + parts[i].startsWith("//") || + parts[i].startsWith("/") || + parts[i].startsWith("./") + ) { + nextElement = this.evaluateXPath( + child.shadowRoot as unknown as Document, + parts[i] + ); + } else { + nextElement = child.shadowRoot.querySelector(parts[i]); + } if (nextElement) break; } } @@ -111,7 +211,12 @@ class ClientListExtractor { selector: string ): Element[] => { if (!selector.includes(">>") && !selector.includes(":>>")) { - return Array.from(rootElement.querySelectorAll(selector)); + // Check if it's an XPath selector (starts with // or /) + if (selector.startsWith("//") || selector.startsWith("/")) { + return this.evaluateXPathAll(rootElement, selector); + } else { + return Array.from(rootElement.querySelectorAll(selector)); + } } const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim()); @@ -133,7 +238,14 @@ class ClientListExtractor { frameElement.contentDocument || frameElement.contentWindow?.document; if (frameDoc) { - nextElements.push(...Array.from(frameDoc.querySelectorAll(part))); + // Handle XPath in iframe context + if (part.startsWith("//") || part.startsWith("/")) { + nextElements.push(...this.evaluateXPathAll(frameDoc, part)); + } else { + nextElements.push( + ...Array.from(frameDoc.querySelectorAll(part)) + ); + } } } catch (e) { console.warn( @@ -146,24 +258,47 @@ class ClientListExtractor { } } else { if ("querySelectorAll" in element) { - nextElements.push(...Array.from(element.querySelectorAll(part))); + // Handle XPath vs CSS selector + if (part.startsWith("//") || part.startsWith("/")) { + nextElements.push(...this.evaluateXPathAll(element, part)); + } else { + nextElements.push(...Array.from(element.querySelectorAll(part))); + } } if ("shadowRoot" in element && (element as Element).shadowRoot) { - nextElements.push( - ...Array.from( - (element as Element).shadowRoot!.querySelectorAll(part) - ) - ); + if (part.startsWith("//") || part.startsWith("/")) { + nextElements.push( + ...this.evaluateXPathAll( + (element as Element).shadowRoot as unknown as Document, + part + ) + ); + } else { + nextElements.push( + ...Array.from( + (element as Element).shadowRoot!.querySelectorAll(part) + ) + ); + } } if ("children" in element) { const children = Array.from((element as Element).children || []); for (const child of children) { if (child.shadowRoot) { - nextElements.push( - ...Array.from(child.shadowRoot.querySelectorAll(part)) - ); + if (part.startsWith("//") || part.startsWith("/")) { + nextElements.push( + ...this.evaluateXPathAll( + child.shadowRoot as unknown as Document, + part + ) + ); + } else { + nextElements.push( + ...Array.from(child.shadowRoot.querySelectorAll(part)) + ); + } } } } @@ -193,35 +328,66 @@ class ClientListExtractor { } if (attribute === "innerText") { - return (element as HTMLElement).innerText?.trim() || null; - } else if (attribute === "innerHTML") { - return element.innerHTML?.trim() || null; - } else if (attribute === "src" || attribute === "href") { - if (attribute === "href" && element.tagName !== "A") { - const parentElement = element.parentElement; - if (parentElement && parentElement.tagName === "A") { - const parentHref = parentElement.getAttribute("href"); - if (parentHref) { - try { - return new URL(parentHref, baseURL).href; - } catch (e) { - return parentHref; - } + // First try standard innerText/textContent + let textContent = + (element as HTMLElement).innerText?.trim() || + (element as HTMLElement).textContent?.trim(); + + // If empty, check for common data attributes that might contain the text + if (!textContent) { + // Check for data-* attributes that commonly contain text values + const dataAttributes = [ + "data-600", + "data-text", + "data-label", + "data-value", + "data-content", + ]; + for (const attr of dataAttributes) { + const dataValue = element.getAttribute(attr); + if (dataValue && dataValue.trim()) { + textContent = dataValue.trim(); + break; } } } + return textContent || null; + } else if (attribute === "innerHTML") { + return element.innerHTML?.trim() || null; + } else if (attribute === "href") { + // For href, we need to find the anchor tag if the current element isn't one + let anchorElement = element; + + // If current element is not an anchor, look for parent anchor + if (element.tagName !== "A") { + anchorElement = + element.closest("a") || + element.parentElement?.closest("a") || + element; + } + + const hrefValue = anchorElement.getAttribute("href"); + if (!hrefValue || hrefValue.trim() === "") { + return null; + } + + try { + return new URL(hrefValue, baseURL).href; + } catch (e) { + console.warn("Error creating URL from", hrefValue, e); + return hrefValue; + } + } else if (attribute === "src") { const attrValue = element.getAttribute(attribute); const dataAttr = attrValue || element.getAttribute("data-" + attribute); if (!dataAttr || dataAttr.trim() === "") { - if (attribute === "src") { - const style = window.getComputedStyle(element as HTMLElement); - const bgImage = style.backgroundImage; - if (bgImage && bgImage !== "none") { - const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); - return matches ? new URL(matches[1], baseURL).href : null; - } + const style = window.getComputedStyle(element as HTMLElement); + const bgImage = style.backgroundImage; + if (bgImage && bgImage !== "none") { + const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); + return matches ? new URL(matches[1], baseURL).href : null; } return null; } @@ -236,187 +402,8 @@ class ClientListExtractor { return element.getAttribute(attribute); }; - private findTableAncestor = ( - element: Element - ): { type: string; element: Element } | null => { - let currentElement: Element | null = element; - const MAX_DEPTH = 5; - let depth = 0; - - while (currentElement && depth < MAX_DEPTH) { - if (currentElement.getRootNode() instanceof ShadowRoot) { - currentElement = (currentElement.getRootNode() as ShadowRoot).host; - continue; - } - - if (currentElement.tagName === "TD") { - return { type: "TD", element: currentElement }; - } else if (currentElement.tagName === "TR") { - return { type: "TR", element: currentElement }; - } - - if ( - currentElement.tagName === "IFRAME" || - currentElement.tagName === "FRAME" - ) { - try { - const frameElement = currentElement as - | HTMLIFrameElement - | HTMLFrameElement; - currentElement = frameElement.contentDocument?.body || null; - } catch (e) { - return null; - } - } else { - currentElement = currentElement.parentElement; - } - depth++; - } - return null; - }; - - private getCellIndex = (td: Element): number => { - if (td.getRootNode() instanceof ShadowRoot) { - const shadowRoot = td.getRootNode() as ShadowRoot; - const allCells = Array.from(shadowRoot.querySelectorAll("td")); - return allCells.indexOf(td as HTMLTableCellElement); - } - - let index = 0; - let sibling = td; - while ((sibling = sibling.previousElementSibling as Element)) { - index++; - } - return index; - }; - - private hasThElement = ( - row: Element, - tableFields: Record - ): boolean => { - for (const [_, { selector }] of Object.entries(tableFields)) { - const element = this.queryElement(row, selector); - if (element) { - let current: Element | ShadowRoot | Document | null = element; - while (current && current !== row) { - if (current.getRootNode() instanceof ShadowRoot) { - current = (current.getRootNode() as ShadowRoot).host; - continue; - } - - if ((current as Element).tagName === "TH") return true; - - if ( - (current as Element).tagName === "IFRAME" || - (current as Element).tagName === "FRAME" - ) { - try { - const frameElement = current as - | HTMLIFrameElement - | HTMLFrameElement; - current = frameElement.contentDocument?.body || null; - } catch (e) { - break; - } - } else { - current = (current as Element).parentElement; - } - } - } - } - return false; - }; - - private filterRowsBasedOnTag = ( - rows: Element[], - tableFields: Record - ): Element[] => { - for (const row of rows) { - if (this.hasThElement(row, tableFields)) { - return rows; - } - } - return rows.filter((row) => { - const directTH = row.getElementsByTagName("TH").length === 0; - const shadowTH = row.shadowRoot - ? row.shadowRoot.querySelector("th") === null - : true; - return directTH && shadowTH; - }); - }; - - private calculateClassSimilarity = ( - classList1: string[], - classList2: string[] - ): number => { - const set1 = new Set(classList1); - const set2 = new Set(classList2); - const intersection = new Set([...set1].filter((x) => set2.has(x))); - const union = new Set([...set1, ...set2]); - return intersection.size / union.size; - }; - - private findSimilarElements = ( - baseElement: Element, - document: Document, - similarityThreshold: number = 0.7 - ): Element[] => { - const baseClasses = Array.from(baseElement.classList); - if (baseClasses.length === 0) return []; - - const allElements: Element[] = []; - - allElements.push( - ...Array.from(document.getElementsByTagName(baseElement.tagName)) - ); - - if (baseElement.getRootNode() instanceof ShadowRoot) { - const shadowHost = (baseElement.getRootNode() as ShadowRoot).host; - allElements.push( - ...Array.from(shadowHost.getElementsByTagName(baseElement.tagName)) - ); - } - - const frames = [ - ...Array.from(document.getElementsByTagName("iframe")), - ...Array.from(document.getElementsByTagName("frame")), - ]; - - for (const frame of frames) { - try { - const frameElement = frame as HTMLIFrameElement | HTMLFrameElement; - const frameDoc = - frameElement.contentDocument || frameElement.contentWindow?.document; - if (frameDoc) { - allElements.push( - ...Array.from(frameDoc.getElementsByTagName(baseElement.tagName)) - ); - } - } catch (e) { - console.warn( - `Cannot access ${frame.tagName.toLowerCase()} content:`, - e - ); - } - } - - return allElements.filter((element) => { - if (element === baseElement) return false; - const similarity = this.calculateClassSimilarity( - baseClasses, - Array.from(element.classList) - ); - return similarity >= similarityThreshold; - }); - }; - - private convertFields = ( - fields: any - ): Record => { - const convertedFields: Record< - string, - { selector: string; attribute: string } - > = {}; + private convertFields = (fields: any): Record => { + const convertedFields: Record = {}; for (const [key, field] of Object.entries(fields)) { const typedField = field as TextStep; @@ -439,285 +426,134 @@ class ClientListExtractor { // Convert fields to the format expected by the extraction logic const convertedFields = this.convertFields(fields); - // Get all container elements matching the list selector - let containers = this.queryElementAll(iframeDocument, listSelector); + // Step 1: Get all container elements matching the list selector + const containers = this.queryElementAll(iframeDocument, listSelector); if (containers.length === 0) { - console.warn("No containers found for listSelector:", listSelector); + console.warn("❌ No containers found for listSelector:", listSelector); return []; } - // Enhanced container discovery: find similar elements if we need more containers - if (limit > 1 && containers.length === 1) { - const baseContainer = containers[0]; - const similarContainers = this.findSimilarElements( - baseContainer, - iframeDocument, - 0.7 - ); + // Step 2: Extract data from each container up to the limit + const extractedData: ExtractedListData[] = []; + const containersToProcess = Math.min(containers.length, limit); - if (similarContainers.length > 0) { - const newContainers = similarContainers.filter( - (container) => !container.matches(listSelector) - ); - containers = [...containers, ...newContainers]; - } - } + for ( + let containerIndex = 0; + containerIndex < containersToProcess; + containerIndex++ + ) { + const container = containers[containerIndex]; + const record: ExtractedListData = {}; - // Analyze fields for table vs non-table context - const containerFields: ContainerFields[] = containers.map(() => ({ - tableFields: {}, - nonTableFields: {}, - })); + // Step 3: For each field, extract data from the current container + for (const [label, { selector, attribute }] of Object.entries( + convertedFields + )) { + let element: Element | null = null; - containers.forEach((container, containerIndex) => { - for (const [label, field] of Object.entries(convertedFields)) { - const sampleElement = this.queryElement(container, field.selector); + // CORRECT APPROACH: Create indexed absolute XPath + if (selector.startsWith("//")) { + // Convert the absolute selector to target the specific container instance + const indexedSelector = this.createIndexedXPath( + selector, + listSelector, + containerIndex + 1 + ); - if (sampleElement) { - const ancestor = this.findTableAncestor(sampleElement); - if (ancestor) { - containerFields[containerIndex].tableFields[label] = { - ...field, - tableContext: ancestor.type, - cellIndex: - ancestor.type === "TD" - ? this.getCellIndex(ancestor.element) - : -1, - }; - } else { - containerFields[containerIndex].nonTableFields[label] = field; - } + element = this.evaluateXPathSingle(iframeDocument, indexedSelector); } else { - containerFields[containerIndex].nonTableFields[label] = field; + // Fallback for non-XPath selectors + element = this.queryElement(container, selector); } - } - }); - // Extract table data - const tableData: ExtractedListData[] = []; - for ( - let containerIndex = 0; - containerIndex < containers.length; - containerIndex++ - ) { - const container = containers[containerIndex]; - const { tableFields } = containerFields[containerIndex]; - - if (Object.keys(tableFields).length > 0) { - const firstField = Object.values(tableFields)[0]; - const firstElement = this.queryElement( - container, - firstField.selector - ); - let tableContext: Element | null = firstElement; - - // Find the table context - while ( - tableContext && - tableContext.tagName !== "TABLE" && - tableContext !== container - ) { - if (tableContext.getRootNode() instanceof ShadowRoot) { - tableContext = (tableContext.getRootNode() as ShadowRoot).host; - continue; - } - - if ( - tableContext.tagName === "IFRAME" || - tableContext.tagName === "FRAME" - ) { - try { - const frameElement = tableContext as - | HTMLIFrameElement - | HTMLFrameElement; - tableContext = frameElement.contentDocument?.body || null; - } catch (e) { - break; - } + // Step 4: Extract the value from the found element + if (element) { + const value = this.extractValue(element, attribute); + if (value !== null && value !== "") { + record[label] = value; } else { - tableContext = tableContext.parentElement; - } - } - - if (tableContext) { - const rows: Element[] = []; - rows.push(...Array.from(tableContext.getElementsByTagName("TR"))); - - if ( - tableContext.tagName === "IFRAME" || - tableContext.tagName === "FRAME" - ) { - try { - const frameElement = tableContext as - | HTMLIFrameElement - | HTMLFrameElement; - const frameDoc = - frameElement.contentDocument || - frameElement.contentWindow?.document; - if (frameDoc) { - rows.push(...Array.from(frameDoc.getElementsByTagName("TR"))); - } - } catch (e) { - console.warn( - `Cannot access ${tableContext.tagName.toLowerCase()} rows:`, - e - ); - } - } - - const processedRows = this.filterRowsBasedOnTag(rows, tableFields); - - for ( - let rowIndex = 0; - rowIndex < Math.min(processedRows.length, limit); - rowIndex++ - ) { - const record: ExtractedListData = {}; - const currentRow = processedRows[rowIndex]; - - for (const [ - label, - { selector, attribute, cellIndex }, - ] of Object.entries(tableFields)) { - let element: Element | null = null; - - if (cellIndex !== undefined && cellIndex >= 0) { - let td: Element | null = - currentRow.children[cellIndex] || null; - - if (!td && currentRow.shadowRoot) { - const shadowCells = currentRow.shadowRoot.children; - if (shadowCells && shadowCells.length > cellIndex) { - td = shadowCells[cellIndex]; - } - } - - if (td) { - element = this.queryElement(td, selector); - - if ( - !element && - selector - .split(/(?:>>|:>>)/) - .pop() - ?.includes("td:nth-child") - ) { - element = td; - } - - if (!element) { - const tagOnlySelector = selector.split(".")[0]; - element = this.queryElement(td, tagOnlySelector); - } - - if (!element) { - let currentElement: Element | null = td; - while ( - currentElement && - currentElement.children.length > 0 - ) { - let foundContentChild = false; - for (const child of Array.from( - currentElement.children - )) { - if (this.extractValue(child, attribute)) { - currentElement = child; - foundContentChild = true; - break; - } - } - if (!foundContentChild) break; - } - element = currentElement; - } - } - } else { - element = this.queryElement(currentRow, selector); - } - - if (element) { - const value = this.extractValue(element, attribute); - if (value !== null && value !== "") { - record[label] = value; - } else { - console.warn( - `❌ No value for ${label} in row ${rowIndex + 1}` - ); - record[label] = ""; - } - } else { - console.warn( - `❌ Element not found for ${label} with selector:`, - selector - ); - record[label] = ""; - } - } - - if (Object.values(record).some((value) => value !== "")) { - tableData.push(record); - } - } - } - } - } - - // Extract non-table data - const nonTableData: ExtractedListData[] = []; - for ( - let containerIndex = 0; - containerIndex < containers.length; - containerIndex++ - ) { - if (nonTableData.length >= limit) break; - - const container = containers[containerIndex]; - const { nonTableFields } = containerFields[containerIndex]; - - if (Object.keys(nonTableFields).length > 0) { - const record: ExtractedListData = {}; - - for (const [label, { selector, attribute }] of Object.entries( - nonTableFields - )) { - const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0]; - const element = this.queryElement(container, relativeSelector); - - if (element) { - const value = this.extractValue(element, attribute); - if (value !== null && value !== "") { - record[label] = value; - } else { - console.warn( - `❌ No value for ${label} in container ${containerIndex + 1}` - ); - record[label] = ""; - } - } else { - console.warn( - `❌ Element not found for ${label} with selector:`, - selector - ); + console.warn(` ⚠️ Empty value for "${label}"`); record[label] = ""; } - } - - if (Object.values(record).some((value) => value !== "")) { - nonTableData.push(record); + } else { + console.warn(` ❌ Element not found for "${label}"`); + record[label] = ""; } } - } - // Combine and limit results - const extractedData = [...tableData, ...nonTableData].slice(0, limit); + // Step 5: Add record if it has any non-empty values + if (Object.values(record).some((value) => value !== "")) { + extractedData.push(record); + } else { + console.warn( + ` ⚠️ Skipping empty record for container ${containerIndex + 1}` + ); + } + } return extractedData; } catch (error) { - console.error("Error in client-side extractListData:", error); + console.error("💥 Error in client-side extractListData:", error); return []; } }; + + // Create indexed XPath for specific container instance + private createIndexedXPath( + childSelector: string, + listSelector: string, + containerIndex: number + ): string { + // Check if the child selector contains the list selector pattern + if (childSelector.includes(listSelector.replace("//", ""))) { + // Replace the list selector part with indexed version + const listPattern = listSelector.replace("//", ""); + const indexedListSelector = `(${listSelector})[${containerIndex}]`; + + const indexedSelector = childSelector.replace( + `//${listPattern}`, + indexedListSelector + ); + + return indexedSelector; + } else { + // If pattern doesn't match, create a more generic indexed selector + // This is a fallback approach + console.warn(` ⚠️ Pattern doesn't match, using fallback approach`); + return `(${listSelector})[${containerIndex}]${childSelector.replace( + "//", + "/" + )}`; + } + } + + // Helper method for single XPath evaluation + private evaluateXPathSingle = ( + document: Document, + xpath: string + ): Element | null => { + try { + const result = document.evaluate( + xpath, + document, + null, + XPathResult.FIRST_ORDERED_NODE_TYPE, + null + ); + + const element = result.singleNodeValue as Element | null; + + if (!element) { + console.warn(`❌ XPath found no element for: ${xpath}`); + } + + return element; + } catch (error) { + console.error("❌ XPath evaluation failed:", xpath, error); + return null; + } + }; } export const clientListExtractor = new ClientListExtractor();