feat: revamp client list extractor with xpath logic
This commit is contained in:
@@ -15,30 +15,89 @@ interface ExtractedListData {
|
|||||||
[key: string]: string;
|
[key: string]: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
interface TableField {
|
interface Field {
|
||||||
selector: string;
|
selector: string;
|
||||||
attribute: string;
|
attribute: string;
|
||||||
tableContext?: string;
|
|
||||||
cellIndex?: number;
|
|
||||||
}
|
|
||||||
|
|
||||||
interface NonTableField {
|
|
||||||
selector: string;
|
|
||||||
attribute: string;
|
|
||||||
}
|
|
||||||
|
|
||||||
interface ContainerFields {
|
|
||||||
tableFields: Record<string, TableField>;
|
|
||||||
nonTableFields: Record<string, NonTableField>;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
class ClientListExtractor {
|
class ClientListExtractor {
|
||||||
|
private evaluateXPath = (
|
||||||
|
rootElement: Element | Document,
|
||||||
|
xpath: string
|
||||||
|
): Element | null => {
|
||||||
|
try {
|
||||||
|
const ownerDoc =
|
||||||
|
rootElement.nodeType === Node.DOCUMENT_NODE
|
||||||
|
? (rootElement as Document)
|
||||||
|
: rootElement.ownerDocument;
|
||||||
|
|
||||||
|
if (!ownerDoc) return null;
|
||||||
|
|
||||||
|
const result = ownerDoc.evaluate(
|
||||||
|
xpath,
|
||||||
|
rootElement,
|
||||||
|
null,
|
||||||
|
XPathResult.FIRST_ORDERED_NODE_TYPE,
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
return result.singleNodeValue as Element | null;
|
||||||
|
} catch (error) {
|
||||||
|
console.warn("XPath evaluation failed:", xpath, error);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private evaluateXPathAll = (
|
||||||
|
rootElement: Element | Document,
|
||||||
|
xpath: string
|
||||||
|
): Element[] => {
|
||||||
|
try {
|
||||||
|
const ownerDoc =
|
||||||
|
rootElement.nodeType === Node.DOCUMENT_NODE
|
||||||
|
? (rootElement as Document)
|
||||||
|
: rootElement.ownerDocument;
|
||||||
|
|
||||||
|
if (!ownerDoc) return [];
|
||||||
|
|
||||||
|
const result = ownerDoc.evaluate(
|
||||||
|
xpath,
|
||||||
|
rootElement,
|
||||||
|
null,
|
||||||
|
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
const elements: Element[] = [];
|
||||||
|
for (let i = 0; i < result.snapshotLength; i++) {
|
||||||
|
const node = result.snapshotItem(i);
|
||||||
|
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
||||||
|
elements.push(node as Element);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return elements;
|
||||||
|
} catch (error) {
|
||||||
|
console.warn("XPath evaluation failed:", xpath, error);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
private queryElement = (
|
private queryElement = (
|
||||||
rootElement: Element | Document,
|
rootElement: Element | Document,
|
||||||
selector: string
|
selector: string
|
||||||
): Element | null => {
|
): Element | null => {
|
||||||
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
||||||
return rootElement.querySelector(selector);
|
// Check if it's an XPath selector (starts with // or / or ./)
|
||||||
|
if (
|
||||||
|
selector.startsWith("//") ||
|
||||||
|
selector.startsWith("/") ||
|
||||||
|
selector.startsWith("./")
|
||||||
|
) {
|
||||||
|
return this.evaluateXPath(rootElement, selector);
|
||||||
|
} else {
|
||||||
|
return rootElement.querySelector(selector);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
||||||
@@ -59,7 +118,17 @@ class ClientListExtractor {
|
|||||||
frameElement.contentDocument ||
|
frameElement.contentDocument ||
|
||||||
frameElement.contentWindow?.document;
|
frameElement.contentWindow?.document;
|
||||||
if (!frameDoc) return null;
|
if (!frameDoc) return null;
|
||||||
currentElement = frameDoc.querySelector(parts[i]);
|
|
||||||
|
// Handle XPath in iframe context
|
||||||
|
if (
|
||||||
|
parts[i].startsWith("//") ||
|
||||||
|
parts[i].startsWith("/") ||
|
||||||
|
parts[i].startsWith("./")
|
||||||
|
) {
|
||||||
|
currentElement = this.evaluateXPath(frameDoc, parts[i]);
|
||||||
|
} else {
|
||||||
|
currentElement = frameDoc.querySelector(parts[i]);
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn(
|
console.warn(
|
||||||
@@ -75,7 +144,16 @@ class ClientListExtractor {
|
|||||||
let nextElement: Element | null = null;
|
let nextElement: Element | null = null;
|
||||||
|
|
||||||
if ("querySelector" in currentElement) {
|
if ("querySelector" in currentElement) {
|
||||||
nextElement = currentElement.querySelector(parts[i]);
|
// Handle XPath vs CSS selector
|
||||||
|
if (
|
||||||
|
parts[i].startsWith("//") ||
|
||||||
|
parts[i].startsWith("/") ||
|
||||||
|
parts[i].startsWith("./")
|
||||||
|
) {
|
||||||
|
nextElement = this.evaluateXPath(currentElement, parts[i]);
|
||||||
|
} else {
|
||||||
|
nextElement = currentElement.querySelector(parts[i]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
if (
|
||||||
@@ -83,9 +161,20 @@ class ClientListExtractor {
|
|||||||
"shadowRoot" in currentElement &&
|
"shadowRoot" in currentElement &&
|
||||||
(currentElement as Element).shadowRoot
|
(currentElement as Element).shadowRoot
|
||||||
) {
|
) {
|
||||||
nextElement = (currentElement as Element).shadowRoot!.querySelector(
|
if (
|
||||||
parts[i]
|
parts[i].startsWith("//") ||
|
||||||
);
|
parts[i].startsWith("/") ||
|
||||||
|
parts[i].startsWith("./")
|
||||||
|
) {
|
||||||
|
nextElement = this.evaluateXPath(
|
||||||
|
(currentElement as Element).shadowRoot as unknown as Document,
|
||||||
|
parts[i]
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
nextElement = (currentElement as Element).shadowRoot!.querySelector(
|
||||||
|
parts[i]
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!nextElement && "children" in currentElement) {
|
if (!nextElement && "children" in currentElement) {
|
||||||
@@ -94,7 +183,18 @@ class ClientListExtractor {
|
|||||||
);
|
);
|
||||||
for (const child of children) {
|
for (const child of children) {
|
||||||
if (child.shadowRoot) {
|
if (child.shadowRoot) {
|
||||||
nextElement = child.shadowRoot.querySelector(parts[i]);
|
if (
|
||||||
|
parts[i].startsWith("//") ||
|
||||||
|
parts[i].startsWith("/") ||
|
||||||
|
parts[i].startsWith("./")
|
||||||
|
) {
|
||||||
|
nextElement = this.evaluateXPath(
|
||||||
|
child.shadowRoot as unknown as Document,
|
||||||
|
parts[i]
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
nextElement = child.shadowRoot.querySelector(parts[i]);
|
||||||
|
}
|
||||||
if (nextElement) break;
|
if (nextElement) break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -111,7 +211,12 @@ class ClientListExtractor {
|
|||||||
selector: string
|
selector: string
|
||||||
): Element[] => {
|
): Element[] => {
|
||||||
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
||||||
return Array.from(rootElement.querySelectorAll(selector));
|
// Check if it's an XPath selector (starts with // or /)
|
||||||
|
if (selector.startsWith("//") || selector.startsWith("/")) {
|
||||||
|
return this.evaluateXPathAll(rootElement, selector);
|
||||||
|
} else {
|
||||||
|
return Array.from(rootElement.querySelectorAll(selector));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
||||||
@@ -133,7 +238,14 @@ class ClientListExtractor {
|
|||||||
frameElement.contentDocument ||
|
frameElement.contentDocument ||
|
||||||
frameElement.contentWindow?.document;
|
frameElement.contentWindow?.document;
|
||||||
if (frameDoc) {
|
if (frameDoc) {
|
||||||
nextElements.push(...Array.from(frameDoc.querySelectorAll(part)));
|
// Handle XPath in iframe context
|
||||||
|
if (part.startsWith("//") || part.startsWith("/")) {
|
||||||
|
nextElements.push(...this.evaluateXPathAll(frameDoc, part));
|
||||||
|
} else {
|
||||||
|
nextElements.push(
|
||||||
|
...Array.from(frameDoc.querySelectorAll(part))
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn(
|
console.warn(
|
||||||
@@ -146,24 +258,47 @@ class ClientListExtractor {
|
|||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
if ("querySelectorAll" in element) {
|
if ("querySelectorAll" in element) {
|
||||||
nextElements.push(...Array.from(element.querySelectorAll(part)));
|
// Handle XPath vs CSS selector
|
||||||
|
if (part.startsWith("//") || part.startsWith("/")) {
|
||||||
|
nextElements.push(...this.evaluateXPathAll(element, part));
|
||||||
|
} else {
|
||||||
|
nextElements.push(...Array.from(element.querySelectorAll(part)));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ("shadowRoot" in element && (element as Element).shadowRoot) {
|
if ("shadowRoot" in element && (element as Element).shadowRoot) {
|
||||||
nextElements.push(
|
if (part.startsWith("//") || part.startsWith("/")) {
|
||||||
...Array.from(
|
nextElements.push(
|
||||||
(element as Element).shadowRoot!.querySelectorAll(part)
|
...this.evaluateXPathAll(
|
||||||
)
|
(element as Element).shadowRoot as unknown as Document,
|
||||||
);
|
part
|
||||||
|
)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
nextElements.push(
|
||||||
|
...Array.from(
|
||||||
|
(element as Element).shadowRoot!.querySelectorAll(part)
|
||||||
|
)
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ("children" in element) {
|
if ("children" in element) {
|
||||||
const children = Array.from((element as Element).children || []);
|
const children = Array.from((element as Element).children || []);
|
||||||
for (const child of children) {
|
for (const child of children) {
|
||||||
if (child.shadowRoot) {
|
if (child.shadowRoot) {
|
||||||
nextElements.push(
|
if (part.startsWith("//") || part.startsWith("/")) {
|
||||||
...Array.from(child.shadowRoot.querySelectorAll(part))
|
nextElements.push(
|
||||||
);
|
...this.evaluateXPathAll(
|
||||||
|
child.shadowRoot as unknown as Document,
|
||||||
|
part
|
||||||
|
)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
nextElements.push(
|
||||||
|
...Array.from(child.shadowRoot.querySelectorAll(part))
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -193,35 +328,66 @@ class ClientListExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (attribute === "innerText") {
|
if (attribute === "innerText") {
|
||||||
return (element as HTMLElement).innerText?.trim() || null;
|
// First try standard innerText/textContent
|
||||||
} else if (attribute === "innerHTML") {
|
let textContent =
|
||||||
return element.innerHTML?.trim() || null;
|
(element as HTMLElement).innerText?.trim() ||
|
||||||
} else if (attribute === "src" || attribute === "href") {
|
(element as HTMLElement).textContent?.trim();
|
||||||
if (attribute === "href" && element.tagName !== "A") {
|
|
||||||
const parentElement = element.parentElement;
|
// If empty, check for common data attributes that might contain the text
|
||||||
if (parentElement && parentElement.tagName === "A") {
|
if (!textContent) {
|
||||||
const parentHref = parentElement.getAttribute("href");
|
// Check for data-* attributes that commonly contain text values
|
||||||
if (parentHref) {
|
const dataAttributes = [
|
||||||
try {
|
"data-600",
|
||||||
return new URL(parentHref, baseURL).href;
|
"data-text",
|
||||||
} catch (e) {
|
"data-label",
|
||||||
return parentHref;
|
"data-value",
|
||||||
}
|
"data-content",
|
||||||
|
];
|
||||||
|
for (const attr of dataAttributes) {
|
||||||
|
const dataValue = element.getAttribute(attr);
|
||||||
|
if (dataValue && dataValue.trim()) {
|
||||||
|
textContent = dataValue.trim();
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
return textContent || null;
|
||||||
|
} else if (attribute === "innerHTML") {
|
||||||
|
return element.innerHTML?.trim() || null;
|
||||||
|
} else if (attribute === "href") {
|
||||||
|
// For href, we need to find the anchor tag if the current element isn't one
|
||||||
|
let anchorElement = element;
|
||||||
|
|
||||||
|
// If current element is not an anchor, look for parent anchor
|
||||||
|
if (element.tagName !== "A") {
|
||||||
|
anchorElement =
|
||||||
|
element.closest("a") ||
|
||||||
|
element.parentElement?.closest("a") ||
|
||||||
|
element;
|
||||||
|
}
|
||||||
|
|
||||||
|
const hrefValue = anchorElement.getAttribute("href");
|
||||||
|
if (!hrefValue || hrefValue.trim() === "") {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return new URL(hrefValue, baseURL).href;
|
||||||
|
} catch (e) {
|
||||||
|
console.warn("Error creating URL from", hrefValue, e);
|
||||||
|
return hrefValue;
|
||||||
|
}
|
||||||
|
} else if (attribute === "src") {
|
||||||
const attrValue = element.getAttribute(attribute);
|
const attrValue = element.getAttribute(attribute);
|
||||||
const dataAttr = attrValue || element.getAttribute("data-" + attribute);
|
const dataAttr = attrValue || element.getAttribute("data-" + attribute);
|
||||||
|
|
||||||
if (!dataAttr || dataAttr.trim() === "") {
|
if (!dataAttr || dataAttr.trim() === "") {
|
||||||
if (attribute === "src") {
|
const style = window.getComputedStyle(element as HTMLElement);
|
||||||
const style = window.getComputedStyle(element as HTMLElement);
|
const bgImage = style.backgroundImage;
|
||||||
const bgImage = style.backgroundImage;
|
if (bgImage && bgImage !== "none") {
|
||||||
if (bgImage && bgImage !== "none") {
|
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
||||||
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
return matches ? new URL(matches[1], baseURL).href : null;
|
||||||
return matches ? new URL(matches[1], baseURL).href : null;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
@@ -236,187 +402,8 @@ class ClientListExtractor {
|
|||||||
return element.getAttribute(attribute);
|
return element.getAttribute(attribute);
|
||||||
};
|
};
|
||||||
|
|
||||||
private findTableAncestor = (
|
private convertFields = (fields: any): Record<string, Field> => {
|
||||||
element: Element
|
const convertedFields: Record<string, Field> = {};
|
||||||
): { type: string; element: Element } | null => {
|
|
||||||
let currentElement: Element | null = element;
|
|
||||||
const MAX_DEPTH = 5;
|
|
||||||
let depth = 0;
|
|
||||||
|
|
||||||
while (currentElement && depth < MAX_DEPTH) {
|
|
||||||
if (currentElement.getRootNode() instanceof ShadowRoot) {
|
|
||||||
currentElement = (currentElement.getRootNode() as ShadowRoot).host;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (currentElement.tagName === "TD") {
|
|
||||||
return { type: "TD", element: currentElement };
|
|
||||||
} else if (currentElement.tagName === "TR") {
|
|
||||||
return { type: "TR", element: currentElement };
|
|
||||||
}
|
|
||||||
|
|
||||||
if (
|
|
||||||
currentElement.tagName === "IFRAME" ||
|
|
||||||
currentElement.tagName === "FRAME"
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
const frameElement = currentElement as
|
|
||||||
| HTMLIFrameElement
|
|
||||||
| HTMLFrameElement;
|
|
||||||
currentElement = frameElement.contentDocument?.body || null;
|
|
||||||
} catch (e) {
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
currentElement = currentElement.parentElement;
|
|
||||||
}
|
|
||||||
depth++;
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
};
|
|
||||||
|
|
||||||
private getCellIndex = (td: Element): number => {
|
|
||||||
if (td.getRootNode() instanceof ShadowRoot) {
|
|
||||||
const shadowRoot = td.getRootNode() as ShadowRoot;
|
|
||||||
const allCells = Array.from(shadowRoot.querySelectorAll("td"));
|
|
||||||
return allCells.indexOf(td as HTMLTableCellElement);
|
|
||||||
}
|
|
||||||
|
|
||||||
let index = 0;
|
|
||||||
let sibling = td;
|
|
||||||
while ((sibling = sibling.previousElementSibling as Element)) {
|
|
||||||
index++;
|
|
||||||
}
|
|
||||||
return index;
|
|
||||||
};
|
|
||||||
|
|
||||||
private hasThElement = (
|
|
||||||
row: Element,
|
|
||||||
tableFields: Record<string, TableField>
|
|
||||||
): boolean => {
|
|
||||||
for (const [_, { selector }] of Object.entries(tableFields)) {
|
|
||||||
const element = this.queryElement(row, selector);
|
|
||||||
if (element) {
|
|
||||||
let current: Element | ShadowRoot | Document | null = element;
|
|
||||||
while (current && current !== row) {
|
|
||||||
if (current.getRootNode() instanceof ShadowRoot) {
|
|
||||||
current = (current.getRootNode() as ShadowRoot).host;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if ((current as Element).tagName === "TH") return true;
|
|
||||||
|
|
||||||
if (
|
|
||||||
(current as Element).tagName === "IFRAME" ||
|
|
||||||
(current as Element).tagName === "FRAME"
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
const frameElement = current as
|
|
||||||
| HTMLIFrameElement
|
|
||||||
| HTMLFrameElement;
|
|
||||||
current = frameElement.contentDocument?.body || null;
|
|
||||||
} catch (e) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
current = (current as Element).parentElement;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
};
|
|
||||||
|
|
||||||
private filterRowsBasedOnTag = (
|
|
||||||
rows: Element[],
|
|
||||||
tableFields: Record<string, TableField>
|
|
||||||
): Element[] => {
|
|
||||||
for (const row of rows) {
|
|
||||||
if (this.hasThElement(row, tableFields)) {
|
|
||||||
return rows;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return rows.filter((row) => {
|
|
||||||
const directTH = row.getElementsByTagName("TH").length === 0;
|
|
||||||
const shadowTH = row.shadowRoot
|
|
||||||
? row.shadowRoot.querySelector("th") === null
|
|
||||||
: true;
|
|
||||||
return directTH && shadowTH;
|
|
||||||
});
|
|
||||||
};
|
|
||||||
|
|
||||||
private calculateClassSimilarity = (
|
|
||||||
classList1: string[],
|
|
||||||
classList2: string[]
|
|
||||||
): number => {
|
|
||||||
const set1 = new Set(classList1);
|
|
||||||
const set2 = new Set(classList2);
|
|
||||||
const intersection = new Set([...set1].filter((x) => set2.has(x)));
|
|
||||||
const union = new Set([...set1, ...set2]);
|
|
||||||
return intersection.size / union.size;
|
|
||||||
};
|
|
||||||
|
|
||||||
private findSimilarElements = (
|
|
||||||
baseElement: Element,
|
|
||||||
document: Document,
|
|
||||||
similarityThreshold: number = 0.7
|
|
||||||
): Element[] => {
|
|
||||||
const baseClasses = Array.from(baseElement.classList);
|
|
||||||
if (baseClasses.length === 0) return [];
|
|
||||||
|
|
||||||
const allElements: Element[] = [];
|
|
||||||
|
|
||||||
allElements.push(
|
|
||||||
...Array.from(document.getElementsByTagName(baseElement.tagName))
|
|
||||||
);
|
|
||||||
|
|
||||||
if (baseElement.getRootNode() instanceof ShadowRoot) {
|
|
||||||
const shadowHost = (baseElement.getRootNode() as ShadowRoot).host;
|
|
||||||
allElements.push(
|
|
||||||
...Array.from(shadowHost.getElementsByTagName(baseElement.tagName))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
|
|
||||||
const frames = [
|
|
||||||
...Array.from(document.getElementsByTagName("iframe")),
|
|
||||||
...Array.from(document.getElementsByTagName("frame")),
|
|
||||||
];
|
|
||||||
|
|
||||||
for (const frame of frames) {
|
|
||||||
try {
|
|
||||||
const frameElement = frame as HTMLIFrameElement | HTMLFrameElement;
|
|
||||||
const frameDoc =
|
|
||||||
frameElement.contentDocument || frameElement.contentWindow?.document;
|
|
||||||
if (frameDoc) {
|
|
||||||
allElements.push(
|
|
||||||
...Array.from(frameDoc.getElementsByTagName(baseElement.tagName))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
console.warn(
|
|
||||||
`Cannot access ${frame.tagName.toLowerCase()} content:`,
|
|
||||||
e
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return allElements.filter((element) => {
|
|
||||||
if (element === baseElement) return false;
|
|
||||||
const similarity = this.calculateClassSimilarity(
|
|
||||||
baseClasses,
|
|
||||||
Array.from(element.classList)
|
|
||||||
);
|
|
||||||
return similarity >= similarityThreshold;
|
|
||||||
});
|
|
||||||
};
|
|
||||||
|
|
||||||
private convertFields = (
|
|
||||||
fields: any
|
|
||||||
): Record<string, { selector: string; attribute: string }> => {
|
|
||||||
const convertedFields: Record<
|
|
||||||
string,
|
|
||||||
{ selector: string; attribute: string }
|
|
||||||
> = {};
|
|
||||||
|
|
||||||
for (const [key, field] of Object.entries(fields)) {
|
for (const [key, field] of Object.entries(fields)) {
|
||||||
const typedField = field as TextStep;
|
const typedField = field as TextStep;
|
||||||
@@ -439,285 +426,134 @@ class ClientListExtractor {
|
|||||||
// Convert fields to the format expected by the extraction logic
|
// Convert fields to the format expected by the extraction logic
|
||||||
const convertedFields = this.convertFields(fields);
|
const convertedFields = this.convertFields(fields);
|
||||||
|
|
||||||
// Get all container elements matching the list selector
|
// Step 1: Get all container elements matching the list selector
|
||||||
let containers = this.queryElementAll(iframeDocument, listSelector);
|
const containers = this.queryElementAll(iframeDocument, listSelector);
|
||||||
|
|
||||||
if (containers.length === 0) {
|
if (containers.length === 0) {
|
||||||
console.warn("No containers found for listSelector:", listSelector);
|
console.warn("❌ No containers found for listSelector:", listSelector);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Enhanced container discovery: find similar elements if we need more containers
|
// Step 2: Extract data from each container up to the limit
|
||||||
if (limit > 1 && containers.length === 1) {
|
const extractedData: ExtractedListData[] = [];
|
||||||
const baseContainer = containers[0];
|
const containersToProcess = Math.min(containers.length, limit);
|
||||||
const similarContainers = this.findSimilarElements(
|
|
||||||
baseContainer,
|
|
||||||
iframeDocument,
|
|
||||||
0.7
|
|
||||||
);
|
|
||||||
|
|
||||||
if (similarContainers.length > 0) {
|
for (
|
||||||
const newContainers = similarContainers.filter(
|
let containerIndex = 0;
|
||||||
(container) => !container.matches(listSelector)
|
containerIndex < containersToProcess;
|
||||||
);
|
containerIndex++
|
||||||
containers = [...containers, ...newContainers];
|
) {
|
||||||
}
|
const container = containers[containerIndex];
|
||||||
}
|
const record: ExtractedListData = {};
|
||||||
|
|
||||||
// Analyze fields for table vs non-table context
|
// Step 3: For each field, extract data from the current container
|
||||||
const containerFields: ContainerFields[] = containers.map(() => ({
|
for (const [label, { selector, attribute }] of Object.entries(
|
||||||
tableFields: {},
|
convertedFields
|
||||||
nonTableFields: {},
|
)) {
|
||||||
}));
|
let element: Element | null = null;
|
||||||
|
|
||||||
containers.forEach((container, containerIndex) => {
|
// CORRECT APPROACH: Create indexed absolute XPath
|
||||||
for (const [label, field] of Object.entries(convertedFields)) {
|
if (selector.startsWith("//")) {
|
||||||
const sampleElement = this.queryElement(container, field.selector);
|
// Convert the absolute selector to target the specific container instance
|
||||||
|
const indexedSelector = this.createIndexedXPath(
|
||||||
|
selector,
|
||||||
|
listSelector,
|
||||||
|
containerIndex + 1
|
||||||
|
);
|
||||||
|
|
||||||
if (sampleElement) {
|
element = this.evaluateXPathSingle(iframeDocument, indexedSelector);
|
||||||
const ancestor = this.findTableAncestor(sampleElement);
|
|
||||||
if (ancestor) {
|
|
||||||
containerFields[containerIndex].tableFields[label] = {
|
|
||||||
...field,
|
|
||||||
tableContext: ancestor.type,
|
|
||||||
cellIndex:
|
|
||||||
ancestor.type === "TD"
|
|
||||||
? this.getCellIndex(ancestor.element)
|
|
||||||
: -1,
|
|
||||||
};
|
|
||||||
} else {
|
|
||||||
containerFields[containerIndex].nonTableFields[label] = field;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
containerFields[containerIndex].nonTableFields[label] = field;
|
// Fallback for non-XPath selectors
|
||||||
|
element = this.queryElement(container, selector);
|
||||||
}
|
}
|
||||||
}
|
|
||||||
});
|
|
||||||
|
|
||||||
// Extract table data
|
// Step 4: Extract the value from the found element
|
||||||
const tableData: ExtractedListData[] = [];
|
if (element) {
|
||||||
for (
|
const value = this.extractValue(element, attribute);
|
||||||
let containerIndex = 0;
|
if (value !== null && value !== "") {
|
||||||
containerIndex < containers.length;
|
record[label] = value;
|
||||||
containerIndex++
|
|
||||||
) {
|
|
||||||
const container = containers[containerIndex];
|
|
||||||
const { tableFields } = containerFields[containerIndex];
|
|
||||||
|
|
||||||
if (Object.keys(tableFields).length > 0) {
|
|
||||||
const firstField = Object.values(tableFields)[0];
|
|
||||||
const firstElement = this.queryElement(
|
|
||||||
container,
|
|
||||||
firstField.selector
|
|
||||||
);
|
|
||||||
let tableContext: Element | null = firstElement;
|
|
||||||
|
|
||||||
// Find the table context
|
|
||||||
while (
|
|
||||||
tableContext &&
|
|
||||||
tableContext.tagName !== "TABLE" &&
|
|
||||||
tableContext !== container
|
|
||||||
) {
|
|
||||||
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
|
||||||
tableContext = (tableContext.getRootNode() as ShadowRoot).host;
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (
|
|
||||||
tableContext.tagName === "IFRAME" ||
|
|
||||||
tableContext.tagName === "FRAME"
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
const frameElement = tableContext as
|
|
||||||
| HTMLIFrameElement
|
|
||||||
| HTMLFrameElement;
|
|
||||||
tableContext = frameElement.contentDocument?.body || null;
|
|
||||||
} catch (e) {
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
} else {
|
} else {
|
||||||
tableContext = tableContext.parentElement;
|
console.warn(` ⚠️ Empty value for "${label}"`);
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (tableContext) {
|
|
||||||
const rows: Element[] = [];
|
|
||||||
rows.push(...Array.from(tableContext.getElementsByTagName("TR")));
|
|
||||||
|
|
||||||
if (
|
|
||||||
tableContext.tagName === "IFRAME" ||
|
|
||||||
tableContext.tagName === "FRAME"
|
|
||||||
) {
|
|
||||||
try {
|
|
||||||
const frameElement = tableContext as
|
|
||||||
| HTMLIFrameElement
|
|
||||||
| HTMLFrameElement;
|
|
||||||
const frameDoc =
|
|
||||||
frameElement.contentDocument ||
|
|
||||||
frameElement.contentWindow?.document;
|
|
||||||
if (frameDoc) {
|
|
||||||
rows.push(...Array.from(frameDoc.getElementsByTagName("TR")));
|
|
||||||
}
|
|
||||||
} catch (e) {
|
|
||||||
console.warn(
|
|
||||||
`Cannot access ${tableContext.tagName.toLowerCase()} rows:`,
|
|
||||||
e
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
const processedRows = this.filterRowsBasedOnTag(rows, tableFields);
|
|
||||||
|
|
||||||
for (
|
|
||||||
let rowIndex = 0;
|
|
||||||
rowIndex < Math.min(processedRows.length, limit);
|
|
||||||
rowIndex++
|
|
||||||
) {
|
|
||||||
const record: ExtractedListData = {};
|
|
||||||
const currentRow = processedRows[rowIndex];
|
|
||||||
|
|
||||||
for (const [
|
|
||||||
label,
|
|
||||||
{ selector, attribute, cellIndex },
|
|
||||||
] of Object.entries(tableFields)) {
|
|
||||||
let element: Element | null = null;
|
|
||||||
|
|
||||||
if (cellIndex !== undefined && cellIndex >= 0) {
|
|
||||||
let td: Element | null =
|
|
||||||
currentRow.children[cellIndex] || null;
|
|
||||||
|
|
||||||
if (!td && currentRow.shadowRoot) {
|
|
||||||
const shadowCells = currentRow.shadowRoot.children;
|
|
||||||
if (shadowCells && shadowCells.length > cellIndex) {
|
|
||||||
td = shadowCells[cellIndex];
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (td) {
|
|
||||||
element = this.queryElement(td, selector);
|
|
||||||
|
|
||||||
if (
|
|
||||||
!element &&
|
|
||||||
selector
|
|
||||||
.split(/(?:>>|:>>)/)
|
|
||||||
.pop()
|
|
||||||
?.includes("td:nth-child")
|
|
||||||
) {
|
|
||||||
element = td;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!element) {
|
|
||||||
const tagOnlySelector = selector.split(".")[0];
|
|
||||||
element = this.queryElement(td, tagOnlySelector);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!element) {
|
|
||||||
let currentElement: Element | null = td;
|
|
||||||
while (
|
|
||||||
currentElement &&
|
|
||||||
currentElement.children.length > 0
|
|
||||||
) {
|
|
||||||
let foundContentChild = false;
|
|
||||||
for (const child of Array.from(
|
|
||||||
currentElement.children
|
|
||||||
)) {
|
|
||||||
if (this.extractValue(child, attribute)) {
|
|
||||||
currentElement = child;
|
|
||||||
foundContentChild = true;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
if (!foundContentChild) break;
|
|
||||||
}
|
|
||||||
element = currentElement;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
element = this.queryElement(currentRow, selector);
|
|
||||||
}
|
|
||||||
|
|
||||||
if (element) {
|
|
||||||
const value = this.extractValue(element, attribute);
|
|
||||||
if (value !== null && value !== "") {
|
|
||||||
record[label] = value;
|
|
||||||
} else {
|
|
||||||
console.warn(
|
|
||||||
`❌ No value for ${label} in row ${rowIndex + 1}`
|
|
||||||
);
|
|
||||||
record[label] = "";
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
console.warn(
|
|
||||||
`❌ Element not found for ${label} with selector:`,
|
|
||||||
selector
|
|
||||||
);
|
|
||||||
record[label] = "";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (Object.values(record).some((value) => value !== "")) {
|
|
||||||
tableData.push(record);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Extract non-table data
|
|
||||||
const nonTableData: ExtractedListData[] = [];
|
|
||||||
for (
|
|
||||||
let containerIndex = 0;
|
|
||||||
containerIndex < containers.length;
|
|
||||||
containerIndex++
|
|
||||||
) {
|
|
||||||
if (nonTableData.length >= limit) break;
|
|
||||||
|
|
||||||
const container = containers[containerIndex];
|
|
||||||
const { nonTableFields } = containerFields[containerIndex];
|
|
||||||
|
|
||||||
if (Object.keys(nonTableFields).length > 0) {
|
|
||||||
const record: ExtractedListData = {};
|
|
||||||
|
|
||||||
for (const [label, { selector, attribute }] of Object.entries(
|
|
||||||
nonTableFields
|
|
||||||
)) {
|
|
||||||
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
|
||||||
const element = this.queryElement(container, relativeSelector);
|
|
||||||
|
|
||||||
if (element) {
|
|
||||||
const value = this.extractValue(element, attribute);
|
|
||||||
if (value !== null && value !== "") {
|
|
||||||
record[label] = value;
|
|
||||||
} else {
|
|
||||||
console.warn(
|
|
||||||
`❌ No value for ${label} in container ${containerIndex + 1}`
|
|
||||||
);
|
|
||||||
record[label] = "";
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
console.warn(
|
|
||||||
`❌ Element not found for ${label} with selector:`,
|
|
||||||
selector
|
|
||||||
);
|
|
||||||
record[label] = "";
|
record[label] = "";
|
||||||
}
|
}
|
||||||
}
|
} else {
|
||||||
|
console.warn(` ❌ Element not found for "${label}"`);
|
||||||
if (Object.values(record).some((value) => value !== "")) {
|
record[label] = "";
|
||||||
nonTableData.push(record);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
// Combine and limit results
|
// Step 5: Add record if it has any non-empty values
|
||||||
const extractedData = [...tableData, ...nonTableData].slice(0, limit);
|
if (Object.values(record).some((value) => value !== "")) {
|
||||||
|
extractedData.push(record);
|
||||||
|
} else {
|
||||||
|
console.warn(
|
||||||
|
` ⚠️ Skipping empty record for container ${containerIndex + 1}`
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
return extractedData;
|
return extractedData;
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.error("Error in client-side extractListData:", error);
|
console.error("💥 Error in client-side extractListData:", error);
|
||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// Create indexed XPath for specific container instance
|
||||||
|
private createIndexedXPath(
|
||||||
|
childSelector: string,
|
||||||
|
listSelector: string,
|
||||||
|
containerIndex: number
|
||||||
|
): string {
|
||||||
|
// Check if the child selector contains the list selector pattern
|
||||||
|
if (childSelector.includes(listSelector.replace("//", ""))) {
|
||||||
|
// Replace the list selector part with indexed version
|
||||||
|
const listPattern = listSelector.replace("//", "");
|
||||||
|
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
|
||||||
|
|
||||||
|
const indexedSelector = childSelector.replace(
|
||||||
|
`//${listPattern}`,
|
||||||
|
indexedListSelector
|
||||||
|
);
|
||||||
|
|
||||||
|
return indexedSelector;
|
||||||
|
} else {
|
||||||
|
// If pattern doesn't match, create a more generic indexed selector
|
||||||
|
// This is a fallback approach
|
||||||
|
console.warn(` ⚠️ Pattern doesn't match, using fallback approach`);
|
||||||
|
return `(${listSelector})[${containerIndex}]${childSelector.replace(
|
||||||
|
"//",
|
||||||
|
"/"
|
||||||
|
)}`;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Helper method for single XPath evaluation
|
||||||
|
private evaluateXPathSingle = (
|
||||||
|
document: Document,
|
||||||
|
xpath: string
|
||||||
|
): Element | null => {
|
||||||
|
try {
|
||||||
|
const result = document.evaluate(
|
||||||
|
xpath,
|
||||||
|
document,
|
||||||
|
null,
|
||||||
|
XPathResult.FIRST_ORDERED_NODE_TYPE,
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
const element = result.singleNodeValue as Element | null;
|
||||||
|
|
||||||
|
if (!element) {
|
||||||
|
console.warn(`❌ XPath found no element for: ${xpath}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return element;
|
||||||
|
} catch (error) {
|
||||||
|
console.error("❌ XPath evaluation failed:", xpath, error);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export const clientListExtractor = new ClientListExtractor();
|
export const clientListExtractor = new ClientListExtractor();
|
||||||
|
|||||||
Reference in New Issue
Block a user