feat: shadow scrape list data preview
This commit is contained in:
@@ -6,7 +6,7 @@ interface TextStep {
|
|||||||
selectorObj: {
|
selectorObj: {
|
||||||
selector: string;
|
selector: string;
|
||||||
tag?: string;
|
tag?: string;
|
||||||
shadow?: boolean;
|
isShadow?: boolean;
|
||||||
attribute: string;
|
attribute: string;
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
@@ -18,6 +18,8 @@ interface ExtractedListData {
|
|||||||
interface Field {
|
interface Field {
|
||||||
selector: string;
|
selector: string;
|
||||||
attribute: string;
|
attribute: string;
|
||||||
|
tag?: string;
|
||||||
|
isShadow?: boolean;
|
||||||
}
|
}
|
||||||
|
|
||||||
class ClientListExtractor {
|
class ClientListExtractor {
|
||||||
@@ -156,50 +158,6 @@ class ClientListExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (
|
|
||||||
!nextElement &&
|
|
||||||
"shadowRoot" in currentElement &&
|
|
||||||
(currentElement as Element).shadowRoot
|
|
||||||
) {
|
|
||||||
if (
|
|
||||||
parts[i].startsWith("//") ||
|
|
||||||
parts[i].startsWith("/") ||
|
|
||||||
parts[i].startsWith("./")
|
|
||||||
) {
|
|
||||||
nextElement = this.evaluateXPath(
|
|
||||||
(currentElement as Element).shadowRoot as unknown as Document,
|
|
||||||
parts[i]
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
nextElement = (currentElement as Element).shadowRoot!.querySelector(
|
|
||||||
parts[i]
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if (!nextElement && "children" in currentElement) {
|
|
||||||
const children: any = Array.from(
|
|
||||||
(currentElement as Element).children || []
|
|
||||||
);
|
|
||||||
for (const child of children) {
|
|
||||||
if (child.shadowRoot) {
|
|
||||||
if (
|
|
||||||
parts[i].startsWith("//") ||
|
|
||||||
parts[i].startsWith("/") ||
|
|
||||||
parts[i].startsWith("./")
|
|
||||||
) {
|
|
||||||
nextElement = this.evaluateXPath(
|
|
||||||
child.shadowRoot as unknown as Document,
|
|
||||||
parts[i]
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
nextElement = child.shadowRoot.querySelector(parts[i]);
|
|
||||||
}
|
|
||||||
if (nextElement) break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
currentElement = nextElement;
|
currentElement = nextElement;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -265,43 +223,6 @@ class ClientListExtractor {
|
|||||||
nextElements.push(...Array.from(element.querySelectorAll(part)));
|
nextElements.push(...Array.from(element.querySelectorAll(part)));
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if ("shadowRoot" in element && (element as Element).shadowRoot) {
|
|
||||||
if (part.startsWith("//") || part.startsWith("/")) {
|
|
||||||
nextElements.push(
|
|
||||||
...this.evaluateXPathAll(
|
|
||||||
(element as Element).shadowRoot as unknown as Document,
|
|
||||||
part
|
|
||||||
)
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
nextElements.push(
|
|
||||||
...Array.from(
|
|
||||||
(element as Element).shadowRoot!.querySelectorAll(part)
|
|
||||||
)
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
if ("children" in element) {
|
|
||||||
const children = Array.from((element as Element).children || []);
|
|
||||||
for (const child of children) {
|
|
||||||
if (child.shadowRoot) {
|
|
||||||
if (part.startsWith("//") || part.startsWith("/")) {
|
|
||||||
nextElements.push(
|
|
||||||
...this.evaluateXPathAll(
|
|
||||||
child.shadowRoot as unknown as Document,
|
|
||||||
part
|
|
||||||
)
|
|
||||||
);
|
|
||||||
} else {
|
|
||||||
nextElements.push(
|
|
||||||
...Array.from(child.shadowRoot.querySelectorAll(part))
|
|
||||||
);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -328,14 +249,11 @@ class ClientListExtractor {
|
|||||||
}
|
}
|
||||||
|
|
||||||
if (attribute === "innerText") {
|
if (attribute === "innerText") {
|
||||||
// First try standard innerText/textContent
|
|
||||||
let textContent =
|
let textContent =
|
||||||
(element as HTMLElement).innerText?.trim() ||
|
(element as HTMLElement).innerText?.trim() ||
|
||||||
(element as HTMLElement).textContent?.trim();
|
(element as HTMLElement).textContent?.trim();
|
||||||
|
|
||||||
// If empty, check for common data attributes that might contain the text
|
|
||||||
if (!textContent) {
|
if (!textContent) {
|
||||||
// Check for data-* attributes that commonly contain text values
|
|
||||||
const dataAttributes = [
|
const dataAttributes = [
|
||||||
"data-600",
|
"data-600",
|
||||||
"data-text",
|
"data-text",
|
||||||
@@ -356,10 +274,8 @@ class ClientListExtractor {
|
|||||||
} else if (attribute === "innerHTML") {
|
} else if (attribute === "innerHTML") {
|
||||||
return element.innerHTML?.trim() || null;
|
return element.innerHTML?.trim() || null;
|
||||||
} else if (attribute === "href") {
|
} else if (attribute === "href") {
|
||||||
// For href, we need to find the anchor tag if the current element isn't one
|
|
||||||
let anchorElement = element;
|
let anchorElement = element;
|
||||||
|
|
||||||
// If current element is not an anchor, look for parent anchor
|
|
||||||
if (element.tagName !== "A") {
|
if (element.tagName !== "A") {
|
||||||
anchorElement =
|
anchorElement =
|
||||||
element.closest("a") ||
|
element.closest("a") ||
|
||||||
@@ -410,6 +326,7 @@ class ClientListExtractor {
|
|||||||
convertedFields[typedField.label] = {
|
convertedFields[typedField.label] = {
|
||||||
selector: typedField.selectorObj.selector,
|
selector: typedField.selectorObj.selector,
|
||||||
attribute: typedField.selectorObj.attribute,
|
attribute: typedField.selectorObj.attribute,
|
||||||
|
isShadow: typedField.selectorObj.isShadow || false,
|
||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -423,10 +340,8 @@ class ClientListExtractor {
|
|||||||
limit: number = 5
|
limit: number = 5
|
||||||
): ExtractedListData[] => {
|
): ExtractedListData[] => {
|
||||||
try {
|
try {
|
||||||
// Convert fields to the format expected by the extraction logic
|
|
||||||
const convertedFields = this.convertFields(fields);
|
const convertedFields = this.convertFields(fields);
|
||||||
|
|
||||||
// Step 1: Get all container elements matching the list selector
|
|
||||||
const containers = this.queryElementAll(iframeDocument, listSelector);
|
const containers = this.queryElementAll(iframeDocument, listSelector);
|
||||||
|
|
||||||
if (containers.length === 0) {
|
if (containers.length === 0) {
|
||||||
@@ -434,7 +349,6 @@ class ClientListExtractor {
|
|||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 2: Extract data from each container up to the limit
|
|
||||||
const extractedData: ExtractedListData[] = [];
|
const extractedData: ExtractedListData[] = [];
|
||||||
const containersToProcess = Math.min(containers.length, limit);
|
const containersToProcess = Math.min(containers.length, limit);
|
||||||
|
|
||||||
@@ -446,28 +360,27 @@ class ClientListExtractor {
|
|||||||
const container = containers[containerIndex];
|
const container = containers[containerIndex];
|
||||||
const record: ExtractedListData = {};
|
const record: ExtractedListData = {};
|
||||||
|
|
||||||
// Step 3: For each field, extract data from the current container
|
for (const [label, { selector, attribute, isShadow }] of Object.entries(
|
||||||
for (const [label, { selector, attribute }] of Object.entries(
|
|
||||||
convertedFields
|
convertedFields
|
||||||
)) {
|
)) {
|
||||||
let element: Element | null = null;
|
let element: Element | null = null;
|
||||||
|
|
||||||
// CORRECT APPROACH: Create indexed absolute XPath
|
|
||||||
if (selector.startsWith("//")) {
|
if (selector.startsWith("//")) {
|
||||||
// Convert the absolute selector to target the specific container instance
|
|
||||||
const indexedSelector = this.createIndexedXPath(
|
const indexedSelector = this.createIndexedXPath(
|
||||||
selector,
|
selector,
|
||||||
listSelector,
|
listSelector,
|
||||||
containerIndex + 1
|
containerIndex + 1
|
||||||
);
|
);
|
||||||
|
|
||||||
element = this.evaluateXPathSingle(iframeDocument, indexedSelector);
|
element = this.evaluateXPathSingle(
|
||||||
|
iframeDocument,
|
||||||
|
indexedSelector,
|
||||||
|
isShadow
|
||||||
|
);
|
||||||
} else {
|
} else {
|
||||||
// Fallback for non-XPath selectors
|
|
||||||
element = this.queryElement(container, selector);
|
element = this.queryElement(container, selector);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 4: Extract the value from the found element
|
|
||||||
if (element) {
|
if (element) {
|
||||||
const value = this.extractValue(element, attribute);
|
const value = this.extractValue(element, attribute);
|
||||||
if (value !== null && value !== "") {
|
if (value !== null && value !== "") {
|
||||||
@@ -482,7 +395,6 @@ class ClientListExtractor {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Step 5: Add record if it has any non-empty values
|
|
||||||
if (Object.values(record).some((value) => value !== "")) {
|
if (Object.values(record).some((value) => value !== "")) {
|
||||||
extractedData.push(record);
|
extractedData.push(record);
|
||||||
} else {
|
} else {
|
||||||
@@ -499,15 +411,12 @@ class ClientListExtractor {
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
// Create indexed XPath for specific container instance
|
|
||||||
private createIndexedXPath(
|
private createIndexedXPath(
|
||||||
childSelector: string,
|
childSelector: string,
|
||||||
listSelector: string,
|
listSelector: string,
|
||||||
containerIndex: number
|
containerIndex: number
|
||||||
): string {
|
): string {
|
||||||
// Check if the child selector contains the list selector pattern
|
|
||||||
if (childSelector.includes(listSelector.replace("//", ""))) {
|
if (childSelector.includes(listSelector.replace("//", ""))) {
|
||||||
// Replace the list selector part with indexed version
|
|
||||||
const listPattern = listSelector.replace("//", "");
|
const listPattern = listSelector.replace("//", "");
|
||||||
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
|
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
|
||||||
|
|
||||||
@@ -518,8 +427,6 @@ class ClientListExtractor {
|
|||||||
|
|
||||||
return indexedSelector;
|
return indexedSelector;
|
||||||
} else {
|
} else {
|
||||||
// If pattern doesn't match, create a more generic indexed selector
|
|
||||||
// This is a fallback approach
|
|
||||||
console.warn(` ⚠️ Pattern doesn't match, using fallback approach`);
|
console.warn(` ⚠️ Pattern doesn't match, using fallback approach`);
|
||||||
return `(${listSelector})[${containerIndex}]${childSelector.replace(
|
return `(${listSelector})[${containerIndex}]${childSelector.replace(
|
||||||
"//",
|
"//",
|
||||||
@@ -531,7 +438,8 @@ class ClientListExtractor {
|
|||||||
// Helper method for single XPath evaluation
|
// Helper method for single XPath evaluation
|
||||||
private evaluateXPathSingle = (
|
private evaluateXPathSingle = (
|
||||||
document: Document,
|
document: Document,
|
||||||
xpath: string
|
xpath: string,
|
||||||
|
isShadow: boolean = false
|
||||||
): Element | null => {
|
): Element | null => {
|
||||||
try {
|
try {
|
||||||
const result = document.evaluate(
|
const result = document.evaluate(
|
||||||
@@ -540,20 +448,228 @@ class ClientListExtractor {
|
|||||||
null,
|
null,
|
||||||
XPathResult.FIRST_ORDERED_NODE_TYPE,
|
XPathResult.FIRST_ORDERED_NODE_TYPE,
|
||||||
null
|
null
|
||||||
);
|
).singleNodeValue as Element | null;
|
||||||
|
|
||||||
const element = result.singleNodeValue as Element | null;
|
if (!isShadow) {
|
||||||
|
if (result === null) {
|
||||||
if (!element) {
|
return null;
|
||||||
console.warn(`❌ XPath found no element for: ${xpath}`);
|
}
|
||||||
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
return element;
|
let cleanPath = xpath;
|
||||||
} catch (error) {
|
let isIndexed = false;
|
||||||
console.error("❌ XPath evaluation failed:", xpath, error);
|
|
||||||
|
const indexedMatch = xpath.match(/^\((.*?)\)\[(\d+)\](.*)$/);
|
||||||
|
if (indexedMatch) {
|
||||||
|
cleanPath = indexedMatch[1] + indexedMatch[3];
|
||||||
|
isIndexed = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const pathParts = cleanPath
|
||||||
|
.replace(/^\/\//, "")
|
||||||
|
.split("/")
|
||||||
|
.map((p) => p.trim())
|
||||||
|
.filter((p) => p.length > 0);
|
||||||
|
|
||||||
|
let currentContexts: (Document | Element | ShadowRoot)[] = [document];
|
||||||
|
|
||||||
|
for (let i = 0; i < pathParts.length; i++) {
|
||||||
|
const part = pathParts[i];
|
||||||
|
const nextContexts: (Element | ShadowRoot)[] = [];
|
||||||
|
|
||||||
|
for (const ctx of currentContexts) {
|
||||||
|
const positionalMatch = part.match(/^([^[]+)\[(\d+)\]$/);
|
||||||
|
let partWithoutPosition = part;
|
||||||
|
let requestedPosition: number | null = null;
|
||||||
|
|
||||||
|
if (positionalMatch) {
|
||||||
|
partWithoutPosition = positionalMatch[1];
|
||||||
|
requestedPosition = parseInt(positionalMatch[2]);
|
||||||
|
}
|
||||||
|
|
||||||
|
const matched = this.queryInsideContext(ctx, partWithoutPosition);
|
||||||
|
|
||||||
|
let elementsToAdd = matched;
|
||||||
|
if (requestedPosition !== null) {
|
||||||
|
const index = requestedPosition - 1; // XPath is 1-based, arrays are 0-based
|
||||||
|
if (index >= 0 && index < matched.length) {
|
||||||
|
elementsToAdd = [matched[index]];
|
||||||
|
} else {
|
||||||
|
console.warn(
|
||||||
|
` ⚠️ Position ${requestedPosition} out of range (${matched.length} elements found)`
|
||||||
|
);
|
||||||
|
elementsToAdd = [];
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
elementsToAdd.forEach((el) => {
|
||||||
|
nextContexts.push(el);
|
||||||
|
if (el.shadowRoot) {
|
||||||
|
nextContexts.push(el.shadowRoot);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (nextContexts.length === 0) {
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
currentContexts = nextContexts;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (currentContexts.length > 0) {
|
||||||
|
if (isIndexed && indexedMatch) {
|
||||||
|
const requestedIndex = parseInt(indexedMatch[2]) - 1; // XPath is 1-based, array is 0-based
|
||||||
|
if (requestedIndex >= 0 && requestedIndex < currentContexts.length) {
|
||||||
|
return currentContexts[requestedIndex] as Element;
|
||||||
|
} else {
|
||||||
|
console.warn(
|
||||||
|
`⚠️ Requested index ${requestedIndex + 1} out of range (${
|
||||||
|
currentContexts.length
|
||||||
|
} elements found)`
|
||||||
|
);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return currentContexts[0] as Element;
|
||||||
|
}
|
||||||
|
|
||||||
|
return null;
|
||||||
|
} catch (err) {
|
||||||
|
console.error("💥 Critical XPath failure:", xpath, err);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
private queryInsideContext = (
|
||||||
|
context: Document | Element | ShadowRoot,
|
||||||
|
part: string
|
||||||
|
): Element[] => {
|
||||||
|
try {
|
||||||
|
const { tagName, conditions } = this.parseXPathPart(part);
|
||||||
|
|
||||||
|
const candidateElements = Array.from(context.querySelectorAll(tagName));
|
||||||
|
if (candidateElements.length === 0) {
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
const matchingElements = candidateElements.filter((el) => {
|
||||||
|
const matches = this.elementMatchesConditions(el, conditions);
|
||||||
|
return matches;
|
||||||
|
});
|
||||||
|
|
||||||
|
return matchingElements;
|
||||||
|
} catch (err) {
|
||||||
|
console.error("Error in queryInsideContext:", err);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
private parseXPathPart = (
|
||||||
|
part: string
|
||||||
|
): { tagName: string; conditions: string[] } => {
|
||||||
|
const tagMatch = part.match(/^([a-zA-Z0-9-]+)/);
|
||||||
|
const tagName = tagMatch ? tagMatch[1] : "*";
|
||||||
|
|
||||||
|
const conditionMatches = part.match(/\[([^\]]+)\]/g);
|
||||||
|
const conditions = conditionMatches
|
||||||
|
? conditionMatches.map((c) => c.slice(1, -1))
|
||||||
|
: [];
|
||||||
|
|
||||||
|
return { tagName, conditions };
|
||||||
|
};
|
||||||
|
|
||||||
|
// Check if element matches all given conditions
|
||||||
|
private elementMatchesConditions = (
|
||||||
|
element: Element,
|
||||||
|
conditions: string[]
|
||||||
|
): boolean => {
|
||||||
|
for (const condition of conditions) {
|
||||||
|
if (!this.elementMatchesCondition(element, condition)) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
};
|
||||||
|
|
||||||
|
private elementMatchesCondition = (
|
||||||
|
element: Element,
|
||||||
|
condition: string
|
||||||
|
): boolean => {
|
||||||
|
condition = condition.trim();
|
||||||
|
|
||||||
|
if (/^\d+$/.test(condition)) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle @attribute="value"
|
||||||
|
const attrMatch = condition.match(/^@([^=]+)=["']([^"']+)["']$/);
|
||||||
|
if (attrMatch) {
|
||||||
|
const [, attr, value] = attrMatch;
|
||||||
|
const elementValue = element.getAttribute(attr);
|
||||||
|
const matches = elementValue === value;
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle contains(@class, 'value')
|
||||||
|
const classContainsMatch = condition.match(
|
||||||
|
/^contains\(@class,\s*["']([^"']+)["']\)$/
|
||||||
|
);
|
||||||
|
if (classContainsMatch) {
|
||||||
|
const className = classContainsMatch[1];
|
||||||
|
const matches = element.classList.contains(className);
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle contains(@attribute, 'value')
|
||||||
|
const attrContainsMatch = condition.match(
|
||||||
|
/^contains\(@([^,]+),\s*["']([^"']+)["']\)$/
|
||||||
|
);
|
||||||
|
if (attrContainsMatch) {
|
||||||
|
const [, attr, value] = attrContainsMatch;
|
||||||
|
const elementValue = element.getAttribute(attr) || "";
|
||||||
|
const matches = elementValue.includes(value);
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle text()="value"
|
||||||
|
const textMatch = condition.match(/^text\(\)=["']([^"']+)["']$/);
|
||||||
|
if (textMatch) {
|
||||||
|
const expectedText = textMatch[1];
|
||||||
|
const elementText = element.textContent?.trim() || "";
|
||||||
|
const matches = elementText === expectedText;
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle contains(text(), 'value')
|
||||||
|
const textContainsMatch = condition.match(
|
||||||
|
/^contains\(text\(\),\s*["']([^"']+)["']\)$/
|
||||||
|
);
|
||||||
|
if (textContainsMatch) {
|
||||||
|
const expectedText = textContainsMatch[1];
|
||||||
|
const elementText = element.textContent?.trim() || "";
|
||||||
|
const matches = elementText.includes(expectedText);
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle count(*)=0 (element has no children)
|
||||||
|
if (condition === "count(*)=0") {
|
||||||
|
const matches = element.children.length === 0;
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle other count conditions
|
||||||
|
const countMatch = condition.match(/^count\(\*\)=(\d+)$/);
|
||||||
|
if (countMatch) {
|
||||||
|
const expectedCount = parseInt(countMatch[1]);
|
||||||
|
const matches = element.children.length === expectedCount;
|
||||||
|
return matches;
|
||||||
|
}
|
||||||
|
|
||||||
|
return true;
|
||||||
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
export const clientListExtractor = new ClientListExtractor();
|
export const clientListExtractor = new ClientListExtractor();
|
||||||
|
|||||||
Reference in New Issue
Block a user