feat: xpath support for core extraction
This commit is contained in:
@@ -423,44 +423,149 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
||||
*/
|
||||
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
|
||||
// Enhanced query function to handle iframe, frame and shadow DOM
|
||||
const queryElement = (rootElement, selector) => {
|
||||
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
||||
return rootElement.querySelector(selector);
|
||||
// XPath evaluation functions
|
||||
const evaluateXPath = (rootElement, xpath) => {
|
||||
try {
|
||||
const ownerDoc =
|
||||
rootElement.nodeType === Node.DOCUMENT_NODE
|
||||
? rootElement
|
||||
: rootElement.ownerDocument;
|
||||
|
||||
if (!ownerDoc) return null;
|
||||
|
||||
const result = ownerDoc.evaluate(
|
||||
xpath,
|
||||
rootElement,
|
||||
null,
|
||||
XPathResult.FIRST_ORDERED_NODE_TYPE,
|
||||
null
|
||||
);
|
||||
|
||||
return result.singleNodeValue;
|
||||
} catch (error) {
|
||||
console.warn("XPath evaluation failed:", xpath, error);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
const evaluateXPathAll = (rootElement, xpath) => {
|
||||
try {
|
||||
const ownerDoc =
|
||||
rootElement.nodeType === Node.DOCUMENT_NODE
|
||||
? rootElement
|
||||
: rootElement.ownerDocument;
|
||||
|
||||
if (!ownerDoc) return [];
|
||||
|
||||
const result = ownerDoc.evaluate(
|
||||
xpath,
|
||||
rootElement,
|
||||
null,
|
||||
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
|
||||
null
|
||||
);
|
||||
|
||||
const elements = [];
|
||||
for (let i = 0; i < result.snapshotLength; i++) {
|
||||
const node = result.snapshotItem(i);
|
||||
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
||||
elements.push(node);
|
||||
}
|
||||
}
|
||||
|
||||
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
||||
return elements;
|
||||
} catch (error) {
|
||||
console.warn("XPath evaluation failed:", xpath, error);
|
||||
return [];
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function to detect selector type
|
||||
const isXPathSelector = (selector) => {
|
||||
return (
|
||||
selector.startsWith("//") ||
|
||||
selector.startsWith("/") ||
|
||||
selector.startsWith("./")
|
||||
);
|
||||
};
|
||||
|
||||
// Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath
|
||||
const queryElement = (rootElement, selector) => {
|
||||
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
||||
// Check if it's an XPath selector
|
||||
if (isXPathSelector(selector)) {
|
||||
return evaluateXPath(rootElement, selector);
|
||||
} else {
|
||||
return rootElement.querySelector(selector);
|
||||
}
|
||||
}
|
||||
|
||||
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
||||
let currentElement = rootElement;
|
||||
|
||||
for (let i = 0; i < parts.length; i++) {
|
||||
if (!currentElement) return null;
|
||||
|
||||
// Handle iframe and frame traversal
|
||||
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
||||
if (
|
||||
currentElement.tagName === "IFRAME" ||
|
||||
currentElement.tagName === "FRAME"
|
||||
) {
|
||||
try {
|
||||
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
||||
const frameDoc =
|
||||
currentElement.contentDocument ||
|
||||
currentElement.contentWindow.document;
|
||||
if (!frameDoc) return null;
|
||||
|
||||
if (isXPathSelector(parts[i])) {
|
||||
currentElement = evaluateXPath(frameDoc, parts[i]);
|
||||
} else {
|
||||
currentElement = frameDoc.querySelector(parts[i]);
|
||||
}
|
||||
continue;
|
||||
} catch (e) {
|
||||
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
|
||||
console.warn(
|
||||
`Cannot access ${currentElement.tagName.toLowerCase()} content:`,
|
||||
e
|
||||
);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
|
||||
let nextElement = null;
|
||||
|
||||
// Try regular DOM first
|
||||
let nextElement = currentElement.querySelector(parts[i]);
|
||||
if ("querySelector" in currentElement) {
|
||||
if (isXPathSelector(parts[i])) {
|
||||
nextElement = evaluateXPath(currentElement, parts[i]);
|
||||
} else {
|
||||
nextElement = currentElement.querySelector(parts[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Try shadow DOM if not found
|
||||
if (!nextElement && currentElement.shadowRoot) {
|
||||
if (
|
||||
!nextElement &&
|
||||
"shadowRoot" in currentElement &&
|
||||
currentElement.shadowRoot
|
||||
) {
|
||||
if (isXPathSelector(parts[i])) {
|
||||
nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
|
||||
} else {
|
||||
nextElement = currentElement.shadowRoot.querySelector(parts[i]);
|
||||
}
|
||||
}
|
||||
|
||||
// Check children's shadow roots if still not found
|
||||
if (!nextElement) {
|
||||
if (!nextElement && "children" in currentElement) {
|
||||
const children = Array.from(currentElement.children || []);
|
||||
for (const child of children) {
|
||||
if (child.shadowRoot) {
|
||||
if (isXPathSelector(parts[i])) {
|
||||
nextElement = evaluateXPath(child.shadowRoot, parts[i]);
|
||||
} else {
|
||||
nextElement = child.shadowRoot.querySelector(parts[i]);
|
||||
}
|
||||
if (nextElement) break;
|
||||
}
|
||||
}
|
||||
@@ -474,11 +579,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
|
||||
// Enhanced query all function for both contexts
|
||||
const queryElementAll = (rootElement, selector) => {
|
||||
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
||||
return rootElement.querySelectorAll(selector);
|
||||
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
||||
if (isXPathSelector(selector)) {
|
||||
return evaluateXPathAll(rootElement, selector);
|
||||
} else {
|
||||
return Array.from(rootElement.querySelectorAll(selector));
|
||||
}
|
||||
}
|
||||
|
||||
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
||||
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
||||
let currentElements = [rootElement];
|
||||
|
||||
for (const part of parts) {
|
||||
@@ -486,30 +595,64 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
|
||||
for (const element of currentElements) {
|
||||
// Handle iframe and frame traversal
|
||||
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
|
||||
if (element.tagName === "IFRAME" || element.tagName === "FRAME") {
|
||||
try {
|
||||
const frameDoc = element.contentDocument || element.contentWindow.document;
|
||||
nextElements.push(...frameDoc.querySelectorAll(part));
|
||||
const frameDoc =
|
||||
element.contentDocument || element.contentWindow.document;
|
||||
if (frameDoc) {
|
||||
if (isXPathSelector(part)) {
|
||||
nextElements.push(...evaluateXPathAll(frameDoc, part));
|
||||
} else {
|
||||
nextElements.push(
|
||||
...Array.from(frameDoc.querySelectorAll(part))
|
||||
);
|
||||
}
|
||||
}
|
||||
} catch (e) {
|
||||
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
|
||||
console.warn(
|
||||
`Cannot access ${element.tagName.toLowerCase()} content:`,
|
||||
e
|
||||
);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
// Regular DOM elements
|
||||
if (element.querySelectorAll) {
|
||||
nextElements.push(...element.querySelectorAll(part));
|
||||
if (isXPathSelector(part)) {
|
||||
nextElements.push(...evaluateXPathAll(element, part));
|
||||
} else {
|
||||
nextElements.push(
|
||||
...Array.from(element.querySelectorAll(part))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Shadow DOM elements
|
||||
if (element.shadowRoot) {
|
||||
nextElements.push(...element.shadowRoot.querySelectorAll(part));
|
||||
if (isXPathSelector(part)) {
|
||||
nextElements.push(
|
||||
...evaluateXPathAll(element.shadowRoot, part)
|
||||
);
|
||||
} else {
|
||||
nextElements.push(
|
||||
...Array.from(element.shadowRoot.querySelectorAll(part))
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
// Check children's shadow roots
|
||||
const children = Array.from(element.children || []);
|
||||
for (const child of children) {
|
||||
if (child.shadowRoot) {
|
||||
nextElements.push(...child.shadowRoot.querySelectorAll(part));
|
||||
if (isXPathSelector(part)) {
|
||||
nextElements.push(
|
||||
...evaluateXPathAll(child.shadowRoot, part)
|
||||
);
|
||||
} else {
|
||||
nextElements.push(
|
||||
...Array.from(child.shadowRoot.querySelectorAll(part))
|
||||
);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
@@ -522,11 +665,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
};
|
||||
|
||||
// Enhanced value extraction with context awareness
|
||||
function extractValue(element, attribute) {
|
||||
const extractValue = (element, attribute) => {
|
||||
if (!element) return null;
|
||||
|
||||
// Get context-aware base URL
|
||||
const baseURL = element.ownerDocument?.location?.href || window.location.origin;
|
||||
const baseURL =
|
||||
element.ownerDocument?.location?.href || window.location.origin;
|
||||
|
||||
// Check shadow root first
|
||||
if (element.shadowRoot) {
|
||||
@@ -536,15 +680,37 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
}
|
||||
}
|
||||
|
||||
if (attribute === 'innerText') {
|
||||
return element.innerText.trim();
|
||||
} else if (attribute === 'innerHTML') {
|
||||
return element.innerHTML.trim();
|
||||
} else if (attribute === 'src' || attribute === 'href') {
|
||||
if (attribute === 'href' && element.tagName !== 'A') {
|
||||
if (attribute === "innerText") {
|
||||
// First try standard innerText/textContent
|
||||
let textContent =
|
||||
element.innerText?.trim() || element.textContent?.trim();
|
||||
|
||||
// If empty, check for common data attributes that might contain the text
|
||||
if (!textContent) {
|
||||
const dataAttributes = [
|
||||
"data-600",
|
||||
"data-text",
|
||||
"data-label",
|
||||
"data-value",
|
||||
"data-content",
|
||||
];
|
||||
for (const attr of dataAttributes) {
|
||||
const dataValue = element.getAttribute(attr);
|
||||
if (dataValue && dataValue.trim()) {
|
||||
textContent = dataValue.trim();
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return textContent || null;
|
||||
} else if (attribute === "innerHTML") {
|
||||
return element.innerHTML?.trim() || null;
|
||||
} else if (attribute === "src" || attribute === "href") {
|
||||
if (attribute === "href" && element.tagName !== "A") {
|
||||
const parentElement = element.parentElement;
|
||||
if (parentElement && parentElement.tagName === 'A') {
|
||||
const parentHref = parentElement.getAttribute('href');
|
||||
if (parentElement && parentElement.tagName === "A") {
|
||||
const parentHref = parentElement.getAttribute("href");
|
||||
if (parentHref) {
|
||||
try {
|
||||
return new URL(parentHref, baseURL).href;
|
||||
@@ -556,13 +722,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
}
|
||||
|
||||
const attrValue = element.getAttribute(attribute);
|
||||
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
||||
const dataAttr = attrValue || element.getAttribute("data-" + attribute);
|
||||
|
||||
if (!dataAttr || dataAttr.trim() === '') {
|
||||
if (attribute === 'src') {
|
||||
if (!dataAttr || dataAttr.trim() === "") {
|
||||
if (attribute === "src") {
|
||||
const style = window.getComputedStyle(element);
|
||||
const bgImage = style.backgroundImage;
|
||||
if (bgImage && bgImage !== 'none') {
|
||||
if (bgImage && bgImage !== "none") {
|
||||
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
||||
return matches ? new URL(matches[1], baseURL).href : null;
|
||||
}
|
||||
@@ -573,15 +739,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
try {
|
||||
return new URL(dataAttr, baseURL).href;
|
||||
} catch (e) {
|
||||
console.warn('Error creating URL from', dataAttr, e);
|
||||
return dataAttr; // Return the original value if URL construction fails
|
||||
console.warn("Error creating URL from", dataAttr, e);
|
||||
return dataAttr;
|
||||
}
|
||||
}
|
||||
return element.getAttribute(attribute);
|
||||
}
|
||||
};
|
||||
|
||||
// Enhanced table ancestor finding with context support
|
||||
function findTableAncestor(element) {
|
||||
const findTableAncestor = (element) => {
|
||||
let currentElement = element;
|
||||
const MAX_DEPTH = 5;
|
||||
let depth = 0;
|
||||
@@ -593,14 +759,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
continue;
|
||||
}
|
||||
|
||||
if (currentElement.tagName === 'TD') {
|
||||
return { type: 'TD', element: currentElement };
|
||||
} else if (currentElement.tagName === 'TR') {
|
||||
return { type: 'TR', element: currentElement };
|
||||
if (currentElement.tagName === "TD") {
|
||||
return { type: "TD", element: currentElement };
|
||||
} else if (currentElement.tagName === "TR") {
|
||||
return { type: "TR", element: currentElement };
|
||||
}
|
||||
|
||||
// Handle iframe and frame crossing
|
||||
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
||||
if (
|
||||
currentElement.tagName === "IFRAME" ||
|
||||
currentElement.tagName === "FRAME"
|
||||
) {
|
||||
try {
|
||||
currentElement = currentElement.contentDocument.body;
|
||||
} catch (e) {
|
||||
@@ -612,26 +781,26 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
depth++;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function to get cell index
|
||||
function getCellIndex(td) {
|
||||
const getCellIndex = (td) => {
|
||||
if (td.getRootNode() instanceof ShadowRoot) {
|
||||
const shadowRoot = td.getRootNode();
|
||||
const allCells = Array.from(shadowRoot.querySelectorAll('td'));
|
||||
const allCells = Array.from(shadowRoot.querySelectorAll("td"));
|
||||
return allCells.indexOf(td);
|
||||
}
|
||||
|
||||
let index = 0;
|
||||
let sibling = td;
|
||||
while (sibling = sibling.previousElementSibling) {
|
||||
while ((sibling = sibling.previousElementSibling)) {
|
||||
index++;
|
||||
}
|
||||
return index;
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function to check for TH elements
|
||||
function hasThElement(row, tableFields) {
|
||||
const hasThElement = (row, tableFields) => {
|
||||
for (const [_, { selector }] of Object.entries(tableFields)) {
|
||||
const element = queryElement(row, selector);
|
||||
if (element) {
|
||||
@@ -642,9 +811,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.tagName === 'TH') return true;
|
||||
if (current.tagName === "TH") return true;
|
||||
|
||||
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
|
||||
if (current.tagName === "IFRAME" || current.tagName === "FRAME") {
|
||||
try {
|
||||
current = current.contentDocument.body;
|
||||
} catch (e) {
|
||||
@@ -657,35 +826,35 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
}
|
||||
}
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
// Helper function to filter rows
|
||||
function filterRowsBasedOnTag(rows, tableFields) {
|
||||
const filterRowsBasedOnTag = (rows, tableFields) => {
|
||||
for (const row of rows) {
|
||||
if (hasThElement(row, tableFields)) {
|
||||
return rows;
|
||||
}
|
||||
}
|
||||
// Include shadow DOM in TH search
|
||||
return rows.filter(row => {
|
||||
const directTH = row.getElementsByTagName('TH').length === 0;
|
||||
const shadowTH = row.shadowRoot ?
|
||||
row.shadowRoot.querySelector('th') === null : true;
|
||||
return rows.filter((row) => {
|
||||
const directTH = row.getElementsByTagName("TH").length === 0;
|
||||
const shadowTH = row.shadowRoot
|
||||
? row.shadowRoot.querySelector("th") === null
|
||||
: true;
|
||||
return directTH && shadowTH;
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
// Class similarity comparison functions
|
||||
function calculateClassSimilarity(classList1, classList2) {
|
||||
const calculateClassSimilarity = (classList1, classList2) => {
|
||||
const set1 = new Set(classList1);
|
||||
const set2 = new Set(classList2);
|
||||
const intersection = new Set([...set1].filter(x => set2.has(x)));
|
||||
const intersection = new Set([...set1].filter((x) => set2.has(x)));
|
||||
const union = new Set([...set1, ...set2]);
|
||||
return intersection.size / union.size;
|
||||
}
|
||||
};
|
||||
|
||||
// Enhanced similar elements finding with context support
|
||||
function findSimilarElements(baseElement, similarityThreshold = 0.7) {
|
||||
const findSimilarElements = (baseElement, similarityThreshold = 0.7) => {
|
||||
const baseClasses = Array.from(baseElement.classList);
|
||||
if (baseClasses.length === 0) return [];
|
||||
|
||||
@@ -697,25 +866,33 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
// Get elements from shadow DOM
|
||||
if (baseElement.getRootNode() instanceof ShadowRoot) {
|
||||
const shadowHost = baseElement.getRootNode().host;
|
||||
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
||||
allElements.push(
|
||||
...shadowHost.getElementsByTagName(baseElement.tagName)
|
||||
);
|
||||
}
|
||||
|
||||
// Get elements from iframes and frames
|
||||
const frames = [
|
||||
...Array.from(document.getElementsByTagName('iframe')),
|
||||
...Array.from(document.getElementsByTagName('frame'))
|
||||
...Array.from(document.getElementsByTagName("iframe")),
|
||||
...Array.from(document.getElementsByTagName("frame")),
|
||||
];
|
||||
|
||||
for (const frame of frames) {
|
||||
try {
|
||||
const frameDoc = frame.contentDocument || frame.contentWindow.document;
|
||||
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
|
||||
const frameDoc =
|
||||
frame.contentDocument || frame.contentWindow.document;
|
||||
allElements.push(
|
||||
...frameDoc.getElementsByTagName(baseElement.tagName)
|
||||
);
|
||||
} catch (e) {
|
||||
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
|
||||
console.warn(
|
||||
`Cannot access ${frame.tagName.toLowerCase()} content:`,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
return allElements.filter(element => {
|
||||
return allElements.filter((element) => {
|
||||
if (element === baseElement) return false;
|
||||
const similarity = calculateClassSimilarity(
|
||||
baseClasses,
|
||||
@@ -723,45 +900,92 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
);
|
||||
return similarity >= similarityThreshold;
|
||||
});
|
||||
}
|
||||
};
|
||||
|
||||
function tryFallbackSelector(rootElement, originalSelector) {
|
||||
const tryFallbackSelector = (rootElement, originalSelector) => {
|
||||
let element = queryElement(rootElement, originalSelector);
|
||||
|
||||
if (!element && originalSelector.includes('nth-child')) {
|
||||
if (!element && originalSelector.includes("nth-child")) {
|
||||
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
||||
if (match) {
|
||||
const position = parseInt(match[1], 10);
|
||||
|
||||
for (let i = position - 1; i >= 1; i--) {
|
||||
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
|
||||
const fallbackSelector = originalSelector.replace(
|
||||
/nth-child\(\d+\)/,
|
||||
`nth-child(${i})`
|
||||
);
|
||||
element = queryElement(rootElement, fallbackSelector);
|
||||
if (element) break;
|
||||
}
|
||||
|
||||
if (!element) {
|
||||
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
|
||||
const baseSelector = originalSelector.replace(
|
||||
/\:nth-child\(\d+\)/,
|
||||
""
|
||||
);
|
||||
element = queryElement(rootElement, baseSelector);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return element;
|
||||
}
|
||||
};
|
||||
|
||||
// Create indexed XPath for specific container instance
|
||||
const createIndexedXPath = (
|
||||
childSelector,
|
||||
listSelector,
|
||||
containerIndex
|
||||
) => {
|
||||
// Check if the child selector contains the list selector pattern
|
||||
if (childSelector.includes(listSelector.replace("//", ""))) {
|
||||
// Replace the list selector part with indexed version
|
||||
const listPattern = listSelector.replace("//", "");
|
||||
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
|
||||
|
||||
const indexedSelector = childSelector.replace(
|
||||
`//${listPattern}`,
|
||||
indexedListSelector
|
||||
);
|
||||
|
||||
return indexedSelector;
|
||||
} else {
|
||||
// If pattern doesn't match, create a more generic indexed selector
|
||||
return `(${listSelector})[${containerIndex}]${childSelector.replace(
|
||||
"//",
|
||||
"/"
|
||||
)}`;
|
||||
}
|
||||
};
|
||||
|
||||
// Main scraping logic with unified support for both CSS and XPath
|
||||
console.log("🚀 Starting unified list data extraction");
|
||||
console.log("List Selector:", listSelector);
|
||||
console.log("Fields:", fields);
|
||||
|
||||
// Main scraping logic with context support
|
||||
let containers = queryElementAll(document, listSelector);
|
||||
containers = Array.from(containers);
|
||||
|
||||
if (containers.length === 0) return [];
|
||||
if (containers.length === 0) {
|
||||
console.warn("❌ No containers found for listSelector:", listSelector);
|
||||
return [];
|
||||
}
|
||||
|
||||
if (limit > 1 && containers.length === 1) {
|
||||
console.log(`📦 Found ${containers.length} list containers`);
|
||||
|
||||
// For CSS selectors, try to find similar containers if needed
|
||||
if (
|
||||
!isXPathSelector(listSelector) &&
|
||||
limit > 1 &&
|
||||
containers.length === 1
|
||||
) {
|
||||
const baseContainer = containers[0];
|
||||
const similarContainers = findSimilarElements(baseContainer);
|
||||
|
||||
if (similarContainers.length > 0) {
|
||||
const newContainers = similarContainers.filter(container =>
|
||||
!container.matches(listSelector)
|
||||
const newContainers = similarContainers.filter(
|
||||
(container) => !container.matches(listSelector)
|
||||
);
|
||||
containers = [...containers, ...newContainers];
|
||||
}
|
||||
@@ -769,10 +993,60 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
|
||||
const containerFields = containers.map(() => ({
|
||||
tableFields: {},
|
||||
nonTableFields: {}
|
||||
nonTableFields: {},
|
||||
}));
|
||||
|
||||
// Classify fields
|
||||
// For XPath selectors, use the new approach
|
||||
if (isXPathSelector(listSelector)) {
|
||||
const extractedData = [];
|
||||
const containersToProcess = Math.min(containers.length, limit);
|
||||
|
||||
for (
|
||||
let containerIndex = 0;
|
||||
containerIndex < containersToProcess;
|
||||
containerIndex++
|
||||
) {
|
||||
const record = {};
|
||||
|
||||
for (const [label, field] of Object.entries(fields)) {
|
||||
let element = null;
|
||||
|
||||
if (isXPathSelector(field.selector)) {
|
||||
// Create indexed absolute XPath
|
||||
const indexedSelector = createIndexedXPath(
|
||||
field.selector,
|
||||
listSelector,
|
||||
containerIndex + 1
|
||||
);
|
||||
element = evaluateXPath(document, indexedSelector);
|
||||
} else {
|
||||
// Fallback for CSS selectors within XPath containers
|
||||
const container = containers[containerIndex];
|
||||
element = queryElement(container, field.selector);
|
||||
}
|
||||
|
||||
if (element) {
|
||||
const value = extractValue(element, field.attribute);
|
||||
if (value !== null && value !== "") {
|
||||
record[label] = value;
|
||||
} else {
|
||||
record[label] = "";
|
||||
}
|
||||
} else {
|
||||
record[label] = "";
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.values(record).some((value) => value !== "")) {
|
||||
extractedData.push(record);
|
||||
}
|
||||
}
|
||||
|
||||
console.log(`📊 Total records extracted: ${extractedData.length}`);
|
||||
return extractedData;
|
||||
}
|
||||
|
||||
// For CSS selectors, use the original table-aware approach
|
||||
containers.forEach((container, containerIndex) => {
|
||||
for (const [label, field] of Object.entries(fields)) {
|
||||
const sampleElement = queryElement(container, field.selector);
|
||||
@@ -783,7 +1057,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
containerFields[containerIndex].tableFields[label] = {
|
||||
...field,
|
||||
tableContext: ancestor.type,
|
||||
cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1
|
||||
cellIndex:
|
||||
ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1,
|
||||
};
|
||||
} else {
|
||||
containerFields[containerIndex].nonTableFields[label] = field;
|
||||
@@ -798,7 +1073,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
const nonTableData = [];
|
||||
|
||||
// Process table data with support for iframes, frames, and shadow DOM
|
||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||
for (
|
||||
let containerIndex = 0;
|
||||
containerIndex < containers.length;
|
||||
containerIndex++
|
||||
) {
|
||||
const container = containers[containerIndex];
|
||||
const { tableFields } = containerFields[containerIndex];
|
||||
|
||||
@@ -808,13 +1087,20 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
let tableContext = firstElement;
|
||||
|
||||
// Find table context including iframe, frame and shadow DOM
|
||||
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
||||
while (
|
||||
tableContext &&
|
||||
tableContext.tagName !== "TABLE" &&
|
||||
tableContext !== container
|
||||
) {
|
||||
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
||||
tableContext = tableContext.getRootNode().host;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
||||
if (
|
||||
tableContext.tagName === "IFRAME" ||
|
||||
tableContext.tagName === "FRAME"
|
||||
) {
|
||||
try {
|
||||
tableContext = tableContext.contentDocument.body;
|
||||
} catch (e) {
|
||||
@@ -830,30 +1116,45 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
const rows = [];
|
||||
|
||||
// Get rows from regular DOM
|
||||
rows.push(...tableContext.getElementsByTagName('TR'));
|
||||
rows.push(...tableContext.getElementsByTagName("TR"));
|
||||
|
||||
// Get rows from shadow DOM
|
||||
if (tableContext.shadowRoot) {
|
||||
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
||||
rows.push(...tableContext.shadowRoot.getElementsByTagName("TR"));
|
||||
}
|
||||
|
||||
// Get rows from iframes and frames
|
||||
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
||||
if (
|
||||
tableContext.tagName === "IFRAME" ||
|
||||
tableContext.tagName === "FRAME"
|
||||
) {
|
||||
try {
|
||||
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
||||
rows.push(...frameDoc.getElementsByTagName('TR'));
|
||||
const frameDoc =
|
||||
tableContext.contentDocument ||
|
||||
tableContext.contentWindow.document;
|
||||
rows.push(...frameDoc.getElementsByTagName("TR"));
|
||||
} catch (e) {
|
||||
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
|
||||
console.warn(
|
||||
`Cannot access ${tableContext.tagName.toLowerCase()} rows:`,
|
||||
e
|
||||
);
|
||||
}
|
||||
}
|
||||
|
||||
const processedRows = filterRowsBasedOnTag(rows, tableFields);
|
||||
|
||||
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
|
||||
for (
|
||||
let rowIndex = 0;
|
||||
rowIndex < Math.min(processedRows.length, limit);
|
||||
rowIndex++
|
||||
) {
|
||||
const record = {};
|
||||
const currentRow = processedRows[rowIndex];
|
||||
|
||||
for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
|
||||
for (const [
|
||||
label,
|
||||
{ selector, attribute, cellIndex },
|
||||
] of Object.entries(tableFields)) {
|
||||
let element = null;
|
||||
|
||||
if (cellIndex >= 0) {
|
||||
@@ -871,18 +1172,27 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
if (td) {
|
||||
element = queryElement(td, selector);
|
||||
|
||||
if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) {
|
||||
if (
|
||||
!element &&
|
||||
selector
|
||||
.split(/(?:>>|:>>)/)
|
||||
.pop()
|
||||
.includes("td:nth-child")
|
||||
) {
|
||||
element = td;
|
||||
}
|
||||
|
||||
if (!element) {
|
||||
const tagOnlySelector = selector.split('.')[0];
|
||||
const tagOnlySelector = selector.split(".")[0];
|
||||
element = queryElement(td, tagOnlySelector);
|
||||
}
|
||||
|
||||
if (!element) {
|
||||
let currentElement = td;
|
||||
while (currentElement && currentElement.children.length > 0) {
|
||||
while (
|
||||
currentElement &&
|
||||
currentElement.children.length > 0
|
||||
) {
|
||||
let foundContentChild = false;
|
||||
for (const child of currentElement.children) {
|
||||
if (extractValue(child, attribute)) {
|
||||
@@ -914,7 +1224,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
}
|
||||
|
||||
// Process non-table data with all contexts support
|
||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||
for (
|
||||
let containerIndex = 0;
|
||||
containerIndex < containers.length;
|
||||
containerIndex++
|
||||
) {
|
||||
if (nonTableData.length >= limit) break;
|
||||
|
||||
const container = containers[containerIndex];
|
||||
@@ -923,7 +1237,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
if (Object.keys(nonTableFields).length > 0) {
|
||||
const record = {};
|
||||
|
||||
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
||||
for (const [label, { selector, attribute }] of Object.entries(
|
||||
nonTableFields
|
||||
)) {
|
||||
// Get the last part of the selector after any context delimiter
|
||||
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
||||
const element = tryFallbackSelector(container, relativeSelector);
|
||||
@@ -941,6 +1257,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
|
||||
// Merge and limit the results
|
||||
const scrapedData = [...tableData, ...nonTableData];
|
||||
console.log(`📊 Total records extracted: ${scrapedData.length}`);
|
||||
|
||||
return scrapedData;
|
||||
};
|
||||
|
||||
|
||||
Reference in New Issue
Block a user