feat: xpath support for core extraction

This commit is contained in:
Rohit
2025-07-06 21:44:56 +05:30
parent be42c1d8ef
commit 9a065a3d3d

View File

@@ -423,44 +423,149 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list * @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/ */
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
// Enhanced query function to handle iframe, frame and shadow DOM // XPath evaluation functions
const queryElement = (rootElement, selector) => { const evaluateXPath = (rootElement, xpath) => {
if (!selector.includes('>>') && !selector.includes(':>>')) { try {
return rootElement.querySelector(selector); const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? rootElement
: rootElement.ownerDocument;
if (!ownerDoc) return null;
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
return result.singleNodeValue;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return null;
}
};
const evaluateXPathAll = (rootElement, xpath) => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? rootElement
: rootElement.ownerDocument;
if (!ownerDoc) return [];
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
null
);
const elements = [];
for (let i = 0; i < result.snapshotLength; i++) {
const node = result.snapshotItem(i);
if (node && node.nodeType === Node.ELEMENT_NODE) {
elements.push(node);
}
} }
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim()); return elements;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return [];
}
};
// Helper function to detect selector type
const isXPathSelector = (selector) => {
return (
selector.startsWith("//") ||
selector.startsWith("/") ||
selector.startsWith("./")
);
};
// Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath
const queryElement = (rootElement, selector) => {
if (!selector.includes(">>") && !selector.includes(":>>")) {
// Check if it's an XPath selector
if (isXPathSelector(selector)) {
return evaluateXPath(rootElement, selector);
} else {
return rootElement.querySelector(selector);
}
}
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
let currentElement = rootElement; let currentElement = rootElement;
for (let i = 0; i < parts.length; i++) { for (let i = 0; i < parts.length; i++) {
if (!currentElement) return null; if (!currentElement) return null;
// Handle iframe and frame traversal // Handle iframe and frame traversal
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') { if (
currentElement.tagName === "IFRAME" ||
currentElement.tagName === "FRAME"
) {
try { try {
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document; const frameDoc =
currentElement.contentDocument ||
currentElement.contentWindow.document;
if (!frameDoc) return null;
if (isXPathSelector(parts[i])) {
currentElement = evaluateXPath(frameDoc, parts[i]);
} else {
currentElement = frameDoc.querySelector(parts[i]); currentElement = frameDoc.querySelector(parts[i]);
}
continue; continue;
} catch (e) { } catch (e) {
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e); console.warn(
`Cannot access ${currentElement.tagName.toLowerCase()} content:`,
e
);
return null; return null;
} }
} }
let nextElement = null;
// Try regular DOM first // Try regular DOM first
let nextElement = currentElement.querySelector(parts[i]); if ("querySelector" in currentElement) {
if (isXPathSelector(parts[i])) {
nextElement = evaluateXPath(currentElement, parts[i]);
} else {
nextElement = currentElement.querySelector(parts[i]);
}
}
// Try shadow DOM if not found // Try shadow DOM if not found
if (!nextElement && currentElement.shadowRoot) { if (
!nextElement &&
"shadowRoot" in currentElement &&
currentElement.shadowRoot
) {
if (isXPathSelector(parts[i])) {
nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
} else {
nextElement = currentElement.shadowRoot.querySelector(parts[i]); nextElement = currentElement.shadowRoot.querySelector(parts[i]);
} }
}
// Check children's shadow roots if still not found // Check children's shadow roots if still not found
if (!nextElement) { if (!nextElement && "children" in currentElement) {
const children = Array.from(currentElement.children || []); const children = Array.from(currentElement.children || []);
for (const child of children) { for (const child of children) {
if (child.shadowRoot) { if (child.shadowRoot) {
if (isXPathSelector(parts[i])) {
nextElement = evaluateXPath(child.shadowRoot, parts[i]);
} else {
nextElement = child.shadowRoot.querySelector(parts[i]); nextElement = child.shadowRoot.querySelector(parts[i]);
}
if (nextElement) break; if (nextElement) break;
} }
} }
@@ -474,11 +579,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Enhanced query all function for both contexts // Enhanced query all function for both contexts
const queryElementAll = (rootElement, selector) => { const queryElementAll = (rootElement, selector) => {
if (!selector.includes('>>') && !selector.includes(':>>')) { if (!selector.includes(">>") && !selector.includes(":>>")) {
return rootElement.querySelectorAll(selector); if (isXPathSelector(selector)) {
return evaluateXPathAll(rootElement, selector);
} else {
return Array.from(rootElement.querySelectorAll(selector));
}
} }
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim()); const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
let currentElements = [rootElement]; let currentElements = [rootElement];
for (const part of parts) { for (const part of parts) {
@@ -486,30 +595,64 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
for (const element of currentElements) { for (const element of currentElements) {
// Handle iframe and frame traversal // Handle iframe and frame traversal
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') { if (element.tagName === "IFRAME" || element.tagName === "FRAME") {
try { try {
const frameDoc = element.contentDocument || element.contentWindow.document; const frameDoc =
nextElements.push(...frameDoc.querySelectorAll(part)); element.contentDocument || element.contentWindow.document;
if (frameDoc) {
if (isXPathSelector(part)) {
nextElements.push(...evaluateXPathAll(frameDoc, part));
} else {
nextElements.push(
...Array.from(frameDoc.querySelectorAll(part))
);
}
}
} catch (e) { } catch (e) {
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e); console.warn(
`Cannot access ${element.tagName.toLowerCase()} content:`,
e
);
continue; continue;
} }
} else { } else {
// Regular DOM elements // Regular DOM elements
if (element.querySelectorAll) { if (element.querySelectorAll) {
nextElements.push(...element.querySelectorAll(part)); if (isXPathSelector(part)) {
nextElements.push(...evaluateXPathAll(element, part));
} else {
nextElements.push(
...Array.from(element.querySelectorAll(part))
);
}
} }
// Shadow DOM elements // Shadow DOM elements
if (element.shadowRoot) { if (element.shadowRoot) {
nextElements.push(...element.shadowRoot.querySelectorAll(part)); if (isXPathSelector(part)) {
nextElements.push(
...evaluateXPathAll(element.shadowRoot, part)
);
} else {
nextElements.push(
...Array.from(element.shadowRoot.querySelectorAll(part))
);
}
} }
// Check children's shadow roots // Check children's shadow roots
const children = Array.from(element.children || []); const children = Array.from(element.children || []);
for (const child of children) { for (const child of children) {
if (child.shadowRoot) { if (child.shadowRoot) {
nextElements.push(...child.shadowRoot.querySelectorAll(part)); if (isXPathSelector(part)) {
nextElements.push(
...evaluateXPathAll(child.shadowRoot, part)
);
} else {
nextElements.push(
...Array.from(child.shadowRoot.querySelectorAll(part))
);
}
} }
} }
} }
@@ -522,11 +665,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}; };
// Enhanced value extraction with context awareness // Enhanced value extraction with context awareness
function extractValue(element, attribute) { const extractValue = (element, attribute) => {
if (!element) return null; if (!element) return null;
// Get context-aware base URL // Get context-aware base URL
const baseURL = element.ownerDocument?.location?.href || window.location.origin; const baseURL =
element.ownerDocument?.location?.href || window.location.origin;
// Check shadow root first // Check shadow root first
if (element.shadowRoot) { if (element.shadowRoot) {
@@ -536,15 +680,37 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} }
} }
if (attribute === 'innerText') { if (attribute === "innerText") {
return element.innerText.trim(); // First try standard innerText/textContent
} else if (attribute === 'innerHTML') { let textContent =
return element.innerHTML.trim(); element.innerText?.trim() || element.textContent?.trim();
} else if (attribute === 'src' || attribute === 'href') {
if (attribute === 'href' && element.tagName !== 'A') { // If empty, check for common data attributes that might contain the text
if (!textContent) {
const dataAttributes = [
"data-600",
"data-text",
"data-label",
"data-value",
"data-content",
];
for (const attr of dataAttributes) {
const dataValue = element.getAttribute(attr);
if (dataValue && dataValue.trim()) {
textContent = dataValue.trim();
break;
}
}
}
return textContent || null;
} else if (attribute === "innerHTML") {
return element.innerHTML?.trim() || null;
} else if (attribute === "src" || attribute === "href") {
if (attribute === "href" && element.tagName !== "A") {
const parentElement = element.parentElement; const parentElement = element.parentElement;
if (parentElement && parentElement.tagName === 'A') { if (parentElement && parentElement.tagName === "A") {
const parentHref = parentElement.getAttribute('href'); const parentHref = parentElement.getAttribute("href");
if (parentHref) { if (parentHref) {
try { try {
return new URL(parentHref, baseURL).href; return new URL(parentHref, baseURL).href;
@@ -556,13 +722,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} }
const attrValue = element.getAttribute(attribute); const attrValue = element.getAttribute(attribute);
const dataAttr = attrValue || element.getAttribute('data-' + attribute); const dataAttr = attrValue || element.getAttribute("data-" + attribute);
if (!dataAttr || dataAttr.trim() === '') { if (!dataAttr || dataAttr.trim() === "") {
if (attribute === 'src') { if (attribute === "src") {
const style = window.getComputedStyle(element); const style = window.getComputedStyle(element);
const bgImage = style.backgroundImage; const bgImage = style.backgroundImage;
if (bgImage && bgImage !== 'none') { if (bgImage && bgImage !== "none") {
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
return matches ? new URL(matches[1], baseURL).href : null; return matches ? new URL(matches[1], baseURL).href : null;
} }
@@ -573,15 +739,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
try { try {
return new URL(dataAttr, baseURL).href; return new URL(dataAttr, baseURL).href;
} catch (e) { } catch (e) {
console.warn('Error creating URL from', dataAttr, e); console.warn("Error creating URL from", dataAttr, e);
return dataAttr; // Return the original value if URL construction fails return dataAttr;
} }
} }
return element.getAttribute(attribute); return element.getAttribute(attribute);
} };
// Enhanced table ancestor finding with context support // Enhanced table ancestor finding with context support
function findTableAncestor(element) { const findTableAncestor = (element) => {
let currentElement = element; let currentElement = element;
const MAX_DEPTH = 5; const MAX_DEPTH = 5;
let depth = 0; let depth = 0;
@@ -593,14 +759,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
continue; continue;
} }
if (currentElement.tagName === 'TD') { if (currentElement.tagName === "TD") {
return { type: 'TD', element: currentElement }; return { type: "TD", element: currentElement };
} else if (currentElement.tagName === 'TR') { } else if (currentElement.tagName === "TR") {
return { type: 'TR', element: currentElement }; return { type: "TR", element: currentElement };
} }
// Handle iframe and frame crossing // Handle iframe and frame crossing
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') { if (
currentElement.tagName === "IFRAME" ||
currentElement.tagName === "FRAME"
) {
try { try {
currentElement = currentElement.contentDocument.body; currentElement = currentElement.contentDocument.body;
} catch (e) { } catch (e) {
@@ -612,26 +781,26 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
depth++; depth++;
} }
return null; return null;
} };
// Helper function to get cell index // Helper function to get cell index
function getCellIndex(td) { const getCellIndex = (td) => {
if (td.getRootNode() instanceof ShadowRoot) { if (td.getRootNode() instanceof ShadowRoot) {
const shadowRoot = td.getRootNode(); const shadowRoot = td.getRootNode();
const allCells = Array.from(shadowRoot.querySelectorAll('td')); const allCells = Array.from(shadowRoot.querySelectorAll("td"));
return allCells.indexOf(td); return allCells.indexOf(td);
} }
let index = 0; let index = 0;
let sibling = td; let sibling = td;
while (sibling = sibling.previousElementSibling) { while ((sibling = sibling.previousElementSibling)) {
index++; index++;
} }
return index; return index;
} };
// Helper function to check for TH elements // Helper function to check for TH elements
function hasThElement(row, tableFields) { const hasThElement = (row, tableFields) => {
for (const [_, { selector }] of Object.entries(tableFields)) { for (const [_, { selector }] of Object.entries(tableFields)) {
const element = queryElement(row, selector); const element = queryElement(row, selector);
if (element) { if (element) {
@@ -642,9 +811,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
continue; continue;
} }
if (current.tagName === 'TH') return true; if (current.tagName === "TH") return true;
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') { if (current.tagName === "IFRAME" || current.tagName === "FRAME") {
try { try {
current = current.contentDocument.body; current = current.contentDocument.body;
} catch (e) { } catch (e) {
@@ -657,35 +826,35 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} }
} }
return false; return false;
} };
// Helper function to filter rows // Helper function to filter rows
function filterRowsBasedOnTag(rows, tableFields) { const filterRowsBasedOnTag = (rows, tableFields) => {
for (const row of rows) { for (const row of rows) {
if (hasThElement(row, tableFields)) { if (hasThElement(row, tableFields)) {
return rows; return rows;
} }
} }
// Include shadow DOM in TH search return rows.filter((row) => {
return rows.filter(row => { const directTH = row.getElementsByTagName("TH").length === 0;
const directTH = row.getElementsByTagName('TH').length === 0; const shadowTH = row.shadowRoot
const shadowTH = row.shadowRoot ? ? row.shadowRoot.querySelector("th") === null
row.shadowRoot.querySelector('th') === null : true; : true;
return directTH && shadowTH; return directTH && shadowTH;
}); });
} };
// Class similarity comparison functions // Class similarity comparison functions
function calculateClassSimilarity(classList1, classList2) { const calculateClassSimilarity = (classList1, classList2) => {
const set1 = new Set(classList1); const set1 = new Set(classList1);
const set2 = new Set(classList2); const set2 = new Set(classList2);
const intersection = new Set([...set1].filter(x => set2.has(x))); const intersection = new Set([...set1].filter((x) => set2.has(x)));
const union = new Set([...set1, ...set2]); const union = new Set([...set1, ...set2]);
return intersection.size / union.size; return intersection.size / union.size;
} };
// Enhanced similar elements finding with context support // Enhanced similar elements finding with context support
function findSimilarElements(baseElement, similarityThreshold = 0.7) { const findSimilarElements = (baseElement, similarityThreshold = 0.7) => {
const baseClasses = Array.from(baseElement.classList); const baseClasses = Array.from(baseElement.classList);
if (baseClasses.length === 0) return []; if (baseClasses.length === 0) return [];
@@ -697,25 +866,33 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Get elements from shadow DOM // Get elements from shadow DOM
if (baseElement.getRootNode() instanceof ShadowRoot) { if (baseElement.getRootNode() instanceof ShadowRoot) {
const shadowHost = baseElement.getRootNode().host; const shadowHost = baseElement.getRootNode().host;
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName)); allElements.push(
...shadowHost.getElementsByTagName(baseElement.tagName)
);
} }
// Get elements from iframes and frames // Get elements from iframes and frames
const frames = [ const frames = [
...Array.from(document.getElementsByTagName('iframe')), ...Array.from(document.getElementsByTagName("iframe")),
...Array.from(document.getElementsByTagName('frame')) ...Array.from(document.getElementsByTagName("frame")),
]; ];
for (const frame of frames) { for (const frame of frames) {
try { try {
const frameDoc = frame.contentDocument || frame.contentWindow.document; const frameDoc =
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName)); frame.contentDocument || frame.contentWindow.document;
allElements.push(
...frameDoc.getElementsByTagName(baseElement.tagName)
);
} catch (e) { } catch (e) {
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e); console.warn(
`Cannot access ${frame.tagName.toLowerCase()} content:`,
e
);
} }
} }
return allElements.filter(element => { return allElements.filter((element) => {
if (element === baseElement) return false; if (element === baseElement) return false;
const similarity = calculateClassSimilarity( const similarity = calculateClassSimilarity(
baseClasses, baseClasses,
@@ -723,45 +900,92 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
); );
return similarity >= similarityThreshold; return similarity >= similarityThreshold;
}); });
} };
function tryFallbackSelector(rootElement, originalSelector) { const tryFallbackSelector = (rootElement, originalSelector) => {
let element = queryElement(rootElement, originalSelector); let element = queryElement(rootElement, originalSelector);
if (!element && originalSelector.includes('nth-child')) { if (!element && originalSelector.includes("nth-child")) {
const match = originalSelector.match(/nth-child\((\d+)\)/); const match = originalSelector.match(/nth-child\((\d+)\)/);
if (match) { if (match) {
const position = parseInt(match[1], 10); const position = parseInt(match[1], 10);
for (let i = position - 1; i >= 1; i--) { for (let i = position - 1; i >= 1; i--) {
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`); const fallbackSelector = originalSelector.replace(
/nth-child\(\d+\)/,
`nth-child(${i})`
);
element = queryElement(rootElement, fallbackSelector); element = queryElement(rootElement, fallbackSelector);
if (element) break; if (element) break;
} }
if (!element) { if (!element) {
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, ''); const baseSelector = originalSelector.replace(
/\:nth-child\(\d+\)/,
""
);
element = queryElement(rootElement, baseSelector); element = queryElement(rootElement, baseSelector);
} }
} }
} }
return element; return element;
} };
// Create indexed XPath for specific container instance
const createIndexedXPath = (
childSelector,
listSelector,
containerIndex
) => {
// Check if the child selector contains the list selector pattern
if (childSelector.includes(listSelector.replace("//", ""))) {
// Replace the list selector part with indexed version
const listPattern = listSelector.replace("//", "");
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
const indexedSelector = childSelector.replace(
`//${listPattern}`,
indexedListSelector
);
return indexedSelector;
} else {
// If pattern doesn't match, create a more generic indexed selector
return `(${listSelector})[${containerIndex}]${childSelector.replace(
"//",
"/"
)}`;
}
};
// Main scraping logic with unified support for both CSS and XPath
console.log("🚀 Starting unified list data extraction");
console.log("List Selector:", listSelector);
console.log("Fields:", fields);
// Main scraping logic with context support
let containers = queryElementAll(document, listSelector); let containers = queryElementAll(document, listSelector);
containers = Array.from(containers); containers = Array.from(containers);
if (containers.length === 0) return []; if (containers.length === 0) {
console.warn("❌ No containers found for listSelector:", listSelector);
return [];
}
if (limit > 1 && containers.length === 1) { console.log(`📦 Found ${containers.length} list containers`);
// For CSS selectors, try to find similar containers if needed
if (
!isXPathSelector(listSelector) &&
limit > 1 &&
containers.length === 1
) {
const baseContainer = containers[0]; const baseContainer = containers[0];
const similarContainers = findSimilarElements(baseContainer); const similarContainers = findSimilarElements(baseContainer);
if (similarContainers.length > 0) { if (similarContainers.length > 0) {
const newContainers = similarContainers.filter(container => const newContainers = similarContainers.filter(
!container.matches(listSelector) (container) => !container.matches(listSelector)
); );
containers = [...containers, ...newContainers]; containers = [...containers, ...newContainers];
} }
@@ -769,10 +993,60 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const containerFields = containers.map(() => ({ const containerFields = containers.map(() => ({
tableFields: {}, tableFields: {},
nonTableFields: {} nonTableFields: {},
})); }));
// Classify fields // For XPath selectors, use the new approach
if (isXPathSelector(listSelector)) {
const extractedData = [];
const containersToProcess = Math.min(containers.length, limit);
for (
let containerIndex = 0;
containerIndex < containersToProcess;
containerIndex++
) {
const record = {};
for (const [label, field] of Object.entries(fields)) {
let element = null;
if (isXPathSelector(field.selector)) {
// Create indexed absolute XPath
const indexedSelector = createIndexedXPath(
field.selector,
listSelector,
containerIndex + 1
);
element = evaluateXPath(document, indexedSelector);
} else {
// Fallback for CSS selectors within XPath containers
const container = containers[containerIndex];
element = queryElement(container, field.selector);
}
if (element) {
const value = extractValue(element, field.attribute);
if (value !== null && value !== "") {
record[label] = value;
} else {
record[label] = "";
}
} else {
record[label] = "";
}
}
if (Object.values(record).some((value) => value !== "")) {
extractedData.push(record);
}
}
console.log(`📊 Total records extracted: ${extractedData.length}`);
return extractedData;
}
// For CSS selectors, use the original table-aware approach
containers.forEach((container, containerIndex) => { containers.forEach((container, containerIndex) => {
for (const [label, field] of Object.entries(fields)) { for (const [label, field] of Object.entries(fields)) {
const sampleElement = queryElement(container, field.selector); const sampleElement = queryElement(container, field.selector);
@@ -783,7 +1057,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
containerFields[containerIndex].tableFields[label] = { containerFields[containerIndex].tableFields[label] = {
...field, ...field,
tableContext: ancestor.type, tableContext: ancestor.type,
cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 cellIndex:
ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1,
}; };
} else { } else {
containerFields[containerIndex].nonTableFields[label] = field; containerFields[containerIndex].nonTableFields[label] = field;
@@ -798,7 +1073,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const nonTableData = []; const nonTableData = [];
// Process table data with support for iframes, frames, and shadow DOM // Process table data with support for iframes, frames, and shadow DOM
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
const container = containers[containerIndex]; const container = containers[containerIndex];
const { tableFields } = containerFields[containerIndex]; const { tableFields } = containerFields[containerIndex];
@@ -808,13 +1087,20 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
let tableContext = firstElement; let tableContext = firstElement;
// Find table context including iframe, frame and shadow DOM // Find table context including iframe, frame and shadow DOM
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { while (
tableContext &&
tableContext.tagName !== "TABLE" &&
tableContext !== container
) {
if (tableContext.getRootNode() instanceof ShadowRoot) { if (tableContext.getRootNode() instanceof ShadowRoot) {
tableContext = tableContext.getRootNode().host; tableContext = tableContext.getRootNode().host;
continue; continue;
} }
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') { if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try { try {
tableContext = tableContext.contentDocument.body; tableContext = tableContext.contentDocument.body;
} catch (e) { } catch (e) {
@@ -830,30 +1116,45 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const rows = []; const rows = [];
// Get rows from regular DOM // Get rows from regular DOM
rows.push(...tableContext.getElementsByTagName('TR')); rows.push(...tableContext.getElementsByTagName("TR"));
// Get rows from shadow DOM // Get rows from shadow DOM
if (tableContext.shadowRoot) { if (tableContext.shadowRoot) {
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR')); rows.push(...tableContext.shadowRoot.getElementsByTagName("TR"));
} }
// Get rows from iframes and frames // Get rows from iframes and frames
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') { if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try { try {
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document; const frameDoc =
rows.push(...frameDoc.getElementsByTagName('TR')); tableContext.contentDocument ||
tableContext.contentWindow.document;
rows.push(...frameDoc.getElementsByTagName("TR"));
} catch (e) { } catch (e) {
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e); console.warn(
`Cannot access ${tableContext.tagName.toLowerCase()} rows:`,
e
);
} }
} }
const processedRows = filterRowsBasedOnTag(rows, tableFields); const processedRows = filterRowsBasedOnTag(rows, tableFields);
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) { for (
let rowIndex = 0;
rowIndex < Math.min(processedRows.length, limit);
rowIndex++
) {
const record = {}; const record = {};
const currentRow = processedRows[rowIndex]; const currentRow = processedRows[rowIndex];
for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) { for (const [
label,
{ selector, attribute, cellIndex },
] of Object.entries(tableFields)) {
let element = null; let element = null;
if (cellIndex >= 0) { if (cellIndex >= 0) {
@@ -871,18 +1172,27 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
if (td) { if (td) {
element = queryElement(td, selector); element = queryElement(td, selector);
if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) { if (
!element &&
selector
.split(/(?:>>|:>>)/)
.pop()
.includes("td:nth-child")
) {
element = td; element = td;
} }
if (!element) { if (!element) {
const tagOnlySelector = selector.split('.')[0]; const tagOnlySelector = selector.split(".")[0];
element = queryElement(td, tagOnlySelector); element = queryElement(td, tagOnlySelector);
} }
if (!element) { if (!element) {
let currentElement = td; let currentElement = td;
while (currentElement && currentElement.children.length > 0) { while (
currentElement &&
currentElement.children.length > 0
) {
let foundContentChild = false; let foundContentChild = false;
for (const child of currentElement.children) { for (const child of currentElement.children) {
if (extractValue(child, attribute)) { if (extractValue(child, attribute)) {
@@ -914,7 +1224,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} }
// Process non-table data with all contexts support // Process non-table data with all contexts support
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
if (nonTableData.length >= limit) break; if (nonTableData.length >= limit) break;
const container = containers[containerIndex]; const container = containers[containerIndex];
@@ -923,7 +1237,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
if (Object.keys(nonTableFields).length > 0) { if (Object.keys(nonTableFields).length > 0) {
const record = {}; const record = {};
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { for (const [label, { selector, attribute }] of Object.entries(
nonTableFields
)) {
// Get the last part of the selector after any context delimiter // Get the last part of the selector after any context delimiter
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0]; const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
const element = tryFallbackSelector(container, relativeSelector); const element = tryFallbackSelector(container, relativeSelector);
@@ -941,6 +1257,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Merge and limit the results // Merge and limit the results
const scrapedData = [...tableData, ...nonTableData]; const scrapedData = [...tableData, ...nonTableData];
console.log(`📊 Total records extracted: ${scrapedData.length}`);
return scrapedData; return scrapedData;
}; };