feat: xpath support for core extraction

This commit is contained in:
Rohit
2025-07-06 21:44:56 +05:30
parent be42c1d8ef
commit 9a065a3d3d

View File

@@ -423,44 +423,149 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
// Enhanced query function to handle iframe, frame and shadow DOM
const queryElement = (rootElement, selector) => {
if (!selector.includes('>>') && !selector.includes(':>>')) {
return rootElement.querySelector(selector);
// XPath evaluation functions
const evaluateXPath = (rootElement, xpath) => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? rootElement
: rootElement.ownerDocument;
if (!ownerDoc) return null;
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
return result.singleNodeValue;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return null;
}
};
const evaluateXPathAll = (rootElement, xpath) => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? rootElement
: rootElement.ownerDocument;
if (!ownerDoc) return [];
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
null
);
const elements = [];
for (let i = 0; i < result.snapshotLength; i++) {
const node = result.snapshotItem(i);
if (node && node.nodeType === Node.ELEMENT_NODE) {
elements.push(node);
}
}
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
return elements;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return [];
}
};
// Helper function to detect selector type
const isXPathSelector = (selector) => {
return (
selector.startsWith("//") ||
selector.startsWith("/") ||
selector.startsWith("./")
);
};
// Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath
const queryElement = (rootElement, selector) => {
if (!selector.includes(">>") && !selector.includes(":>>")) {
// Check if it's an XPath selector
if (isXPathSelector(selector)) {
return evaluateXPath(rootElement, selector);
} else {
return rootElement.querySelector(selector);
}
}
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
let currentElement = rootElement;
for (let i = 0; i < parts.length; i++) {
if (!currentElement) return null;
// Handle iframe and frame traversal
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
if (
currentElement.tagName === "IFRAME" ||
currentElement.tagName === "FRAME"
) {
try {
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
const frameDoc =
currentElement.contentDocument ||
currentElement.contentWindow.document;
if (!frameDoc) return null;
if (isXPathSelector(parts[i])) {
currentElement = evaluateXPath(frameDoc, parts[i]);
} else {
currentElement = frameDoc.querySelector(parts[i]);
}
continue;
} catch (e) {
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
console.warn(
`Cannot access ${currentElement.tagName.toLowerCase()} content:`,
e
);
return null;
}
}
let nextElement = null;
// Try regular DOM first
let nextElement = currentElement.querySelector(parts[i]);
if ("querySelector" in currentElement) {
if (isXPathSelector(parts[i])) {
nextElement = evaluateXPath(currentElement, parts[i]);
} else {
nextElement = currentElement.querySelector(parts[i]);
}
}
// Try shadow DOM if not found
if (!nextElement && currentElement.shadowRoot) {
if (
!nextElement &&
"shadowRoot" in currentElement &&
currentElement.shadowRoot
) {
if (isXPathSelector(parts[i])) {
nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
} else {
nextElement = currentElement.shadowRoot.querySelector(parts[i]);
}
}
// Check children's shadow roots if still not found
if (!nextElement) {
if (!nextElement && "children" in currentElement) {
const children = Array.from(currentElement.children || []);
for (const child of children) {
if (child.shadowRoot) {
if (isXPathSelector(parts[i])) {
nextElement = evaluateXPath(child.shadowRoot, parts[i]);
} else {
nextElement = child.shadowRoot.querySelector(parts[i]);
}
if (nextElement) break;
}
}
@@ -474,11 +579,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Enhanced query all function for both contexts
const queryElementAll = (rootElement, selector) => {
if (!selector.includes('>>') && !selector.includes(':>>')) {
return rootElement.querySelectorAll(selector);
if (!selector.includes(">>") && !selector.includes(":>>")) {
if (isXPathSelector(selector)) {
return evaluateXPathAll(rootElement, selector);
} else {
return Array.from(rootElement.querySelectorAll(selector));
}
}
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
let currentElements = [rootElement];
for (const part of parts) {
@@ -486,30 +595,64 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
for (const element of currentElements) {
// Handle iframe and frame traversal
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
if (element.tagName === "IFRAME" || element.tagName === "FRAME") {
try {
const frameDoc = element.contentDocument || element.contentWindow.document;
nextElements.push(...frameDoc.querySelectorAll(part));
const frameDoc =
element.contentDocument || element.contentWindow.document;
if (frameDoc) {
if (isXPathSelector(part)) {
nextElements.push(...evaluateXPathAll(frameDoc, part));
} else {
nextElements.push(
...Array.from(frameDoc.querySelectorAll(part))
);
}
}
} catch (e) {
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
console.warn(
`Cannot access ${element.tagName.toLowerCase()} content:`,
e
);
continue;
}
} else {
// Regular DOM elements
if (element.querySelectorAll) {
nextElements.push(...element.querySelectorAll(part));
if (isXPathSelector(part)) {
nextElements.push(...evaluateXPathAll(element, part));
} else {
nextElements.push(
...Array.from(element.querySelectorAll(part))
);
}
}
// Shadow DOM elements
if (element.shadowRoot) {
nextElements.push(...element.shadowRoot.querySelectorAll(part));
if (isXPathSelector(part)) {
nextElements.push(
...evaluateXPathAll(element.shadowRoot, part)
);
} else {
nextElements.push(
...Array.from(element.shadowRoot.querySelectorAll(part))
);
}
}
// Check children's shadow roots
const children = Array.from(element.children || []);
for (const child of children) {
if (child.shadowRoot) {
nextElements.push(...child.shadowRoot.querySelectorAll(part));
if (isXPathSelector(part)) {
nextElements.push(
...evaluateXPathAll(child.shadowRoot, part)
);
} else {
nextElements.push(
...Array.from(child.shadowRoot.querySelectorAll(part))
);
}
}
}
}
@@ -522,11 +665,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
};
// Enhanced value extraction with context awareness
function extractValue(element, attribute) {
const extractValue = (element, attribute) => {
if (!element) return null;
// Get context-aware base URL
const baseURL = element.ownerDocument?.location?.href || window.location.origin;
const baseURL =
element.ownerDocument?.location?.href || window.location.origin;
// Check shadow root first
if (element.shadowRoot) {
@@ -536,15 +680,37 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}
}
if (attribute === 'innerText') {
return element.innerText.trim();
} else if (attribute === 'innerHTML') {
return element.innerHTML.trim();
} else if (attribute === 'src' || attribute === 'href') {
if (attribute === 'href' && element.tagName !== 'A') {
if (attribute === "innerText") {
// First try standard innerText/textContent
let textContent =
element.innerText?.trim() || element.textContent?.trim();
// If empty, check for common data attributes that might contain the text
if (!textContent) {
const dataAttributes = [
"data-600",
"data-text",
"data-label",
"data-value",
"data-content",
];
for (const attr of dataAttributes) {
const dataValue = element.getAttribute(attr);
if (dataValue && dataValue.trim()) {
textContent = dataValue.trim();
break;
}
}
}
return textContent || null;
} else if (attribute === "innerHTML") {
return element.innerHTML?.trim() || null;
} else if (attribute === "src" || attribute === "href") {
if (attribute === "href" && element.tagName !== "A") {
const parentElement = element.parentElement;
if (parentElement && parentElement.tagName === 'A') {
const parentHref = parentElement.getAttribute('href');
if (parentElement && parentElement.tagName === "A") {
const parentHref = parentElement.getAttribute("href");
if (parentHref) {
try {
return new URL(parentHref, baseURL).href;
@@ -556,13 +722,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}
const attrValue = element.getAttribute(attribute);
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
const dataAttr = attrValue || element.getAttribute("data-" + attribute);
if (!dataAttr || dataAttr.trim() === '') {
if (attribute === 'src') {
if (!dataAttr || dataAttr.trim() === "") {
if (attribute === "src") {
const style = window.getComputedStyle(element);
const bgImage = style.backgroundImage;
if (bgImage && bgImage !== 'none') {
if (bgImage && bgImage !== "none") {
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
return matches ? new URL(matches[1], baseURL).href : null;
}
@@ -573,15 +739,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
try {
return new URL(dataAttr, baseURL).href;
} catch (e) {
console.warn('Error creating URL from', dataAttr, e);
return dataAttr; // Return the original value if URL construction fails
console.warn("Error creating URL from", dataAttr, e);
return dataAttr;
}
}
return element.getAttribute(attribute);
}
};
// Enhanced table ancestor finding with context support
function findTableAncestor(element) {
const findTableAncestor = (element) => {
let currentElement = element;
const MAX_DEPTH = 5;
let depth = 0;
@@ -593,14 +759,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
continue;
}
if (currentElement.tagName === 'TD') {
return { type: 'TD', element: currentElement };
} else if (currentElement.tagName === 'TR') {
return { type: 'TR', element: currentElement };
if (currentElement.tagName === "TD") {
return { type: "TD", element: currentElement };
} else if (currentElement.tagName === "TR") {
return { type: "TR", element: currentElement };
}
// Handle iframe and frame crossing
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
if (
currentElement.tagName === "IFRAME" ||
currentElement.tagName === "FRAME"
) {
try {
currentElement = currentElement.contentDocument.body;
} catch (e) {
@@ -612,26 +781,26 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
depth++;
}
return null;
}
};
// Helper function to get cell index
function getCellIndex(td) {
const getCellIndex = (td) => {
if (td.getRootNode() instanceof ShadowRoot) {
const shadowRoot = td.getRootNode();
const allCells = Array.from(shadowRoot.querySelectorAll('td'));
const allCells = Array.from(shadowRoot.querySelectorAll("td"));
return allCells.indexOf(td);
}
let index = 0;
let sibling = td;
while (sibling = sibling.previousElementSibling) {
while ((sibling = sibling.previousElementSibling)) {
index++;
}
return index;
}
};
// Helper function to check for TH elements
function hasThElement(row, tableFields) {
const hasThElement = (row, tableFields) => {
for (const [_, { selector }] of Object.entries(tableFields)) {
const element = queryElement(row, selector);
if (element) {
@@ -642,9 +811,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
continue;
}
if (current.tagName === 'TH') return true;
if (current.tagName === "TH") return true;
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
if (current.tagName === "IFRAME" || current.tagName === "FRAME") {
try {
current = current.contentDocument.body;
} catch (e) {
@@ -657,35 +826,35 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}
}
return false;
}
};
// Helper function to filter rows
function filterRowsBasedOnTag(rows, tableFields) {
const filterRowsBasedOnTag = (rows, tableFields) => {
for (const row of rows) {
if (hasThElement(row, tableFields)) {
return rows;
}
}
// Include shadow DOM in TH search
return rows.filter(row => {
const directTH = row.getElementsByTagName('TH').length === 0;
const shadowTH = row.shadowRoot ?
row.shadowRoot.querySelector('th') === null : true;
return rows.filter((row) => {
const directTH = row.getElementsByTagName("TH").length === 0;
const shadowTH = row.shadowRoot
? row.shadowRoot.querySelector("th") === null
: true;
return directTH && shadowTH;
});
}
};
// Class similarity comparison functions
function calculateClassSimilarity(classList1, classList2) {
const calculateClassSimilarity = (classList1, classList2) => {
const set1 = new Set(classList1);
const set2 = new Set(classList2);
const intersection = new Set([...set1].filter(x => set2.has(x)));
const intersection = new Set([...set1].filter((x) => set2.has(x)));
const union = new Set([...set1, ...set2]);
return intersection.size / union.size;
}
};
// Enhanced similar elements finding with context support
function findSimilarElements(baseElement, similarityThreshold = 0.7) {
const findSimilarElements = (baseElement, similarityThreshold = 0.7) => {
const baseClasses = Array.from(baseElement.classList);
if (baseClasses.length === 0) return [];
@@ -697,25 +866,33 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Get elements from shadow DOM
if (baseElement.getRootNode() instanceof ShadowRoot) {
const shadowHost = baseElement.getRootNode().host;
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
allElements.push(
...shadowHost.getElementsByTagName(baseElement.tagName)
);
}
// Get elements from iframes and frames
const frames = [
...Array.from(document.getElementsByTagName('iframe')),
...Array.from(document.getElementsByTagName('frame'))
...Array.from(document.getElementsByTagName("iframe")),
...Array.from(document.getElementsByTagName("frame")),
];
for (const frame of frames) {
try {
const frameDoc = frame.contentDocument || frame.contentWindow.document;
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
const frameDoc =
frame.contentDocument || frame.contentWindow.document;
allElements.push(
...frameDoc.getElementsByTagName(baseElement.tagName)
);
} catch (e) {
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
console.warn(
`Cannot access ${frame.tagName.toLowerCase()} content:`,
e
);
}
}
return allElements.filter(element => {
return allElements.filter((element) => {
if (element === baseElement) return false;
const similarity = calculateClassSimilarity(
baseClasses,
@@ -723,45 +900,92 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
);
return similarity >= similarityThreshold;
});
}
};
function tryFallbackSelector(rootElement, originalSelector) {
const tryFallbackSelector = (rootElement, originalSelector) => {
let element = queryElement(rootElement, originalSelector);
if (!element && originalSelector.includes('nth-child')) {
if (!element && originalSelector.includes("nth-child")) {
const match = originalSelector.match(/nth-child\((\d+)\)/);
if (match) {
const position = parseInt(match[1], 10);
for (let i = position - 1; i >= 1; i--) {
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
const fallbackSelector = originalSelector.replace(
/nth-child\(\d+\)/,
`nth-child(${i})`
);
element = queryElement(rootElement, fallbackSelector);
if (element) break;
}
if (!element) {
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
const baseSelector = originalSelector.replace(
/\:nth-child\(\d+\)/,
""
);
element = queryElement(rootElement, baseSelector);
}
}
}
return element;
}
};
// Create indexed XPath for specific container instance
const createIndexedXPath = (
childSelector,
listSelector,
containerIndex
) => {
// Check if the child selector contains the list selector pattern
if (childSelector.includes(listSelector.replace("//", ""))) {
// Replace the list selector part with indexed version
const listPattern = listSelector.replace("//", "");
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
const indexedSelector = childSelector.replace(
`//${listPattern}`,
indexedListSelector
);
return indexedSelector;
} else {
// If pattern doesn't match, create a more generic indexed selector
return `(${listSelector})[${containerIndex}]${childSelector.replace(
"//",
"/"
)}`;
}
};
// Main scraping logic with unified support for both CSS and XPath
console.log("🚀 Starting unified list data extraction");
console.log("List Selector:", listSelector);
console.log("Fields:", fields);
// Main scraping logic with context support
let containers = queryElementAll(document, listSelector);
containers = Array.from(containers);
if (containers.length === 0) return [];
if (containers.length === 0) {
console.warn("❌ No containers found for listSelector:", listSelector);
return [];
}
if (limit > 1 && containers.length === 1) {
console.log(`📦 Found ${containers.length} list containers`);
// For CSS selectors, try to find similar containers if needed
if (
!isXPathSelector(listSelector) &&
limit > 1 &&
containers.length === 1
) {
const baseContainer = containers[0];
const similarContainers = findSimilarElements(baseContainer);
if (similarContainers.length > 0) {
const newContainers = similarContainers.filter(container =>
!container.matches(listSelector)
const newContainers = similarContainers.filter(
(container) => !container.matches(listSelector)
);
containers = [...containers, ...newContainers];
}
@@ -769,10 +993,60 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const containerFields = containers.map(() => ({
tableFields: {},
nonTableFields: {}
nonTableFields: {},
}));
// Classify fields
// For XPath selectors, use the new approach
if (isXPathSelector(listSelector)) {
const extractedData = [];
const containersToProcess = Math.min(containers.length, limit);
for (
let containerIndex = 0;
containerIndex < containersToProcess;
containerIndex++
) {
const record = {};
for (const [label, field] of Object.entries(fields)) {
let element = null;
if (isXPathSelector(field.selector)) {
// Create indexed absolute XPath
const indexedSelector = createIndexedXPath(
field.selector,
listSelector,
containerIndex + 1
);
element = evaluateXPath(document, indexedSelector);
} else {
// Fallback for CSS selectors within XPath containers
const container = containers[containerIndex];
element = queryElement(container, field.selector);
}
if (element) {
const value = extractValue(element, field.attribute);
if (value !== null && value !== "") {
record[label] = value;
} else {
record[label] = "";
}
} else {
record[label] = "";
}
}
if (Object.values(record).some((value) => value !== "")) {
extractedData.push(record);
}
}
console.log(`📊 Total records extracted: ${extractedData.length}`);
return extractedData;
}
// For CSS selectors, use the original table-aware approach
containers.forEach((container, containerIndex) => {
for (const [label, field] of Object.entries(fields)) {
const sampleElement = queryElement(container, field.selector);
@@ -783,7 +1057,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
containerFields[containerIndex].tableFields[label] = {
...field,
tableContext: ancestor.type,
cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1
cellIndex:
ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1,
};
} else {
containerFields[containerIndex].nonTableFields[label] = field;
@@ -798,7 +1073,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const nonTableData = [];
// Process table data with support for iframes, frames, and shadow DOM
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
const container = containers[containerIndex];
const { tableFields } = containerFields[containerIndex];
@@ -808,13 +1087,20 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
let tableContext = firstElement;
// Find table context including iframe, frame and shadow DOM
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
while (
tableContext &&
tableContext.tagName !== "TABLE" &&
tableContext !== container
) {
if (tableContext.getRootNode() instanceof ShadowRoot) {
tableContext = tableContext.getRootNode().host;
continue;
}
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try {
tableContext = tableContext.contentDocument.body;
} catch (e) {
@@ -830,30 +1116,45 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const rows = [];
// Get rows from regular DOM
rows.push(...tableContext.getElementsByTagName('TR'));
rows.push(...tableContext.getElementsByTagName("TR"));
// Get rows from shadow DOM
if (tableContext.shadowRoot) {
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
rows.push(...tableContext.shadowRoot.getElementsByTagName("TR"));
}
// Get rows from iframes and frames
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try {
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
rows.push(...frameDoc.getElementsByTagName('TR'));
const frameDoc =
tableContext.contentDocument ||
tableContext.contentWindow.document;
rows.push(...frameDoc.getElementsByTagName("TR"));
} catch (e) {
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
console.warn(
`Cannot access ${tableContext.tagName.toLowerCase()} rows:`,
e
);
}
}
const processedRows = filterRowsBasedOnTag(rows, tableFields);
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
for (
let rowIndex = 0;
rowIndex < Math.min(processedRows.length, limit);
rowIndex++
) {
const record = {};
const currentRow = processedRows[rowIndex];
for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
for (const [
label,
{ selector, attribute, cellIndex },
] of Object.entries(tableFields)) {
let element = null;
if (cellIndex >= 0) {
@@ -871,18 +1172,27 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
if (td) {
element = queryElement(td, selector);
if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) {
if (
!element &&
selector
.split(/(?:>>|:>>)/)
.pop()
.includes("td:nth-child")
) {
element = td;
}
if (!element) {
const tagOnlySelector = selector.split('.')[0];
const tagOnlySelector = selector.split(".")[0];
element = queryElement(td, tagOnlySelector);
}
if (!element) {
let currentElement = td;
while (currentElement && currentElement.children.length > 0) {
while (
currentElement &&
currentElement.children.length > 0
) {
let foundContentChild = false;
for (const child of currentElement.children) {
if (extractValue(child, attribute)) {
@@ -914,7 +1224,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}
// Process non-table data with all contexts support
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
if (nonTableData.length >= limit) break;
const container = containers[containerIndex];
@@ -923,7 +1237,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
if (Object.keys(nonTableFields).length > 0) {
const record = {};
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
for (const [label, { selector, attribute }] of Object.entries(
nonTableFields
)) {
// Get the last part of the selector after any context delimiter
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
const element = tryFallbackSelector(container, relativeSelector);
@@ -941,6 +1257,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Merge and limit the results
const scrapedData = [...tableData, ...nonTableData];
console.log(`📊 Total records extracted: ${scrapedData.length}`);
return scrapedData;
};