feat: xpath support for core extraction
This commit is contained in:
@@ -423,44 +423,149 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
||||||
*/
|
*/
|
||||||
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
|
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
|
||||||
// Enhanced query function to handle iframe, frame and shadow DOM
|
// XPath evaluation functions
|
||||||
const queryElement = (rootElement, selector) => {
|
const evaluateXPath = (rootElement, xpath) => {
|
||||||
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
try {
|
||||||
return rootElement.querySelector(selector);
|
const ownerDoc =
|
||||||
|
rootElement.nodeType === Node.DOCUMENT_NODE
|
||||||
|
? rootElement
|
||||||
|
: rootElement.ownerDocument;
|
||||||
|
|
||||||
|
if (!ownerDoc) return null;
|
||||||
|
|
||||||
|
const result = ownerDoc.evaluate(
|
||||||
|
xpath,
|
||||||
|
rootElement,
|
||||||
|
null,
|
||||||
|
XPathResult.FIRST_ORDERED_NODE_TYPE,
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
return result.singleNodeValue;
|
||||||
|
} catch (error) {
|
||||||
|
console.warn("XPath evaluation failed:", xpath, error);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const evaluateXPathAll = (rootElement, xpath) => {
|
||||||
|
try {
|
||||||
|
const ownerDoc =
|
||||||
|
rootElement.nodeType === Node.DOCUMENT_NODE
|
||||||
|
? rootElement
|
||||||
|
: rootElement.ownerDocument;
|
||||||
|
|
||||||
|
if (!ownerDoc) return [];
|
||||||
|
|
||||||
|
const result = ownerDoc.evaluate(
|
||||||
|
xpath,
|
||||||
|
rootElement,
|
||||||
|
null,
|
||||||
|
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
|
||||||
|
null
|
||||||
|
);
|
||||||
|
|
||||||
|
const elements = [];
|
||||||
|
for (let i = 0; i < result.snapshotLength; i++) {
|
||||||
|
const node = result.snapshotItem(i);
|
||||||
|
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
||||||
|
elements.push(node);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
return elements;
|
||||||
|
} catch (error) {
|
||||||
|
console.warn("XPath evaluation failed:", xpath, error);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Helper function to detect selector type
|
||||||
|
const isXPathSelector = (selector) => {
|
||||||
|
return (
|
||||||
|
selector.startsWith("//") ||
|
||||||
|
selector.startsWith("/") ||
|
||||||
|
selector.startsWith("./")
|
||||||
|
);
|
||||||
|
};
|
||||||
|
|
||||||
|
// Enhanced query function to handle iframe, frame, shadow DOM, CSS selectors, and XPath
|
||||||
|
const queryElement = (rootElement, selector) => {
|
||||||
|
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
||||||
|
// Check if it's an XPath selector
|
||||||
|
if (isXPathSelector(selector)) {
|
||||||
|
return evaluateXPath(rootElement, selector);
|
||||||
|
} else {
|
||||||
|
return rootElement.querySelector(selector);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
||||||
let currentElement = rootElement;
|
let currentElement = rootElement;
|
||||||
|
|
||||||
for (let i = 0; i < parts.length; i++) {
|
for (let i = 0; i < parts.length; i++) {
|
||||||
if (!currentElement) return null;
|
if (!currentElement) return null;
|
||||||
|
|
||||||
// Handle iframe and frame traversal
|
// Handle iframe and frame traversal
|
||||||
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
if (
|
||||||
|
currentElement.tagName === "IFRAME" ||
|
||||||
|
currentElement.tagName === "FRAME"
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
const frameDoc =
|
||||||
|
currentElement.contentDocument ||
|
||||||
|
currentElement.contentWindow.document;
|
||||||
|
if (!frameDoc) return null;
|
||||||
|
|
||||||
|
if (isXPathSelector(parts[i])) {
|
||||||
|
currentElement = evaluateXPath(frameDoc, parts[i]);
|
||||||
|
} else {
|
||||||
currentElement = frameDoc.querySelector(parts[i]);
|
currentElement = frameDoc.querySelector(parts[i]);
|
||||||
|
}
|
||||||
continue;
|
continue;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
|
console.warn(
|
||||||
|
`Cannot access ${currentElement.tagName.toLowerCase()} content:`,
|
||||||
|
e
|
||||||
|
);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
let nextElement = null;
|
||||||
|
|
||||||
// Try regular DOM first
|
// Try regular DOM first
|
||||||
let nextElement = currentElement.querySelector(parts[i]);
|
if ("querySelector" in currentElement) {
|
||||||
|
if (isXPathSelector(parts[i])) {
|
||||||
|
nextElement = evaluateXPath(currentElement, parts[i]);
|
||||||
|
} else {
|
||||||
|
nextElement = currentElement.querySelector(parts[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Try shadow DOM if not found
|
// Try shadow DOM if not found
|
||||||
if (!nextElement && currentElement.shadowRoot) {
|
if (
|
||||||
|
!nextElement &&
|
||||||
|
"shadowRoot" in currentElement &&
|
||||||
|
currentElement.shadowRoot
|
||||||
|
) {
|
||||||
|
if (isXPathSelector(parts[i])) {
|
||||||
|
nextElement = evaluateXPath(currentElement.shadowRoot, parts[i]);
|
||||||
|
} else {
|
||||||
nextElement = currentElement.shadowRoot.querySelector(parts[i]);
|
nextElement = currentElement.shadowRoot.querySelector(parts[i]);
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
// Check children's shadow roots if still not found
|
// Check children's shadow roots if still not found
|
||||||
if (!nextElement) {
|
if (!nextElement && "children" in currentElement) {
|
||||||
const children = Array.from(currentElement.children || []);
|
const children = Array.from(currentElement.children || []);
|
||||||
for (const child of children) {
|
for (const child of children) {
|
||||||
if (child.shadowRoot) {
|
if (child.shadowRoot) {
|
||||||
|
if (isXPathSelector(parts[i])) {
|
||||||
|
nextElement = evaluateXPath(child.shadowRoot, parts[i]);
|
||||||
|
} else {
|
||||||
nextElement = child.shadowRoot.querySelector(parts[i]);
|
nextElement = child.shadowRoot.querySelector(parts[i]);
|
||||||
|
}
|
||||||
if (nextElement) break;
|
if (nextElement) break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -474,11 +579,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
|
|
||||||
// Enhanced query all function for both contexts
|
// Enhanced query all function for both contexts
|
||||||
const queryElementAll = (rootElement, selector) => {
|
const queryElementAll = (rootElement, selector) => {
|
||||||
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
if (!selector.includes(">>") && !selector.includes(":>>")) {
|
||||||
return rootElement.querySelectorAll(selector);
|
if (isXPathSelector(selector)) {
|
||||||
|
return evaluateXPathAll(rootElement, selector);
|
||||||
|
} else {
|
||||||
|
return Array.from(rootElement.querySelectorAll(selector));
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const parts = selector.split(/(?:>>|:>>)/).map(part => part.trim());
|
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
|
||||||
let currentElements = [rootElement];
|
let currentElements = [rootElement];
|
||||||
|
|
||||||
for (const part of parts) {
|
for (const part of parts) {
|
||||||
@@ -486,30 +595,64 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
|
|
||||||
for (const element of currentElements) {
|
for (const element of currentElements) {
|
||||||
// Handle iframe and frame traversal
|
// Handle iframe and frame traversal
|
||||||
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
|
if (element.tagName === "IFRAME" || element.tagName === "FRAME") {
|
||||||
try {
|
try {
|
||||||
const frameDoc = element.contentDocument || element.contentWindow.document;
|
const frameDoc =
|
||||||
nextElements.push(...frameDoc.querySelectorAll(part));
|
element.contentDocument || element.contentWindow.document;
|
||||||
|
if (frameDoc) {
|
||||||
|
if (isXPathSelector(part)) {
|
||||||
|
nextElements.push(...evaluateXPathAll(frameDoc, part));
|
||||||
|
} else {
|
||||||
|
nextElements.push(
|
||||||
|
...Array.from(frameDoc.querySelectorAll(part))
|
||||||
|
);
|
||||||
|
}
|
||||||
|
}
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
|
console.warn(
|
||||||
|
`Cannot access ${element.tagName.toLowerCase()} content:`,
|
||||||
|
e
|
||||||
|
);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
// Regular DOM elements
|
// Regular DOM elements
|
||||||
if (element.querySelectorAll) {
|
if (element.querySelectorAll) {
|
||||||
nextElements.push(...element.querySelectorAll(part));
|
if (isXPathSelector(part)) {
|
||||||
|
nextElements.push(...evaluateXPathAll(element, part));
|
||||||
|
} else {
|
||||||
|
nextElements.push(
|
||||||
|
...Array.from(element.querySelectorAll(part))
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Shadow DOM elements
|
// Shadow DOM elements
|
||||||
if (element.shadowRoot) {
|
if (element.shadowRoot) {
|
||||||
nextElements.push(...element.shadowRoot.querySelectorAll(part));
|
if (isXPathSelector(part)) {
|
||||||
|
nextElements.push(
|
||||||
|
...evaluateXPathAll(element.shadowRoot, part)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
nextElements.push(
|
||||||
|
...Array.from(element.shadowRoot.querySelectorAll(part))
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Check children's shadow roots
|
// Check children's shadow roots
|
||||||
const children = Array.from(element.children || []);
|
const children = Array.from(element.children || []);
|
||||||
for (const child of children) {
|
for (const child of children) {
|
||||||
if (child.shadowRoot) {
|
if (child.shadowRoot) {
|
||||||
nextElements.push(...child.shadowRoot.querySelectorAll(part));
|
if (isXPathSelector(part)) {
|
||||||
|
nextElements.push(
|
||||||
|
...evaluateXPathAll(child.shadowRoot, part)
|
||||||
|
);
|
||||||
|
} else {
|
||||||
|
nextElements.push(
|
||||||
|
...Array.from(child.shadowRoot.querySelectorAll(part))
|
||||||
|
);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -522,11 +665,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Enhanced value extraction with context awareness
|
// Enhanced value extraction with context awareness
|
||||||
function extractValue(element, attribute) {
|
const extractValue = (element, attribute) => {
|
||||||
if (!element) return null;
|
if (!element) return null;
|
||||||
|
|
||||||
// Get context-aware base URL
|
// Get context-aware base URL
|
||||||
const baseURL = element.ownerDocument?.location?.href || window.location.origin;
|
const baseURL =
|
||||||
|
element.ownerDocument?.location?.href || window.location.origin;
|
||||||
|
|
||||||
// Check shadow root first
|
// Check shadow root first
|
||||||
if (element.shadowRoot) {
|
if (element.shadowRoot) {
|
||||||
@@ -536,15 +680,37 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (attribute === 'innerText') {
|
if (attribute === "innerText") {
|
||||||
return element.innerText.trim();
|
// First try standard innerText/textContent
|
||||||
} else if (attribute === 'innerHTML') {
|
let textContent =
|
||||||
return element.innerHTML.trim();
|
element.innerText?.trim() || element.textContent?.trim();
|
||||||
} else if (attribute === 'src' || attribute === 'href') {
|
|
||||||
if (attribute === 'href' && element.tagName !== 'A') {
|
// If empty, check for common data attributes that might contain the text
|
||||||
|
if (!textContent) {
|
||||||
|
const dataAttributes = [
|
||||||
|
"data-600",
|
||||||
|
"data-text",
|
||||||
|
"data-label",
|
||||||
|
"data-value",
|
||||||
|
"data-content",
|
||||||
|
];
|
||||||
|
for (const attr of dataAttributes) {
|
||||||
|
const dataValue = element.getAttribute(attr);
|
||||||
|
if (dataValue && dataValue.trim()) {
|
||||||
|
textContent = dataValue.trim();
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return textContent || null;
|
||||||
|
} else if (attribute === "innerHTML") {
|
||||||
|
return element.innerHTML?.trim() || null;
|
||||||
|
} else if (attribute === "src" || attribute === "href") {
|
||||||
|
if (attribute === "href" && element.tagName !== "A") {
|
||||||
const parentElement = element.parentElement;
|
const parentElement = element.parentElement;
|
||||||
if (parentElement && parentElement.tagName === 'A') {
|
if (parentElement && parentElement.tagName === "A") {
|
||||||
const parentHref = parentElement.getAttribute('href');
|
const parentHref = parentElement.getAttribute("href");
|
||||||
if (parentHref) {
|
if (parentHref) {
|
||||||
try {
|
try {
|
||||||
return new URL(parentHref, baseURL).href;
|
return new URL(parentHref, baseURL).href;
|
||||||
@@ -556,13 +722,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
}
|
}
|
||||||
|
|
||||||
const attrValue = element.getAttribute(attribute);
|
const attrValue = element.getAttribute(attribute);
|
||||||
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
const dataAttr = attrValue || element.getAttribute("data-" + attribute);
|
||||||
|
|
||||||
if (!dataAttr || dataAttr.trim() === '') {
|
if (!dataAttr || dataAttr.trim() === "") {
|
||||||
if (attribute === 'src') {
|
if (attribute === "src") {
|
||||||
const style = window.getComputedStyle(element);
|
const style = window.getComputedStyle(element);
|
||||||
const bgImage = style.backgroundImage;
|
const bgImage = style.backgroundImage;
|
||||||
if (bgImage && bgImage !== 'none') {
|
if (bgImage && bgImage !== "none") {
|
||||||
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
||||||
return matches ? new URL(matches[1], baseURL).href : null;
|
return matches ? new URL(matches[1], baseURL).href : null;
|
||||||
}
|
}
|
||||||
@@ -573,15 +739,15 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
try {
|
try {
|
||||||
return new URL(dataAttr, baseURL).href;
|
return new URL(dataAttr, baseURL).href;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Error creating URL from', dataAttr, e);
|
console.warn("Error creating URL from", dataAttr, e);
|
||||||
return dataAttr; // Return the original value if URL construction fails
|
return dataAttr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return element.getAttribute(attribute);
|
return element.getAttribute(attribute);
|
||||||
}
|
};
|
||||||
|
|
||||||
// Enhanced table ancestor finding with context support
|
// Enhanced table ancestor finding with context support
|
||||||
function findTableAncestor(element) {
|
const findTableAncestor = (element) => {
|
||||||
let currentElement = element;
|
let currentElement = element;
|
||||||
const MAX_DEPTH = 5;
|
const MAX_DEPTH = 5;
|
||||||
let depth = 0;
|
let depth = 0;
|
||||||
@@ -593,14 +759,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (currentElement.tagName === 'TD') {
|
if (currentElement.tagName === "TD") {
|
||||||
return { type: 'TD', element: currentElement };
|
return { type: "TD", element: currentElement };
|
||||||
} else if (currentElement.tagName === 'TR') {
|
} else if (currentElement.tagName === "TR") {
|
||||||
return { type: 'TR', element: currentElement };
|
return { type: "TR", element: currentElement };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle iframe and frame crossing
|
// Handle iframe and frame crossing
|
||||||
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
if (
|
||||||
|
currentElement.tagName === "IFRAME" ||
|
||||||
|
currentElement.tagName === "FRAME"
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
currentElement = currentElement.contentDocument.body;
|
currentElement = currentElement.contentDocument.body;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -612,26 +781,26 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
depth++;
|
depth++;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
}
|
};
|
||||||
|
|
||||||
// Helper function to get cell index
|
// Helper function to get cell index
|
||||||
function getCellIndex(td) {
|
const getCellIndex = (td) => {
|
||||||
if (td.getRootNode() instanceof ShadowRoot) {
|
if (td.getRootNode() instanceof ShadowRoot) {
|
||||||
const shadowRoot = td.getRootNode();
|
const shadowRoot = td.getRootNode();
|
||||||
const allCells = Array.from(shadowRoot.querySelectorAll('td'));
|
const allCells = Array.from(shadowRoot.querySelectorAll("td"));
|
||||||
return allCells.indexOf(td);
|
return allCells.indexOf(td);
|
||||||
}
|
}
|
||||||
|
|
||||||
let index = 0;
|
let index = 0;
|
||||||
let sibling = td;
|
let sibling = td;
|
||||||
while (sibling = sibling.previousElementSibling) {
|
while ((sibling = sibling.previousElementSibling)) {
|
||||||
index++;
|
index++;
|
||||||
}
|
}
|
||||||
return index;
|
return index;
|
||||||
}
|
};
|
||||||
|
|
||||||
// Helper function to check for TH elements
|
// Helper function to check for TH elements
|
||||||
function hasThElement(row, tableFields) {
|
const hasThElement = (row, tableFields) => {
|
||||||
for (const [_, { selector }] of Object.entries(tableFields)) {
|
for (const [_, { selector }] of Object.entries(tableFields)) {
|
||||||
const element = queryElement(row, selector);
|
const element = queryElement(row, selector);
|
||||||
if (element) {
|
if (element) {
|
||||||
@@ -642,9 +811,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (current.tagName === 'TH') return true;
|
if (current.tagName === "TH") return true;
|
||||||
|
|
||||||
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
|
if (current.tagName === "IFRAME" || current.tagName === "FRAME") {
|
||||||
try {
|
try {
|
||||||
current = current.contentDocument.body;
|
current = current.contentDocument.body;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -657,35 +826,35 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
};
|
||||||
|
|
||||||
// Helper function to filter rows
|
// Helper function to filter rows
|
||||||
function filterRowsBasedOnTag(rows, tableFields) {
|
const filterRowsBasedOnTag = (rows, tableFields) => {
|
||||||
for (const row of rows) {
|
for (const row of rows) {
|
||||||
if (hasThElement(row, tableFields)) {
|
if (hasThElement(row, tableFields)) {
|
||||||
return rows;
|
return rows;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// Include shadow DOM in TH search
|
return rows.filter((row) => {
|
||||||
return rows.filter(row => {
|
const directTH = row.getElementsByTagName("TH").length === 0;
|
||||||
const directTH = row.getElementsByTagName('TH').length === 0;
|
const shadowTH = row.shadowRoot
|
||||||
const shadowTH = row.shadowRoot ?
|
? row.shadowRoot.querySelector("th") === null
|
||||||
row.shadowRoot.querySelector('th') === null : true;
|
: true;
|
||||||
return directTH && shadowTH;
|
return directTH && shadowTH;
|
||||||
});
|
});
|
||||||
}
|
};
|
||||||
|
|
||||||
// Class similarity comparison functions
|
// Class similarity comparison functions
|
||||||
function calculateClassSimilarity(classList1, classList2) {
|
const calculateClassSimilarity = (classList1, classList2) => {
|
||||||
const set1 = new Set(classList1);
|
const set1 = new Set(classList1);
|
||||||
const set2 = new Set(classList2);
|
const set2 = new Set(classList2);
|
||||||
const intersection = new Set([...set1].filter(x => set2.has(x)));
|
const intersection = new Set([...set1].filter((x) => set2.has(x)));
|
||||||
const union = new Set([...set1, ...set2]);
|
const union = new Set([...set1, ...set2]);
|
||||||
return intersection.size / union.size;
|
return intersection.size / union.size;
|
||||||
}
|
};
|
||||||
|
|
||||||
// Enhanced similar elements finding with context support
|
// Enhanced similar elements finding with context support
|
||||||
function findSimilarElements(baseElement, similarityThreshold = 0.7) {
|
const findSimilarElements = (baseElement, similarityThreshold = 0.7) => {
|
||||||
const baseClasses = Array.from(baseElement.classList);
|
const baseClasses = Array.from(baseElement.classList);
|
||||||
if (baseClasses.length === 0) return [];
|
if (baseClasses.length === 0) return [];
|
||||||
|
|
||||||
@@ -697,25 +866,33 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
// Get elements from shadow DOM
|
// Get elements from shadow DOM
|
||||||
if (baseElement.getRootNode() instanceof ShadowRoot) {
|
if (baseElement.getRootNode() instanceof ShadowRoot) {
|
||||||
const shadowHost = baseElement.getRootNode().host;
|
const shadowHost = baseElement.getRootNode().host;
|
||||||
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
allElements.push(
|
||||||
|
...shadowHost.getElementsByTagName(baseElement.tagName)
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get elements from iframes and frames
|
// Get elements from iframes and frames
|
||||||
const frames = [
|
const frames = [
|
||||||
...Array.from(document.getElementsByTagName('iframe')),
|
...Array.from(document.getElementsByTagName("iframe")),
|
||||||
...Array.from(document.getElementsByTagName('frame'))
|
...Array.from(document.getElementsByTagName("frame")),
|
||||||
];
|
];
|
||||||
|
|
||||||
for (const frame of frames) {
|
for (const frame of frames) {
|
||||||
try {
|
try {
|
||||||
const frameDoc = frame.contentDocument || frame.contentWindow.document;
|
const frameDoc =
|
||||||
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
|
frame.contentDocument || frame.contentWindow.document;
|
||||||
|
allElements.push(
|
||||||
|
...frameDoc.getElementsByTagName(baseElement.tagName)
|
||||||
|
);
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
|
console.warn(
|
||||||
|
`Cannot access ${frame.tagName.toLowerCase()} content:`,
|
||||||
|
e
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return allElements.filter(element => {
|
return allElements.filter((element) => {
|
||||||
if (element === baseElement) return false;
|
if (element === baseElement) return false;
|
||||||
const similarity = calculateClassSimilarity(
|
const similarity = calculateClassSimilarity(
|
||||||
baseClasses,
|
baseClasses,
|
||||||
@@ -723,45 +900,92 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
);
|
);
|
||||||
return similarity >= similarityThreshold;
|
return similarity >= similarityThreshold;
|
||||||
});
|
});
|
||||||
}
|
};
|
||||||
|
|
||||||
function tryFallbackSelector(rootElement, originalSelector) {
|
const tryFallbackSelector = (rootElement, originalSelector) => {
|
||||||
let element = queryElement(rootElement, originalSelector);
|
let element = queryElement(rootElement, originalSelector);
|
||||||
|
|
||||||
if (!element && originalSelector.includes('nth-child')) {
|
if (!element && originalSelector.includes("nth-child")) {
|
||||||
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
||||||
if (match) {
|
if (match) {
|
||||||
const position = parseInt(match[1], 10);
|
const position = parseInt(match[1], 10);
|
||||||
|
|
||||||
for (let i = position - 1; i >= 1; i--) {
|
for (let i = position - 1; i >= 1; i--) {
|
||||||
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
|
const fallbackSelector = originalSelector.replace(
|
||||||
|
/nth-child\(\d+\)/,
|
||||||
|
`nth-child(${i})`
|
||||||
|
);
|
||||||
element = queryElement(rootElement, fallbackSelector);
|
element = queryElement(rootElement, fallbackSelector);
|
||||||
if (element) break;
|
if (element) break;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!element) {
|
if (!element) {
|
||||||
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
|
const baseSelector = originalSelector.replace(
|
||||||
|
/\:nth-child\(\d+\)/,
|
||||||
|
""
|
||||||
|
);
|
||||||
element = queryElement(rootElement, baseSelector);
|
element = queryElement(rootElement, baseSelector);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
return element;
|
return element;
|
||||||
}
|
};
|
||||||
|
|
||||||
|
// Create indexed XPath for specific container instance
|
||||||
|
const createIndexedXPath = (
|
||||||
|
childSelector,
|
||||||
|
listSelector,
|
||||||
|
containerIndex
|
||||||
|
) => {
|
||||||
|
// Check if the child selector contains the list selector pattern
|
||||||
|
if (childSelector.includes(listSelector.replace("//", ""))) {
|
||||||
|
// Replace the list selector part with indexed version
|
||||||
|
const listPattern = listSelector.replace("//", "");
|
||||||
|
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
|
||||||
|
|
||||||
|
const indexedSelector = childSelector.replace(
|
||||||
|
`//${listPattern}`,
|
||||||
|
indexedListSelector
|
||||||
|
);
|
||||||
|
|
||||||
|
return indexedSelector;
|
||||||
|
} else {
|
||||||
|
// If pattern doesn't match, create a more generic indexed selector
|
||||||
|
return `(${listSelector})[${containerIndex}]${childSelector.replace(
|
||||||
|
"//",
|
||||||
|
"/"
|
||||||
|
)}`;
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
// Main scraping logic with unified support for both CSS and XPath
|
||||||
|
console.log("🚀 Starting unified list data extraction");
|
||||||
|
console.log("List Selector:", listSelector);
|
||||||
|
console.log("Fields:", fields);
|
||||||
|
|
||||||
// Main scraping logic with context support
|
|
||||||
let containers = queryElementAll(document, listSelector);
|
let containers = queryElementAll(document, listSelector);
|
||||||
containers = Array.from(containers);
|
containers = Array.from(containers);
|
||||||
|
|
||||||
if (containers.length === 0) return [];
|
if (containers.length === 0) {
|
||||||
|
console.warn("❌ No containers found for listSelector:", listSelector);
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
if (limit > 1 && containers.length === 1) {
|
console.log(`📦 Found ${containers.length} list containers`);
|
||||||
|
|
||||||
|
// For CSS selectors, try to find similar containers if needed
|
||||||
|
if (
|
||||||
|
!isXPathSelector(listSelector) &&
|
||||||
|
limit > 1 &&
|
||||||
|
containers.length === 1
|
||||||
|
) {
|
||||||
const baseContainer = containers[0];
|
const baseContainer = containers[0];
|
||||||
const similarContainers = findSimilarElements(baseContainer);
|
const similarContainers = findSimilarElements(baseContainer);
|
||||||
|
|
||||||
if (similarContainers.length > 0) {
|
if (similarContainers.length > 0) {
|
||||||
const newContainers = similarContainers.filter(container =>
|
const newContainers = similarContainers.filter(
|
||||||
!container.matches(listSelector)
|
(container) => !container.matches(listSelector)
|
||||||
);
|
);
|
||||||
containers = [...containers, ...newContainers];
|
containers = [...containers, ...newContainers];
|
||||||
}
|
}
|
||||||
@@ -769,10 +993,60 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
|
|
||||||
const containerFields = containers.map(() => ({
|
const containerFields = containers.map(() => ({
|
||||||
tableFields: {},
|
tableFields: {},
|
||||||
nonTableFields: {}
|
nonTableFields: {},
|
||||||
}));
|
}));
|
||||||
|
|
||||||
// Classify fields
|
// For XPath selectors, use the new approach
|
||||||
|
if (isXPathSelector(listSelector)) {
|
||||||
|
const extractedData = [];
|
||||||
|
const containersToProcess = Math.min(containers.length, limit);
|
||||||
|
|
||||||
|
for (
|
||||||
|
let containerIndex = 0;
|
||||||
|
containerIndex < containersToProcess;
|
||||||
|
containerIndex++
|
||||||
|
) {
|
||||||
|
const record = {};
|
||||||
|
|
||||||
|
for (const [label, field] of Object.entries(fields)) {
|
||||||
|
let element = null;
|
||||||
|
|
||||||
|
if (isXPathSelector(field.selector)) {
|
||||||
|
// Create indexed absolute XPath
|
||||||
|
const indexedSelector = createIndexedXPath(
|
||||||
|
field.selector,
|
||||||
|
listSelector,
|
||||||
|
containerIndex + 1
|
||||||
|
);
|
||||||
|
element = evaluateXPath(document, indexedSelector);
|
||||||
|
} else {
|
||||||
|
// Fallback for CSS selectors within XPath containers
|
||||||
|
const container = containers[containerIndex];
|
||||||
|
element = queryElement(container, field.selector);
|
||||||
|
}
|
||||||
|
|
||||||
|
if (element) {
|
||||||
|
const value = extractValue(element, field.attribute);
|
||||||
|
if (value !== null && value !== "") {
|
||||||
|
record[label] = value;
|
||||||
|
} else {
|
||||||
|
record[label] = "";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
record[label] = "";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (Object.values(record).some((value) => value !== "")) {
|
||||||
|
extractedData.push(record);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
console.log(`📊 Total records extracted: ${extractedData.length}`);
|
||||||
|
return extractedData;
|
||||||
|
}
|
||||||
|
|
||||||
|
// For CSS selectors, use the original table-aware approach
|
||||||
containers.forEach((container, containerIndex) => {
|
containers.forEach((container, containerIndex) => {
|
||||||
for (const [label, field] of Object.entries(fields)) {
|
for (const [label, field] of Object.entries(fields)) {
|
||||||
const sampleElement = queryElement(container, field.selector);
|
const sampleElement = queryElement(container, field.selector);
|
||||||
@@ -783,7 +1057,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
containerFields[containerIndex].tableFields[label] = {
|
containerFields[containerIndex].tableFields[label] = {
|
||||||
...field,
|
...field,
|
||||||
tableContext: ancestor.type,
|
tableContext: ancestor.type,
|
||||||
cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1
|
cellIndex:
|
||||||
|
ancestor.type === "TD" ? getCellIndex(ancestor.element) : -1,
|
||||||
};
|
};
|
||||||
} else {
|
} else {
|
||||||
containerFields[containerIndex].nonTableFields[label] = field;
|
containerFields[containerIndex].nonTableFields[label] = field;
|
||||||
@@ -798,7 +1073,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
const nonTableData = [];
|
const nonTableData = [];
|
||||||
|
|
||||||
// Process table data with support for iframes, frames, and shadow DOM
|
// Process table data with support for iframes, frames, and shadow DOM
|
||||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
for (
|
||||||
|
let containerIndex = 0;
|
||||||
|
containerIndex < containers.length;
|
||||||
|
containerIndex++
|
||||||
|
) {
|
||||||
const container = containers[containerIndex];
|
const container = containers[containerIndex];
|
||||||
const { tableFields } = containerFields[containerIndex];
|
const { tableFields } = containerFields[containerIndex];
|
||||||
|
|
||||||
@@ -808,13 +1087,20 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
let tableContext = firstElement;
|
let tableContext = firstElement;
|
||||||
|
|
||||||
// Find table context including iframe, frame and shadow DOM
|
// Find table context including iframe, frame and shadow DOM
|
||||||
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
while (
|
||||||
|
tableContext &&
|
||||||
|
tableContext.tagName !== "TABLE" &&
|
||||||
|
tableContext !== container
|
||||||
|
) {
|
||||||
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
||||||
tableContext = tableContext.getRootNode().host;
|
tableContext = tableContext.getRootNode().host;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
if (
|
||||||
|
tableContext.tagName === "IFRAME" ||
|
||||||
|
tableContext.tagName === "FRAME"
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
tableContext = tableContext.contentDocument.body;
|
tableContext = tableContext.contentDocument.body;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -830,30 +1116,45 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
const rows = [];
|
const rows = [];
|
||||||
|
|
||||||
// Get rows from regular DOM
|
// Get rows from regular DOM
|
||||||
rows.push(...tableContext.getElementsByTagName('TR'));
|
rows.push(...tableContext.getElementsByTagName("TR"));
|
||||||
|
|
||||||
// Get rows from shadow DOM
|
// Get rows from shadow DOM
|
||||||
if (tableContext.shadowRoot) {
|
if (tableContext.shadowRoot) {
|
||||||
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
rows.push(...tableContext.shadowRoot.getElementsByTagName("TR"));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get rows from iframes and frames
|
// Get rows from iframes and frames
|
||||||
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
if (
|
||||||
|
tableContext.tagName === "IFRAME" ||
|
||||||
|
tableContext.tagName === "FRAME"
|
||||||
|
) {
|
||||||
try {
|
try {
|
||||||
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
const frameDoc =
|
||||||
rows.push(...frameDoc.getElementsByTagName('TR'));
|
tableContext.contentDocument ||
|
||||||
|
tableContext.contentWindow.document;
|
||||||
|
rows.push(...frameDoc.getElementsByTagName("TR"));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
|
console.warn(
|
||||||
|
`Cannot access ${tableContext.tagName.toLowerCase()} rows:`,
|
||||||
|
e
|
||||||
|
);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const processedRows = filterRowsBasedOnTag(rows, tableFields);
|
const processedRows = filterRowsBasedOnTag(rows, tableFields);
|
||||||
|
|
||||||
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
|
for (
|
||||||
|
let rowIndex = 0;
|
||||||
|
rowIndex < Math.min(processedRows.length, limit);
|
||||||
|
rowIndex++
|
||||||
|
) {
|
||||||
const record = {};
|
const record = {};
|
||||||
const currentRow = processedRows[rowIndex];
|
const currentRow = processedRows[rowIndex];
|
||||||
|
|
||||||
for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
|
for (const [
|
||||||
|
label,
|
||||||
|
{ selector, attribute, cellIndex },
|
||||||
|
] of Object.entries(tableFields)) {
|
||||||
let element = null;
|
let element = null;
|
||||||
|
|
||||||
if (cellIndex >= 0) {
|
if (cellIndex >= 0) {
|
||||||
@@ -871,18 +1172,27 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
if (td) {
|
if (td) {
|
||||||
element = queryElement(td, selector);
|
element = queryElement(td, selector);
|
||||||
|
|
||||||
if (!element && selector.split(/(?:>>|:>>)/).pop().includes('td:nth-child')) {
|
if (
|
||||||
|
!element &&
|
||||||
|
selector
|
||||||
|
.split(/(?:>>|:>>)/)
|
||||||
|
.pop()
|
||||||
|
.includes("td:nth-child")
|
||||||
|
) {
|
||||||
element = td;
|
element = td;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!element) {
|
if (!element) {
|
||||||
const tagOnlySelector = selector.split('.')[0];
|
const tagOnlySelector = selector.split(".")[0];
|
||||||
element = queryElement(td, tagOnlySelector);
|
element = queryElement(td, tagOnlySelector);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!element) {
|
if (!element) {
|
||||||
let currentElement = td;
|
let currentElement = td;
|
||||||
while (currentElement && currentElement.children.length > 0) {
|
while (
|
||||||
|
currentElement &&
|
||||||
|
currentElement.children.length > 0
|
||||||
|
) {
|
||||||
let foundContentChild = false;
|
let foundContentChild = false;
|
||||||
for (const child of currentElement.children) {
|
for (const child of currentElement.children) {
|
||||||
if (extractValue(child, attribute)) {
|
if (extractValue(child, attribute)) {
|
||||||
@@ -914,7 +1224,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Process non-table data with all contexts support
|
// Process non-table data with all contexts support
|
||||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
for (
|
||||||
|
let containerIndex = 0;
|
||||||
|
containerIndex < containers.length;
|
||||||
|
containerIndex++
|
||||||
|
) {
|
||||||
if (nonTableData.length >= limit) break;
|
if (nonTableData.length >= limit) break;
|
||||||
|
|
||||||
const container = containers[containerIndex];
|
const container = containers[containerIndex];
|
||||||
@@ -923,7 +1237,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
if (Object.keys(nonTableFields).length > 0) {
|
if (Object.keys(nonTableFields).length > 0) {
|
||||||
const record = {};
|
const record = {};
|
||||||
|
|
||||||
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
for (const [label, { selector, attribute }] of Object.entries(
|
||||||
|
nonTableFields
|
||||||
|
)) {
|
||||||
// Get the last part of the selector after any context delimiter
|
// Get the last part of the selector after any context delimiter
|
||||||
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
||||||
const element = tryFallbackSelector(container, relativeSelector);
|
const element = tryFallbackSelector(container, relativeSelector);
|
||||||
@@ -941,6 +1257,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
|
|
||||||
// Merge and limit the results
|
// Merge and limit the results
|
||||||
const scrapedData = [...tableData, ...nonTableData];
|
const scrapedData = [...tableData, ...nonTableData];
|
||||||
|
console.log(`📊 Total records extracted: ${scrapedData.length}`);
|
||||||
|
|
||||||
return scrapedData;
|
return scrapedData;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user