feat: add logic to scrape multiple nested shadow dom elements

This commit is contained in:
RohitR311
2024-12-31 01:52:38 +05:30
parent b757d9c4f8
commit 4b4074b70d

View File

@@ -204,69 +204,68 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} }
function findAllElements(config) { function findAllElements(config) {
if (!config.shadow || !config.selector.includes('>>')) { if (!config.shadow || !config.selector.includes('>>')) {
return Array.from(document.querySelectorAll(config.selector)); return Array.from(document.querySelectorAll(config.selector));
} }
// For shadow DOM, we'll get all possible combinations // For shadow DOM, we'll get all possible combinations
const parts = config.selector.split('>>').map(s => s.trim()); const parts = config.selector.split('>>').map(s => s.trim());
let currentElements = [document]; let currentElements = [document];
for (let i = 0; i < parts.length; i++) { for (let i = 0; i < parts.length; i++) {
const part = parts[i]; const part = parts[i];
const nextElements = []; const nextElements = [];
for (const element of currentElements) { for (const element of currentElements) {
let targets; let targets;
if (i === 0) { if (i === 0) {
// First selector is queried from document // First selector is queried from document
targets = Array.from(element.querySelectorAll(part)) targets = Array.from(element.querySelectorAll(part))
.filter(el => { .filter(el => {
// Only include elements that either: // Only include elements that either:
// 1. Have an open shadow root // 1. Have an open shadow root
// 2. Don't need shadow root (last part of selector) // 2. Don't need shadow root (last part of selector)
if (i === parts.length - 1) return true; if (i === parts.length - 1) return true;
const shadowRoot = el.shadowRoot; const shadowRoot = el.shadowRoot;
return shadowRoot && shadowRoot.mode === 'open'; return shadowRoot && shadowRoot.mode === 'open';
}); });
} else { } else {
// For subsequent selectors, only use elements with open shadow roots // For subsequent selectors, only use elements with open shadow roots
const shadowRoot = element.shadowRoot; const shadowRoot = element.shadowRoot;
if (!shadowRoot || shadowRoot.mode !== 'open') continue; if (!shadowRoot || shadowRoot.mode !== 'open') continue;
targets = Array.from(shadowRoot.querySelectorAll(part)); targets = Array.from(shadowRoot.querySelectorAll(part));
}
nextElements.push(...targets);
} }
nextElements.push(...targets);
} if (nextElements.length === 0) return [];
currentElements = nextElements;
if (nextElements.length === 0) return []; }
currentElements = nextElements;
} return currentElements;
return currentElements;
} }
// Helper function to extract value from element based on attribute function getElementValue(element, attribute) {
function getElementValue(element, attribute) { if (!element) return null;
if (!element) return null;
switch (attribute) {
switch (attribute) { case 'href': {
case 'href': { const relativeHref = element.getAttribute('href');
const relativeHref = element.getAttribute('href'); return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
return relativeHref ? new URL(relativeHref, window.location.origin).href : null; }
} case 'src': {
case 'src': { const relativeSrc = element.getAttribute('src');
const relativeSrc = element.getAttribute('src'); return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null; }
} case 'innerText':
case 'innerText': return element.innerText?.trim();
return element.innerText?.trim(); case 'textContent':
case 'textContent': return element.textContent?.trim();
return element.textContent?.trim(); default:
default: return element.getAttribute(attribute) || element.innerText?.trim();
return element.getAttribute(attribute) || element.innerText?.trim(); }
} }
}
// Get the seed key based on the maximum number of elements found // Get the seed key based on the maximum number of elements found
function getSeedKey(listObj) { function getSeedKey(listObj) {
@@ -280,26 +279,26 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Find minimal bounding elements // Find minimal bounding elements
function getMBEs(elements) { function getMBEs(elements) {
return elements.map((element) => { return elements.map((element) => {
let candidate = element; let candidate = element;
const isUniqueChild = (e) => elements const isUniqueChild = (e) => elements
.filter((elem) => e.parentNode?.contains(elem)) .filter((elem) => e.parentNode?.contains(elem))
.length === 1; .length === 1;
while (candidate && isUniqueChild(candidate)) { while (candidate && isUniqueChild(candidate)) {
candidate = candidate.parentNode; candidate = candidate.parentNode;
} }
return candidate; return candidate;
}); });
} }
// Main scraping logic // First try the MBE approach
const seedName = getSeedKey(lists); const seedName = getSeedKey(lists);
const seedElements = findAllElements(lists[seedName]); const seedElements = findAllElements(lists[seedName]);
const MBEs = getMBEs(seedElements); const MBEs = getMBEs(seedElements);
return MBEs.map((mbe) => omap( const mbeResults = MBEs.map((mbe) => omap(
lists, lists,
(config) => { (config) => {
const elem = findAllElements(config) const elem = findAllElements(config)
@@ -309,6 +308,33 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}, },
(key) => key (key) => key
)) || []; )) || [];
// If MBE approach didn't find all elements, try independent scraping
if (mbeResults.some(result => Object.values(result).some(v => v === undefined))) {
// Fall back to independent scraping
const results = [];
const foundElements = new Map();
// Find all elements for each selector
Object.entries(lists).forEach(([key, config]) => {
const elements = findAllElements(config);
foundElements.set(key, elements);
});
// Create result objects for each found element
foundElements.forEach((elements, key) => {
elements.forEach((element, index) => {
if (!results[index]) {
results[index] = {};
}
results[index][key] = getElementValue(element, lists[key].attribute);
});
});
return results.filter(result => Object.keys(result).length > 0);
}
return mbeResults;
}; };
/** /**