feat: add deep nested iframe scraping support for capture text

This commit is contained in:
RohitR311
2025-01-05 01:30:11 +05:30
parent 934ffbb0f5
commit 9c8a980c43

View File

@@ -207,69 +207,82 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
function findAllElements(config) { function findAllElements(config) {
// Regular DOM query if no special delimiters // Regular DOM query if no special delimiters
if (!config.selector.includes('>>') && !config.selector.includes(':>>')) { if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
return Array.from(document.querySelectorAll(config.selector)); return Array.from(document.querySelectorAll(config.selector));
} }
// Split by both types of delimiters // First handle iframe traversal if present
const parts = config.selector.split(/(?:>>|:>>)/).map(s => s.trim()); if (config.selector.includes(':>>')) {
const delimiters = config.selector.match(/(?:>>|:>>)/g) || []; const parts = config.selector.split(':>>').map(s => s.trim());
let currentElements = [document]; let currentElements = [document];
for (let i = 0; i < parts.length; i++) { // Traverse through each part of the selector
const part = parts[i]; for (let i = 0; i < parts.length; i++) {
const nextElements = []; const part = parts[i];
const isLast = i === parts.length - 1; const nextElements = [];
const delimiter = delimiters[i] || ''; const isLast = i === parts.length - 1;
const isIframeTraversal = delimiter === ':>>';
for (const element of currentElements) { for (const element of currentElements) {
try { try {
let targets; // For document or iframe document
const doc = element.contentDocument || element || element.contentWindow?.document;
if (!doc) continue;
if (i === 0) { // Query elements in current context
// First selector is queried from main document const found = Array.from(doc.querySelectorAll(part));
targets = Array.from(element.querySelectorAll(part))
.filter(el => {
if (isLast) return true;
// For iframe traversal, only include iframes
if (isIframeTraversal) return el.tagName === 'IFRAME';
// For shadow DOM traversal, only include elements with shadow root
return el.shadowRoot && el.shadowRoot.mode === 'open';
});
} else {
if (isIframeTraversal) {
// Handle iframe traversal
const iframeDocument = element.contentDocument || element.contentWindow?.document;
if (!iframeDocument) continue;
targets = Array.from(iframeDocument.querySelectorAll(part)); if (isLast) {
if (!isLast) { // If it's the last part, keep all matching elements
targets = targets.filter(el => el.tagName === 'IFRAME'); nextElements.push(...found);
} else {
// If not last, only keep iframes for next iteration
const iframes = found.filter(el => el.tagName === 'IFRAME');
nextElements.push(...iframes);
}
} catch (error) {
console.warn('Cannot access iframe content:', error, {
part,
element,
index: i
});
} }
} else {
// Handle shadow DOM traversal
const shadowRoot = element.shadowRoot;
if (!shadowRoot || shadowRoot.mode !== 'open') continue;
targets = Array.from(shadowRoot.querySelectorAll(part));
if (!isLast) {
targets = targets.filter(el => el.shadowRoot && el.shadowRoot.mode === 'open');
}
}
} }
nextElements.push(...targets); if (nextElements.length === 0) {
} catch (error) { console.warn('No elements found for part:', part, 'at depth:', i);
console.warn('Cannot access content:', error); return [];
continue; }
} currentElements = nextElements;
} }
if (nextElements.length === 0) return []; return currentElements;
currentElements = nextElements;
} }
return currentElements; // Handle shadow DOM traversal
if (config.selector.includes('>>')) {
const parts = config.selector.split('>>').map(s => s.trim());
let currentElements = [document];
for (const part of parts) {
const nextElements = [];
for (const element of currentElements) {
// Try regular DOM first
const found = Array.from(element.querySelectorAll(part));
// Then check shadow roots
for (const foundEl of found) {
if (foundEl.shadowRoot) {
nextElements.push(foundEl.shadowRoot);
} else {
nextElements.push(foundEl);
}
}
}
currentElements = nextElements;
}
return currentElements.filter(el => !(el instanceof ShadowRoot));
}
return [];
} }
// Modified to handle iframe context for URL resolution // Modified to handle iframe context for URL resolution