feat: add deep nested iframe scraping support for capture text

This commit is contained in:
RohitR311
2025-01-05 01:30:11 +05:30
parent 934ffbb0f5
commit 9c8a980c43

View File

@@ -207,69 +207,82 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
function findAllElements(config) {
// Regular DOM query if no special delimiters
if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
return Array.from(document.querySelectorAll(config.selector));
return Array.from(document.querySelectorAll(config.selector));
}
// Split by both types of delimiters
const parts = config.selector.split(/(?:>>|:>>)/).map(s => s.trim());
const delimiters = config.selector.match(/(?:>>|:>>)/g) || [];
let currentElements = [document];
// First handle iframe traversal if present
if (config.selector.includes(':>>')) {
const parts = config.selector.split(':>>').map(s => s.trim());
let currentElements = [document];
for (let i = 0; i < parts.length; i++) {
const part = parts[i];
const nextElements = [];
const isLast = i === parts.length - 1;
const delimiter = delimiters[i] || '';
const isIframeTraversal = delimiter === ':>>';
// Traverse through each part of the selector
for (let i = 0; i < parts.length; i++) {
const part = parts[i];
const nextElements = [];
const isLast = i === parts.length - 1;
for (const element of currentElements) {
try {
let targets;
for (const element of currentElements) {
try {
// For document or iframe document
const doc = element.contentDocument || element || element.contentWindow?.document;
if (!doc) continue;
if (i === 0) {
// First selector is queried from main document
targets = Array.from(element.querySelectorAll(part))
.filter(el => {
if (isLast) return true;
// For iframe traversal, only include iframes
if (isIframeTraversal) return el.tagName === 'IFRAME';
// For shadow DOM traversal, only include elements with shadow root
return el.shadowRoot && el.shadowRoot.mode === 'open';
});
} else {
if (isIframeTraversal) {
// Handle iframe traversal
const iframeDocument = element.contentDocument || element.contentWindow?.document;
if (!iframeDocument) continue;
// Query elements in current context
const found = Array.from(doc.querySelectorAll(part));
targets = Array.from(iframeDocument.querySelectorAll(part));
if (!isLast) {
targets = targets.filter(el => el.tagName === 'IFRAME');
if (isLast) {
// If it's the last part, keep all matching elements
nextElements.push(...found);
} else {
// If not last, only keep iframes for next iteration
const iframes = found.filter(el => el.tagName === 'IFRAME');
nextElements.push(...iframes);
}
} catch (error) {
console.warn('Cannot access iframe content:', error, {
part,
element,
index: i
});
}
} else {
// Handle shadow DOM traversal
const shadowRoot = element.shadowRoot;
if (!shadowRoot || shadowRoot.mode !== 'open') continue;
targets = Array.from(shadowRoot.querySelectorAll(part));
if (!isLast) {
targets = targets.filter(el => el.shadowRoot && el.shadowRoot.mode === 'open');
}
}
}
nextElements.push(...targets);
} catch (error) {
console.warn('Cannot access content:', error);
continue;
}
if (nextElements.length === 0) {
console.warn('No elements found for part:', part, 'at depth:', i);
return [];
}
currentElements = nextElements;
}
if (nextElements.length === 0) return [];
currentElements = nextElements;
return currentElements;
}
return currentElements;
// Handle shadow DOM traversal
if (config.selector.includes('>>')) {
const parts = config.selector.split('>>').map(s => s.trim());
let currentElements = [document];
for (const part of parts) {
const nextElements = [];
for (const element of currentElements) {
// Try regular DOM first
const found = Array.from(element.querySelectorAll(part));
// Then check shadow roots
for (const foundEl of found) {
if (foundEl.shadowRoot) {
nextElements.push(foundEl.shadowRoot);
} else {
nextElements.push(foundEl);
}
}
}
currentElements = nextElements;
}
return currentElements.filter(el => !(el instanceof ShadowRoot));
}
return [];
}
// Modified to handle iframe context for URL resolution