feat: add deep nested iframe scraping support for capture text
This commit is contained in:
@@ -207,69 +207,82 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
function findAllElements(config) {
|
function findAllElements(config) {
|
||||||
// Regular DOM query if no special delimiters
|
// Regular DOM query if no special delimiters
|
||||||
if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
|
if (!config.selector.includes('>>') && !config.selector.includes(':>>')) {
|
||||||
return Array.from(document.querySelectorAll(config.selector));
|
return Array.from(document.querySelectorAll(config.selector));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Split by both types of delimiters
|
// First handle iframe traversal if present
|
||||||
const parts = config.selector.split(/(?:>>|:>>)/).map(s => s.trim());
|
if (config.selector.includes(':>>')) {
|
||||||
const delimiters = config.selector.match(/(?:>>|:>>)/g) || [];
|
const parts = config.selector.split(':>>').map(s => s.trim());
|
||||||
let currentElements = [document];
|
let currentElements = [document];
|
||||||
|
|
||||||
for (let i = 0; i < parts.length; i++) {
|
// Traverse through each part of the selector
|
||||||
const part = parts[i];
|
for (let i = 0; i < parts.length; i++) {
|
||||||
const nextElements = [];
|
const part = parts[i];
|
||||||
const isLast = i === parts.length - 1;
|
const nextElements = [];
|
||||||
const delimiter = delimiters[i] || '';
|
const isLast = i === parts.length - 1;
|
||||||
const isIframeTraversal = delimiter === ':>>';
|
|
||||||
|
for (const element of currentElements) {
|
||||||
for (const element of currentElements) {
|
try {
|
||||||
try {
|
// For document or iframe document
|
||||||
let targets;
|
const doc = element.contentDocument || element || element.contentWindow?.document;
|
||||||
|
if (!doc) continue;
|
||||||
if (i === 0) {
|
|
||||||
// First selector is queried from main document
|
// Query elements in current context
|
||||||
targets = Array.from(element.querySelectorAll(part))
|
const found = Array.from(doc.querySelectorAll(part));
|
||||||
.filter(el => {
|
|
||||||
if (isLast) return true;
|
if (isLast) {
|
||||||
// For iframe traversal, only include iframes
|
// If it's the last part, keep all matching elements
|
||||||
if (isIframeTraversal) return el.tagName === 'IFRAME';
|
nextElements.push(...found);
|
||||||
// For shadow DOM traversal, only include elements with shadow root
|
} else {
|
||||||
return el.shadowRoot && el.shadowRoot.mode === 'open';
|
// If not last, only keep iframes for next iteration
|
||||||
});
|
const iframes = found.filter(el => el.tagName === 'IFRAME');
|
||||||
} else {
|
nextElements.push(...iframes);
|
||||||
if (isIframeTraversal) {
|
}
|
||||||
// Handle iframe traversal
|
} catch (error) {
|
||||||
const iframeDocument = element.contentDocument || element.contentWindow?.document;
|
console.warn('Cannot access iframe content:', error, {
|
||||||
if (!iframeDocument) continue;
|
part,
|
||||||
|
element,
|
||||||
targets = Array.from(iframeDocument.querySelectorAll(part));
|
index: i
|
||||||
if (!isLast) {
|
});
|
||||||
targets = targets.filter(el => el.tagName === 'IFRAME');
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// Handle shadow DOM traversal
|
|
||||||
const shadowRoot = element.shadowRoot;
|
|
||||||
if (!shadowRoot || shadowRoot.mode !== 'open') continue;
|
|
||||||
|
|
||||||
targets = Array.from(shadowRoot.querySelectorAll(part));
|
|
||||||
if (!isLast) {
|
|
||||||
targets = targets.filter(el => el.shadowRoot && el.shadowRoot.mode === 'open');
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
nextElements.push(...targets);
|
if (nextElements.length === 0) {
|
||||||
} catch (error) {
|
console.warn('No elements found for part:', part, 'at depth:', i);
|
||||||
console.warn('Cannot access content:', error);
|
return [];
|
||||||
continue;
|
}
|
||||||
}
|
currentElements = nextElements;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (nextElements.length === 0) return [];
|
return currentElements;
|
||||||
currentElements = nextElements;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
return currentElements;
|
// Handle shadow DOM traversal
|
||||||
|
if (config.selector.includes('>>')) {
|
||||||
|
const parts = config.selector.split('>>').map(s => s.trim());
|
||||||
|
let currentElements = [document];
|
||||||
|
|
||||||
|
for (const part of parts) {
|
||||||
|
const nextElements = [];
|
||||||
|
for (const element of currentElements) {
|
||||||
|
// Try regular DOM first
|
||||||
|
const found = Array.from(element.querySelectorAll(part));
|
||||||
|
|
||||||
|
// Then check shadow roots
|
||||||
|
for (const foundEl of found) {
|
||||||
|
if (foundEl.shadowRoot) {
|
||||||
|
nextElements.push(foundEl.shadowRoot);
|
||||||
|
} else {
|
||||||
|
nextElements.push(foundEl);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
currentElements = nextElements;
|
||||||
|
}
|
||||||
|
return currentElements.filter(el => !(el instanceof ShadowRoot));
|
||||||
|
}
|
||||||
|
|
||||||
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Modified to handle iframe context for URL resolution
|
// Modified to handle iframe context for URL resolution
|
||||||
|
|||||||
Reference in New Issue
Block a user