feat: add frame element support for scrapeList action

This commit is contained in:
Rohit
2025-03-04 09:14:27 +05:30
parent 9400ebe032
commit cb4dabfcf7

View File

@@ -423,7 +423,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list * @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/ */
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
// Enhanced query function to handle both iframe and shadow DOM // Enhanced query function to handle iframe, frame and shadow DOM
const queryElement = (rootElement, selector) => { const queryElement = (rootElement, selector) => {
if (!selector.includes('>>') && !selector.includes(':>>')) { if (!selector.includes('>>') && !selector.includes(':>>')) {
return rootElement.querySelector(selector); return rootElement.querySelector(selector);
@@ -435,14 +435,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
for (let i = 0; i < parts.length; i++) { for (let i = 0; i < parts.length; i++) {
if (!currentElement) return null; if (!currentElement) return null;
// Handle iframe traversal // Handle iframe and frame traversal
if (currentElement.tagName === 'IFRAME') { if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
try { try {
const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document; const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
currentElement = iframeDoc.querySelector(parts[i]); currentElement = frameDoc.querySelector(parts[i]);
continue; continue;
} catch (e) { } catch (e) {
console.warn('Cannot access iframe content:', e); console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
return null; return null;
} }
} }
@@ -485,13 +485,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const nextElements = []; const nextElements = [];
for (const element of currentElements) { for (const element of currentElements) {
// Handle iframe traversal // Handle iframe and frame traversal
if (element.tagName === 'IFRAME') { if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
try { try {
const iframeDoc = element.contentDocument || element.contentWindow.document; const frameDoc = element.contentDocument || element.contentWindow.document;
nextElements.push(...iframeDoc.querySelectorAll(part)); nextElements.push(...frameDoc.querySelectorAll(part));
} catch (e) { } catch (e) {
console.warn('Cannot access iframe content:', e); console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
continue; continue;
} }
} else { } else {
@@ -566,8 +566,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return { type: 'TR', element: currentElement }; return { type: 'TR', element: currentElement };
} }
// Handle iframe crossing // Handle iframe and frame crossing
if (currentElement.tagName === 'IFRAME') { if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
try { try {
currentElement = currentElement.contentDocument.body; currentElement = currentElement.contentDocument.body;
} catch (e) { } catch (e) {
@@ -611,7 +611,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
if (current.tagName === 'TH') return true; if (current.tagName === 'TH') return true;
if (current.tagName === 'IFRAME') { if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
try { try {
current = current.contentDocument.body; current = current.contentDocument.body;
} catch (e) { } catch (e) {
@@ -667,14 +667,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName)); allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
} }
// Get elements from iframes // Get elements from iframes and frames
const iframes = document.getElementsByTagName('iframe'); const frames = [
for (const iframe of iframes) { ...Array.from(document.getElementsByTagName('iframe')),
...Array.from(document.getElementsByTagName('frame'))
];
for (const frame of frames) {
try { try {
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document; const frameDoc = frame.contentDocument || frame.contentWindow.document;
allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName)); allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
} catch (e) { } catch (e) {
console.warn('Cannot access iframe content:', e); console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
} }
} }
@@ -736,7 +740,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const tableData = []; const tableData = [];
const nonTableData = []; const nonTableData = [];
// Process table data with both iframe and shadow DOM support // Process table data with support for iframes, frames, and shadow DOM
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
const container = containers[containerIndex]; const container = containers[containerIndex];
const { tableFields } = containerFields[containerIndex]; const { tableFields } = containerFields[containerIndex];
@@ -746,14 +750,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const firstElement = queryElement(container, firstField.selector); const firstElement = queryElement(container, firstField.selector);
let tableContext = firstElement; let tableContext = firstElement;
// Find table context including both iframe and shadow DOM // Find table context including iframe, frame and shadow DOM
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
if (tableContext.getRootNode() instanceof ShadowRoot) { if (tableContext.getRootNode() instanceof ShadowRoot) {
tableContext = tableContext.getRootNode().host; tableContext = tableContext.getRootNode().host;
continue; continue;
} }
if (tableContext.tagName === 'IFRAME') { if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
try { try {
tableContext = tableContext.contentDocument.body; tableContext = tableContext.contentDocument.body;
} catch (e) { } catch (e) {
@@ -776,13 +780,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR')); rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
} }
// Get rows from iframes // Get rows from iframes and frames
if (tableContext.tagName === 'IFRAME') { if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
try { try {
const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document; const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
rows.push(...iframeDoc.getElementsByTagName('TR')); rows.push(...frameDoc.getElementsByTagName('TR'));
} catch (e) { } catch (e) {
console.warn('Cannot access iframe rows:', e); console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
} }
} }
@@ -852,7 +856,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} }
} }
// Process non-table data with both contexts support // Process non-table data with all contexts support
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
if (nonTableData.length >= limit) break; if (nonTableData.length >= limit) break;