From cb4dabfcf7bdd88d5d9c320c7350a87d81d8654e Mon Sep 17 00:00:00 2001 From: Rohit Date: Tue, 4 Mar 2025 09:14:27 +0530 Subject: [PATCH] feat: add frame element support for scrapeList action --- maxun-core/src/browserSide/scraper.js | 62 ++++++++++++++------------- 1 file changed, 33 insertions(+), 29 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 84649286..c04a9993 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -423,7 +423,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - // Enhanced query function to handle both iframe and shadow DOM + // Enhanced query function to handle iframe, frame and shadow DOM const queryElement = (rootElement, selector) => { if (!selector.includes('>>') && !selector.includes(':>>')) { return rootElement.querySelector(selector); @@ -435,14 +435,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, for (let i = 0; i < parts.length; i++) { if (!currentElement) return null; - // Handle iframe traversal - if (currentElement.tagName === 'IFRAME') { + // Handle iframe and frame traversal + if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') { try { - const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document; - currentElement = iframeDoc.querySelector(parts[i]); + const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document; + currentElement = frameDoc.querySelector(parts[i]); continue; } catch (e) { - console.warn('Cannot access iframe content:', e); + console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e); return null; } } @@ -485,13 +485,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const nextElements = []; for (const element of currentElements) { - // Handle iframe traversal - if (element.tagName === 'IFRAME') { + // Handle iframe and frame traversal + if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') { try { - const iframeDoc = element.contentDocument || element.contentWindow.document; - nextElements.push(...iframeDoc.querySelectorAll(part)); + const frameDoc = element.contentDocument || element.contentWindow.document; + nextElements.push(...frameDoc.querySelectorAll(part)); } catch (e) { - console.warn('Cannot access iframe content:', e); + console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e); continue; } } else { @@ -566,8 +566,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return { type: 'TR', element: currentElement }; } - // Handle iframe crossing - if (currentElement.tagName === 'IFRAME') { + // Handle iframe and frame crossing + if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') { try { currentElement = currentElement.contentDocument.body; } catch (e) { @@ -611,7 +611,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, if (current.tagName === 'TH') return true; - if (current.tagName === 'IFRAME') { + if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') { try { current = current.contentDocument.body; } catch (e) { @@ -667,14 +667,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName)); } - // Get elements from iframes - const iframes = document.getElementsByTagName('iframe'); - for (const iframe of iframes) { + // Get elements from iframes and frames + const frames = [ + ...Array.from(document.getElementsByTagName('iframe')), + ...Array.from(document.getElementsByTagName('frame')) + ]; + + for (const frame of frames) { try { - const iframeDoc = iframe.contentDocument || iframe.contentWindow.document; - allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName)); + const frameDoc = frame.contentDocument || frame.contentWindow.document; + allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName)); } catch (e) { - console.warn('Cannot access iframe content:', e); + console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e); } } @@ -736,7 +740,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const tableData = []; const nonTableData = []; - // Process table data with both iframe and shadow DOM support + // Process table data with support for iframes, frames, and shadow DOM for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { const container = containers[containerIndex]; const { tableFields } = containerFields[containerIndex]; @@ -746,14 +750,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const firstElement = queryElement(container, firstField.selector); let tableContext = firstElement; - // Find table context including both iframe and shadow DOM + // Find table context including iframe, frame and shadow DOM while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { if (tableContext.getRootNode() instanceof ShadowRoot) { tableContext = tableContext.getRootNode().host; continue; } - if (tableContext.tagName === 'IFRAME') { + if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') { try { tableContext = tableContext.contentDocument.body; } catch (e) { @@ -776,13 +780,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, rows.push(...tableContext.shadowRoot.getElementsByTagName('TR')); } - // Get rows from iframes - if (tableContext.tagName === 'IFRAME') { + // Get rows from iframes and frames + if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') { try { - const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document; - rows.push(...iframeDoc.getElementsByTagName('TR')); + const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document; + rows.push(...frameDoc.getElementsByTagName('TR')); } catch (e) { - console.warn('Cannot access iframe rows:', e); + console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e); } } @@ -852,7 +856,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } } - // Process non-table data with both contexts support + // Process non-table data with all contexts support for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { if (nonTableData.length >= limit) break;