From ccbf9d074bf98c964b85e0e3afd0f7ea1aca3a34 Mon Sep 17 00:00:00 2001
From: Rohit <rohit.rajan031101@gmail.com>
Date: Mon, 3 Mar 2025 18:11:19 +0530
Subject: [PATCH] feat: add scraping support for frame elements

---
 maxun-core/src/browserSide/scraper.js | 51 +++++++++++++++++++++------
 1 file changed, 40 insertions(+), 11 deletions(-)

diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js
index f6b53da2..84649286 100644
--- a/maxun-core/src/browserSide/scraper.js
+++ b/maxun-core/src/browserSide/scraper.js
@@ -210,7 +210,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
           return Array.from(document.querySelectorAll(config.selector));
       }
   
-      // First handle iframe traversal if present
       if (config.selector.includes(':>>')) {
         const parts = config.selector.split(':>>').map(s => s.trim());
         let currentElements = [document];
@@ -223,23 +222,44 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
 
             for (const element of currentElements) {
                 try {
-                    // For document or iframe document
                     const doc = element.contentDocument || element || element.contentWindow?.document;
                     if (!doc) continue;
 
-                    // Query elements in current context
+                    if (part.startsWith('frame[name=') || part.startsWith('iframe[name=')) {
+                        const nameMatch = part.match(/\[name=['"]([^'"]+)['"]\]/);
+                        if (nameMatch && nameMatch[1]) {
+                            const frameName = nameMatch[1];
+                            let foundFrames = [];
+                            
+                            if (doc.getElementsByName && typeof doc.getElementsByName === 'function') {
+                                foundFrames = Array.from(doc.getElementsByName(frameName))
+                                    .filter(el => el.tagName === 'FRAME' || el.tagName === 'IFRAME');
+                            }
+                            
+                            if (foundFrames.length === 0) {
+                                const framesBySelector = Array.from(doc.querySelectorAll(`frame[name="${frameName}"], iframe[name="${frameName}"]`));
+                                foundFrames = framesBySelector;
+                            }
+                            
+                            if (isLast) {
+                                nextElements.push(...foundFrames);
+                            } else {
+                                nextElements.push(...foundFrames);
+                            }
+                            continue;
+                        }
+                    }
+
                     const found = Array.from(doc.querySelectorAll(part));
                     
                     if (isLast) {
-                        // If it's the last part, keep all matching elements
                         nextElements.push(...found);
                     } else {
-                        // If not last, only keep iframes for next iteration
-                        const iframes = found.filter(el => el.tagName === 'IFRAME');
-                        nextElements.push(...iframes);
+                        const frames = found.filter(el => el.tagName === 'IFRAME' || el.tagName === 'FRAME');
+                        nextElements.push(...frames);
                     }
                 } catch (error) {
-                    console.warn('Cannot access iframe content:', error, {
+                    console.warn('Cannot access iframe/frame content:', error, {
                         part,
                         element,
                         index: i
@@ -285,12 +305,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
       return [];
     }
 
-    // Modified to handle iframe context for URL resolution
     function getElementValue(element, attribute) {
       if (!element) return null;
   
-      // Get the base URL for resolving relative URLs
-      const baseURL = element.ownerDocument?.location?.href || window.location.origin;
+      let baseURL;
+      try {
+          baseURL = element.ownerDocument?.location?.href || 
+                    element.ownerDocument?.baseURI || 
+                    window.location.origin;
+      } catch (e) {
+          baseURL = window.location.origin;
+      }
   
       switch (attribute) {
         case 'href': {
@@ -305,6 +330,10 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
             return element.innerText?.trim();
         case 'textContent':
             return element.textContent?.trim();
+        case 'innerHTML':
+            return element.innerHTML;
+        case 'outerHTML':
+            return element.outerHTML;
         default:
             return element.getAttribute(attribute) || element.innerText?.trim();
       }