From 540c9ec7095c97cd7be9235e7e51dd5cffccfab5 Mon Sep 17 00:00:00 2001 From: Rohit Date: Wed, 23 Apr 2025 17:21:14 +0530 Subject: [PATCH] feat: check parent element url and extract --- maxun-core/src/browserSide/scraper.js | 95 +++++++++++++++------------ 1 file changed, 54 insertions(+), 41 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 6e4bf029..e1f99c1d 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -523,49 +523,62 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, // Enhanced value extraction with context awareness function extractValue(element, attribute) { - if (!element) return null; - - // Get context-aware base URL - const baseURL = element.ownerDocument?.location?.href || window.location.origin; + if (!element) return null; - // Check shadow root first - if (element.shadowRoot) { - const shadowContent = element.shadowRoot.textContent; - if (shadowContent?.trim()) { - return shadowContent.trim(); - } + // Get context-aware base URL + const baseURL = element.ownerDocument?.location?.href || window.location.origin; + + // Check shadow root first + if (element.shadowRoot) { + const shadowContent = element.shadowRoot.textContent; + if (shadowContent?.trim()) { + return shadowContent.trim(); + } + } + + if (attribute === 'innerText') { + return element.innerText.trim(); + } else if (attribute === 'innerHTML') { + return element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + if (attribute === 'href' && element.tagName !== 'A') { + const parentElement = element.parentElement; + if (parentElement && parentElement.tagName === 'A') { + const parentHref = parentElement.getAttribute('href'); + if (parentHref) { + try { + return new URL(parentHref, baseURL).href; + } catch (e) { + return parentHref; + } + } + } + } + + const attrValue = element.getAttribute(attribute); + const dataAttr = attrValue || element.getAttribute('data-' + attribute); + + if (!dataAttr || dataAttr.trim() === '') { + if (attribute === 'src') { + const style = window.getComputedStyle(element); + const bgImage = style.backgroundImage; + if (bgImage && bgImage !== 'none') { + const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); + return matches ? new URL(matches[1], baseURL).href : null; + } + } + return null; + } + + try { + return new URL(dataAttr, baseURL).href; + } catch (e) { + console.warn('Error creating URL from', dataAttr, e); + return dataAttr; // Return the original value if URL construction fails + } + } + return element.getAttribute(attribute); } - - if (attribute === 'innerText') { - return element.innerText.trim(); - } else if (attribute === 'innerHTML') { - return element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - const attrValue = element.getAttribute(attribute); - - const dataAttr = attrValue || element.getAttribute('data-' + attribute); - - if (!dataAttr || dataAttr.trim() === '') { - if (attribute === 'src') { - const style = window.getComputedStyle(element); - const bgImage = style.backgroundImage; - if (bgImage && bgImage !== 'none') { - const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); - return matches ? new URL(matches[1], baseURL).href : null; - } - } - return null; - } - - try { - return new URL(dataAttr, baseURL).href; - } catch (e) { - console.warn('Error creating URL from', dataAttr, e); - return dataAttr; // Return the original value if URL construction fails - } - } - return element.getAttribute(attribute); - } // Enhanced table ancestor finding with context support function findTableAncestor(element) {