diff --git a/maxun-core/package.json b/maxun-core/package.json index 45c69ffe..faa133ec 100644 --- a/maxun-core/package.json +++ b/maxun-core/package.json @@ -1,6 +1,6 @@ { "name": "maxun-core", - "version": "0.0.3", + "version": "0.0.4", "description": "Core package for Maxun, responsible for data extraction", "main": "build/index.js", "typings": "build/index.d.ts", @@ -20,7 +20,12 @@ "automation", "workflow", "data extraction", - "scraping" + "scraping", + "web scraper", + "web scraping", + "data scraping", + "no-code web scraper", + "no-code web scraping" ], "author": "Maxun", "license": "AGPL-3.0-or-later", diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 79893568..828a4f84 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -235,9 +235,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, switch (attribute) { case 'href': - return elem.getAttribute('href'); + const relativeHref = elem.getAttribute('href'); + return relativeHref ? new URL(relativeHref, window.location.origin).href : null; case 'src': - return elem.getAttribute('src'); + const relativeSrc = elem.getAttribute('src'); + return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null; case 'innerText': return elem.innerText; case 'textContent': @@ -281,9 +283,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } else if (attribute === 'innerHTML') { record[label] = fieldElement.innerHTML.trim(); } else if (attribute === 'src') { - record[label] = fieldElement.src; + // Handle relative 'src' URLs + const src = fieldElement.getAttribute('src'); + record[label] = src ? new URL(src, baseUrl).href : null; } else if (attribute === 'href') { - record[label] = fieldElement.href; + // Handle relative 'href' URLs + const href = fieldElement.getAttribute('href'); + record[label] = href ? new URL(href, baseUrl).href : null; } else { record[label] = fieldElement.getAttribute(attribute); } diff --git a/package.json b/package.json index 8c914f55..b6b73537 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,7 @@ "jwt-decode": "^4.0.0", "loglevel": "^1.8.0", "loglevel-plugin-remote": "^0.6.8", - "maxun-core": "^0.0.3", + "maxun-core": "0.0.4", "minio": "^8.0.1", "moment-timezone": "^0.5.45", "node-cron": "^3.0.3", @@ -110,4 +110,4 @@ "ts-node": "^10.4.0", "vite": "^5.4.10" } -} +} \ No newline at end of file diff --git a/server/src/workflow-management/utils.ts b/server/src/workflow-management/utils.ts index c10d2c13..b3dadd60 100644 --- a/server/src/workflow-management/utils.ts +++ b/server/src/workflow-management/utils.ts @@ -17,7 +17,7 @@ export const getBestSelectorForAction = (action: Action) => { selectors?.text?.length != null && selectors?.text?.length < 25 && action.hasOnlyText - ? `text=${selectors.text}` + ? selectors.generalSelector : null; if (action.tagName === TagName.Input) {