From 1e0890800f386f08b439c090da63772e4a6fe341 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sat, 23 Nov 2024 06:48:17 +0530 Subject: [PATCH 1/8] feat: handle if element has only text content --- server/src/workflow-management/utils.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/workflow-management/utils.ts b/server/src/workflow-management/utils.ts index c10d2c13..b3dadd60 100644 --- a/server/src/workflow-management/utils.ts +++ b/server/src/workflow-management/utils.ts @@ -17,7 +17,7 @@ export const getBestSelectorForAction = (action: Action) => { selectors?.text?.length != null && selectors?.text?.length < 25 && action.hasOnlyText - ? `text=${selectors.text}` + ? selectors.generalSelector : null; if (action.tagName === TagName.Input) { From fc2e3dc1c18fc2e11b72ae2c4ae7fa5065d6289d Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sat, 23 Nov 2024 07:04:18 +0530 Subject: [PATCH 2/8] feat: join base url with href --- maxun-core/src/browserSide/scraper.js | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 79893568..7ee64dc1 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -235,7 +235,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, switch (attribute) { case 'href': - return elem.getAttribute('href'); + const relativeHref = elem.getAttribute('href'); // Get the href attribute + return relativeHref ? new URL(relativeHref, window.location.origin).href : null; // Convert to full URL case 'src': return elem.getAttribute('src'); case 'innerText': From 94aa8bcf42d0266febb54516c12a98a562be47b3 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sat, 23 Nov 2024 07:05:55 +0530 Subject: [PATCH 3/8] feat: handle relative href urls in scrape list --- maxun-core/src/browserSide/scraper.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 7ee64dc1..e5e1c697 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -284,7 +284,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } else if (attribute === 'src') { record[label] = fieldElement.src; } else if (attribute === 'href') { - record[label] = fieldElement.href; + // Handle relative 'href' URLs + const href = fieldElement.getAttribute('href'); + record[label] = href ? new URL(href, baseUrl).href : null; } else { record[label] = fieldElement.getAttribute(attribute); } From 232f83890209f0e84f59b906640065ab8759126d Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sat, 23 Nov 2024 07:06:15 +0530 Subject: [PATCH 4/8] feat: handle relative src urls in scrape list --- maxun-core/src/browserSide/scraper.js | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index e5e1c697..467d077b 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -282,7 +282,9 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } else if (attribute === 'innerHTML') { record[label] = fieldElement.innerHTML.trim(); } else if (attribute === 'src') { - record[label] = fieldElement.src; + // Handle relative 'src' URLs + const src = fieldElement.getAttribute('src'); + record[label] = src ? new URL(src, baseUrl).href : null; } else if (attribute === 'href') { // Handle relative 'href' URLs const href = fieldElement.getAttribute('href'); From 6d59a58e304bfa2a4a32cda496dca5a0c001d8d2 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sat, 23 Nov 2024 07:07:12 +0530 Subject: [PATCH 5/8] feat: handle relative src urls in scrape schema --- maxun-core/src/browserSide/scraper.js | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 467d077b..828a4f84 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -235,10 +235,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, switch (attribute) { case 'href': - const relativeHref = elem.getAttribute('href'); // Get the href attribute - return relativeHref ? new URL(relativeHref, window.location.origin).href : null; // Convert to full URL + const relativeHref = elem.getAttribute('href'); + return relativeHref ? new URL(relativeHref, window.location.origin).href : null; case 'src': - return elem.getAttribute('src'); + const relativeSrc = elem.getAttribute('src'); + return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null; case 'innerText': return elem.innerText; case 'textContent': From 2b255c454e0d06171471f5a9e8ccfdd7961c7214 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sat, 23 Nov 2024 07:08:24 +0530 Subject: [PATCH 6/8] chore: v0.0.4 --- maxun-core/package.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maxun-core/package.json b/maxun-core/package.json index 45c69ffe..ca621e3a 100644 --- a/maxun-core/package.json +++ b/maxun-core/package.json @@ -1,6 +1,6 @@ { "name": "maxun-core", - "version": "0.0.3", + "version": "0.0.4", "description": "Core package for Maxun, responsible for data extraction", "main": "build/index.js", "typings": "build/index.d.ts", From 3f58867f5784475b0d9fa4195768ae4fc72863cf Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sat, 23 Nov 2024 07:09:13 +0530 Subject: [PATCH 7/8] chore: add more keywords --- maxun-core/package.json | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/maxun-core/package.json b/maxun-core/package.json index ca621e3a..faa133ec 100644 --- a/maxun-core/package.json +++ b/maxun-core/package.json @@ -20,7 +20,12 @@ "automation", "workflow", "data extraction", - "scraping" + "scraping", + "web scraper", + "web scraping", + "data scraping", + "no-code web scraper", + "no-code web scraping" ], "author": "Maxun", "license": "AGPL-3.0-or-later", From a0b8d73519e4fd474d4c4e95f10f31cabc8067a3 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sat, 23 Nov 2024 08:30:07 +0530 Subject: [PATCH 8/8] chore: upgrade maxun-core to v0.0.4 --- package.json | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/package.json b/package.json index 8c914f55..b6b73537 100644 --- a/package.json +++ b/package.json @@ -42,7 +42,7 @@ "jwt-decode": "^4.0.0", "loglevel": "^1.8.0", "loglevel-plugin-remote": "^0.6.8", - "maxun-core": "^0.0.3", + "maxun-core": "0.0.4", "minio": "^8.0.1", "moment-timezone": "^0.5.45", "node-cron": "^3.0.3", @@ -110,4 +110,4 @@ "ts-node": "^10.4.0", "vite": "^5.4.10" } -} +} \ No newline at end of file