Merge pull request #202 from getmaxun/develop

feat: feat: handle relative src & href paths in scrapeList & scrapeSchema (missing in v0.0.2)
This commit is contained in:
Karishma Shukla
2024-11-23 08:53:33 +05:30
committed by GitHub
5 changed files with 21 additions and 10 deletions

View File

@@ -43,7 +43,7 @@ services:
#build:
#context: .
#dockerfile: server/Dockerfile
image: getmaxun/maxun-backend:v0.0.1
image: getmaxun/maxun-backend:v0.0.2
ports:
- "8080:8080"
env_file: .env

View File

@@ -1,6 +1,6 @@
{
"name": "maxun-core",
"version": "0.0.3",
"version": "0.0.4",
"description": "Core package for Maxun, responsible for data extraction",
"main": "build/index.js",
"typings": "build/index.d.ts",
@@ -20,7 +20,12 @@
"automation",
"workflow",
"data extraction",
"scraping"
"scraping",
"web scraper",
"web scraping",
"data scraping",
"no-code web scraper",
"no-code web scraping"
],
"author": "Maxun",
"license": "AGPL-3.0-or-later",

View File

@@ -235,9 +235,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
switch (attribute) {
case 'href':
return elem.getAttribute('href');
const relativeHref = elem.getAttribute('href');
return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
case 'src':
return elem.getAttribute('src');
const relativeSrc = elem.getAttribute('src');
return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
case 'innerText':
return elem.innerText;
case 'textContent':
@@ -281,9 +283,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} else if (attribute === 'innerHTML') {
record[label] = fieldElement.innerHTML.trim();
} else if (attribute === 'src') {
record[label] = fieldElement.src;
// Handle relative 'src' URLs
const src = fieldElement.getAttribute('src');
record[label] = src ? new URL(src, baseUrl).href : null;
} else if (attribute === 'href') {
record[label] = fieldElement.href;
// Handle relative 'href' URLs
const href = fieldElement.getAttribute('href');
record[label] = href ? new URL(href, baseUrl).href : null;
} else {
record[label] = fieldElement.getAttribute(attribute);
}

View File

@@ -42,7 +42,7 @@
"jwt-decode": "^4.0.0",
"loglevel": "^1.8.0",
"loglevel-plugin-remote": "^0.6.8",
"maxun-core": "^0.0.3",
"maxun-core": "0.0.4",
"minio": "^8.0.1",
"moment-timezone": "^0.5.45",
"node-cron": "^3.0.3",
@@ -110,4 +110,4 @@
"ts-node": "^10.4.0",
"vite": "^5.4.10"
}
}
}

View File

@@ -17,7 +17,7 @@ export const getBestSelectorForAction = (action: Action) => {
selectors?.text?.length != null &&
selectors?.text?.length < 25 &&
action.hasOnlyText
? `text=${selectors.text}`
? selectors.generalSelector
: null;
if (action.tagName === TagName.Input) {