Merge pull request #202 from getmaxun/develop
feat: feat: handle relative src & href paths in scrapeList & scrapeSchema (missing in v0.0.2)
This commit is contained in:
@@ -43,7 +43,7 @@ services:
|
|||||||
#build:
|
#build:
|
||||||
#context: .
|
#context: .
|
||||||
#dockerfile: server/Dockerfile
|
#dockerfile: server/Dockerfile
|
||||||
image: getmaxun/maxun-backend:v0.0.1
|
image: getmaxun/maxun-backend:v0.0.2
|
||||||
ports:
|
ports:
|
||||||
- "8080:8080"
|
- "8080:8080"
|
||||||
env_file: .env
|
env_file: .env
|
||||||
|
|||||||
@@ -1,6 +1,6 @@
|
|||||||
{
|
{
|
||||||
"name": "maxun-core",
|
"name": "maxun-core",
|
||||||
"version": "0.0.3",
|
"version": "0.0.4",
|
||||||
"description": "Core package for Maxun, responsible for data extraction",
|
"description": "Core package for Maxun, responsible for data extraction",
|
||||||
"main": "build/index.js",
|
"main": "build/index.js",
|
||||||
"typings": "build/index.d.ts",
|
"typings": "build/index.d.ts",
|
||||||
@@ -20,7 +20,12 @@
|
|||||||
"automation",
|
"automation",
|
||||||
"workflow",
|
"workflow",
|
||||||
"data extraction",
|
"data extraction",
|
||||||
"scraping"
|
"scraping",
|
||||||
|
"web scraper",
|
||||||
|
"web scraping",
|
||||||
|
"data scraping",
|
||||||
|
"no-code web scraper",
|
||||||
|
"no-code web scraping"
|
||||||
],
|
],
|
||||||
"author": "Maxun",
|
"author": "Maxun",
|
||||||
"license": "AGPL-3.0-or-later",
|
"license": "AGPL-3.0-or-later",
|
||||||
|
|||||||
@@ -235,9 +235,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
|
|
||||||
switch (attribute) {
|
switch (attribute) {
|
||||||
case 'href':
|
case 'href':
|
||||||
return elem.getAttribute('href');
|
const relativeHref = elem.getAttribute('href');
|
||||||
|
return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
|
||||||
case 'src':
|
case 'src':
|
||||||
return elem.getAttribute('src');
|
const relativeSrc = elem.getAttribute('src');
|
||||||
|
return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
|
||||||
case 'innerText':
|
case 'innerText':
|
||||||
return elem.innerText;
|
return elem.innerText;
|
||||||
case 'textContent':
|
case 'textContent':
|
||||||
@@ -281,9 +283,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
} else if (attribute === 'innerHTML') {
|
} else if (attribute === 'innerHTML') {
|
||||||
record[label] = fieldElement.innerHTML.trim();
|
record[label] = fieldElement.innerHTML.trim();
|
||||||
} else if (attribute === 'src') {
|
} else if (attribute === 'src') {
|
||||||
record[label] = fieldElement.src;
|
// Handle relative 'src' URLs
|
||||||
|
const src = fieldElement.getAttribute('src');
|
||||||
|
record[label] = src ? new URL(src, baseUrl).href : null;
|
||||||
} else if (attribute === 'href') {
|
} else if (attribute === 'href') {
|
||||||
record[label] = fieldElement.href;
|
// Handle relative 'href' URLs
|
||||||
|
const href = fieldElement.getAttribute('href');
|
||||||
|
record[label] = href ? new URL(href, baseUrl).href : null;
|
||||||
} else {
|
} else {
|
||||||
record[label] = fieldElement.getAttribute(attribute);
|
record[label] = fieldElement.getAttribute(attribute);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -42,7 +42,7 @@
|
|||||||
"jwt-decode": "^4.0.0",
|
"jwt-decode": "^4.0.0",
|
||||||
"loglevel": "^1.8.0",
|
"loglevel": "^1.8.0",
|
||||||
"loglevel-plugin-remote": "^0.6.8",
|
"loglevel-plugin-remote": "^0.6.8",
|
||||||
"maxun-core": "^0.0.3",
|
"maxun-core": "0.0.4",
|
||||||
"minio": "^8.0.1",
|
"minio": "^8.0.1",
|
||||||
"moment-timezone": "^0.5.45",
|
"moment-timezone": "^0.5.45",
|
||||||
"node-cron": "^3.0.3",
|
"node-cron": "^3.0.3",
|
||||||
@@ -110,4 +110,4 @@
|
|||||||
"ts-node": "^10.4.0",
|
"ts-node": "^10.4.0",
|
||||||
"vite": "^5.4.10"
|
"vite": "^5.4.10"
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -17,7 +17,7 @@ export const getBestSelectorForAction = (action: Action) => {
|
|||||||
selectors?.text?.length != null &&
|
selectors?.text?.length != null &&
|
||||||
selectors?.text?.length < 25 &&
|
selectors?.text?.length < 25 &&
|
||||||
action.hasOnlyText
|
action.hasOnlyText
|
||||||
? `text=${selectors.text}`
|
? selectors.generalSelector
|
||||||
: null;
|
: null;
|
||||||
|
|
||||||
if (action.tagName === TagName.Input) {
|
if (action.tagName === TagName.Input) {
|
||||||
|
|||||||
Reference in New Issue
Block a user