Merge pull request #202 from getmaxun/develop
feat: feat: handle relative src & href paths in scrapeList & scrapeSchema (missing in v0.0.2)
This commit is contained in:
@@ -43,7 +43,7 @@ services:
|
||||
#build:
|
||||
#context: .
|
||||
#dockerfile: server/Dockerfile
|
||||
image: getmaxun/maxun-backend:v0.0.1
|
||||
image: getmaxun/maxun-backend:v0.0.2
|
||||
ports:
|
||||
- "8080:8080"
|
||||
env_file: .env
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
{
|
||||
"name": "maxun-core",
|
||||
"version": "0.0.3",
|
||||
"version": "0.0.4",
|
||||
"description": "Core package for Maxun, responsible for data extraction",
|
||||
"main": "build/index.js",
|
||||
"typings": "build/index.d.ts",
|
||||
@@ -20,7 +20,12 @@
|
||||
"automation",
|
||||
"workflow",
|
||||
"data extraction",
|
||||
"scraping"
|
||||
"scraping",
|
||||
"web scraper",
|
||||
"web scraping",
|
||||
"data scraping",
|
||||
"no-code web scraper",
|
||||
"no-code web scraping"
|
||||
],
|
||||
"author": "Maxun",
|
||||
"license": "AGPL-3.0-or-later",
|
||||
|
||||
@@ -235,9 +235,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
|
||||
switch (attribute) {
|
||||
case 'href':
|
||||
return elem.getAttribute('href');
|
||||
const relativeHref = elem.getAttribute('href');
|
||||
return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
|
||||
case 'src':
|
||||
return elem.getAttribute('src');
|
||||
const relativeSrc = elem.getAttribute('src');
|
||||
return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
|
||||
case 'innerText':
|
||||
return elem.innerText;
|
||||
case 'textContent':
|
||||
@@ -281,9 +283,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
} else if (attribute === 'innerHTML') {
|
||||
record[label] = fieldElement.innerHTML.trim();
|
||||
} else if (attribute === 'src') {
|
||||
record[label] = fieldElement.src;
|
||||
// Handle relative 'src' URLs
|
||||
const src = fieldElement.getAttribute('src');
|
||||
record[label] = src ? new URL(src, baseUrl).href : null;
|
||||
} else if (attribute === 'href') {
|
||||
record[label] = fieldElement.href;
|
||||
// Handle relative 'href' URLs
|
||||
const href = fieldElement.getAttribute('href');
|
||||
record[label] = href ? new URL(href, baseUrl).href : null;
|
||||
} else {
|
||||
record[label] = fieldElement.getAttribute(attribute);
|
||||
}
|
||||
|
||||
@@ -42,7 +42,7 @@
|
||||
"jwt-decode": "^4.0.0",
|
||||
"loglevel": "^1.8.0",
|
||||
"loglevel-plugin-remote": "^0.6.8",
|
||||
"maxun-core": "^0.0.3",
|
||||
"maxun-core": "0.0.4",
|
||||
"minio": "^8.0.1",
|
||||
"moment-timezone": "^0.5.45",
|
||||
"node-cron": "^3.0.3",
|
||||
|
||||
@@ -17,7 +17,7 @@ export const getBestSelectorForAction = (action: Action) => {
|
||||
selectors?.text?.length != null &&
|
||||
selectors?.text?.length < 25 &&
|
||||
action.hasOnlyText
|
||||
? `text=${selectors.text}`
|
||||
? selectors.generalSelector
|
||||
: null;
|
||||
|
||||
if (action.tagName === TagName.Input) {
|
||||
|
||||
Reference in New Issue
Block a user