feat: check parent element url and extract
This commit is contained in:
@@ -523,50 +523,63 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
|
|
||||||
// Enhanced value extraction with context awareness
|
// Enhanced value extraction with context awareness
|
||||||
function extractValue(element, attribute) {
|
function extractValue(element, attribute) {
|
||||||
if (!element) return null;
|
if (!element) return null;
|
||||||
|
|
||||||
// Get context-aware base URL
|
// Get context-aware base URL
|
||||||
const baseURL = element.ownerDocument?.location?.href || window.location.origin;
|
const baseURL = element.ownerDocument?.location?.href || window.location.origin;
|
||||||
|
|
||||||
// Check shadow root first
|
// Check shadow root first
|
||||||
if (element.shadowRoot) {
|
if (element.shadowRoot) {
|
||||||
const shadowContent = element.shadowRoot.textContent;
|
const shadowContent = element.shadowRoot.textContent;
|
||||||
if (shadowContent?.trim()) {
|
if (shadowContent?.trim()) {
|
||||||
return shadowContent.trim();
|
return shadowContent.trim();
|
||||||
}
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (attribute === 'innerText') {
|
||||||
|
return element.innerText.trim();
|
||||||
|
} else if (attribute === 'innerHTML') {
|
||||||
|
return element.innerHTML.trim();
|
||||||
|
} else if (attribute === 'src' || attribute === 'href') {
|
||||||
|
if (attribute === 'href' && element.tagName !== 'A') {
|
||||||
|
const parentElement = element.parentElement;
|
||||||
|
if (parentElement && parentElement.tagName === 'A') {
|
||||||
|
const parentHref = parentElement.getAttribute('href');
|
||||||
|
if (parentHref) {
|
||||||
|
try {
|
||||||
|
return new URL(parentHref, baseURL).href;
|
||||||
|
} catch (e) {
|
||||||
|
return parentHref;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const attrValue = element.getAttribute(attribute);
|
||||||
|
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
||||||
|
|
||||||
|
if (!dataAttr || dataAttr.trim() === '') {
|
||||||
|
if (attribute === 'src') {
|
||||||
|
const style = window.getComputedStyle(element);
|
||||||
|
const bgImage = style.backgroundImage;
|
||||||
|
if (bgImage && bgImage !== 'none') {
|
||||||
|
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
||||||
|
return matches ? new URL(matches[1], baseURL).href : null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return new URL(dataAttr, baseURL).href;
|
||||||
|
} catch (e) {
|
||||||
|
console.warn('Error creating URL from', dataAttr, e);
|
||||||
|
return dataAttr; // Return the original value if URL construction fails
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return element.getAttribute(attribute);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (attribute === 'innerText') {
|
|
||||||
return element.innerText.trim();
|
|
||||||
} else if (attribute === 'innerHTML') {
|
|
||||||
return element.innerHTML.trim();
|
|
||||||
} else if (attribute === 'src' || attribute === 'href') {
|
|
||||||
const attrValue = element.getAttribute(attribute);
|
|
||||||
|
|
||||||
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
|
||||||
|
|
||||||
if (!dataAttr || dataAttr.trim() === '') {
|
|
||||||
if (attribute === 'src') {
|
|
||||||
const style = window.getComputedStyle(element);
|
|
||||||
const bgImage = style.backgroundImage;
|
|
||||||
if (bgImage && bgImage !== 'none') {
|
|
||||||
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
|
||||||
return matches ? new URL(matches[1], baseURL).href : null;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return null;
|
|
||||||
}
|
|
||||||
|
|
||||||
try {
|
|
||||||
return new URL(dataAttr, baseURL).href;
|
|
||||||
} catch (e) {
|
|
||||||
console.warn('Error creating URL from', dataAttr, e);
|
|
||||||
return dataAttr; // Return the original value if URL construction fails
|
|
||||||
}
|
|
||||||
}
|
|
||||||
return element.getAttribute(attribute);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Enhanced table ancestor finding with context support
|
// Enhanced table ancestor finding with context support
|
||||||
function findTableAncestor(element) {
|
function findTableAncestor(element) {
|
||||||
let currentElement = element;
|
let currentElement = element;
|
||||||
|
|||||||
Reference in New Issue
Block a user