Merge pull request #552 from getmaxun/href-fix

fix: links not being scraped
This commit is contained in:
Karishma Shukla
2025-04-23 18:38:29 +05:30
committed by GitHub
2 changed files with 54 additions and 44 deletions

View File

@@ -523,49 +523,62 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
// Enhanced value extraction with context awareness
function extractValue(element, attribute) {
if (!element) return null;
// Get context-aware base URL
const baseURL = element.ownerDocument?.location?.href || window.location.origin;
if (!element) return null;
// Check shadow root first
if (element.shadowRoot) {
const shadowContent = element.shadowRoot.textContent;
if (shadowContent?.trim()) {
return shadowContent.trim();
}
// Get context-aware base URL
const baseURL = element.ownerDocument?.location?.href || window.location.origin;
// Check shadow root first
if (element.shadowRoot) {
const shadowContent = element.shadowRoot.textContent;
if (shadowContent?.trim()) {
return shadowContent.trim();
}
}
if (attribute === 'innerText') {
return element.innerText.trim();
} else if (attribute === 'innerHTML') {
return element.innerHTML.trim();
} else if (attribute === 'src' || attribute === 'href') {
if (attribute === 'href' && element.tagName !== 'A') {
const parentElement = element.parentElement;
if (parentElement && parentElement.tagName === 'A') {
const parentHref = parentElement.getAttribute('href');
if (parentHref) {
try {
return new URL(parentHref, baseURL).href;
} catch (e) {
return parentHref;
}
}
}
}
const attrValue = element.getAttribute(attribute);
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
if (!dataAttr || dataAttr.trim() === '') {
if (attribute === 'src') {
const style = window.getComputedStyle(element);
const bgImage = style.backgroundImage;
if (bgImage && bgImage !== 'none') {
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
return matches ? new URL(matches[1], baseURL).href : null;
}
}
return null;
}
try {
return new URL(dataAttr, baseURL).href;
} catch (e) {
console.warn('Error creating URL from', dataAttr, e);
return dataAttr; // Return the original value if URL construction fails
}
}
return element.getAttribute(attribute);
}
if (attribute === 'innerText') {
return element.innerText.trim();
} else if (attribute === 'innerHTML') {
return element.innerHTML.trim();
} else if (attribute === 'src' || attribute === 'href') {
const attrValue = element.getAttribute(attribute);
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
if (!dataAttr || dataAttr.trim() === '') {
if (attribute === 'src') {
const style = window.getComputedStyle(element);
const bgImage = style.backgroundImage;
if (bgImage && bgImage !== 'none') {
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
return matches ? new URL(matches[1], baseURL).href : null;
}
}
return null;
}
try {
return new URL(dataAttr, baseURL).href;
} catch (e) {
console.warn('Error creating URL from', dataAttr, e);
return dataAttr; // Return the original value if URL construction fails
}
}
return element.getAttribute(attribute);
}
// Enhanced table ancestor finding with context support
function findTableAncestor(element) {

View File

@@ -302,9 +302,6 @@ export const getElementInformation = async (
);
return elementInfo;
} else {
page.on('console', msg => {
console.log(`Browser console: ${msg.text()}`);
});
const elementInfo = await page.evaluate(
async ({ x, y }) => {
const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => {