feat: add functionality to scrape shadowDOM elements

This commit is contained in:
RohitR311
2024-12-30 01:24:32 +05:30
parent 542f4d31fa
commit b60f4b73b8

View File

@@ -189,68 +189,102 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
* @returns {Array.<Object.<string, string>>} * @returns {Array.<Object.<string, string>>}
*/ */
window.scrapeSchema = function (lists) { window.scrapeSchema = function (lists) {
// These utility functions remain unchanged as they work perfectly
function omap(object, f, kf = (x) => x) { function omap(object, f, kf = (x) => x) {
return Object.fromEntries( return Object.fromEntries(
Object.entries(object) Object.entries(object)
.map(([k, v]) => [kf(k), f(v)]), .map(([k, v]) => [kf(k), f(v)]),
); );
} }
function ofilter(object, f) { function ofilter(object, f) {
return Object.fromEntries( return Object.fromEntries(
Object.entries(object) Object.entries(object)
.filter(([k, v]) => f(k, v)), .filter(([k, v]) => f(k, v)),
); );
} }
function getSeedKey(listObj) { function findElement(config) {
const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length))); // If this is a shadow DOM query
return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0]; if (config.shadow && config.selector.includes('>>')) {
} const [hostSelector, shadowSelector] = config.selector.split('>>').map(s => s.trim());
const host = document.querySelector(hostSelector);
function getMBEs(elements) { return host?.shadowRoot?.querySelector(shadowSelector) || null;
return elements.map((element) => {
let candidate = element;
const isUniqueChild = (e) => elements
.filter((elem) => e.parentNode?.contains(elem))
.length === 1;
while (candidate && isUniqueChild(candidate)) {
candidate = candidate.parentNode;
} }
// Otherwise, use regular querySelector
return document.querySelector(config.selector);
}
return candidate; function findAllElements(config) {
}); // If this is a shadow DOM query
if (config.shadow && config.selector.includes('>>')) {
const element = findElement(config);
return element ? [element] : [];
}
// Otherwise, use regular querySelectorAll
return Array.from(document.querySelectorAll(config.selector));
}
// Modified to use our new element finding functions
function getSeedKey(listObj) {
const maxLength = Math.max(...Object.values(
omap(listObj, (x) => findAllElements(x).length)
));
return Object.keys(
ofilter(listObj, (_, v) => findAllElements(v).length === maxLength)
)[0];
}
// This function remains unchanged as it works with DOM elements
// regardless of how they were found
function getMBEs(elements) {
return elements.map((element) => {
let candidate = element;
const isUniqueChild = (e) => elements
.filter((elem) => e.parentNode?.contains(elem))
.length === 1;
while (candidate && isUniqueChild(candidate)) {
candidate = candidate.parentNode;
}
return candidate;
});
} }
const seedName = getSeedKey(lists); const seedName = getSeedKey(lists);
const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector)); const seedElements = findAllElements(lists[seedName]);
const MBEs = getMBEs(seedElements); const MBEs = getMBEs(seedElements);
return MBEs.map((mbe) => omap( return MBEs.map((mbe) => omap(
lists, lists,
({ selector, attribute }, key) => { (config, key) => {
const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem)); // Use our new findAllElements function
if (!elem) return undefined; const elem = findAllElements(config)
.find((elem) => mbe.contains(elem));
switch (attribute) { if (!elem) return undefined;
case 'href':
const relativeHref = elem.getAttribute('href'); switch (config.attribute) {
return relativeHref ? new URL(relativeHref, window.location.origin).href : null; case 'href': {
case 'src': const relativeHref = elem.getAttribute('href');
const relativeSrc = elem.getAttribute('src'); return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null; }
case 'innerText': case 'src': {
return elem.innerText; const relativeSrc = elem.getAttribute('src');
case 'textContent': return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
return elem.textContent; }
default: case 'innerText':
return elem.innerText; return elem.innerText;
} case 'textContent':
}, return elem.textContent;
(key) => key // Use the original key in the output default:
return elem.getAttribute(config.attribute) || elem.innerText;
}
},
(key) => key
)) || []; )) || [];
} };
/** /**
* Scrapes multiple lists of similar items based on a template item. * Scrapes multiple lists of similar items based on a template item.