feat: add functionality to scrape shadowDOM elements
This commit is contained in:
@@ -189,68 +189,102 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
* @returns {Array.<Object.<string, string>>}
|
* @returns {Array.<Object.<string, string>>}
|
||||||
*/
|
*/
|
||||||
window.scrapeSchema = function (lists) {
|
window.scrapeSchema = function (lists) {
|
||||||
|
// These utility functions remain unchanged as they work perfectly
|
||||||
function omap(object, f, kf = (x) => x) {
|
function omap(object, f, kf = (x) => x) {
|
||||||
return Object.fromEntries(
|
return Object.fromEntries(
|
||||||
Object.entries(object)
|
Object.entries(object)
|
||||||
.map(([k, v]) => [kf(k), f(v)]),
|
.map(([k, v]) => [kf(k), f(v)]),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
function ofilter(object, f) {
|
function ofilter(object, f) {
|
||||||
return Object.fromEntries(
|
return Object.fromEntries(
|
||||||
Object.entries(object)
|
Object.entries(object)
|
||||||
.filter(([k, v]) => f(k, v)),
|
.filter(([k, v]) => f(k, v)),
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
function getSeedKey(listObj) {
|
function findElement(config) {
|
||||||
const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length)));
|
// If this is a shadow DOM query
|
||||||
return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0];
|
if (config.shadow && config.selector.includes('>>')) {
|
||||||
}
|
const [hostSelector, shadowSelector] = config.selector.split('>>').map(s => s.trim());
|
||||||
|
const host = document.querySelector(hostSelector);
|
||||||
function getMBEs(elements) {
|
return host?.shadowRoot?.querySelector(shadowSelector) || null;
|
||||||
return elements.map((element) => {
|
|
||||||
let candidate = element;
|
|
||||||
const isUniqueChild = (e) => elements
|
|
||||||
.filter((elem) => e.parentNode?.contains(elem))
|
|
||||||
.length === 1;
|
|
||||||
|
|
||||||
while (candidate && isUniqueChild(candidate)) {
|
|
||||||
candidate = candidate.parentNode;
|
|
||||||
}
|
}
|
||||||
|
// Otherwise, use regular querySelector
|
||||||
|
return document.querySelector(config.selector);
|
||||||
|
}
|
||||||
|
|
||||||
return candidate;
|
function findAllElements(config) {
|
||||||
});
|
// If this is a shadow DOM query
|
||||||
|
if (config.shadow && config.selector.includes('>>')) {
|
||||||
|
const element = findElement(config);
|
||||||
|
return element ? [element] : [];
|
||||||
|
}
|
||||||
|
// Otherwise, use regular querySelectorAll
|
||||||
|
return Array.from(document.querySelectorAll(config.selector));
|
||||||
|
}
|
||||||
|
|
||||||
|
// Modified to use our new element finding functions
|
||||||
|
function getSeedKey(listObj) {
|
||||||
|
const maxLength = Math.max(...Object.values(
|
||||||
|
omap(listObj, (x) => findAllElements(x).length)
|
||||||
|
));
|
||||||
|
return Object.keys(
|
||||||
|
ofilter(listObj, (_, v) => findAllElements(v).length === maxLength)
|
||||||
|
)[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
// This function remains unchanged as it works with DOM elements
|
||||||
|
// regardless of how they were found
|
||||||
|
function getMBEs(elements) {
|
||||||
|
return elements.map((element) => {
|
||||||
|
let candidate = element;
|
||||||
|
const isUniqueChild = (e) => elements
|
||||||
|
.filter((elem) => e.parentNode?.contains(elem))
|
||||||
|
.length === 1;
|
||||||
|
|
||||||
|
while (candidate && isUniqueChild(candidate)) {
|
||||||
|
candidate = candidate.parentNode;
|
||||||
|
}
|
||||||
|
|
||||||
|
return candidate;
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const seedName = getSeedKey(lists);
|
const seedName = getSeedKey(lists);
|
||||||
const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector));
|
const seedElements = findAllElements(lists[seedName]);
|
||||||
const MBEs = getMBEs(seedElements);
|
const MBEs = getMBEs(seedElements);
|
||||||
|
|
||||||
return MBEs.map((mbe) => omap(
|
return MBEs.map((mbe) => omap(
|
||||||
lists,
|
lists,
|
||||||
({ selector, attribute }, key) => {
|
(config, key) => {
|
||||||
const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem));
|
// Use our new findAllElements function
|
||||||
if (!elem) return undefined;
|
const elem = findAllElements(config)
|
||||||
|
.find((elem) => mbe.contains(elem));
|
||||||
|
|
||||||
switch (attribute) {
|
if (!elem) return undefined;
|
||||||
case 'href':
|
|
||||||
const relativeHref = elem.getAttribute('href');
|
switch (config.attribute) {
|
||||||
return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
|
case 'href': {
|
||||||
case 'src':
|
const relativeHref = elem.getAttribute('href');
|
||||||
const relativeSrc = elem.getAttribute('src');
|
return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
|
||||||
return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
|
}
|
||||||
case 'innerText':
|
case 'src': {
|
||||||
return elem.innerText;
|
const relativeSrc = elem.getAttribute('src');
|
||||||
case 'textContent':
|
return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
|
||||||
return elem.textContent;
|
}
|
||||||
default:
|
case 'innerText':
|
||||||
return elem.innerText;
|
return elem.innerText;
|
||||||
}
|
case 'textContent':
|
||||||
},
|
return elem.textContent;
|
||||||
(key) => key // Use the original key in the output
|
default:
|
||||||
|
return elem.getAttribute(config.attribute) || elem.innerText;
|
||||||
|
}
|
||||||
|
},
|
||||||
|
(key) => key
|
||||||
)) || [];
|
)) || [];
|
||||||
}
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Scrapes multiple lists of similar items based on a template item.
|
* Scrapes multiple lists of similar items based on a template item.
|
||||||
|
|||||||
Reference in New Issue
Block a user