feat: scrapeListAuto
This commit is contained in:
@@ -317,15 +317,18 @@ async function scrollDownToLoadMore(selector, limit) {
|
|||||||
for (const [fieldName, fieldConfig] of Object.entries(fields)) {
|
for (const [fieldName, fieldConfig] of Object.entries(fields)) {
|
||||||
let element;
|
let element;
|
||||||
|
|
||||||
if (flexible) {
|
// if (flexible) {
|
||||||
// try multiple strategies to find the element
|
// // try multiple strategies to find the element
|
||||||
element = item.querySelector(fieldConfig.selector) ||
|
// element = item.querySelector(fieldConfig.selector) ||
|
||||||
item.querySelector(`[class*="${fieldConfig.selector}"]`) ||
|
// item.querySelector(`[class*="${fieldConfig.selector}"]`) ||
|
||||||
Array.from(item.querySelectorAll('*'))
|
// Array.from(item.querySelectorAll('*'))
|
||||||
.find(el => el.textContent.trim() === fieldConfig.selector);
|
// .find(el => el.textContent.trim() === fieldConfig.selector);
|
||||||
} else {
|
// } else {
|
||||||
element = item.querySelector(fieldConfig.selector);
|
element = item.querySelector(fieldConfig.selector);
|
||||||
}
|
|
||||||
|
console.debug('Element:', element);
|
||||||
|
|
||||||
|
// }
|
||||||
|
|
||||||
if (element) {
|
if (element) {
|
||||||
switch (fieldConfig.attribute) {
|
switch (fieldConfig.attribute) {
|
||||||
@@ -344,7 +347,8 @@ async function scrollDownToLoadMore(selector, limit) {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
scrapedItem[fieldName] = null;
|
// send a message that says it failed
|
||||||
|
scrapedItem[fieldName] = `Failed to scrape ${fieldName}`;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -353,4 +357,51 @@ async function scrollDownToLoadMore(selector, limit) {
|
|||||||
});
|
});
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Gets all children of the elements matching the listSelector,
|
||||||
|
* returning their CSS selectors and innerText.
|
||||||
|
* @param {string} listSelector - Selector for the list container(s)
|
||||||
|
* @returns {Array.<Object>} Array of objects, each containing the CSS selector and innerText of the children
|
||||||
|
*/
|
||||||
|
window.scrapeListAuto = function (listSelector) {
|
||||||
|
const lists = Array.from(document.querySelectorAll(listSelector));
|
||||||
|
|
||||||
|
const results = [];
|
||||||
|
|
||||||
|
lists.forEach(list => {
|
||||||
|
const children = Array.from(list.children);
|
||||||
|
|
||||||
|
children.forEach(child => {
|
||||||
|
const selectors = [];
|
||||||
|
let element = child;
|
||||||
|
|
||||||
|
// Traverse up to gather the CSS selector for the element
|
||||||
|
while (element && element !== document) {
|
||||||
|
let selector = element.nodeName.toLowerCase();
|
||||||
|
if (element.id) {
|
||||||
|
selector += `#${element.id}`;
|
||||||
|
selectors.push(selector);
|
||||||
|
break;
|
||||||
|
} else {
|
||||||
|
const className = element.className.trim().split(/\s+/).join('.');
|
||||||
|
if (className) {
|
||||||
|
selector += `.${className}`;
|
||||||
|
}
|
||||||
|
selectors.push(selector);
|
||||||
|
element = element.parentElement;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
results.push({
|
||||||
|
selector: selectors.reverse().join(' > '),
|
||||||
|
innerText: child.innerText.trim()
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
return results;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
})(window);
|
})(window);
|
||||||
Reference in New Issue
Block a user