feat: scrapeList
This commit is contained in:
@@ -278,76 +278,42 @@ async function scrollDownToLoadMore(selector, limit) {
|
|||||||
* @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors
|
* @param {boolean} [config.flexible=false] - Whether to use flexible matching for field selectors
|
||||||
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
||||||
*/
|
*/
|
||||||
window.scrapeList = async function (config) {
|
window.scrapeList = function({ listSelector, fields }) {
|
||||||
const { listSelector, fields, limit, flexible = false, pagination } = config;
|
// Get all parent elements matching the listSelector
|
||||||
|
const parentElements = Array.from(document.querySelectorAll(listSelector));
|
||||||
|
|
||||||
const lists = Array.from(document.querySelectorAll(listSelector));
|
const scrapedData = [];
|
||||||
|
|
||||||
if (pagination) {
|
// Iterate through each parent element
|
||||||
const { type, selector } = pagination;
|
parentElements.forEach(parent => {
|
||||||
|
const record = {};
|
||||||
|
|
||||||
switch (type) {
|
// For each field, select the corresponding element within the parent
|
||||||
case 'scrollDown':
|
for (const [label, { selector, attribute }] of Object.entries(fields)) {
|
||||||
await scrollDownToLoadMore(pagination.selector, config.limit);
|
const fieldElement = parent.querySelector(selector);
|
||||||
break;
|
|
||||||
// case 'scrollUp':
|
|
||||||
// await scrollUpToLoadMore(limit);
|
|
||||||
// break;
|
|
||||||
// case 'clickNext':
|
|
||||||
// if (selector) await clickNextToNavigate(selector, limit);
|
|
||||||
// break;
|
|
||||||
// case 'clickLoadMore':
|
|
||||||
// if (selector) await clickLoadMore(selector, limit);
|
|
||||||
// break;
|
|
||||||
default:
|
|
||||||
// No pagination or different handling
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return lists.map(list => {
|
// Depending on the attribute specified, extract the data
|
||||||
const listItems = Array.from(list.children);
|
if (fieldElement) {
|
||||||
|
if (attribute === 'innerText') {
|
||||||
const itemsToScrape = limit ? listItems.slice(0, limit) : listItems;
|
record[label] = fieldElement.innerText.trim();
|
||||||
|
} else if (attribute === 'innerHTML') {
|
||||||
// scrape each item
|
record[label] = fieldElement.innerHTML.trim();
|
||||||
return itemsToScrape.map(item => {
|
} else if (attribute === 'src') {
|
||||||
const scrapedItem = {};
|
record[label] = fieldElement.src;
|
||||||
|
} else if (attribute === 'href') {
|
||||||
for (const [fieldName, fieldConfig] of Object.entries(fields)) {
|
record[label] = fieldElement.href;
|
||||||
let element;
|
} else {
|
||||||
element = item.querySelector(fieldConfig.selector);
|
// Default to attribute retrieval
|
||||||
|
record[label] = fieldElement.getAttribute(attribute);
|
||||||
console.debug('Element:', element);
|
}
|
||||||
|
|
||||||
// }
|
|
||||||
|
|
||||||
if (element) {
|
|
||||||
switch (fieldConfig.attribute) {
|
|
||||||
case 'href':
|
|
||||||
scrapedItem[fieldName] = element.getAttribute('href');
|
|
||||||
break;
|
|
||||||
case 'src':
|
|
||||||
scrapedItem[fieldName] = element.getAttribute('src');
|
|
||||||
break;
|
|
||||||
case 'textContent':
|
|
||||||
scrapedItem[fieldName] = element.textContent.trim();
|
|
||||||
break;
|
|
||||||
case 'innerText':
|
|
||||||
default:
|
|
||||||
scrapedItem[fieldName] = element.innerText.trim();
|
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
// send a message that says it failed
|
|
||||||
scrapedItem[fieldName] = `Failed to scrape ${fieldName}`;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
scrapedData.push(record);
|
||||||
return scrapedItem;
|
|
||||||
});
|
|
||||||
});
|
});
|
||||||
};
|
|
||||||
|
return scrapedData;
|
||||||
|
};
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user