feat: add logic to scrape multiple nested shadow dom elements
This commit is contained in:
@@ -246,7 +246,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
return currentElements;
|
||||
}
|
||||
|
||||
// Helper function to extract value from element based on attribute
|
||||
function getElementValue(element, attribute) {
|
||||
if (!element) return null;
|
||||
|
||||
@@ -294,12 +293,12 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
});
|
||||
}
|
||||
|
||||
// Main scraping logic
|
||||
// First try the MBE approach
|
||||
const seedName = getSeedKey(lists);
|
||||
const seedElements = findAllElements(lists[seedName]);
|
||||
const MBEs = getMBEs(seedElements);
|
||||
|
||||
return MBEs.map((mbe) => omap(
|
||||
const mbeResults = MBEs.map((mbe) => omap(
|
||||
lists,
|
||||
(config) => {
|
||||
const elem = findAllElements(config)
|
||||
@@ -309,6 +308,33 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
},
|
||||
(key) => key
|
||||
)) || [];
|
||||
|
||||
// If MBE approach didn't find all elements, try independent scraping
|
||||
if (mbeResults.some(result => Object.values(result).some(v => v === undefined))) {
|
||||
// Fall back to independent scraping
|
||||
const results = [];
|
||||
const foundElements = new Map();
|
||||
|
||||
// Find all elements for each selector
|
||||
Object.entries(lists).forEach(([key, config]) => {
|
||||
const elements = findAllElements(config);
|
||||
foundElements.set(key, elements);
|
||||
});
|
||||
|
||||
// Create result objects for each found element
|
||||
foundElements.forEach((elements, key) => {
|
||||
elements.forEach((element, index) => {
|
||||
if (!results[index]) {
|
||||
results[index] = {};
|
||||
}
|
||||
results[index][key] = getElementValue(element, lists[key].attribute);
|
||||
});
|
||||
});
|
||||
|
||||
return results.filter(result => Object.keys(result).length > 0);
|
||||
}
|
||||
|
||||
return mbeResults;
|
||||
};
|
||||
|
||||
/**
|
||||
|
||||
Reference in New Issue
Block a user