feat: wrap scrape & scrapeSchema in IIFE
This commit is contained in:
@@ -130,58 +130,65 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
* Returns a "scrape" result from the current page.
|
* Returns a "scrape" result from the current page.
|
||||||
* @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
|
* @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
|
||||||
*/
|
*/
|
||||||
function scrape(selector = null) {
|
// Wrap the entire function in an IIFE (Immediately Invoked Function Expression)
|
||||||
|
// and attach it to the window object
|
||||||
|
(function(window) {
|
||||||
/**
|
/**
|
||||||
* **crudeRecords** contains uncurated rundowns of "scrapable" elements
|
* Returns a "scrape" result from the current page.
|
||||||
* @type {Array<Object>}
|
* @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
|
||||||
*/
|
*/
|
||||||
const crudeRecords = (selector
|
window.scrape = function(selector = null) {
|
||||||
? Array.from(document.querySelectorAll(selector))
|
/**
|
||||||
: scrapableHeuristics())
|
* **crudeRecords** contains uncurated rundowns of "scrapable" elements
|
||||||
.map((record) => ({
|
* @type {Array<Object>}
|
||||||
...Array.from(record.querySelectorAll('img'))
|
*/
|
||||||
.reduce((p, x, i) => {
|
const crudeRecords = (selector
|
||||||
let url = null;
|
? Array.from(document.querySelectorAll(selector))
|
||||||
if (x.srcset) {
|
: scrapableHeuristics())
|
||||||
const urls = x.srcset.split(', ');
|
.map((record) => ({
|
||||||
[url] = urls[urls.length - 1].split(' ');
|
...Array.from(record.querySelectorAll('img'))
|
||||||
}
|
.reduce((p, x, i) => {
|
||||||
|
let url = null;
|
||||||
|
if (x.srcset) {
|
||||||
|
const urls = x.srcset.split(', ');
|
||||||
|
[url] = urls[urls.length - 1].split(' ');
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Contains the largest elements from `srcset` - if `srcset` is not present, contains
|
* Contains the largest elements from `srcset` - if `srcset` is not present, contains
|
||||||
* URL from the `src` attribute
|
* URL from the `src` attribute
|
||||||
*
|
*
|
||||||
* If the `src` attribute contains a data url, imgUrl contains `undefined`.
|
* If the `src` attribute contains a data url, imgUrl contains `undefined`.
|
||||||
*/
|
*/
|
||||||
let imgUrl;
|
let imgUrl;
|
||||||
if (x.srcset) {
|
if (x.srcset) {
|
||||||
imgUrl = url;
|
imgUrl = url;
|
||||||
} else if (x.src.indexOf('data:') === -1) {
|
} else if (x.src.indexOf('data:') === -1) {
|
||||||
imgUrl = x.src;
|
imgUrl = x.src;
|
||||||
}
|
}
|
||||||
|
|
||||||
return ({
|
return ({
|
||||||
|
...p,
|
||||||
|
...(imgUrl ? { [`img_${i}`]: imgUrl } : {}),
|
||||||
|
});
|
||||||
|
}, {}),
|
||||||
|
...record.innerText.split('\n')
|
||||||
|
.reduce((p, x, i) => ({
|
||||||
...p,
|
...p,
|
||||||
...(imgUrl ? { [`img_${i}`]: imgUrl } : {}),
|
[`record_${String(i).padStart(4, '0')}`]: x.trim(),
|
||||||
});
|
}), {}),
|
||||||
}, {}),
|
}));
|
||||||
...record.innerText.split('\n')
|
|
||||||
.reduce((p, x, i) => ({
|
|
||||||
...p,
|
|
||||||
[`record_${String(i).padStart(4, '0')}`]: x.trim(),
|
|
||||||
}), {}),
|
|
||||||
}));
|
|
||||||
|
|
||||||
return crudeRecords;
|
return crudeRecords;
|
||||||
}
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Given an object with named lists of elements,
|
* Given an object with named lists of elements,
|
||||||
* groups the elements by their distance in the DOM tree.
|
* groups the elements by their distance in the DOM tree.
|
||||||
* @param {Object.<string, object[]>} lists The named lists of HTML elements.
|
* @param {Object.<string, object[]>} lists The named lists of HTML elements.
|
||||||
* @returns {Array.<Object.<string, string>>}
|
* @returns {Array.<Object.<string, string>>}
|
||||||
*/
|
*/
|
||||||
function scrapeSchema(lists) {
|
window.scrapeSchema = function (lists) {
|
||||||
function omap(object, f, kf = (x) => x) {
|
function omap(object, f, kf = (x) => x) {
|
||||||
return Object.fromEntries(
|
return Object.fromEntries(
|
||||||
Object.entries(object)
|
Object.entries(object)
|
||||||
@@ -224,3 +231,6 @@ function scrapeSchema(lists) {
|
|||||||
(listOfElements) => listOfElements.find((elem) => mbe.contains(elem))?.innerText,
|
(listOfElements) => listOfElements.find((elem) => mbe.contains(elem))?.innerText,
|
||||||
));
|
));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
})(window);
|
||||||
Reference in New Issue
Block a user