Merge pull request #462 from getmaxun/improve-extract
feat: improve highlighting and data extraction
This commit is contained in:
@@ -210,7 +210,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
return Array.from(document.querySelectorAll(config.selector));
|
return Array.from(document.querySelectorAll(config.selector));
|
||||||
}
|
}
|
||||||
|
|
||||||
// First handle iframe traversal if present
|
|
||||||
if (config.selector.includes(':>>')) {
|
if (config.selector.includes(':>>')) {
|
||||||
const parts = config.selector.split(':>>').map(s => s.trim());
|
const parts = config.selector.split(':>>').map(s => s.trim());
|
||||||
let currentElements = [document];
|
let currentElements = [document];
|
||||||
@@ -223,23 +222,44 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
|
|
||||||
for (const element of currentElements) {
|
for (const element of currentElements) {
|
||||||
try {
|
try {
|
||||||
// For document or iframe document
|
|
||||||
const doc = element.contentDocument || element || element.contentWindow?.document;
|
const doc = element.contentDocument || element || element.contentWindow?.document;
|
||||||
if (!doc) continue;
|
if (!doc) continue;
|
||||||
|
|
||||||
// Query elements in current context
|
if (part.startsWith('frame[name=') || part.startsWith('iframe[name=')) {
|
||||||
|
const nameMatch = part.match(/\[name=['"]([^'"]+)['"]\]/);
|
||||||
|
if (nameMatch && nameMatch[1]) {
|
||||||
|
const frameName = nameMatch[1];
|
||||||
|
let foundFrames = [];
|
||||||
|
|
||||||
|
if (doc.getElementsByName && typeof doc.getElementsByName === 'function') {
|
||||||
|
foundFrames = Array.from(doc.getElementsByName(frameName))
|
||||||
|
.filter(el => el.tagName === 'FRAME' || el.tagName === 'IFRAME');
|
||||||
|
}
|
||||||
|
|
||||||
|
if (foundFrames.length === 0) {
|
||||||
|
const framesBySelector = Array.from(doc.querySelectorAll(`frame[name="${frameName}"], iframe[name="${frameName}"]`));
|
||||||
|
foundFrames = framesBySelector;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (isLast) {
|
||||||
|
nextElements.push(...foundFrames);
|
||||||
|
} else {
|
||||||
|
nextElements.push(...foundFrames);
|
||||||
|
}
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const found = Array.from(doc.querySelectorAll(part));
|
const found = Array.from(doc.querySelectorAll(part));
|
||||||
|
|
||||||
if (isLast) {
|
if (isLast) {
|
||||||
// If it's the last part, keep all matching elements
|
|
||||||
nextElements.push(...found);
|
nextElements.push(...found);
|
||||||
} else {
|
} else {
|
||||||
// If not last, only keep iframes for next iteration
|
const frames = found.filter(el => el.tagName === 'IFRAME' || el.tagName === 'FRAME');
|
||||||
const iframes = found.filter(el => el.tagName === 'IFRAME');
|
nextElements.push(...frames);
|
||||||
nextElements.push(...iframes);
|
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
console.warn('Cannot access iframe content:', error, {
|
console.warn('Cannot access iframe/frame content:', error, {
|
||||||
part,
|
part,
|
||||||
element,
|
element,
|
||||||
index: i
|
index: i
|
||||||
@@ -285,12 +305,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
return [];
|
return [];
|
||||||
}
|
}
|
||||||
|
|
||||||
// Modified to handle iframe context for URL resolution
|
|
||||||
function getElementValue(element, attribute) {
|
function getElementValue(element, attribute) {
|
||||||
if (!element) return null;
|
if (!element) return null;
|
||||||
|
|
||||||
// Get the base URL for resolving relative URLs
|
let baseURL;
|
||||||
const baseURL = element.ownerDocument?.location?.href || window.location.origin;
|
try {
|
||||||
|
baseURL = element.ownerDocument?.location?.href ||
|
||||||
|
element.ownerDocument?.baseURI ||
|
||||||
|
window.location.origin;
|
||||||
|
} catch (e) {
|
||||||
|
baseURL = window.location.origin;
|
||||||
|
}
|
||||||
|
|
||||||
switch (attribute) {
|
switch (attribute) {
|
||||||
case 'href': {
|
case 'href': {
|
||||||
@@ -305,6 +330,10 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
return element.innerText?.trim();
|
return element.innerText?.trim();
|
||||||
case 'textContent':
|
case 'textContent':
|
||||||
return element.textContent?.trim();
|
return element.textContent?.trim();
|
||||||
|
case 'innerHTML':
|
||||||
|
return element.innerHTML;
|
||||||
|
case 'outerHTML':
|
||||||
|
return element.outerHTML;
|
||||||
default:
|
default:
|
||||||
return element.getAttribute(attribute) || element.innerText?.trim();
|
return element.getAttribute(attribute) || element.innerText?.trim();
|
||||||
}
|
}
|
||||||
@@ -394,7 +423,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
||||||
*/
|
*/
|
||||||
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
|
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
|
||||||
// Enhanced query function to handle both iframe and shadow DOM
|
// Enhanced query function to handle iframe, frame and shadow DOM
|
||||||
const queryElement = (rootElement, selector) => {
|
const queryElement = (rootElement, selector) => {
|
||||||
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
||||||
return rootElement.querySelector(selector);
|
return rootElement.querySelector(selector);
|
||||||
@@ -406,14 +435,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
for (let i = 0; i < parts.length; i++) {
|
for (let i = 0; i < parts.length; i++) {
|
||||||
if (!currentElement) return null;
|
if (!currentElement) return null;
|
||||||
|
|
||||||
// Handle iframe traversal
|
// Handle iframe and frame traversal
|
||||||
if (currentElement.tagName === 'IFRAME') {
|
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
||||||
currentElement = iframeDoc.querySelector(parts[i]);
|
currentElement = frameDoc.querySelector(parts[i]);
|
||||||
continue;
|
continue;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Cannot access iframe content:', e);
|
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -456,13 +485,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
const nextElements = [];
|
const nextElements = [];
|
||||||
|
|
||||||
for (const element of currentElements) {
|
for (const element of currentElements) {
|
||||||
// Handle iframe traversal
|
// Handle iframe and frame traversal
|
||||||
if (element.tagName === 'IFRAME') {
|
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
const iframeDoc = element.contentDocument || element.contentWindow.document;
|
const frameDoc = element.contentDocument || element.contentWindow.document;
|
||||||
nextElements.push(...iframeDoc.querySelectorAll(part));
|
nextElements.push(...frameDoc.querySelectorAll(part));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Cannot access iframe content:', e);
|
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -537,8 +566,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
return { type: 'TR', element: currentElement };
|
return { type: 'TR', element: currentElement };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle iframe crossing
|
// Handle iframe and frame crossing
|
||||||
if (currentElement.tagName === 'IFRAME') {
|
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
currentElement = currentElement.contentDocument.body;
|
currentElement = currentElement.contentDocument.body;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -582,7 +611,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
|
|
||||||
if (current.tagName === 'TH') return true;
|
if (current.tagName === 'TH') return true;
|
||||||
|
|
||||||
if (current.tagName === 'IFRAME') {
|
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
current = current.contentDocument.body;
|
current = current.contentDocument.body;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -638,14 +667,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get elements from iframes
|
// Get elements from iframes and frames
|
||||||
const iframes = document.getElementsByTagName('iframe');
|
const frames = [
|
||||||
for (const iframe of iframes) {
|
...Array.from(document.getElementsByTagName('iframe')),
|
||||||
|
...Array.from(document.getElementsByTagName('frame'))
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const frame of frames) {
|
||||||
try {
|
try {
|
||||||
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
|
const frameDoc = frame.contentDocument || frame.contentWindow.document;
|
||||||
allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
|
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Cannot access iframe content:', e);
|
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -707,7 +740,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
const tableData = [];
|
const tableData = [];
|
||||||
const nonTableData = [];
|
const nonTableData = [];
|
||||||
|
|
||||||
// Process table data with both iframe and shadow DOM support
|
// Process table data with support for iframes, frames, and shadow DOM
|
||||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||||
const container = containers[containerIndex];
|
const container = containers[containerIndex];
|
||||||
const { tableFields } = containerFields[containerIndex];
|
const { tableFields } = containerFields[containerIndex];
|
||||||
@@ -717,14 +750,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
const firstElement = queryElement(container, firstField.selector);
|
const firstElement = queryElement(container, firstField.selector);
|
||||||
let tableContext = firstElement;
|
let tableContext = firstElement;
|
||||||
|
|
||||||
// Find table context including both iframe and shadow DOM
|
// Find table context including iframe, frame and shadow DOM
|
||||||
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
||||||
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
||||||
tableContext = tableContext.getRootNode().host;
|
tableContext = tableContext.getRootNode().host;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tableContext.tagName === 'IFRAME') {
|
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
tableContext = tableContext.contentDocument.body;
|
tableContext = tableContext.contentDocument.body;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -747,13 +780,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get rows from iframes
|
// Get rows from iframes and frames
|
||||||
if (tableContext.tagName === 'IFRAME') {
|
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
||||||
rows.push(...iframeDoc.getElementsByTagName('TR'));
|
rows.push(...frameDoc.getElementsByTagName('TR'));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Cannot access iframe rows:', e);
|
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -823,7 +856,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process non-table data with both contexts support
|
// Process non-table data with all contexts support
|
||||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||||
if (nonTableData.length >= limit) break;
|
if (nonTableData.length >= limit) break;
|
||||||
|
|
||||||
|
|||||||
@@ -286,6 +286,12 @@ export default class Interpreter extends EventEmitter {
|
|||||||
? arrayToObject(<any>superset[key])
|
? arrayToObject(<any>superset[key])
|
||||||
: superset[key];
|
: superset[key];
|
||||||
|
|
||||||
|
if ((key === 'url' || key === 'selectors') &&
|
||||||
|
Array.isArray(value) && Array.isArray(superset[key]) &&
|
||||||
|
value.length === 0 && (superset[key] as any[]).length === 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
|
if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
|
||||||
return value.some(selector =>
|
return value.some(selector =>
|
||||||
(superset[key] as any[]).includes(selector)
|
(superset[key] as any[]).includes(selector)
|
||||||
@@ -592,33 +598,52 @@ export default class Interpreter extends EventEmitter {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Enhanced button finder with retry mechanism
|
// Enhanced button finder with retry mechanism
|
||||||
const findWorkingButton = async (selectors: string[], retryCount = 0): Promise<{
|
const findWorkingButton = async (selectors: string[]): Promise<{
|
||||||
button: ElementHandle | null,
|
button: ElementHandle | null,
|
||||||
workingSelector: string | null
|
workingSelector: string | null,
|
||||||
|
updatedSelectors: string[]
|
||||||
}> => {
|
}> => {
|
||||||
for (const selector of selectors) {
|
let updatedSelectors = [...selectors];
|
||||||
try {
|
|
||||||
const button = await page.waitForSelector(selector, {
|
for (let i = 0; i < selectors.length; i++) {
|
||||||
state: 'attached',
|
const selector = selectors[i];
|
||||||
timeout: 10000 // Reduced timeout for faster checks
|
let retryCount = 0;
|
||||||
});
|
let selectorSuccess = false;
|
||||||
if (button) {
|
|
||||||
debugLog('Found working selector:', selector);
|
while (retryCount < MAX_RETRIES && !selectorSuccess) {
|
||||||
return { button, workingSelector: selector };
|
try {
|
||||||
|
const button = await page.waitForSelector(selector, {
|
||||||
|
state: 'attached',
|
||||||
|
timeout: 10000
|
||||||
|
});
|
||||||
|
|
||||||
|
if (button) {
|
||||||
|
debugLog('Found working selector:', selector);
|
||||||
|
return {
|
||||||
|
button,
|
||||||
|
workingSelector: selector,
|
||||||
|
updatedSelectors
|
||||||
|
};
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
retryCount++;
|
||||||
|
debugLog(`Selector "${selector}" failed: attempt ${retryCount}/${MAX_RETRIES}`);
|
||||||
|
|
||||||
|
if (retryCount < MAX_RETRIES) {
|
||||||
|
await page.waitForTimeout(RETRY_DELAY);
|
||||||
|
} else {
|
||||||
|
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
|
||||||
|
updatedSelectors = updatedSelectors.filter(s => s !== selector);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
|
||||||
debugLog(`Selector failed: ${selector}`);
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Implement retry mechanism when no selectors work
|
return {
|
||||||
if (selectors.length > 0 && retryCount < MAX_RETRIES) {
|
button: null,
|
||||||
debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
workingSelector: null,
|
||||||
await page.waitForTimeout(RETRY_DELAY);
|
updatedSelectors
|
||||||
return findWorkingButton(selectors, retryCount + 1);
|
};
|
||||||
}
|
|
||||||
|
|
||||||
return { button: null, workingSelector: null };
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const retryOperation = async (operation: () => Promise<boolean>, retryCount = 0): Promise<boolean> => {
|
const retryOperation = async (operation: () => Promise<boolean>, retryCount = 0): Promise<boolean> => {
|
||||||
@@ -680,7 +705,10 @@ export default class Interpreter extends EventEmitter {
|
|||||||
await scrapeCurrentPage();
|
await scrapeCurrentPage();
|
||||||
if (checkLimit()) return allResults;
|
if (checkLimit()) return allResults;
|
||||||
|
|
||||||
const { button, workingSelector } = await findWorkingButton(availableSelectors);
|
const { button, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);
|
||||||
|
|
||||||
|
availableSelectors = updatedSelectors;
|
||||||
|
|
||||||
if (!button || !workingSelector) {
|
if (!button || !workingSelector) {
|
||||||
// Final retry for navigation when no selectors work
|
// Final retry for navigation when no selectors work
|
||||||
const success = await retryOperation(async () => {
|
const success = await retryOperation(async () => {
|
||||||
@@ -697,10 +725,6 @@ export default class Interpreter extends EventEmitter {
|
|||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
availableSelectors = availableSelectors.slice(
|
|
||||||
availableSelectors.indexOf(workingSelector)
|
|
||||||
);
|
|
||||||
|
|
||||||
let retryCount = 0;
|
let retryCount = 0;
|
||||||
let navigationSuccess = false;
|
let navigationSuccess = false;
|
||||||
|
|
||||||
@@ -768,22 +792,25 @@ export default class Interpreter extends EventEmitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
case 'clickLoadMore': {
|
case 'clickLoadMore': {
|
||||||
|
await scrapeCurrentPage();
|
||||||
|
if (checkLimit()) return allResults;
|
||||||
|
|
||||||
|
let loadMoreCounter = 0;
|
||||||
|
let previousResultCount = allResults.length;
|
||||||
|
let noNewItemsCounter = 0;
|
||||||
|
const MAX_NO_NEW_ITEMS = 2;
|
||||||
|
|
||||||
while (true) {
|
while (true) {
|
||||||
// Find working button with retry mechanism, consistent with clickNext
|
// Find working button with retry mechanism
|
||||||
const { button: loadMoreButton, workingSelector } = await findWorkingButton(availableSelectors);
|
const { button: loadMoreButton, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);
|
||||||
|
|
||||||
|
availableSelectors = updatedSelectors;
|
||||||
|
|
||||||
if (!workingSelector || !loadMoreButton) {
|
if (!workingSelector || !loadMoreButton) {
|
||||||
debugLog('No working Load More selector found after retries');
|
debugLog('No working Load More selector found after retries');
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
return allResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update available selectors to start from the working one
|
|
||||||
availableSelectors = availableSelectors.slice(
|
|
||||||
availableSelectors.indexOf(workingSelector)
|
|
||||||
);
|
|
||||||
|
|
||||||
// Implement retry mechanism for clicking the button
|
// Implement retry mechanism for clicking the button
|
||||||
let retryCount = 0;
|
let retryCount = 0;
|
||||||
let clickSuccess = false;
|
let clickSuccess = false;
|
||||||
@@ -808,6 +835,8 @@ export default class Interpreter extends EventEmitter {
|
|||||||
|
|
||||||
if (clickSuccess) {
|
if (clickSuccess) {
|
||||||
await page.waitForTimeout(1000);
|
await page.waitForTimeout(1000);
|
||||||
|
loadMoreCounter++;
|
||||||
|
debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`);
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
debugLog(`Click attempt ${retryCount + 1} failed completely.`);
|
debugLog(`Click attempt ${retryCount + 1} failed completely.`);
|
||||||
@@ -822,8 +851,6 @@ export default class Interpreter extends EventEmitter {
|
|||||||
|
|
||||||
if (!clickSuccess) {
|
if (!clickSuccess) {
|
||||||
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
return allResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -833,20 +860,34 @@ export default class Interpreter extends EventEmitter {
|
|||||||
await page.waitForTimeout(2000);
|
await page.waitForTimeout(2000);
|
||||||
|
|
||||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||||
if (currentHeight === previousHeight) {
|
const heightChanged = currentHeight !== previousHeight;
|
||||||
debugLog('No more items loaded after Load More');
|
|
||||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
||||||
allResults = allResults.concat(finalResults);
|
|
||||||
return allResults;
|
|
||||||
}
|
|
||||||
previousHeight = currentHeight;
|
previousHeight = currentHeight;
|
||||||
|
|
||||||
if (config.limit && allResults.length >= config.limit) {
|
await scrapeCurrentPage();
|
||||||
allResults = allResults.slice(0, config.limit);
|
|
||||||
break;
|
const currentResultCount = allResults.length;
|
||||||
|
const newItemsAdded = currentResultCount > previousResultCount;
|
||||||
|
|
||||||
|
if (!newItemsAdded) {
|
||||||
|
noNewItemsCounter++;
|
||||||
|
debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
|
||||||
|
|
||||||
|
if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
|
||||||
|
debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
|
||||||
|
return allResults;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
noNewItemsCounter = 0;
|
||||||
|
previousResultCount = currentResultCount;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (checkLimit()) return allResults;
|
||||||
|
|
||||||
|
if (!heightChanged) {
|
||||||
|
debugLog('No more items loaded after Load More');
|
||||||
|
return allResults;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
break;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
default: {
|
default: {
|
||||||
|
|||||||
@@ -825,6 +825,7 @@ export class WorkflowGenerator {
|
|||||||
selectors?.testIdSelector,
|
selectors?.testIdSelector,
|
||||||
selectors?.id,
|
selectors?.id,
|
||||||
selectors?.hrefSelector,
|
selectors?.hrefSelector,
|
||||||
|
selectors?.relSelector,
|
||||||
selectors?.accessibilitySelector,
|
selectors?.accessibilitySelector,
|
||||||
selectors?.attrSelector
|
selectors?.attrSelector
|
||||||
]
|
]
|
||||||
|
|||||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user