Merge pull request #482 from getmaxun/eff-scrape
feat: click next improvements
This commit is contained in:
@@ -524,7 +524,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
// Enhanced value extraction with context awareness
|
||||
function extractValue(element, attribute) {
|
||||
if (!element) return null;
|
||||
|
||||
|
||||
// Get context-aware base URL
|
||||
const baseURL = element.ownerDocument?.location?.href || window.location.origin;
|
||||
|
||||
@@ -535,14 +535,34 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
return shadowContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
if (attribute === 'innerText') {
|
||||
return element.innerText.trim();
|
||||
} else if (attribute === 'innerHTML') {
|
||||
return element.innerHTML.trim();
|
||||
} else if (attribute === 'src' || attribute === 'href') {
|
||||
const attrValue = element.getAttribute(attribute);
|
||||
return attrValue ? new URL(attrValue, baseURL).href : null;
|
||||
|
||||
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
||||
|
||||
if (!dataAttr || dataAttr.trim() === '') {
|
||||
if (attribute === 'src') {
|
||||
const style = window.getComputedStyle(element);
|
||||
const bgImage = style.backgroundImage;
|
||||
if (bgImage && bgImage !== 'none') {
|
||||
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
||||
return matches ? new URL(matches[1], baseURL).href : null;
|
||||
}
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
try {
|
||||
return new URL(dataAttr, baseURL).href;
|
||||
} catch (e) {
|
||||
console.warn('Error creating URL from', dataAttr, e);
|
||||
return dataAttr; // Return the original value if URL construction fails
|
||||
}
|
||||
}
|
||||
return element.getAttribute(attribute);
|
||||
}
|
||||
|
||||
@@ -704,13 +704,13 @@ export default class Interpreter extends EventEmitter {
|
||||
|
||||
await scrapeCurrentPage();
|
||||
if (checkLimit()) return allResults;
|
||||
|
||||
|
||||
const { button, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);
|
||||
|
||||
availableSelectors = updatedSelectors;
|
||||
|
||||
|
||||
if (!button || !workingSelector) {
|
||||
// Final retry for navigation when no selectors work
|
||||
// Final retry for navigation when no selectors work
|
||||
const success = await retryOperation(async () => {
|
||||
try {
|
||||
await page.evaluate(() => window.history.forward());
|
||||
@@ -724,70 +724,102 @@ export default class Interpreter extends EventEmitter {
|
||||
if (!success) return allResults;
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
let retryCount = 0;
|
||||
let navigationSuccess = false;
|
||||
|
||||
while (retryCount < MAX_RETRIES && !navigationSuccess) {
|
||||
let paginationSuccess = false;
|
||||
|
||||
// Capture basic content signature before click
|
||||
const captureContentSignature = async () => {
|
||||
return await page.evaluate((selector) => {
|
||||
const items = document.querySelectorAll(selector);
|
||||
return {
|
||||
url: window.location.href,
|
||||
itemCount: items.length,
|
||||
firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|')
|
||||
};
|
||||
}, config.listSelector);
|
||||
};
|
||||
|
||||
const beforeSignature = await captureContentSignature();
|
||||
debugLog(`Before click: ${beforeSignature.itemCount} items`);
|
||||
|
||||
while (retryCount < MAX_RETRIES && !paginationSuccess) {
|
||||
try {
|
||||
try {
|
||||
await Promise.all([
|
||||
page.waitForNavigation({
|
||||
waitUntil: 'networkidle',
|
||||
timeout: 15000
|
||||
}).catch(e => {
|
||||
throw e;
|
||||
}),
|
||||
button.click()
|
||||
]);
|
||||
navigationSuccess = true;
|
||||
} catch (error) {
|
||||
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
|
||||
|
||||
// If regular click fails, try dispatchEvent
|
||||
if (page.url() === currentUrl) {
|
||||
debugLog("Navigation successful after regular click");
|
||||
paginationSuccess = true;
|
||||
} catch (navError) {
|
||||
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
|
||||
try {
|
||||
await Promise.all([
|
||||
page.waitForNavigation({
|
||||
waitUntil: 'networkidle',
|
||||
timeout: 15000
|
||||
}).catch(e => {
|
||||
throw e;
|
||||
}),
|
||||
button.dispatchEvent('click')
|
||||
]);
|
||||
debugLog("Navigation successful after dispatch event");
|
||||
paginationSuccess = true;
|
||||
} catch (dispatchNavError) {
|
||||
try {
|
||||
await Promise.all([
|
||||
page.waitForNavigation({
|
||||
waitUntil: 'networkidle',
|
||||
timeout: 15000
|
||||
}),
|
||||
button.dispatchEvent('click')
|
||||
]);
|
||||
navigationSuccess = true;
|
||||
} catch (dispatchError) {
|
||||
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
|
||||
await button.click();
|
||||
await page.waitForTimeout(2000);
|
||||
} catch (clickError) {
|
||||
await button.dispatchEvent('click');
|
||||
await page.waitForTimeout(2000);
|
||||
}
|
||||
} else {
|
||||
navigationSuccess = true;
|
||||
}
|
||||
}
|
||||
|
||||
const newUrl = page.url();
|
||||
if (visitedUrls.has(newUrl)) {
|
||||
debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
|
||||
navigationSuccess = false;
|
||||
}
|
||||
|
||||
if (navigationSuccess) {
|
||||
await page.waitForTimeout(1000);
|
||||
|
||||
await page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => {});
|
||||
|
||||
if (!paginationSuccess) {
|
||||
const newUrl = page.url();
|
||||
const afterSignature = await captureContentSignature();
|
||||
|
||||
if (newUrl !== currentUrl) {
|
||||
debugLog(`URL changed to ${newUrl}`);
|
||||
visitedUrls.add(newUrl);
|
||||
paginationSuccess = true;
|
||||
}
|
||||
else if (afterSignature.firstItems !== beforeSignature.firstItems) {
|
||||
debugLog("Content changed without URL change");
|
||||
paginationSuccess = true;
|
||||
}
|
||||
else if (afterSignature.itemCount !== beforeSignature.itemCount) {
|
||||
debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`);
|
||||
paginationSuccess = true;
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
|
||||
navigationSuccess = false;
|
||||
debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`);
|
||||
}
|
||||
|
||||
if (!navigationSuccess) {
|
||||
|
||||
if (!paginationSuccess) {
|
||||
retryCount++;
|
||||
if (retryCount < MAX_RETRIES) {
|
||||
debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
||||
debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
||||
await page.waitForTimeout(RETRY_DELAY);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (!navigationSuccess) {
|
||||
debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
|
||||
|
||||
if (!paginationSuccess) {
|
||||
debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
|
||||
return allResults;
|
||||
}
|
||||
|
||||
break;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user