Merge pull request #482 from getmaxun/eff-scrape
feat: click next improvements
This commit is contained in:
@@ -542,7 +542,27 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
return element.innerHTML.trim();
|
return element.innerHTML.trim();
|
||||||
} else if (attribute === 'src' || attribute === 'href') {
|
} else if (attribute === 'src' || attribute === 'href') {
|
||||||
const attrValue = element.getAttribute(attribute);
|
const attrValue = element.getAttribute(attribute);
|
||||||
return attrValue ? new URL(attrValue, baseURL).href : null;
|
|
||||||
|
const dataAttr = attrValue || element.getAttribute('data-' + attribute);
|
||||||
|
|
||||||
|
if (!dataAttr || dataAttr.trim() === '') {
|
||||||
|
if (attribute === 'src') {
|
||||||
|
const style = window.getComputedStyle(element);
|
||||||
|
const bgImage = style.backgroundImage;
|
||||||
|
if (bgImage && bgImage !== 'none') {
|
||||||
|
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
|
||||||
|
return matches ? new URL(matches[1], baseURL).href : null;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
try {
|
||||||
|
return new URL(dataAttr, baseURL).href;
|
||||||
|
} catch (e) {
|
||||||
|
console.warn('Error creating URL from', dataAttr, e);
|
||||||
|
return dataAttr; // Return the original value if URL construction fails
|
||||||
|
}
|
||||||
}
|
}
|
||||||
return element.getAttribute(attribute);
|
return element.getAttribute(attribute);
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -710,7 +710,7 @@ export default class Interpreter extends EventEmitter {
|
|||||||
availableSelectors = updatedSelectors;
|
availableSelectors = updatedSelectors;
|
||||||
|
|
||||||
if (!button || !workingSelector) {
|
if (!button || !workingSelector) {
|
||||||
// Final retry for navigation when no selectors work
|
// Final retry for navigation when no selectors work
|
||||||
const success = await retryOperation(async () => {
|
const success = await retryOperation(async () => {
|
||||||
try {
|
try {
|
||||||
await page.evaluate(() => window.history.forward());
|
await page.evaluate(() => window.history.forward());
|
||||||
@@ -726,68 +726,100 @@ export default class Interpreter extends EventEmitter {
|
|||||||
}
|
}
|
||||||
|
|
||||||
let retryCount = 0;
|
let retryCount = 0;
|
||||||
let navigationSuccess = false;
|
let paginationSuccess = false;
|
||||||
|
|
||||||
while (retryCount < MAX_RETRIES && !navigationSuccess) {
|
// Capture basic content signature before click
|
||||||
|
const captureContentSignature = async () => {
|
||||||
|
return await page.evaluate((selector) => {
|
||||||
|
const items = document.querySelectorAll(selector);
|
||||||
|
return {
|
||||||
|
url: window.location.href,
|
||||||
|
itemCount: items.length,
|
||||||
|
firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|')
|
||||||
|
};
|
||||||
|
}, config.listSelector);
|
||||||
|
};
|
||||||
|
|
||||||
|
const beforeSignature = await captureContentSignature();
|
||||||
|
debugLog(`Before click: ${beforeSignature.itemCount} items`);
|
||||||
|
|
||||||
|
while (retryCount < MAX_RETRIES && !paginationSuccess) {
|
||||||
try {
|
try {
|
||||||
try {
|
try {
|
||||||
await Promise.all([
|
await Promise.all([
|
||||||
page.waitForNavigation({
|
page.waitForNavigation({
|
||||||
waitUntil: 'networkidle',
|
waitUntil: 'networkidle',
|
||||||
timeout: 15000
|
timeout: 15000
|
||||||
|
}).catch(e => {
|
||||||
|
throw e;
|
||||||
}),
|
}),
|
||||||
button.click()
|
button.click()
|
||||||
]);
|
]);
|
||||||
navigationSuccess = true;
|
debugLog("Navigation successful after regular click");
|
||||||
} catch (error) {
|
paginationSuccess = true;
|
||||||
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
|
} catch (navError) {
|
||||||
|
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
|
||||||
// If regular click fails, try dispatchEvent
|
try {
|
||||||
if (page.url() === currentUrl) {
|
await Promise.all([
|
||||||
|
page.waitForNavigation({
|
||||||
|
waitUntil: 'networkidle',
|
||||||
|
timeout: 15000
|
||||||
|
}).catch(e => {
|
||||||
|
throw e;
|
||||||
|
}),
|
||||||
|
button.dispatchEvent('click')
|
||||||
|
]);
|
||||||
|
debugLog("Navigation successful after dispatch event");
|
||||||
|
paginationSuccess = true;
|
||||||
|
} catch (dispatchNavError) {
|
||||||
try {
|
try {
|
||||||
await Promise.all([
|
await button.click();
|
||||||
page.waitForNavigation({
|
await page.waitForTimeout(2000);
|
||||||
waitUntil: 'networkidle',
|
} catch (clickError) {
|
||||||
timeout: 15000
|
await button.dispatchEvent('click');
|
||||||
}),
|
await page.waitForTimeout(2000);
|
||||||
button.dispatchEvent('click')
|
|
||||||
]);
|
|
||||||
navigationSuccess = true;
|
|
||||||
} catch (dispatchError) {
|
|
||||||
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
|
|
||||||
}
|
}
|
||||||
} else {
|
|
||||||
navigationSuccess = true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const newUrl = page.url();
|
await page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => {});
|
||||||
if (visitedUrls.has(newUrl)) {
|
|
||||||
debugLog(`Detected navigation to previously visited URL ${newUrl} on attempt ${retryCount + 1}`);
|
|
||||||
navigationSuccess = false;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (navigationSuccess) {
|
if (!paginationSuccess) {
|
||||||
await page.waitForTimeout(1000);
|
const newUrl = page.url();
|
||||||
|
const afterSignature = await captureContentSignature();
|
||||||
|
|
||||||
|
if (newUrl !== currentUrl) {
|
||||||
|
debugLog(`URL changed to ${newUrl}`);
|
||||||
|
visitedUrls.add(newUrl);
|
||||||
|
paginationSuccess = true;
|
||||||
|
}
|
||||||
|
else if (afterSignature.firstItems !== beforeSignature.firstItems) {
|
||||||
|
debugLog("Content changed without URL change");
|
||||||
|
paginationSuccess = true;
|
||||||
|
}
|
||||||
|
else if (afterSignature.itemCount !== beforeSignature.itemCount) {
|
||||||
|
debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`);
|
||||||
|
paginationSuccess = true;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
debugLog(`Navigation attempt ${retryCount + 1} failed completely.`);
|
debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`);
|
||||||
navigationSuccess = false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!navigationSuccess) {
|
if (!paginationSuccess) {
|
||||||
retryCount++;
|
retryCount++;
|
||||||
if (retryCount < MAX_RETRIES) {
|
if (retryCount < MAX_RETRIES) {
|
||||||
debugLog(`Retrying navigation - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
||||||
await page.waitForTimeout(RETRY_DELAY);
|
await page.waitForTimeout(RETRY_DELAY);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!navigationSuccess) {
|
if (!paginationSuccess) {
|
||||||
debugLog(`Navigation failed after ${MAX_RETRIES} attempts`);
|
debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
|
||||||
return allResults;
|
return allResults;
|
||||||
}
|
}
|
||||||
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user