Merge pull request #591 from getmaxun/scrape-improve

feat: improve selector generation highlighting
This commit is contained in:
Karishma Shukla
2025-05-21 20:22:02 +05:30
committed by GitHub
3 changed files with 43 additions and 5 deletions

View File

@@ -725,6 +725,30 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
});
}
function tryFallbackSelector(rootElement, originalSelector) {
let element = queryElement(rootElement, originalSelector);
if (!element && originalSelector.includes('nth-child')) {
const match = originalSelector.match(/nth-child\((\d+)\)/);
if (match) {
const position = parseInt(match[1], 10);
for (let i = position - 1; i >= 1; i--) {
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
element = queryElement(rootElement, fallbackSelector);
if (element) break;
}
if (!element) {
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
element = queryElement(rootElement, baseSelector);
}
}
}
return element;
}
// Main scraping logic with context support
let containers = queryElementAll(document, listSelector);
containers = Array.from(containers);
@@ -902,7 +926,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
// Get the last part of the selector after any context delimiter
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
const element = queryElement(container, relativeSelector);
const element = tryFallbackSelector(container, relativeSelector);
if (element) {
record[label] = extractValue(element, attribute);

View File

@@ -825,6 +825,7 @@ export default class Interpreter extends EventEmitter {
button.click()
]);
debugLog("Navigation successful after regular click");
await page.waitForTimeout(2000);
paginationSuccess = true;
} catch (navError) {
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
@@ -839,6 +840,7 @@ export default class Interpreter extends EventEmitter {
button.dispatchEvent('click')
]);
debugLog("Navigation successful after dispatch event");
await page.waitForTimeout(2000);
paginationSuccess = true;
} catch (dispatchNavError) {
try {

View File

@@ -2299,7 +2299,12 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
}
// Regular DOM path generation
const elementSelector = getNonUniqueSelector(element);
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
return elementSelector;
}
const path: string[] = [];
let currentElement = element;
const MAX_DEPTH = 2;
@@ -2656,7 +2661,12 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
}
// Regular DOM path generation
const elementSelector = getNonUniqueSelector(element);
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
return elementSelector;
}
const path: string[] = [];
let currentElement = element;
const MAX_DEPTH = 2;
@@ -2753,12 +2763,14 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro
const frameElement = ownerDocument?.defaultView?.frameElement;
if (frameElement) {
const frameSelector = getNonUniqueSelector(frameElement as HTMLElement);
const isFrame = frameElement.tagName === 'FRAME';
// Use the appropriate delimiter based on whether it's a frame or iframe
return `${frameSelector} :>> ${elementSelector}`;
}
// Regular DOM context
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
return elementSelector;
}
const parentSelector = getNonUniqueSelector(element.parentElement);
return `${parentSelector} > ${elementSelector}`;
}