Merge pull request #591 from getmaxun/scrape-improve
feat: improve selector generation highlighting
This commit is contained in:
@@ -725,6 +725,30 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
});
|
||||
}
|
||||
|
||||
function tryFallbackSelector(rootElement, originalSelector) {
|
||||
let element = queryElement(rootElement, originalSelector);
|
||||
|
||||
if (!element && originalSelector.includes('nth-child')) {
|
||||
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
||||
if (match) {
|
||||
const position = parseInt(match[1], 10);
|
||||
|
||||
for (let i = position - 1; i >= 1; i--) {
|
||||
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
|
||||
element = queryElement(rootElement, fallbackSelector);
|
||||
if (element) break;
|
||||
}
|
||||
|
||||
if (!element) {
|
||||
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
|
||||
element = queryElement(rootElement, baseSelector);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return element;
|
||||
}
|
||||
|
||||
// Main scraping logic with context support
|
||||
let containers = queryElementAll(document, listSelector);
|
||||
containers = Array.from(containers);
|
||||
@@ -902,7 +926,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
||||
// Get the last part of the selector after any context delimiter
|
||||
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
||||
const element = queryElement(container, relativeSelector);
|
||||
const element = tryFallbackSelector(container, relativeSelector);
|
||||
|
||||
if (element) {
|
||||
record[label] = extractValue(element, attribute);
|
||||
|
||||
@@ -825,6 +825,7 @@ export default class Interpreter extends EventEmitter {
|
||||
button.click()
|
||||
]);
|
||||
debugLog("Navigation successful after regular click");
|
||||
await page.waitForTimeout(2000);
|
||||
paginationSuccess = true;
|
||||
} catch (navError) {
|
||||
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
|
||||
@@ -839,6 +840,7 @@ export default class Interpreter extends EventEmitter {
|
||||
button.dispatchEvent('click')
|
||||
]);
|
||||
debugLog("Navigation successful after dispatch event");
|
||||
await page.waitForTimeout(2000);
|
||||
paginationSuccess = true;
|
||||
} catch (dispatchNavError) {
|
||||
try {
|
||||
|
||||
@@ -2299,7 +2299,12 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
|
||||
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
||||
}
|
||||
|
||||
// Regular DOM path generation
|
||||
const elementSelector = getNonUniqueSelector(element);
|
||||
|
||||
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
||||
return elementSelector;
|
||||
}
|
||||
|
||||
const path: string[] = [];
|
||||
let currentElement = element;
|
||||
const MAX_DEPTH = 2;
|
||||
@@ -2656,7 +2661,12 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
|
||||
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
||||
}
|
||||
|
||||
// Regular DOM path generation
|
||||
const elementSelector = getNonUniqueSelector(element);
|
||||
|
||||
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
||||
return elementSelector;
|
||||
}
|
||||
|
||||
const path: string[] = [];
|
||||
let currentElement = element;
|
||||
const MAX_DEPTH = 2;
|
||||
@@ -2753,12 +2763,14 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro
|
||||
const frameElement = ownerDocument?.defaultView?.frameElement;
|
||||
if (frameElement) {
|
||||
const frameSelector = getNonUniqueSelector(frameElement as HTMLElement);
|
||||
const isFrame = frameElement.tagName === 'FRAME';
|
||||
// Use the appropriate delimiter based on whether it's a frame or iframe
|
||||
return `${frameSelector} :>> ${elementSelector}`;
|
||||
}
|
||||
|
||||
// Regular DOM context
|
||||
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
||||
return elementSelector;
|
||||
}
|
||||
|
||||
const parentSelector = getNonUniqueSelector(element.parentElement);
|
||||
return `${parentSelector} > ${elementSelector}`;
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user