Merge pull request #591 from getmaxun/scrape-improve
feat: improve selector generation highlighting
This commit is contained in:
@@ -725,6 +725,30 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
function tryFallbackSelector(rootElement, originalSelector) {
|
||||||
|
let element = queryElement(rootElement, originalSelector);
|
||||||
|
|
||||||
|
if (!element && originalSelector.includes('nth-child')) {
|
||||||
|
const match = originalSelector.match(/nth-child\((\d+)\)/);
|
||||||
|
if (match) {
|
||||||
|
const position = parseInt(match[1], 10);
|
||||||
|
|
||||||
|
for (let i = position - 1; i >= 1; i--) {
|
||||||
|
const fallbackSelector = originalSelector.replace(/nth-child\(\d+\)/, `nth-child(${i})`);
|
||||||
|
element = queryElement(rootElement, fallbackSelector);
|
||||||
|
if (element) break;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!element) {
|
||||||
|
const baseSelector = originalSelector.replace(/\:nth-child\(\d+\)/, '');
|
||||||
|
element = queryElement(rootElement, baseSelector);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return element;
|
||||||
|
}
|
||||||
|
|
||||||
// Main scraping logic with context support
|
// Main scraping logic with context support
|
||||||
let containers = queryElementAll(document, listSelector);
|
let containers = queryElementAll(document, listSelector);
|
||||||
containers = Array.from(containers);
|
containers = Array.from(containers);
|
||||||
@@ -902,7 +926,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
||||||
// Get the last part of the selector after any context delimiter
|
// Get the last part of the selector after any context delimiter
|
||||||
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
|
||||||
const element = queryElement(container, relativeSelector);
|
const element = tryFallbackSelector(container, relativeSelector);
|
||||||
|
|
||||||
if (element) {
|
if (element) {
|
||||||
record[label] = extractValue(element, attribute);
|
record[label] = extractValue(element, attribute);
|
||||||
|
|||||||
@@ -825,6 +825,7 @@ export default class Interpreter extends EventEmitter {
|
|||||||
button.click()
|
button.click()
|
||||||
]);
|
]);
|
||||||
debugLog("Navigation successful after regular click");
|
debugLog("Navigation successful after regular click");
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
paginationSuccess = true;
|
paginationSuccess = true;
|
||||||
} catch (navError) {
|
} catch (navError) {
|
||||||
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
|
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
|
||||||
@@ -839,6 +840,7 @@ export default class Interpreter extends EventEmitter {
|
|||||||
button.dispatchEvent('click')
|
button.dispatchEvent('click')
|
||||||
]);
|
]);
|
||||||
debugLog("Navigation successful after dispatch event");
|
debugLog("Navigation successful after dispatch event");
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
paginationSuccess = true;
|
paginationSuccess = true;
|
||||||
} catch (dispatchNavError) {
|
} catch (dispatchNavError) {
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -2299,7 +2299,12 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
|
|||||||
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Regular DOM path generation
|
const elementSelector = getNonUniqueSelector(element);
|
||||||
|
|
||||||
|
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
||||||
|
return elementSelector;
|
||||||
|
}
|
||||||
|
|
||||||
const path: string[] = [];
|
const path: string[] = [];
|
||||||
let currentElement = element;
|
let currentElement = element;
|
||||||
const MAX_DEPTH = 2;
|
const MAX_DEPTH = 2;
|
||||||
@@ -2656,7 +2661,12 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
|
|||||||
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
return selectorParts.join(contextPath[0].type === 'shadow' ? ' >> ' : ' :>> ');
|
||||||
}
|
}
|
||||||
|
|
||||||
// Regular DOM path generation
|
const elementSelector = getNonUniqueSelector(element);
|
||||||
|
|
||||||
|
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
||||||
|
return elementSelector;
|
||||||
|
}
|
||||||
|
|
||||||
const path: string[] = [];
|
const path: string[] = [];
|
||||||
let currentElement = element;
|
let currentElement = element;
|
||||||
const MAX_DEPTH = 2;
|
const MAX_DEPTH = 2;
|
||||||
@@ -2753,12 +2763,14 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro
|
|||||||
const frameElement = ownerDocument?.defaultView?.frameElement;
|
const frameElement = ownerDocument?.defaultView?.frameElement;
|
||||||
if (frameElement) {
|
if (frameElement) {
|
||||||
const frameSelector = getNonUniqueSelector(frameElement as HTMLElement);
|
const frameSelector = getNonUniqueSelector(frameElement as HTMLElement);
|
||||||
const isFrame = frameElement.tagName === 'FRAME';
|
|
||||||
// Use the appropriate delimiter based on whether it's a frame or iframe
|
// Use the appropriate delimiter based on whether it's a frame or iframe
|
||||||
return `${frameSelector} :>> ${elementSelector}`;
|
return `${frameSelector} :>> ${elementSelector}`;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Regular DOM context
|
if (elementSelector.includes('.') && elementSelector.split('.').length > 1) {
|
||||||
|
return elementSelector;
|
||||||
|
}
|
||||||
|
|
||||||
const parentSelector = getNonUniqueSelector(element.parentElement);
|
const parentSelector = getNonUniqueSelector(element.parentElement);
|
||||||
return `${parentSelector} > ${elementSelector}`;
|
return `${parentSelector} > ${elementSelector}`;
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user