diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 09b6578b..a2009d78 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -265,41 +265,72 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const scrapedData = []; while (scrapedData.length < limit) { - // Get all parent elements matching the listSelector - const parentElements = Array.from(document.querySelectorAll(listSelector)); + let parentElements = Array.from(document.querySelectorAll(listSelector)); + + // If we only got one element or none, try a more generic approach + if (limit > 1 && parentElements.length <= 1) { + const [containerSelector, _] = listSelector.split('>').map(s => s.trim()); + const container = document.querySelector(containerSelector); + + if (container) { + const allChildren = Array.from(container.children); + + const firstMatch = document.querySelector(listSelector); + if (firstMatch) { + // Get classes from the first matching element + const firstMatchClasses = Array.from(firstMatch.classList); + + // Find similar elements by matching most of their classes + parentElements = allChildren.filter(element => { + const elementClasses = Array.from(element.classList); - // Iterate through each parent element - for (const parent of parentElements) { - if (scrapedData.length >= limit) break; - const record = {}; - - // For each field, select the corresponding element within the parent - for (const [label, { selector, attribute }] of Object.entries(fields)) { - const fieldElement = parent.querySelector(selector); - - if (fieldElement) { - if (attribute === 'innerText') { - record[label] = fieldElement.innerText.trim(); - } else if (attribute === 'innerHTML') { - record[label] = fieldElement.innerHTML.trim(); - } else if (attribute === 'src') { - // Handle relative 'src' URLs - const src = fieldElement.getAttribute('src'); - record[label] = src ? new URL(src, window.location.origin).href : null; - } else if (attribute === 'href') { - // Handle relative 'href' URLs - const href = fieldElement.getAttribute('href'); - record[label] = href ? new URL(href, window.location.origin).href : null; - } else { - record[label] = fieldElement.getAttribute(attribute); + // Element should share at least 70% of classes with the first match + const commonClasses = firstMatchClasses.filter(cls => + elementClasses.includes(cls)); + return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7); + }); + } } - } } - scrapedData.push(record); - } + + // Iterate through each parent element + for (const parent of parentElements) { + if (scrapedData.length >= limit) break; + const record = {}; + + // For each field, select the corresponding element within the parent + for (const [label, { selector, attribute }] of Object.entries(fields)) { + const fieldElement = parent.querySelector(selector); + + if (fieldElement) { + if (attribute === 'innerText') { + record[label] = fieldElement.innerText.trim(); + } else if (attribute === 'innerHTML') { + record[label] = fieldElement.innerHTML.trim(); + } else if (attribute === 'src') { + // Handle relative 'src' URLs + const src = fieldElement.getAttribute('src'); + record[label] = src ? new URL(src, window.location.origin).href : null; + } else if (attribute === 'href') { + // Handle relative 'href' URLs + const href = fieldElement.getAttribute('href'); + record[label] = href ? new URL(href, window.location.origin).href : null; + } else { + record[label] = fieldElement.getAttribute(attribute); + } + } + } + scrapedData.push(record); + } + + // If we've processed all available elements and still haven't reached the limit, + // break to avoid infinite loop + if (parentElements.length === 0 || scrapedData.length >= parentElements.length) { + break; + } } - return scrapedData - }; + return scrapedData; +}; /** diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index 9fab04d3..213a0e86 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -544,9 +544,9 @@ export class WorkflowGenerator { * @returns {Promise} */ private generateSelector = async (page: Page, coordinates: Coordinates, action: ActionType) => { - const elementInfo = await getElementInformation(page, coordinates, this.listSelector); + const elementInfo = await getElementInformation(page, coordinates, this.listSelector, this.getList); const selectorBasedOnCustomAction = (this.getList === true) - ? await getNonUniqueSelectors(page, coordinates) + ? await getNonUniqueSelectors(page, coordinates, this.listSelector) : await getSelectors(page, coordinates); const bestSelector = getBestSelectorForAction( @@ -572,9 +572,9 @@ export class WorkflowGenerator { * @returns {Promise} */ public generateDataForHighlighter = async (page: Page, coordinates: Coordinates) => { - const rect = await getRect(page, coordinates, this.listSelector); + const rect = await getRect(page, coordinates, this.listSelector, this.getList); const displaySelector = await this.generateSelector(page, coordinates, ActionType.Click); - const elementInfo = await getElementInformation(page, coordinates, this.listSelector); + const elementInfo = await getElementInformation(page, coordinates, this.listSelector, this.getList); if (rect) { if (this.getList === true) { if (this.listSelector !== '') { diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 891c0e3b..9c62139b 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -17,9 +17,10 @@ export const getElementInformation = async ( page: Page, coordinates: Coordinates, listSelector: string, + getList: boolean ) => { try { - if (listSelector !== '') { + if (!getList || listSelector !== '') { const elementInfo = await page.evaluate( async ({ x, y }) => { const el = document.elementFromPoint(x, y) as HTMLElement; @@ -74,22 +75,10 @@ export const getElementInformation = async ( if (originalEl) { let element = originalEl; - const containerTags = ['DIV', 'SECTION', 'ARTICLE', 'MAIN', 'HEADER', 'FOOTER', 'NAV', 'ASIDE', - 'ADDRESS', 'BLOCKQUOTE', 'DETAILS', 'DIALOG', 'FIGURE', 'FIGCAPTION', 'MAIN', 'MARK', 'SUMMARY', 'TIME', - 'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TR', 'TH', 'TD', 'CAPTION', 'COLGROUP', 'COL', 'FORM', 'FIELDSET', - 'LEGEND', 'LABEL', 'INPUT', 'BUTTON', 'SELECT', 'DATALIST', 'OPTGROUP', 'OPTION', 'TEXTAREA', 'OUTPUT', - 'PROGRESS', 'METER', 'DETAILS', 'SUMMARY', 'MENU', 'MENUITEM', 'MENUITEM', 'APPLET', 'EMBED', 'OBJECT', - 'PARAM', 'VIDEO', 'AUDIO', 'SOURCE', 'TRACK', 'CANVAS', 'MAP', 'AREA', 'SVG', 'IFRAME', 'FRAME', 'FRAMESET', - 'LI', 'UL', 'OL', 'DL', 'DT', 'DD', 'HR', 'P', 'PRE', 'LISTING', 'PLAINTEXT', 'A' - ]; while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); - if (!containerTags.includes(element.parentElement.tagName)) { - break; - } - const fullyContained = parentRect.left <= childRect.left && parentRect.right >= childRect.right && @@ -167,9 +156,9 @@ export const getElementInformation = async ( * @category WorkflowManagement-Selectors * @returns {Promise} */ -export const getRect = async (page: Page, coordinates: Coordinates, listSelector: string) => { +export const getRect = async (page: Page, coordinates: Coordinates, listSelector: string, getList: boolean) => { try { - if (listSelector !== '') { + if (!getList || listSelector !== '') { const rect = await page.evaluate( async ({ x, y }) => { const el = document.elementFromPoint(x, y) as HTMLElement; @@ -202,22 +191,10 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector if (originalEl) { let element = originalEl; - const containerTags = ['DIV', 'SECTION', 'ARTICLE', 'MAIN', 'HEADER', 'FOOTER', 'NAV', 'ASIDE', - 'ADDRESS', 'BLOCKQUOTE', 'DETAILS', 'DIALOG', 'FIGURE', 'FIGCAPTION', 'MAIN', 'MARK', 'SUMMARY', 'TIME', - 'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TR', 'TH', 'TD', 'CAPTION', 'COLGROUP', 'COL', 'FORM', 'FIELDSET', - 'LEGEND', 'LABEL', 'INPUT', 'BUTTON', 'SELECT', 'DATALIST', 'OPTGROUP', 'OPTION', 'TEXTAREA', 'OUTPUT', - 'PROGRESS', 'METER', 'DETAILS', 'SUMMARY', 'MENU', 'MENUITEM', 'MENUITEM', 'APPLET', 'EMBED', 'OBJECT', - 'PARAM', 'VIDEO', 'AUDIO', 'SOURCE', 'TRACK', 'CANVAS', 'MAP', 'AREA', 'SVG', 'IFRAME', 'FRAME', 'FRAMESET', - 'LI', 'UL', 'OL', 'DL', 'DT', 'DD', 'HR', 'P', 'PRE', 'LISTING', 'PLAINTEXT', 'A' - ]; while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); - if (!containerTags.includes(element.parentElement.tagName)) { - break; - } - const fullyContained = parentRect.left <= childRect.left && parentRect.right >= childRect.right && @@ -875,8 +852,10 @@ interface SelectorResult { * @returns {Promise} */ -export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates): Promise => { +export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates, listSelector: string): Promise => { try { + if (!listSelector) { + console.log(`NON UNIQUE: MODE 1`) const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { function getNonUniqueSelector(element: HTMLElement): string { let selector = element.tagName.toLowerCase(); @@ -914,47 +893,82 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates let element = originalEl; - const containerTags = ['DIV', 'SECTION', 'ARTICLE', 'MAIN', 'HEADER', 'FOOTER', 'NAV', 'ASIDE', - 'ADDRESS', 'BLOCKQUOTE', 'DETAILS', 'DIALOG', 'FIGURE', 'FIGCAPTION', 'MAIN', 'MARK', 'SUMMARY', 'TIME', - 'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TR', 'TH', 'TD', 'CAPTION', 'COLGROUP', 'COL', 'FORM', 'FIELDSET', - 'LEGEND', 'LABEL', 'INPUT', 'BUTTON', 'SELECT', 'DATALIST', 'OPTGROUP', 'OPTION', 'TEXTAREA', 'OUTPUT', - 'PROGRESS', 'METER', 'DETAILS', 'SUMMARY', 'MENU', 'MENUITEM', 'MENUITEM', 'APPLET', 'EMBED', 'OBJECT', - 'PARAM', 'VIDEO', 'AUDIO', 'SOURCE', 'TRACK', 'CANVAS', 'MAP', 'AREA', 'SVG', 'IFRAME', 'FRAME', 'FRAMESET', - 'LI', 'UL', 'OL', 'DL', 'DT', 'DD', 'HR', 'P', 'PRE', 'LISTING', 'PLAINTEXT', 'A' - ]; - - while (element.parentElement) { - const parentRect = element.parentElement.getBoundingClientRect(); - const childRect = element.getBoundingClientRect(); - - if (!containerTags.includes(element.parentElement.tagName)) { - break; + // if (listSelector === '') { + while (element.parentElement) { + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); + + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; + + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + element = element.parentElement; + } else { + break; + } } - - const fullyContained = - parentRect.left <= childRect.left && - parentRect.right >= childRect.right && - parentRect.top <= childRect.top && - parentRect.bottom >= childRect.bottom; - - const significantOverlap = - (childRect.width * childRect.height) / - (parentRect.width * parentRect.height) > 0.5; - - if (fullyContained && significantOverlap) { - element = element.parentElement; - } else { - break; - } - } + // } const generalSelector = getSelectorPath(element); return { generalSelector, }; }, coordinates); - return selectors || { generalSelector: '' }; + } else { + console.log(`NON UNIQUE: MODE 2`) + const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { + function getNonUniqueSelector(element: HTMLElement): string { + let selector = element.tagName.toLowerCase(); + + if (element.className) { + const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls)); + if (classes.length > 0) { + const validClasses = classes.filter((cls: string) => !cls.startsWith('!') && !cls.includes(':')); + if (validClasses.length > 0) { + selector += '.' + validClasses.map(cls => CSS.escape(cls)).join('.'); + } + } + } + + return selector; + } + + function getSelectorPath(element: HTMLElement | null): string { + const path: string[] = []; + let depth = 0; + const maxDepth = 2; + + while (element && element !== document.body && depth < maxDepth) { + const selector = getNonUniqueSelector(element); + path.unshift(selector); + element = element.parentElement; + depth++; + } + + return path.join(' > '); + } + + const originalEl = document.elementFromPoint(x, y) as HTMLElement; + if (!originalEl) return null; + + let element = originalEl; + + const generalSelector = getSelectorPath(element); + return { + generalSelector, + }; + }, coordinates); + return selectors || { generalSelector: '' }; + } + } catch (error) { console.error('Error in getNonUniqueSelectors:', error); return { generalSelector: '' };