From e70145219eca9a092487db350487f4a6bb711906 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sun, 15 Dec 2024 04:58:13 +0530 Subject: [PATCH 01/11] feat: remove container tags --- server/src/workflow-management/selector.ts | 37 ---------------------- 1 file changed, 37 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 891c0e3b..bde38300 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -74,22 +74,10 @@ export const getElementInformation = async ( if (originalEl) { let element = originalEl; - const containerTags = ['DIV', 'SECTION', 'ARTICLE', 'MAIN', 'HEADER', 'FOOTER', 'NAV', 'ASIDE', - 'ADDRESS', 'BLOCKQUOTE', 'DETAILS', 'DIALOG', 'FIGURE', 'FIGCAPTION', 'MAIN', 'MARK', 'SUMMARY', 'TIME', - 'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TR', 'TH', 'TD', 'CAPTION', 'COLGROUP', 'COL', 'FORM', 'FIELDSET', - 'LEGEND', 'LABEL', 'INPUT', 'BUTTON', 'SELECT', 'DATALIST', 'OPTGROUP', 'OPTION', 'TEXTAREA', 'OUTPUT', - 'PROGRESS', 'METER', 'DETAILS', 'SUMMARY', 'MENU', 'MENUITEM', 'MENUITEM', 'APPLET', 'EMBED', 'OBJECT', - 'PARAM', 'VIDEO', 'AUDIO', 'SOURCE', 'TRACK', 'CANVAS', 'MAP', 'AREA', 'SVG', 'IFRAME', 'FRAME', 'FRAMESET', - 'LI', 'UL', 'OL', 'DL', 'DT', 'DD', 'HR', 'P', 'PRE', 'LISTING', 'PLAINTEXT', 'A' - ]; while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); - if (!containerTags.includes(element.parentElement.tagName)) { - break; - } - const fullyContained = parentRect.left <= childRect.left && parentRect.right >= childRect.right && @@ -202,22 +190,10 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector if (originalEl) { let element = originalEl; - const containerTags = ['DIV', 'SECTION', 'ARTICLE', 'MAIN', 'HEADER', 'FOOTER', 'NAV', 'ASIDE', - 'ADDRESS', 'BLOCKQUOTE', 'DETAILS', 'DIALOG', 'FIGURE', 'FIGCAPTION', 'MAIN', 'MARK', 'SUMMARY', 'TIME', - 'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TR', 'TH', 'TD', 'CAPTION', 'COLGROUP', 'COL', 'FORM', 'FIELDSET', - 'LEGEND', 'LABEL', 'INPUT', 'BUTTON', 'SELECT', 'DATALIST', 'OPTGROUP', 'OPTION', 'TEXTAREA', 'OUTPUT', - 'PROGRESS', 'METER', 'DETAILS', 'SUMMARY', 'MENU', 'MENUITEM', 'MENUITEM', 'APPLET', 'EMBED', 'OBJECT', - 'PARAM', 'VIDEO', 'AUDIO', 'SOURCE', 'TRACK', 'CANVAS', 'MAP', 'AREA', 'SVG', 'IFRAME', 'FRAME', 'FRAMESET', - 'LI', 'UL', 'OL', 'DL', 'DT', 'DD', 'HR', 'P', 'PRE', 'LISTING', 'PLAINTEXT', 'A' - ]; while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); - if (!containerTags.includes(element.parentElement.tagName)) { - break; - } - const fullyContained = parentRect.left <= childRect.left && parentRect.right >= childRect.right && @@ -914,23 +890,10 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates let element = originalEl; - const containerTags = ['DIV', 'SECTION', 'ARTICLE', 'MAIN', 'HEADER', 'FOOTER', 'NAV', 'ASIDE', - 'ADDRESS', 'BLOCKQUOTE', 'DETAILS', 'DIALOG', 'FIGURE', 'FIGCAPTION', 'MAIN', 'MARK', 'SUMMARY', 'TIME', - 'TABLE', 'THEAD', 'TBODY', 'TFOOT', 'TR', 'TH', 'TD', 'CAPTION', 'COLGROUP', 'COL', 'FORM', 'FIELDSET', - 'LEGEND', 'LABEL', 'INPUT', 'BUTTON', 'SELECT', 'DATALIST', 'OPTGROUP', 'OPTION', 'TEXTAREA', 'OUTPUT', - 'PROGRESS', 'METER', 'DETAILS', 'SUMMARY', 'MENU', 'MENUITEM', 'MENUITEM', 'APPLET', 'EMBED', 'OBJECT', - 'PARAM', 'VIDEO', 'AUDIO', 'SOURCE', 'TRACK', 'CANVAS', 'MAP', 'AREA', 'SVG', 'IFRAME', 'FRAME', 'FRAMESET', - 'LI', 'UL', 'OL', 'DL', 'DT', 'DD', 'HR', 'P', 'PRE', 'LISTING', 'PLAINTEXT', 'A' - ]; - while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); - if (!containerTags.includes(element.parentElement.tagName)) { - break; - } - const fullyContained = parentRect.left <= childRect.left && parentRect.right >= childRect.right && From cb0965323e3a8f13483e198bb1e7dbf086a4481d Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sun, 15 Dec 2024 05:01:15 +0530 Subject: [PATCH 02/11] feat: accept getList in getRect and getElementInfo --- server/src/workflow-management/classes/Generator.ts | 6 +++--- server/src/workflow-management/selector.ts | 3 ++- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index 57be015e..2ab54753 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -541,7 +541,7 @@ export class WorkflowGenerator { * @returns {Promise} */ private generateSelector = async (page: Page, coordinates: Coordinates, action: ActionType) => { - const elementInfo = await getElementInformation(page, coordinates, this.listSelector); + const elementInfo = await getElementInformation(page, coordinates, this.listSelector, this.getList); const selectorBasedOnCustomAction = (this.getList === true) ? await getNonUniqueSelectors(page, coordinates) : await getSelectors(page, coordinates); @@ -569,9 +569,9 @@ export class WorkflowGenerator { * @returns {Promise} */ public generateDataForHighlighter = async (page: Page, coordinates: Coordinates) => { - const rect = await getRect(page, coordinates, this.listSelector); + const rect = await getRect(page, coordinates, this.listSelector, this.getList); const displaySelector = await this.generateSelector(page, coordinates, ActionType.Click); - const elementInfo = await getElementInformation(page, coordinates, this.listSelector); + const elementInfo = await getElementInformation(page, coordinates, this.listSelector, this.getList); if (rect) { if (this.getList === true) { if (this.listSelector !== '') { diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index bde38300..fd25a617 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -17,6 +17,7 @@ export const getElementInformation = async ( page: Page, coordinates: Coordinates, listSelector: string, + getList: boolean ) => { try { if (listSelector !== '') { @@ -155,7 +156,7 @@ export const getElementInformation = async ( * @category WorkflowManagement-Selectors * @returns {Promise} */ -export const getRect = async (page: Page, coordinates: Coordinates, listSelector: string) => { +export const getRect = async (page: Page, coordinates: Coordinates, listSelector: string, getList: boolean) => { try { if (listSelector !== '') { const rect = await page.evaluate( From ddb880df668e84ebce1d7731f6dda6aa2413a486 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sun, 15 Dec 2024 05:07:45 +0530 Subject: [PATCH 03/11] fix: capture text selection --- server/src/workflow-management/selector.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index fd25a617..36b592a6 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -20,7 +20,7 @@ export const getElementInformation = async ( getList: boolean ) => { try { - if (listSelector !== '') { + if (!getList) { const elementInfo = await page.evaluate( async ({ x, y }) => { const el = document.elementFromPoint(x, y) as HTMLElement; @@ -158,7 +158,7 @@ export const getElementInformation = async ( */ export const getRect = async (page: Page, coordinates: Coordinates, listSelector: string, getList: boolean) => { try { - if (listSelector !== '') { + if (!getList) { const rect = await page.evaluate( async ({ x, y }) => { const el = document.elementFromPoint(x, y) as HTMLElement; From 97e7c89105132d42864be52386e9971208f88d8f Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sun, 15 Dec 2024 05:13:08 +0530 Subject: [PATCH 04/11] feat: re-add listSelector empty check for child selection --- server/src/workflow-management/selector.ts | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 36b592a6..699cb669 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -20,7 +20,7 @@ export const getElementInformation = async ( getList: boolean ) => { try { - if (!getList) { + if (!getList || (getList && listSelector !== '')) { const elementInfo = await page.evaluate( async ({ x, y }) => { const el = document.elementFromPoint(x, y) as HTMLElement; @@ -158,7 +158,7 @@ export const getElementInformation = async ( */ export const getRect = async (page: Page, coordinates: Coordinates, listSelector: string, getList: boolean) => { try { - if (!getList) { + if (!getList || (getList && listSelector !== '')) { const rect = await page.evaluate( async ({ x, y }) => { const el = document.elementFromPoint(x, y) as HTMLElement; From 0c3b1e3e53c4e52c7898a2f8637884c0f07e118a Mon Sep 17 00:00:00 2001 From: amhsirak Date: Sun, 15 Dec 2024 05:29:30 +0530 Subject: [PATCH 05/11] feat: paass listSelect --- .../workflow-management/classes/Generator.ts | 2 +- server/src/workflow-management/selector.ts | 40 ++++++++++--------- 2 files changed, 22 insertions(+), 20 deletions(-) diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index 2ab54753..31775261 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -543,7 +543,7 @@ export class WorkflowGenerator { private generateSelector = async (page: Page, coordinates: Coordinates, action: ActionType) => { const elementInfo = await getElementInformation(page, coordinates, this.listSelector, this.getList); const selectorBasedOnCustomAction = (this.getList === true) - ? await getNonUniqueSelectors(page, coordinates) + ? await getNonUniqueSelectors(page, coordinates, this.listSelector) : await getSelectors(page, coordinates); const bestSelector = getBestSelectorForAction( diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 699cb669..527a800f 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -852,7 +852,7 @@ interface SelectorResult { * @returns {Promise} */ -export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates): Promise => { +export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates, listSelector: string): Promise => { try { const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { function getNonUniqueSelector(element: HTMLElement): string { @@ -891,24 +891,26 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates let element = originalEl; - while (element.parentElement) { - const parentRect = element.parentElement.getBoundingClientRect(); - const childRect = element.getBoundingClientRect(); - - const fullyContained = - parentRect.left <= childRect.left && - parentRect.right >= childRect.right && - parentRect.top <= childRect.top && - parentRect.bottom >= childRect.bottom; - - const significantOverlap = - (childRect.width * childRect.height) / - (parentRect.width * parentRect.height) > 0.5; - - if (fullyContained && significantOverlap) { - element = element.parentElement; - } else { - break; + if (listSelector === '') { + while (element.parentElement) { + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); + + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; + + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + element = element.parentElement; + } else { + break; + } } } From e1476935db354b7030eced9447c280241defb713 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 16 Dec 2024 07:25:13 +0530 Subject: [PATCH 06/11] fix: dont pass listSelector to non unique --- server/src/workflow-management/classes/Generator.ts | 2 +- server/src/workflow-management/selector.ts | 11 ++++++----- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index 31775261..2ab54753 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -543,7 +543,7 @@ export class WorkflowGenerator { private generateSelector = async (page: Page, coordinates: Coordinates, action: ActionType) => { const elementInfo = await getElementInformation(page, coordinates, this.listSelector, this.getList); const selectorBasedOnCustomAction = (this.getList === true) - ? await getNonUniqueSelectors(page, coordinates, this.listSelector) + ? await getNonUniqueSelectors(page, coordinates) : await getSelectors(page, coordinates); const bestSelector = getBestSelectorForAction( diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 527a800f..070a897d 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -20,7 +20,8 @@ export const getElementInformation = async ( getList: boolean ) => { try { - if (!getList || (getList && listSelector !== '')) { + console.log(`List Selector Value From EL INFO: ->> ${listSelector !== '' ? listSelector: 'It is empty'}`); + if (!getList ||listSelector !== '') { const elementInfo = await page.evaluate( async ({ x, y }) => { const el = document.elementFromPoint(x, y) as HTMLElement; @@ -158,7 +159,7 @@ export const getElementInformation = async ( */ export const getRect = async (page: Page, coordinates: Coordinates, listSelector: string, getList: boolean) => { try { - if (!getList || (getList && listSelector !== '')) { + if (!getList || listSelector !== '') { const rect = await page.evaluate( async ({ x, y }) => { const el = document.elementFromPoint(x, y) as HTMLElement; @@ -852,7 +853,7 @@ interface SelectorResult { * @returns {Promise} */ -export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates, listSelector: string): Promise => { +export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates): Promise => { try { const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { function getNonUniqueSelector(element: HTMLElement): string { @@ -891,7 +892,7 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates let element = originalEl; - if (listSelector === '') { + // if (listSelector === '') { while (element.parentElement) { const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -912,7 +913,7 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates break; } } - } + // } const generalSelector = getSelectorPath(element); return { From 4a9496053177663e4081850cf9db57742899a578 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 16 Dec 2024 08:22:21 +0530 Subject: [PATCH 07/11] feat: push parentSelector --- server/src/workflow-management/selector.ts | 1 + 1 file changed, 1 insertion(+) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 070a897d..e326f21b 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -968,6 +968,7 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro const childPath = getSelectorPath(child); if (childPath) { selectors.push(childPath); // Add direct child path + selectors.push(parentSelector) selectors = selectors.concat(getAllDescendantSelectors(child)); // Recursively process descendants } } From 94df79404011d99cc6cc5d80bb1c77208f9abcc5 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Tue, 17 Dec 2024 02:48:14 +0530 Subject: [PATCH 08/11] feat: conditionally compute non unique --- server/src/workflow-management/selector.ts | 55 ++++++++++++++++++++-- 1 file changed, 51 insertions(+), 4 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index e326f21b..60f5bdbd 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -20,8 +20,7 @@ export const getElementInformation = async ( getList: boolean ) => { try { - console.log(`List Selector Value From EL INFO: ->> ${listSelector !== '' ? listSelector: 'It is empty'}`); - if (!getList ||listSelector !== '') { + if (!getList || listSelector !== '') { const elementInfo = await page.evaluate( async ({ x, y }) => { const el = document.elementFromPoint(x, y) as HTMLElement; @@ -853,8 +852,10 @@ interface SelectorResult { * @returns {Promise} */ -export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates): Promise => { +export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates, listSelector: string): Promise => { try { + if (!listSelector) { + console.log(`NON UNIQUE: MODE 1`) const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { function getNonUniqueSelector(element: HTMLElement): string { let selector = element.tagName.toLowerCase(); @@ -920,8 +921,54 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates generalSelector, }; }, coordinates); - return selectors || { generalSelector: '' }; + } else { + console.log(`NON UNIQUE: MODE 2`) + const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { + function getNonUniqueSelector(element: HTMLElement): string { + let selector = element.tagName.toLowerCase(); + + if (element.className) { + const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls)); + if (classes.length > 0) { + const validClasses = classes.filter((cls: string) => !cls.startsWith('!') && !cls.includes(':')); + if (validClasses.length > 0) { + selector += '.' + validClasses.map(cls => CSS.escape(cls)).join('.'); + } + } + } + + return selector; + } + + function getSelectorPath(element: HTMLElement | null): string { + const path: string[] = []; + let depth = 0; + const maxDepth = 2; + + while (element && element !== document.body && depth < maxDepth) { + const selector = getNonUniqueSelector(element); + path.unshift(selector); + element = element.parentElement; + depth++; + } + + return path.join(' > '); + } + + const originalEl = document.elementFromPoint(x, y) as HTMLElement; + if (!originalEl) return null; + + let element = originalEl; + + const generalSelector = getSelectorPath(element); + return { + generalSelector, + }; + }, coordinates); + return selectors || { generalSelector: '' }; + } + } catch (error) { console.error('Error in getNonUniqueSelectors:', error); return { generalSelector: '' }; From 52b767188eedd3ef3c3053a3e50d054fb9b35e44 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Tue, 17 Dec 2024 02:48:38 +0530 Subject: [PATCH 09/11] feat: !push parentSelector --- server/src/workflow-management/selector.ts | 1 - 1 file changed, 1 deletion(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 60f5bdbd..9c62139b 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -1015,7 +1015,6 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro const childPath = getSelectorPath(child); if (childPath) { selectors.push(childPath); // Add direct child path - selectors.push(parentSelector) selectors = selectors.concat(getAllDescendantSelectors(child)); // Recursively process descendants } } From 647cd62e32fba8fc55084dbacfef6029f071d076 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 17 Dec 2024 11:37:05 +0530 Subject: [PATCH 10/11] feat: add listSelector param --- server/src/workflow-management/classes/Generator.ts | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/server/src/workflow-management/classes/Generator.ts b/server/src/workflow-management/classes/Generator.ts index 2ab54753..31775261 100644 --- a/server/src/workflow-management/classes/Generator.ts +++ b/server/src/workflow-management/classes/Generator.ts @@ -543,7 +543,7 @@ export class WorkflowGenerator { private generateSelector = async (page: Page, coordinates: Coordinates, action: ActionType) => { const elementInfo = await getElementInformation(page, coordinates, this.listSelector, this.getList); const selectorBasedOnCustomAction = (this.getList === true) - ? await getNonUniqueSelectors(page, coordinates) + ? await getNonUniqueSelectors(page, coordinates, this.listSelector) : await getSelectors(page, coordinates); const bestSelector = getBestSelectorForAction( From a9dc4c8f4ceeca8abe45268331b1369d4a1cbbb9 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 17 Dec 2024 12:21:56 +0530 Subject: [PATCH 11/11] feat: add conditional check to collect matching classes --- maxun-core/src/browserSide/scraper.js | 93 ++++++++++++++++++--------- 1 file changed, 62 insertions(+), 31 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 09b6578b..a2009d78 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -265,41 +265,72 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const scrapedData = []; while (scrapedData.length < limit) { - // Get all parent elements matching the listSelector - const parentElements = Array.from(document.querySelectorAll(listSelector)); + let parentElements = Array.from(document.querySelectorAll(listSelector)); + + // If we only got one element or none, try a more generic approach + if (limit > 1 && parentElements.length <= 1) { + const [containerSelector, _] = listSelector.split('>').map(s => s.trim()); + const container = document.querySelector(containerSelector); + + if (container) { + const allChildren = Array.from(container.children); + + const firstMatch = document.querySelector(listSelector); + if (firstMatch) { + // Get classes from the first matching element + const firstMatchClasses = Array.from(firstMatch.classList); + + // Find similar elements by matching most of their classes + parentElements = allChildren.filter(element => { + const elementClasses = Array.from(element.classList); - // Iterate through each parent element - for (const parent of parentElements) { - if (scrapedData.length >= limit) break; - const record = {}; - - // For each field, select the corresponding element within the parent - for (const [label, { selector, attribute }] of Object.entries(fields)) { - const fieldElement = parent.querySelector(selector); - - if (fieldElement) { - if (attribute === 'innerText') { - record[label] = fieldElement.innerText.trim(); - } else if (attribute === 'innerHTML') { - record[label] = fieldElement.innerHTML.trim(); - } else if (attribute === 'src') { - // Handle relative 'src' URLs - const src = fieldElement.getAttribute('src'); - record[label] = src ? new URL(src, window.location.origin).href : null; - } else if (attribute === 'href') { - // Handle relative 'href' URLs - const href = fieldElement.getAttribute('href'); - record[label] = href ? new URL(href, window.location.origin).href : null; - } else { - record[label] = fieldElement.getAttribute(attribute); + // Element should share at least 70% of classes with the first match + const commonClasses = firstMatchClasses.filter(cls => + elementClasses.includes(cls)); + return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7); + }); + } } - } } - scrapedData.push(record); - } + + // Iterate through each parent element + for (const parent of parentElements) { + if (scrapedData.length >= limit) break; + const record = {}; + + // For each field, select the corresponding element within the parent + for (const [label, { selector, attribute }] of Object.entries(fields)) { + const fieldElement = parent.querySelector(selector); + + if (fieldElement) { + if (attribute === 'innerText') { + record[label] = fieldElement.innerText.trim(); + } else if (attribute === 'innerHTML') { + record[label] = fieldElement.innerHTML.trim(); + } else if (attribute === 'src') { + // Handle relative 'src' URLs + const src = fieldElement.getAttribute('src'); + record[label] = src ? new URL(src, window.location.origin).href : null; + } else if (attribute === 'href') { + // Handle relative 'href' URLs + const href = fieldElement.getAttribute('href'); + record[label] = href ? new URL(href, window.location.origin).href : null; + } else { + record[label] = fieldElement.getAttribute(attribute); + } + } + } + scrapedData.push(record); + } + + // If we've processed all available elements and still haven't reached the limit, + // break to avoid infinite loop + if (parentElements.length === 0 || scrapedData.length >= parentElements.length) { + break; + } } - return scrapedData - }; + return scrapedData; +}; /**