From feb30b9f9e1ffd047e3054f7c10d90e07ec134d5 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 24 Dec 2024 02:30:36 +0530 Subject: [PATCH 01/12] feat: add nth-child selectors for td tag --- server/src/workflow-management/selector.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 240f8921..af9de4af 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -869,6 +869,13 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates function getNonUniqueSelector(element: HTMLElement): string { let selector = element.tagName.toLowerCase(); + if (selector === 'td' && element.parentElement) { + // Find position among td siblings + const siblings = Array.from(element.parentElement.children); + const position = siblings.indexOf(element) + 1; + return `${selector}:nth-child(${position})`; + } + if (element.className) { const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls)); if (classes.length > 0) { @@ -937,6 +944,12 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates function getNonUniqueSelector(element: HTMLElement): string { let selector = element.tagName.toLowerCase(); + if (selector === 'td' && element.parentElement) { + const siblings = Array.from(element.parentElement.children); + const position = siblings.indexOf(element) + 1; + return `${selector}:nth-child(${position})`; + } + if (element.className) { const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls)); if (classes.length > 0) { @@ -991,6 +1004,12 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro function getNonUniqueSelector(element: HTMLElement): string { let selector = element.tagName.toLowerCase(); + if (selector === 'td' && element.parentElement) { + const siblings = Array.from(element.parentElement.children); + const position = siblings.indexOf(element) + 1; + return `${selector}:nth-child(${position})`; + } + const className = typeof element.className === 'string' ? element.className : ''; if (className) { const classes = className.split(/\s+/).filter((cls: string) => Boolean(cls)); From 5ac88c6eda67f67229ca71e511cb7de25bfefd06 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 24 Dec 2024 02:31:32 +0530 Subject: [PATCH 02/12] feat: add scraping logic for tabular data in scrapeList --- maxun-core/src/browserSide/scraper.js | 116 ++++++++++++++------------ 1 file changed, 63 insertions(+), 53 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index a2009d78..82341fbd 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -262,73 +262,83 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { + // Separate fields into table and non-table categories + const tableFields = {}; + const nonTableFields = {}; + + for (const [label, field] of Object.entries(fields)) { + if (['TD', 'TH', 'TR'].includes(field.tag)) { + tableFields[label] = field; + } else { + nonTableFields[label] = field; + } + } + + const parentElements = Array.from(document.querySelectorAll(listSelector)); const scrapedData = []; - while (scrapedData.length < limit) { - let parentElements = Array.from(document.querySelectorAll(listSelector)); - - // If we only got one element or none, try a more generic approach - if (limit > 1 && parentElements.length <= 1) { - const [containerSelector, _] = listSelector.split('>').map(s => s.trim()); - const container = document.querySelector(containerSelector); - - if (container) { - const allChildren = Array.from(container.children); - - const firstMatch = document.querySelector(listSelector); - if (firstMatch) { - // Get classes from the first matching element - const firstMatchClasses = Array.from(firstMatch.classList); - - // Find similar elements by matching most of their classes - parentElements = allChildren.filter(element => { - const elementClasses = Array.from(element.classList); + for (const parent of parentElements) { + // First, get the number of rows we'll need by checking the first table field + const firstTableField = Object.values(tableFields)[0]; + const tableRows = firstTableField + ? Array.from(parent.querySelectorAll(firstTableField.selector)).slice(0, limit) + : [null]; - // Element should share at least 70% of classes with the first match - const commonClasses = firstMatchClasses.filter(cls => - elementClasses.includes(cls)); - return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7); - }); - } - } - } - - // Iterate through each parent element - for (const parent of parentElements) { - if (scrapedData.length >= limit) break; + tableRows.forEach((_, rowIndex) => { const record = {}; - - // For each field, select the corresponding element within the parent - for (const [label, { selector, attribute }] of Object.entries(fields)) { - const fieldElement = parent.querySelector(selector); - - if (fieldElement) { + + // Table fields + for (const [label, { selector, attribute }] of Object.entries(tableFields)) { + const elements = Array.from(parent.querySelectorAll(selector)); + const element = elements[rowIndex]; + + if (element) { + let value; if (attribute === 'innerText') { - record[label] = fieldElement.innerText.trim(); + value = element.innerText.trim(); } else if (attribute === 'innerHTML') { - record[label] = fieldElement.innerHTML.trim(); - } else if (attribute === 'src') { - // Handle relative 'src' URLs - const src = fieldElement.getAttribute('src'); - record[label] = src ? new URL(src, window.location.origin).href : null; - } else if (attribute === 'href') { - // Handle relative 'href' URLs - const href = fieldElement.getAttribute('href'); - record[label] = href ? new URL(href, window.location.origin).href : null; + value = element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + value = attrValue ? new URL(attrValue, window.location.origin).href : null; } else { - record[label] = fieldElement.getAttribute(attribute); + value = element.getAttribute(attribute); } + record[label] = value; } } - scrapedData.push(record); - } + + // Non table fields + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const element = parent.querySelector(selector); + + if (element) { + let value; + if (attribute === 'innerText') { + value = element.innerText.trim(); + } else if (attribute === 'innerHTML') { + value = element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + value = attrValue ? new URL(attrValue, window.location.origin).href : null; + } else { + value = element.getAttribute(attribute); + } + record[label] = value; + } + } + + if (Object.keys(record).length > 0) { + scrapedData.push(record); + } + }); - // If we've processed all available elements and still haven't reached the limit, - // break to avoid infinite loop - if (parentElements.length === 0 || scrapedData.length >= parentElements.length) { + if (scrapedData.length >= limit) { + scrapedData.length = limit; break; } } + return scrapedData; }; From b411faf6812d78ffb763af4f3bf0016a397760a7 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 24 Dec 2024 17:25:52 +0530 Subject: [PATCH 03/12] feat: add condition to ignore html and body tags on highlighting --- server/src/workflow-management/selector.ts | 42 ++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index af9de4af..9f64b418 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -85,6 +85,11 @@ export const getElementInformation = async ( let element = originalEl; while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } + const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -99,7 +104,14 @@ export const getElementInformation = async ( (parentRect.width * parentRect.height) > 0.5; if (fullyContained && significantOverlap) { - element = element.parentElement; + // Only traverse up if next parent isn't body or html + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } @@ -201,6 +213,11 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector let element = originalEl; while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } + const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -215,7 +232,14 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector (parentRect.width * parentRect.height) > 0.5; if (fullyContained && significantOverlap) { - element = element.parentElement; + // Only traverse up if next parent isn't body or html + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } @@ -911,6 +935,11 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates // if (listSelector === '') { while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } + const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -925,7 +954,14 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates (parentRect.width * parentRect.height) > 0.5; if (fullyContained && significantOverlap) { - element = element.parentElement; + // Only traverse up if next parent isn't body or html + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } From 3176fa275f5728095845ba9394b53cf268d1e56f Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 24 Dec 2024 17:27:42 +0530 Subject: [PATCH 04/12] feat: add loop to handle nested scraping --- maxun-core/src/browserSide/scraper.js | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 82341fbd..2135311c 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -262,7 +262,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - // Separate fields into table and non-table categories const tableFields = {}; const nonTableFields = {}; @@ -278,19 +277,19 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const scrapedData = []; for (const parent of parentElements) { - // First, get the number of rows we'll need by checking the first table field - const firstTableField = Object.values(tableFields)[0]; - const tableRows = firstTableField - ? Array.from(parent.querySelectorAll(firstTableField.selector)).slice(0, limit) - : [null]; - - tableRows.forEach((_, rowIndex) => { + // Get the first field's elements to determine how many items we have + const firstField = Object.values(fields)[0]; + const baseElements = Array.from(parent.querySelectorAll(firstField.selector)); + + // Process each item up to the limit + for (let i = 0; i < Math.min(baseElements.length, limit); i++) { const record = {}; - // Table fields + // Process table fields for (const [label, { selector, attribute }] of Object.entries(tableFields)) { const elements = Array.from(parent.querySelectorAll(selector)); - const element = elements[rowIndex]; + // Use the same index to maintain correspondence between fields + const element = elements[i]; if (element) { let value; @@ -308,9 +307,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } } - // Non table fields + // Process non-table fields for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { - const element = parent.querySelector(selector); + const elements = Array.from(parent.querySelectorAll(selector)); + // Use the same index to maintain correspondence between fields + const element = elements[i]; if (element) { let value; @@ -331,7 +332,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, if (Object.keys(record).length > 0) { scrapedData.push(record); } - }); + } if (scrapedData.length >= limit) { scrapedData.length = limit; @@ -342,7 +343,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return scrapedData; }; - /** * Gets all children of the elements matching the listSelector, * returning their CSS selectors and innerText. From 50bcd4b8dd54bba291ff7b7380422cac116c93c0 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Wed, 25 Dec 2024 22:19:18 +0530 Subject: [PATCH 05/12] feat: add logic to seperate table fields --- maxun-core/src/browserSide/scraper.js | 244 +++++++++++++++++++------- 1 file changed, 182 insertions(+), 62 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 2135311c..4fafd25b 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -262,83 +262,203 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - const tableFields = {}; - const nonTableFields = {}; - - for (const [label, field] of Object.entries(fields)) { - if (['TD', 'TH', 'TR'].includes(field.tag)) { - tableFields[label] = field; - } else { - nonTableFields[label] = field; + // Helper function to extract values from elements + function extractValue(element, attribute) { + if (!element) return null; + + if (attribute === 'innerText') { + return element.innerText.trim(); + } else if (attribute === 'innerHTML') { + return element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + return attrValue ? new URL(attrValue, window.location.origin).href : null; } + return element.getAttribute(attribute); } - const parentElements = Array.from(document.querySelectorAll(listSelector)); + // Helper function to find table ancestors + function findTableAncestor(element) { + let currentElement = element; + const MAX_DEPTH = 5; + let depth = 0; + + while (currentElement && depth < MAX_DEPTH) { + if (currentElement.tagName === 'TD') { + return { type: 'TD', element: currentElement }; + } else if (currentElement.tagName === 'TR') { + return { type: 'TR', element: currentElement }; + } + currentElement = currentElement.parentElement; + depth++; + } + return null; + } + + function getCellIndex(td) { + let index = 0; + let sibling = td; + while (sibling = sibling.previousElementSibling) { + index++; + } + return index; + } + + function hasThElement(row, tableFields) { + for (const [label, { selector }] of Object.entries(tableFields)) { + const element = row.querySelector(selector); + if (element) { + let current = element; + while (current && current !== row) { + if (current.tagName === 'TH') { + return true; + } + current = current.parentElement; + } + } + } + return false; + } + + function filterRowsBasedOnTag(rows, tableFields) { + for (const row of rows) { + if (hasThElement(row, tableFields)) { + return rows; + } + } + return rows.filter(row => row.getElementsByTagName('TH').length === 0); + } + + // Get all containers that match the listSelector + const containers = Array.from(document.querySelectorAll(listSelector)); + if (containers.length === 0) return []; + + // Initialize arrays to store field classifications for each container + const containerFields = containers.map(() => ({ + tableFields: {}, + nonTableFields: {} + })); + + // Analyze field types for each container + containers.forEach((container, containerIndex) => { + for (const [label, field] of Object.entries(fields)) { + const sampleElement = container.querySelector(field.selector); + + if (sampleElement) { + const ancestor = findTableAncestor(sampleElement); + if (ancestor) { + containerFields[containerIndex].tableFields[label] = { + ...field, + tableContext: ancestor.type, + cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 + }; + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } + }); + const scrapedData = []; - for (const parent of parentElements) { - // Get the first field's elements to determine how many items we have - const firstField = Object.values(fields)[0]; - const baseElements = Array.from(parent.querySelectorAll(firstField.selector)); - - // Process each item up to the limit - for (let i = 0; i < Math.min(baseElements.length, limit); i++) { - const record = {}; + // Process each container + containers.forEach((container, containerIndex) => { + const { tableFields, nonTableFields } = containerFields[containerIndex]; + + // Handle table fields + if (Object.keys(tableFields).length > 0) { + // Find the common table ancestor + const firstField = Object.values(tableFields)[0]; + const firstElement = container.querySelector(firstField.selector); + let tableContext = firstElement; - // Process table fields - for (const [label, { selector, attribute }] of Object.entries(tableFields)) { - const elements = Array.from(parent.querySelectorAll(selector)); - // Use the same index to maintain correspondence between fields - const element = elements[i]; - - if (element) { - let value; - if (attribute === 'innerText') { - value = element.innerText.trim(); - } else if (attribute === 'innerHTML') { - value = element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - const attrValue = element.getAttribute(attribute); - value = attrValue ? new URL(attrValue, window.location.origin).href : null; - } else { - value = element.getAttribute(attribute); - } - record[label] = value; - } + while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { + tableContext = tableContext.parentElement; } - // Process non-table fields - for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { - const elements = Array.from(parent.querySelectorAll(selector)); - // Use the same index to maintain correspondence between fields - const element = elements[i]; + if (tableContext) { + const rows = Array.from(tableContext.getElementsByTagName('TR')); + const processedRows = filterRowsBasedOnTag(rows, tableFields); - if (element) { - let value; - if (attribute === 'innerText') { - value = element.innerText.trim(); - } else if (attribute === 'innerHTML') { - value = element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - const attrValue = element.getAttribute(attribute); - value = attrValue ? new URL(attrValue, window.location.origin).href : null; - } else { - value = element.getAttribute(attribute); + for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) { + const record = {}; + const currentRow = processedRows[rowIndex]; + + for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) { + let element = null; + + if (cellIndex >= 0) { + const td = currentRow.children[cellIndex]; + if (td) { + element = td.querySelector(selector); + + if (!element && selector.split(">").pop().includes('td:nth-child')) { + element = td; + } + + if (!element) { + const tagOnlySelector = selector.split('.')[0]; + element = td.querySelector(tagOnlySelector); + } + + if (!element) { + let currentElement = td; + while (currentElement && currentElement.children.length > 0) { + let foundContentChild = false; + for (const child of currentElement.children) { + if (extractValue(child, attribute)) { + currentElement = child; + foundContentChild = true; + break; + } + } + if (!foundContentChild) break; + } + element = currentElement; + } + } + } else { + element = currentRow.querySelector(selector); + } + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + scrapedData.push(record); } - record[label] = value; } } - - if (Object.keys(record).length > 0) { - scrapedData.push(record); - } } - if (scrapedData.length >= limit) { - scrapedData.length = limit; - break; + // Handle non-table fields + if (Object.keys(nonTableFields).length > 0) { + const firstField = Object.values(nonTableFields)[0]; + const baseElements = Array.from(container.querySelectorAll(firstField.selector)); + + for (let i = 0; i < Math.min(baseElements.length, limit); i++) { + const record = {}; + + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const elements = Array.from(parent.querySelectorAll(selector)); + // Use the same index to maintain correspondence between fields + const element = elements[i]; + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + scrapedData.push(record); + } + } } - } + }); return scrapedData; }; From 641b6559a77d6efd671b4a5221aae59bb84a38ca Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 25 Dec 2024 23:01:35 +0530 Subject: [PATCH 06/12] fix: revert --- maxun-core/src/browserSide/scraper.js | 244 +++++++------------------- 1 file changed, 62 insertions(+), 182 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 4fafd25b..2135311c 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -262,203 +262,83 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - // Helper function to extract values from elements - function extractValue(element, attribute) { - if (!element) return null; - - if (attribute === 'innerText') { - return element.innerText.trim(); - } else if (attribute === 'innerHTML') { - return element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - const attrValue = element.getAttribute(attribute); - return attrValue ? new URL(attrValue, window.location.origin).href : null; + const tableFields = {}; + const nonTableFields = {}; + + for (const [label, field] of Object.entries(fields)) { + if (['TD', 'TH', 'TR'].includes(field.tag)) { + tableFields[label] = field; + } else { + nonTableFields[label] = field; } - return element.getAttribute(attribute); } - // Helper function to find table ancestors - function findTableAncestor(element) { - let currentElement = element; - const MAX_DEPTH = 5; - let depth = 0; - - while (currentElement && depth < MAX_DEPTH) { - if (currentElement.tagName === 'TD') { - return { type: 'TD', element: currentElement }; - } else if (currentElement.tagName === 'TR') { - return { type: 'TR', element: currentElement }; - } - currentElement = currentElement.parentElement; - depth++; - } - return null; - } - - function getCellIndex(td) { - let index = 0; - let sibling = td; - while (sibling = sibling.previousElementSibling) { - index++; - } - return index; - } - - function hasThElement(row, tableFields) { - for (const [label, { selector }] of Object.entries(tableFields)) { - const element = row.querySelector(selector); - if (element) { - let current = element; - while (current && current !== row) { - if (current.tagName === 'TH') { - return true; - } - current = current.parentElement; - } - } - } - return false; - } - - function filterRowsBasedOnTag(rows, tableFields) { - for (const row of rows) { - if (hasThElement(row, tableFields)) { - return rows; - } - } - return rows.filter(row => row.getElementsByTagName('TH').length === 0); - } - - // Get all containers that match the listSelector - const containers = Array.from(document.querySelectorAll(listSelector)); - if (containers.length === 0) return []; - - // Initialize arrays to store field classifications for each container - const containerFields = containers.map(() => ({ - tableFields: {}, - nonTableFields: {} - })); - - // Analyze field types for each container - containers.forEach((container, containerIndex) => { - for (const [label, field] of Object.entries(fields)) { - const sampleElement = container.querySelector(field.selector); - - if (sampleElement) { - const ancestor = findTableAncestor(sampleElement); - if (ancestor) { - containerFields[containerIndex].tableFields[label] = { - ...field, - tableContext: ancestor.type, - cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 - }; - } else { - containerFields[containerIndex].nonTableFields[label] = field; - } - } else { - containerFields[containerIndex].nonTableFields[label] = field; - } - } - }); - + const parentElements = Array.from(document.querySelectorAll(listSelector)); const scrapedData = []; - // Process each container - containers.forEach((container, containerIndex) => { - const { tableFields, nonTableFields } = containerFields[containerIndex]; - - // Handle table fields - if (Object.keys(tableFields).length > 0) { - // Find the common table ancestor - const firstField = Object.values(tableFields)[0]; - const firstElement = container.querySelector(firstField.selector); - let tableContext = firstElement; + for (const parent of parentElements) { + // Get the first field's elements to determine how many items we have + const firstField = Object.values(fields)[0]; + const baseElements = Array.from(parent.querySelectorAll(firstField.selector)); + + // Process each item up to the limit + for (let i = 0; i < Math.min(baseElements.length, limit); i++) { + const record = {}; - while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { - tableContext = tableContext.parentElement; - } - - if (tableContext) { - const rows = Array.from(tableContext.getElementsByTagName('TR')); - const processedRows = filterRowsBasedOnTag(rows, tableFields); - - for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) { - const record = {}; - const currentRow = processedRows[rowIndex]; - - for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) { - let element = null; - - if (cellIndex >= 0) { - const td = currentRow.children[cellIndex]; - if (td) { - element = td.querySelector(selector); - - if (!element && selector.split(">").pop().includes('td:nth-child')) { - element = td; - } - - if (!element) { - const tagOnlySelector = selector.split('.')[0]; - element = td.querySelector(tagOnlySelector); - } - - if (!element) { - let currentElement = td; - while (currentElement && currentElement.children.length > 0) { - let foundContentChild = false; - for (const child of currentElement.children) { - if (extractValue(child, attribute)) { - currentElement = child; - foundContentChild = true; - break; - } - } - if (!foundContentChild) break; - } - element = currentElement; - } - } - } else { - element = currentRow.querySelector(selector); - } - - if (element) { - record[label] = extractValue(element, attribute); - } - } - - if (Object.keys(record).length > 0) { - scrapedData.push(record); - } - } - } - } - - // Handle non-table fields - if (Object.keys(nonTableFields).length > 0) { - const firstField = Object.values(nonTableFields)[0]; - const baseElements = Array.from(container.querySelectorAll(firstField.selector)); - - for (let i = 0; i < Math.min(baseElements.length, limit); i++) { - const record = {}; - - for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + // Process table fields + for (const [label, { selector, attribute }] of Object.entries(tableFields)) { const elements = Array.from(parent.querySelectorAll(selector)); // Use the same index to maintain correspondence between fields const element = elements[i]; if (element) { - record[label] = extractValue(element, attribute); + let value; + if (attribute === 'innerText') { + value = element.innerText.trim(); + } else if (attribute === 'innerHTML') { + value = element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + value = attrValue ? new URL(attrValue, window.location.origin).href : null; + } else { + value = element.getAttribute(attribute); + } + record[label] = value; } - } + } + + // Process non-table fields + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const elements = Array.from(parent.querySelectorAll(selector)); + // Use the same index to maintain correspondence between fields + const element = elements[i]; - if (Object.keys(record).length > 0) { - scrapedData.push(record); - } + if (element) { + let value; + if (attribute === 'innerText') { + value = element.innerText.trim(); + } else if (attribute === 'innerHTML') { + value = element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + value = attrValue ? new URL(attrValue, window.location.origin).href : null; + } else { + value = element.getAttribute(attribute); + } + record[label] = value; + } + } + + if (Object.keys(record).length > 0) { + scrapedData.push(record); } } - }); + + if (scrapedData.length >= limit) { + scrapedData.length = limit; + break; + } + } return scrapedData; }; From 991b55446480347db4cafe4f1c3901ac07c10ad2 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Wed, 25 Dec 2024 23:09:27 +0530 Subject: [PATCH 07/12] feat: revert non-table scraping logic --- maxun-core/src/browserSide/scraper.js | 29 ++++++++++----------------- 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 4fafd25b..91feaef0 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -437,26 +437,19 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, // Handle non-table fields if (Object.keys(nonTableFields).length > 0) { - const firstField = Object.values(nonTableFields)[0]; - const baseElements = Array.from(container.querySelectorAll(firstField.selector)); + const record = {}; - for (let i = 0; i < Math.min(baseElements.length, limit); i++) { - const record = {}; - - for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { - const elements = Array.from(parent.querySelectorAll(selector)); - // Use the same index to maintain correspondence between fields - const element = elements[i]; - - if (element) { - record[label] = extractValue(element, attribute); - } - } - - if (Object.keys(record).length > 0) { - scrapedData.push(record); - } + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const element = container.querySelector(selector); + + if (element) { + record[label] = extractValue(element, attribute); } + } + + if (Object.keys(record).length > 0) { + scrapedData.push(record); + } } }); From 33de0f14898feb90d659f0b2a9d0412604884eaf Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 26 Dec 2024 01:16:29 +0530 Subject: [PATCH 08/12] feat: separate table and non-table data logic --- maxun-core/src/browserSide/scraper.js | 272 +++++++++++++++++++------- 1 file changed, 197 insertions(+), 75 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 2135311c..c12267d3 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -262,84 +262,206 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - const tableFields = {}; - const nonTableFields = {}; - - for (const [label, field] of Object.entries(fields)) { - if (['TD', 'TH', 'TR'].includes(field.tag)) { - tableFields[label] = field; - } else { - nonTableFields[label] = field; - } - } - - const parentElements = Array.from(document.querySelectorAll(listSelector)); - const scrapedData = []; - - for (const parent of parentElements) { - // Get the first field's elements to determine how many items we have - const firstField = Object.values(fields)[0]; - const baseElements = Array.from(parent.querySelectorAll(firstField.selector)); + // Helper function to extract values from elements + function extractValue(element, attribute) { + if (!element) return null; - // Process each item up to the limit - for (let i = 0; i < Math.min(baseElements.length, limit); i++) { - const record = {}; - - // Process table fields - for (const [label, { selector, attribute }] of Object.entries(tableFields)) { - const elements = Array.from(parent.querySelectorAll(selector)); - // Use the same index to maintain correspondence between fields - const element = elements[i]; - - if (element) { - let value; - if (attribute === 'innerText') { - value = element.innerText.trim(); - } else if (attribute === 'innerHTML') { - value = element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - const attrValue = element.getAttribute(attribute); - value = attrValue ? new URL(attrValue, window.location.origin).href : null; - } else { - value = element.getAttribute(attribute); - } - record[label] = value; - } - } - - // Process non-table fields - for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { - const elements = Array.from(parent.querySelectorAll(selector)); - // Use the same index to maintain correspondence between fields - const element = elements[i]; - - if (element) { - let value; - if (attribute === 'innerText') { - value = element.innerText.trim(); - } else if (attribute === 'innerHTML') { - value = element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - const attrValue = element.getAttribute(attribute); - value = attrValue ? new URL(attrValue, window.location.origin).href : null; - } else { - value = element.getAttribute(attribute); - } - record[label] = value; - } - } - - if (Object.keys(record).length > 0) { - scrapedData.push(record); - } - } - - if (scrapedData.length >= limit) { - scrapedData.length = limit; - break; + if (attribute === 'innerText') { + return element.innerText.trim(); + } else if (attribute === 'innerHTML') { + return element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + return attrValue ? new URL(attrValue, window.location.origin).href : null; } + return element.getAttribute(attribute); } - + + // Helper function to find table ancestors + function findTableAncestor(element) { + let currentElement = element; + const MAX_DEPTH = 5; + let depth = 0; + + while (currentElement && depth < MAX_DEPTH) { + if (currentElement.tagName === 'TD') { + return { type: 'TD', element: currentElement }; + } else if (currentElement.tagName === 'TR') { + return { type: 'TR', element: currentElement }; + } + currentElement = currentElement.parentElement; + depth++; + } + return null; + } + + function getCellIndex(td) { + let index = 0; + let sibling = td; + while (sibling = sibling.previousElementSibling) { + index++; + } + return index; + } + + function hasThElement(row, tableFields) { + for (const [label, { selector }] of Object.entries(tableFields)) { + const element = row.querySelector(selector); + if (element) { + let current = element; + while (current && current !== row) { + if (current.tagName === 'TH') { + return true; + } + current = current.parentElement; + } + } + } + return false; + } + + function filterRowsBasedOnTag(rows, tableFields) { + for (const row of rows) { + if (hasThElement(row, tableFields)) { + return rows; + } + } + return rows.filter(row => row.getElementsByTagName('TH').length === 0); + } + + // Get all containers that match the listSelector + const containers = Array.from(document.querySelectorAll(listSelector)); + if (containers.length === 0) return []; + + // Initialize arrays to store field classifications for each container + const containerFields = containers.map(() => ({ + tableFields: {}, + nonTableFields: {} + })); + + // Analyze field types for each container + containers.forEach((container, containerIndex) => { + for (const [label, field] of Object.entries(fields)) { + const sampleElement = container.querySelector(field.selector); + + if (sampleElement) { + const ancestor = findTableAncestor(sampleElement); + if (ancestor) { + containerFields[containerIndex].tableFields[label] = { + ...field, + tableContext: ancestor.type, + cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 + }; + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } + }); + + const tableData = []; + const nonTableData = []; + + // Process table fields across all containers + for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { + const container = containers[containerIndex]; + const { tableFields } = containerFields[containerIndex]; + + if (Object.keys(tableFields).length > 0) { + const firstField = Object.values(tableFields)[0]; + const firstElement = container.querySelector(firstField.selector); + let tableContext = firstElement; + + while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { + tableContext = tableContext.parentElement; + } + + if (tableContext) { + const rows = Array.from(tableContext.getElementsByTagName('TR')); + const processedRows = filterRowsBasedOnTag(rows, tableFields); + + for (let rowIndex = 0; rowIndex < processedRows.length; rowIndex++) { + const record = {}; + const currentRow = processedRows[rowIndex]; + + for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) { + let element = null; + + if (cellIndex >= 0) { + const td = currentRow.children[cellIndex]; + if (td) { + element = td.querySelector(selector); + + if (!element && selector.split(">").pop().includes('td:nth-child')) { + element = td; + } + + if (!element) { + const tagOnlySelector = selector.split('.')[0]; + element = td.querySelector(tagOnlySelector); + } + + if (!element) { + let currentElement = td; + while (currentElement && currentElement.children.length > 0) { + let foundContentChild = false; + for (const child of currentElement.children) { + if (extractValue(child, attribute)) { + currentElement = child; + foundContentChild = true; + break; + } + } + if (!foundContentChild) break; + } + element = currentElement; + } + } + } else { + element = currentRow.querySelector(selector); + } + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + tableData.push(record); + } + } + } + } + } + + // Process non-table fields across all containers + for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { + if (nonTableData.length >= limit) break; + + const container = containers[containerIndex]; + const { nonTableFields } = containerFields[containerIndex]; + + if (Object.keys(nonTableFields).length > 0) { + const record = {}; + + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const element = container.querySelector(selector); + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + nonTableData.push(record); + } + } + } + + // Merge and limit the results + const scrapedData = [...tableData, ...nonTableData]; return scrapedData; }; From 31d973023741111ac033102466ed848927031c97 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 26 Dec 2024 03:17:18 +0530 Subject: [PATCH 09/12] feat: add logic to match 70% classes --- maxun-core/src/browserSide/scraper.js | 29 +++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index c12267d3..e109839b 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -330,9 +330,34 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } // Get all containers that match the listSelector - const containers = Array.from(document.querySelectorAll(listSelector)); + let containers = Array.from(document.querySelectorAll(listSelector)); if (containers.length === 0) return []; + if (limit > 1 && containers.length <= 1) { + const [containerSelector, _] = listSelector.split('>').map(s => s.trim()); + const container = document.querySelector(containerSelector); + + if (container) { + const allChildren = Array.from(container.children); + + const firstMatch = document.querySelector(listSelector); + if (firstMatch) { + // Get classes from the first matching element + const firstMatchClasses = Array.from(firstMatch.classList); + + // Find similar elements by matching most of their classes + containers = allChildren.filter(element => { + const elementClasses = Array.from(element.classList); + + // Element should share at least 70% of classes with the first match + const commonClasses = firstMatchClasses.filter(cls => + elementClasses.includes(cls)); + return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7); + }); + } + } + } + // Initialize arrays to store field classifications for each container const containerFields = containers.map(() => ({ tableFields: {}, @@ -436,7 +461,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } } - // Process non-table fields across all containers + // Process non-table fields across all containers for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { if (nonTableData.length >= limit) break; From fcc71e08413f2f586815b404c5fab3faaa8d4ba5 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 26 Dec 2024 03:38:40 +0530 Subject: [PATCH 10/12] feat: add limit to table scraping --- maxun-core/src/browserSide/scraper.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index e109839b..89d5abe4 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -407,7 +407,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const rows = Array.from(tableContext.getElementsByTagName('TR')); const processedRows = filterRowsBasedOnTag(rows, tableFields); - for (let rowIndex = 0; rowIndex < processedRows.length; rowIndex++) { + for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) { const record = {}; const currentRow = processedRows[rowIndex]; From 97efd156318549d9289f9b8bed89de1962a73299 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 26 Dec 2024 19:16:44 +0530 Subject: [PATCH 11/12] feat: add check for table highlighting --- server/src/workflow-management/selector.ts | 158 ++++++++++++--------- 1 file changed, 91 insertions(+), 67 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 9f64b418..1f3eac76 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -84,36 +84,44 @@ export const getElementInformation = async ( if (originalEl) { let element = originalEl; - while (element.parentElement) { - if (element.tagName.toLowerCase() === 'body' || - element.tagName.toLowerCase() === 'html') { - break; + if (element.tagName === 'TD' || element.tagName === 'TH') { + const tableParent = element.closest('table'); + if (tableParent) { + element = tableParent; } + } - const parentRect = element.parentElement.getBoundingClientRect(); - const childRect = element.getBoundingClientRect(); + if (element.tagName !== 'TABLE') { + while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } - const fullyContained = - parentRect.left <= childRect.left && - parentRect.right >= childRect.right && - parentRect.top <= childRect.top && - parentRect.bottom >= childRect.bottom; + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); - const significantOverlap = - (childRect.width * childRect.height) / - (parentRect.width * parentRect.height) > 0.5; + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; - if (fullyContained && significantOverlap) { - // Only traverse up if next parent isn't body or html - const nextParent = element.parentElement; - if (nextParent.tagName.toLowerCase() !== 'body' && - nextParent.tagName.toLowerCase() !== 'html') { - element = nextParent; + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } - } else { - break; } } @@ -212,36 +220,44 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector if (originalEl) { let element = originalEl; - while (element.parentElement) { - if (element.tagName.toLowerCase() === 'body' || - element.tagName.toLowerCase() === 'html') { - break; + if (element.tagName === 'TD' || element.tagName === 'TH') { + const tableParent = element.closest('table'); + if (tableParent) { + element = tableParent; } + } - const parentRect = element.parentElement.getBoundingClientRect(); - const childRect = element.getBoundingClientRect(); + if (element.tagName !== 'TABLE') { + while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } - const fullyContained = - parentRect.left <= childRect.left && - parentRect.right >= childRect.right && - parentRect.top <= childRect.top && - parentRect.bottom >= childRect.bottom; + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); - const significantOverlap = - (childRect.width * childRect.height) / - (parentRect.width * parentRect.height) > 0.5; + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; - if (fullyContained && significantOverlap) { - // Only traverse up if next parent isn't body or html - const nextParent = element.parentElement; - if (nextParent.tagName.toLowerCase() !== 'body' && - nextParent.tagName.toLowerCase() !== 'html') { - element = nextParent; + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } - } else { - break; } } @@ -933,37 +949,45 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates let element = originalEl; - // if (listSelector === '') { - while (element.parentElement) { - if (element.tagName.toLowerCase() === 'body' || - element.tagName.toLowerCase() === 'html') { - break; + if (element.tagName === 'TD' || element.tagName === 'TH') { + const tableParent = element.closest('table'); + if (tableParent) { + element = tableParent; } + } - const parentRect = element.parentElement.getBoundingClientRect(); - const childRect = element.getBoundingClientRect(); + // if (listSelector === '') { + if (element.tagName !== 'TABLE') { + while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } - const fullyContained = - parentRect.left <= childRect.left && - parentRect.right >= childRect.right && - parentRect.top <= childRect.top && - parentRect.bottom >= childRect.bottom; + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); - const significantOverlap = - (childRect.width * childRect.height) / - (parentRect.width * parentRect.height) > 0.5; + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; - if (fullyContained && significantOverlap) { - // Only traverse up if next parent isn't body or html - const nextParent = element.parentElement; - if (nextParent.tagName.toLowerCase() !== 'body' && - nextParent.tagName.toLowerCase() !== 'html') { - element = nextParent; + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } - } else { - break; } } // } From fe38f5a581694561dccb852d7199af0066c75069 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 26 Dec 2024 19:27:59 +0530 Subject: [PATCH 12/12] feat: improve 70% class matching logic for scraping --- maxun-core/src/browserSide/scraper.js | 67 ++++++++++++++++++--------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 89d5abe4..0c5d74ac 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -329,32 +329,53 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return rows.filter(row => row.getElementsByTagName('TH').length === 0); } - // Get all containers that match the listSelector + function calculateClassSimilarity(classList1, classList2) { + const set1 = new Set(classList1); + const set2 = new Set(classList2); + + // Calculate intersection + const intersection = new Set([...set1].filter(x => set2.has(x))); + + // Calculate union + const union = new Set([...set1, ...set2]); + + // Return Jaccard similarity coefficient + return intersection.size / union.size; + } + + // New helper function to find elements with similar classes + function findSimilarElements(baseElement, similarityThreshold = 0.7) { + const baseClasses = Array.from(baseElement.classList); + + if (baseClasses.length === 0) return []; + + const potentialElements = document.getElementsByTagName(baseElement.tagName); + + return Array.from(potentialElements).filter(element => { + if (element === baseElement) return false; + + const similarity = calculateClassSimilarity( + baseClasses, + Array.from(element.classList) + ); + + return similarity >= similarityThreshold; + }); + } + let containers = Array.from(document.querySelectorAll(listSelector)); if (containers.length === 0) return []; - if (limit > 1 && containers.length <= 1) { - const [containerSelector, _] = listSelector.split('>').map(s => s.trim()); - const container = document.querySelector(containerSelector); + if (limit > 1 && containers.length === 1) { + const baseContainer = containers[0]; + const similarContainers = findSimilarElements(baseContainer); - if (container) { - const allChildren = Array.from(container.children); - - const firstMatch = document.querySelector(listSelector); - if (firstMatch) { - // Get classes from the first matching element - const firstMatchClasses = Array.from(firstMatch.classList); - - // Find similar elements by matching most of their classes - containers = allChildren.filter(element => { - const elementClasses = Array.from(element.classList); - - // Element should share at least 70% of classes with the first match - const commonClasses = firstMatchClasses.filter(cls => - elementClasses.includes(cls)); - return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7); - }); - } + if (similarContainers.length > 0) { + const newContainers = similarContainers.filter(container => + !container.matches(listSelector) + ); + + containers = [...containers, ...newContainers]; } } @@ -389,7 +410,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const tableData = []; const nonTableData = []; - // Process table fields across all containers + // Process table fields across all containers for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { const container = containers[containerIndex]; const { tableFields } = containerFields[containerIndex];