From feb30b9f9e1ffd047e3054f7c10d90e07ec134d5 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 24 Dec 2024 02:30:36 +0530 Subject: [PATCH 01/53] feat: add nth-child selectors for td tag --- server/src/workflow-management/selector.ts | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 240f8921..af9de4af 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -869,6 +869,13 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates function getNonUniqueSelector(element: HTMLElement): string { let selector = element.tagName.toLowerCase(); + if (selector === 'td' && element.parentElement) { + // Find position among td siblings + const siblings = Array.from(element.parentElement.children); + const position = siblings.indexOf(element) + 1; + return `${selector}:nth-child(${position})`; + } + if (element.className) { const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls)); if (classes.length > 0) { @@ -937,6 +944,12 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates function getNonUniqueSelector(element: HTMLElement): string { let selector = element.tagName.toLowerCase(); + if (selector === 'td' && element.parentElement) { + const siblings = Array.from(element.parentElement.children); + const position = siblings.indexOf(element) + 1; + return `${selector}:nth-child(${position})`; + } + if (element.className) { const classes = element.className.split(/\s+/).filter((cls: string) => Boolean(cls)); if (classes.length > 0) { @@ -991,6 +1004,12 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro function getNonUniqueSelector(element: HTMLElement): string { let selector = element.tagName.toLowerCase(); + if (selector === 'td' && element.parentElement) { + const siblings = Array.from(element.parentElement.children); + const position = siblings.indexOf(element) + 1; + return `${selector}:nth-child(${position})`; + } + const className = typeof element.className === 'string' ? element.className : ''; if (className) { const classes = className.split(/\s+/).filter((cls: string) => Boolean(cls)); From 5ac88c6eda67f67229ca71e511cb7de25bfefd06 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 24 Dec 2024 02:31:32 +0530 Subject: [PATCH 02/53] feat: add scraping logic for tabular data in scrapeList --- maxun-core/src/browserSide/scraper.js | 116 ++++++++++++++------------ 1 file changed, 63 insertions(+), 53 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index a2009d78..82341fbd 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -262,73 +262,83 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { + // Separate fields into table and non-table categories + const tableFields = {}; + const nonTableFields = {}; + + for (const [label, field] of Object.entries(fields)) { + if (['TD', 'TH', 'TR'].includes(field.tag)) { + tableFields[label] = field; + } else { + nonTableFields[label] = field; + } + } + + const parentElements = Array.from(document.querySelectorAll(listSelector)); const scrapedData = []; - while (scrapedData.length < limit) { - let parentElements = Array.from(document.querySelectorAll(listSelector)); - - // If we only got one element or none, try a more generic approach - if (limit > 1 && parentElements.length <= 1) { - const [containerSelector, _] = listSelector.split('>').map(s => s.trim()); - const container = document.querySelector(containerSelector); - - if (container) { - const allChildren = Array.from(container.children); - - const firstMatch = document.querySelector(listSelector); - if (firstMatch) { - // Get classes from the first matching element - const firstMatchClasses = Array.from(firstMatch.classList); - - // Find similar elements by matching most of their classes - parentElements = allChildren.filter(element => { - const elementClasses = Array.from(element.classList); + for (const parent of parentElements) { + // First, get the number of rows we'll need by checking the first table field + const firstTableField = Object.values(tableFields)[0]; + const tableRows = firstTableField + ? Array.from(parent.querySelectorAll(firstTableField.selector)).slice(0, limit) + : [null]; - // Element should share at least 70% of classes with the first match - const commonClasses = firstMatchClasses.filter(cls => - elementClasses.includes(cls)); - return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7); - }); - } - } - } - - // Iterate through each parent element - for (const parent of parentElements) { - if (scrapedData.length >= limit) break; + tableRows.forEach((_, rowIndex) => { const record = {}; - - // For each field, select the corresponding element within the parent - for (const [label, { selector, attribute }] of Object.entries(fields)) { - const fieldElement = parent.querySelector(selector); - - if (fieldElement) { + + // Table fields + for (const [label, { selector, attribute }] of Object.entries(tableFields)) { + const elements = Array.from(parent.querySelectorAll(selector)); + const element = elements[rowIndex]; + + if (element) { + let value; if (attribute === 'innerText') { - record[label] = fieldElement.innerText.trim(); + value = element.innerText.trim(); } else if (attribute === 'innerHTML') { - record[label] = fieldElement.innerHTML.trim(); - } else if (attribute === 'src') { - // Handle relative 'src' URLs - const src = fieldElement.getAttribute('src'); - record[label] = src ? new URL(src, window.location.origin).href : null; - } else if (attribute === 'href') { - // Handle relative 'href' URLs - const href = fieldElement.getAttribute('href'); - record[label] = href ? new URL(href, window.location.origin).href : null; + value = element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + value = attrValue ? new URL(attrValue, window.location.origin).href : null; } else { - record[label] = fieldElement.getAttribute(attribute); + value = element.getAttribute(attribute); } + record[label] = value; } } - scrapedData.push(record); - } + + // Non table fields + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const element = parent.querySelector(selector); + + if (element) { + let value; + if (attribute === 'innerText') { + value = element.innerText.trim(); + } else if (attribute === 'innerHTML') { + value = element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + value = attrValue ? new URL(attrValue, window.location.origin).href : null; + } else { + value = element.getAttribute(attribute); + } + record[label] = value; + } + } + + if (Object.keys(record).length > 0) { + scrapedData.push(record); + } + }); - // If we've processed all available elements and still haven't reached the limit, - // break to avoid infinite loop - if (parentElements.length === 0 || scrapedData.length >= parentElements.length) { + if (scrapedData.length >= limit) { + scrapedData.length = limit; break; } } + return scrapedData; }; From b411faf6812d78ffb763af4f3bf0016a397760a7 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 24 Dec 2024 17:25:52 +0530 Subject: [PATCH 03/53] feat: add condition to ignore html and body tags on highlighting --- server/src/workflow-management/selector.ts | 42 ++++++++++++++++++++-- 1 file changed, 39 insertions(+), 3 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index af9de4af..9f64b418 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -85,6 +85,11 @@ export const getElementInformation = async ( let element = originalEl; while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } + const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -99,7 +104,14 @@ export const getElementInformation = async ( (parentRect.width * parentRect.height) > 0.5; if (fullyContained && significantOverlap) { - element = element.parentElement; + // Only traverse up if next parent isn't body or html + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } @@ -201,6 +213,11 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector let element = originalEl; while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } + const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -215,7 +232,14 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector (parentRect.width * parentRect.height) > 0.5; if (fullyContained && significantOverlap) { - element = element.parentElement; + // Only traverse up if next parent isn't body or html + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } @@ -911,6 +935,11 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates // if (listSelector === '') { while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } + const parentRect = element.parentElement.getBoundingClientRect(); const childRect = element.getBoundingClientRect(); @@ -925,7 +954,14 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates (parentRect.width * parentRect.height) > 0.5; if (fullyContained && significantOverlap) { - element = element.parentElement; + // Only traverse up if next parent isn't body or html + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } From 3176fa275f5728095845ba9394b53cf268d1e56f Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Tue, 24 Dec 2024 17:27:42 +0530 Subject: [PATCH 04/53] feat: add loop to handle nested scraping --- maxun-core/src/browserSide/scraper.js | 28 +++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 82341fbd..2135311c 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -262,7 +262,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - // Separate fields into table and non-table categories const tableFields = {}; const nonTableFields = {}; @@ -278,19 +277,19 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const scrapedData = []; for (const parent of parentElements) { - // First, get the number of rows we'll need by checking the first table field - const firstTableField = Object.values(tableFields)[0]; - const tableRows = firstTableField - ? Array.from(parent.querySelectorAll(firstTableField.selector)).slice(0, limit) - : [null]; - - tableRows.forEach((_, rowIndex) => { + // Get the first field's elements to determine how many items we have + const firstField = Object.values(fields)[0]; + const baseElements = Array.from(parent.querySelectorAll(firstField.selector)); + + // Process each item up to the limit + for (let i = 0; i < Math.min(baseElements.length, limit); i++) { const record = {}; - // Table fields + // Process table fields for (const [label, { selector, attribute }] of Object.entries(tableFields)) { const elements = Array.from(parent.querySelectorAll(selector)); - const element = elements[rowIndex]; + // Use the same index to maintain correspondence between fields + const element = elements[i]; if (element) { let value; @@ -308,9 +307,11 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } } - // Non table fields + // Process non-table fields for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { - const element = parent.querySelector(selector); + const elements = Array.from(parent.querySelectorAll(selector)); + // Use the same index to maintain correspondence between fields + const element = elements[i]; if (element) { let value; @@ -331,7 +332,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, if (Object.keys(record).length > 0) { scrapedData.push(record); } - }); + } if (scrapedData.length >= limit) { scrapedData.length = limit; @@ -342,7 +343,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return scrapedData; }; - /** * Gets all children of the elements matching the listSelector, * returning their CSS selectors and innerText. From 4d39db4ef6b80a2b8e12110e14146e0ef10add01 Mon Sep 17 00:00:00 2001 From: Naveen Date: Wed, 25 Dec 2024 05:42:36 +0530 Subject: [PATCH 05/53] chore: add refs in urls --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index b63cf8a7..cebcedd2 100644 --- a/README.md +++ b/README.md @@ -1,6 +1,6 @@

- +
Maxun @@ -15,11 +15,11 @@ Maxun lets you train a robot in 2 minutes and scrape the web on auto-pilot. Web

- Website | + Website | Discord | - Twitter | + Twitter | Join Maxun Cloud | - Watch Tutorials + Watch Tutorials

getmaxun%2Fmaxun | Trendshift From 50bcd4b8dd54bba291ff7b7380422cac116c93c0 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Wed, 25 Dec 2024 22:19:18 +0530 Subject: [PATCH 06/53] feat: add logic to seperate table fields --- maxun-core/src/browserSide/scraper.js | 244 +++++++++++++++++++------- 1 file changed, 182 insertions(+), 62 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 2135311c..4fafd25b 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -262,83 +262,203 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - const tableFields = {}; - const nonTableFields = {}; - - for (const [label, field] of Object.entries(fields)) { - if (['TD', 'TH', 'TR'].includes(field.tag)) { - tableFields[label] = field; - } else { - nonTableFields[label] = field; + // Helper function to extract values from elements + function extractValue(element, attribute) { + if (!element) return null; + + if (attribute === 'innerText') { + return element.innerText.trim(); + } else if (attribute === 'innerHTML') { + return element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + return attrValue ? new URL(attrValue, window.location.origin).href : null; } + return element.getAttribute(attribute); } - const parentElements = Array.from(document.querySelectorAll(listSelector)); + // Helper function to find table ancestors + function findTableAncestor(element) { + let currentElement = element; + const MAX_DEPTH = 5; + let depth = 0; + + while (currentElement && depth < MAX_DEPTH) { + if (currentElement.tagName === 'TD') { + return { type: 'TD', element: currentElement }; + } else if (currentElement.tagName === 'TR') { + return { type: 'TR', element: currentElement }; + } + currentElement = currentElement.parentElement; + depth++; + } + return null; + } + + function getCellIndex(td) { + let index = 0; + let sibling = td; + while (sibling = sibling.previousElementSibling) { + index++; + } + return index; + } + + function hasThElement(row, tableFields) { + for (const [label, { selector }] of Object.entries(tableFields)) { + const element = row.querySelector(selector); + if (element) { + let current = element; + while (current && current !== row) { + if (current.tagName === 'TH') { + return true; + } + current = current.parentElement; + } + } + } + return false; + } + + function filterRowsBasedOnTag(rows, tableFields) { + for (const row of rows) { + if (hasThElement(row, tableFields)) { + return rows; + } + } + return rows.filter(row => row.getElementsByTagName('TH').length === 0); + } + + // Get all containers that match the listSelector + const containers = Array.from(document.querySelectorAll(listSelector)); + if (containers.length === 0) return []; + + // Initialize arrays to store field classifications for each container + const containerFields = containers.map(() => ({ + tableFields: {}, + nonTableFields: {} + })); + + // Analyze field types for each container + containers.forEach((container, containerIndex) => { + for (const [label, field] of Object.entries(fields)) { + const sampleElement = container.querySelector(field.selector); + + if (sampleElement) { + const ancestor = findTableAncestor(sampleElement); + if (ancestor) { + containerFields[containerIndex].tableFields[label] = { + ...field, + tableContext: ancestor.type, + cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 + }; + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } + }); + const scrapedData = []; - for (const parent of parentElements) { - // Get the first field's elements to determine how many items we have - const firstField = Object.values(fields)[0]; - const baseElements = Array.from(parent.querySelectorAll(firstField.selector)); - - // Process each item up to the limit - for (let i = 0; i < Math.min(baseElements.length, limit); i++) { - const record = {}; + // Process each container + containers.forEach((container, containerIndex) => { + const { tableFields, nonTableFields } = containerFields[containerIndex]; + + // Handle table fields + if (Object.keys(tableFields).length > 0) { + // Find the common table ancestor + const firstField = Object.values(tableFields)[0]; + const firstElement = container.querySelector(firstField.selector); + let tableContext = firstElement; - // Process table fields - for (const [label, { selector, attribute }] of Object.entries(tableFields)) { - const elements = Array.from(parent.querySelectorAll(selector)); - // Use the same index to maintain correspondence between fields - const element = elements[i]; - - if (element) { - let value; - if (attribute === 'innerText') { - value = element.innerText.trim(); - } else if (attribute === 'innerHTML') { - value = element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - const attrValue = element.getAttribute(attribute); - value = attrValue ? new URL(attrValue, window.location.origin).href : null; - } else { - value = element.getAttribute(attribute); - } - record[label] = value; - } + while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { + tableContext = tableContext.parentElement; } - // Process non-table fields - for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { - const elements = Array.from(parent.querySelectorAll(selector)); - // Use the same index to maintain correspondence between fields - const element = elements[i]; + if (tableContext) { + const rows = Array.from(tableContext.getElementsByTagName('TR')); + const processedRows = filterRowsBasedOnTag(rows, tableFields); - if (element) { - let value; - if (attribute === 'innerText') { - value = element.innerText.trim(); - } else if (attribute === 'innerHTML') { - value = element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - const attrValue = element.getAttribute(attribute); - value = attrValue ? new URL(attrValue, window.location.origin).href : null; - } else { - value = element.getAttribute(attribute); + for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) { + const record = {}; + const currentRow = processedRows[rowIndex]; + + for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) { + let element = null; + + if (cellIndex >= 0) { + const td = currentRow.children[cellIndex]; + if (td) { + element = td.querySelector(selector); + + if (!element && selector.split(">").pop().includes('td:nth-child')) { + element = td; + } + + if (!element) { + const tagOnlySelector = selector.split('.')[0]; + element = td.querySelector(tagOnlySelector); + } + + if (!element) { + let currentElement = td; + while (currentElement && currentElement.children.length > 0) { + let foundContentChild = false; + for (const child of currentElement.children) { + if (extractValue(child, attribute)) { + currentElement = child; + foundContentChild = true; + break; + } + } + if (!foundContentChild) break; + } + element = currentElement; + } + } + } else { + element = currentRow.querySelector(selector); + } + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + scrapedData.push(record); } - record[label] = value; } } - - if (Object.keys(record).length > 0) { - scrapedData.push(record); - } } - if (scrapedData.length >= limit) { - scrapedData.length = limit; - break; + // Handle non-table fields + if (Object.keys(nonTableFields).length > 0) { + const firstField = Object.values(nonTableFields)[0]; + const baseElements = Array.from(container.querySelectorAll(firstField.selector)); + + for (let i = 0; i < Math.min(baseElements.length, limit); i++) { + const record = {}; + + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const elements = Array.from(parent.querySelectorAll(selector)); + // Use the same index to maintain correspondence between fields + const element = elements[i]; + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + scrapedData.push(record); + } + } } - } + }); return scrapedData; }; From 641b6559a77d6efd671b4a5221aae59bb84a38ca Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 25 Dec 2024 23:01:35 +0530 Subject: [PATCH 07/53] fix: revert --- maxun-core/src/browserSide/scraper.js | 244 +++++++------------------- 1 file changed, 62 insertions(+), 182 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 4fafd25b..2135311c 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -262,203 +262,83 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - // Helper function to extract values from elements - function extractValue(element, attribute) { - if (!element) return null; - - if (attribute === 'innerText') { - return element.innerText.trim(); - } else if (attribute === 'innerHTML') { - return element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - const attrValue = element.getAttribute(attribute); - return attrValue ? new URL(attrValue, window.location.origin).href : null; + const tableFields = {}; + const nonTableFields = {}; + + for (const [label, field] of Object.entries(fields)) { + if (['TD', 'TH', 'TR'].includes(field.tag)) { + tableFields[label] = field; + } else { + nonTableFields[label] = field; } - return element.getAttribute(attribute); } - // Helper function to find table ancestors - function findTableAncestor(element) { - let currentElement = element; - const MAX_DEPTH = 5; - let depth = 0; - - while (currentElement && depth < MAX_DEPTH) { - if (currentElement.tagName === 'TD') { - return { type: 'TD', element: currentElement }; - } else if (currentElement.tagName === 'TR') { - return { type: 'TR', element: currentElement }; - } - currentElement = currentElement.parentElement; - depth++; - } - return null; - } - - function getCellIndex(td) { - let index = 0; - let sibling = td; - while (sibling = sibling.previousElementSibling) { - index++; - } - return index; - } - - function hasThElement(row, tableFields) { - for (const [label, { selector }] of Object.entries(tableFields)) { - const element = row.querySelector(selector); - if (element) { - let current = element; - while (current && current !== row) { - if (current.tagName === 'TH') { - return true; - } - current = current.parentElement; - } - } - } - return false; - } - - function filterRowsBasedOnTag(rows, tableFields) { - for (const row of rows) { - if (hasThElement(row, tableFields)) { - return rows; - } - } - return rows.filter(row => row.getElementsByTagName('TH').length === 0); - } - - // Get all containers that match the listSelector - const containers = Array.from(document.querySelectorAll(listSelector)); - if (containers.length === 0) return []; - - // Initialize arrays to store field classifications for each container - const containerFields = containers.map(() => ({ - tableFields: {}, - nonTableFields: {} - })); - - // Analyze field types for each container - containers.forEach((container, containerIndex) => { - for (const [label, field] of Object.entries(fields)) { - const sampleElement = container.querySelector(field.selector); - - if (sampleElement) { - const ancestor = findTableAncestor(sampleElement); - if (ancestor) { - containerFields[containerIndex].tableFields[label] = { - ...field, - tableContext: ancestor.type, - cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 - }; - } else { - containerFields[containerIndex].nonTableFields[label] = field; - } - } else { - containerFields[containerIndex].nonTableFields[label] = field; - } - } - }); - + const parentElements = Array.from(document.querySelectorAll(listSelector)); const scrapedData = []; - // Process each container - containers.forEach((container, containerIndex) => { - const { tableFields, nonTableFields } = containerFields[containerIndex]; - - // Handle table fields - if (Object.keys(tableFields).length > 0) { - // Find the common table ancestor - const firstField = Object.values(tableFields)[0]; - const firstElement = container.querySelector(firstField.selector); - let tableContext = firstElement; + for (const parent of parentElements) { + // Get the first field's elements to determine how many items we have + const firstField = Object.values(fields)[0]; + const baseElements = Array.from(parent.querySelectorAll(firstField.selector)); + + // Process each item up to the limit + for (let i = 0; i < Math.min(baseElements.length, limit); i++) { + const record = {}; - while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { - tableContext = tableContext.parentElement; - } - - if (tableContext) { - const rows = Array.from(tableContext.getElementsByTagName('TR')); - const processedRows = filterRowsBasedOnTag(rows, tableFields); - - for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) { - const record = {}; - const currentRow = processedRows[rowIndex]; - - for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) { - let element = null; - - if (cellIndex >= 0) { - const td = currentRow.children[cellIndex]; - if (td) { - element = td.querySelector(selector); - - if (!element && selector.split(">").pop().includes('td:nth-child')) { - element = td; - } - - if (!element) { - const tagOnlySelector = selector.split('.')[0]; - element = td.querySelector(tagOnlySelector); - } - - if (!element) { - let currentElement = td; - while (currentElement && currentElement.children.length > 0) { - let foundContentChild = false; - for (const child of currentElement.children) { - if (extractValue(child, attribute)) { - currentElement = child; - foundContentChild = true; - break; - } - } - if (!foundContentChild) break; - } - element = currentElement; - } - } - } else { - element = currentRow.querySelector(selector); - } - - if (element) { - record[label] = extractValue(element, attribute); - } - } - - if (Object.keys(record).length > 0) { - scrapedData.push(record); - } - } - } - } - - // Handle non-table fields - if (Object.keys(nonTableFields).length > 0) { - const firstField = Object.values(nonTableFields)[0]; - const baseElements = Array.from(container.querySelectorAll(firstField.selector)); - - for (let i = 0; i < Math.min(baseElements.length, limit); i++) { - const record = {}; - - for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + // Process table fields + for (const [label, { selector, attribute }] of Object.entries(tableFields)) { const elements = Array.from(parent.querySelectorAll(selector)); // Use the same index to maintain correspondence between fields const element = elements[i]; if (element) { - record[label] = extractValue(element, attribute); + let value; + if (attribute === 'innerText') { + value = element.innerText.trim(); + } else if (attribute === 'innerHTML') { + value = element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + value = attrValue ? new URL(attrValue, window.location.origin).href : null; + } else { + value = element.getAttribute(attribute); + } + record[label] = value; } - } + } + + // Process non-table fields + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const elements = Array.from(parent.querySelectorAll(selector)); + // Use the same index to maintain correspondence between fields + const element = elements[i]; - if (Object.keys(record).length > 0) { - scrapedData.push(record); - } + if (element) { + let value; + if (attribute === 'innerText') { + value = element.innerText.trim(); + } else if (attribute === 'innerHTML') { + value = element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + value = attrValue ? new URL(attrValue, window.location.origin).href : null; + } else { + value = element.getAttribute(attribute); + } + record[label] = value; + } + } + + if (Object.keys(record).length > 0) { + scrapedData.push(record); } } - }); + + if (scrapedData.length >= limit) { + scrapedData.length = limit; + break; + } + } return scrapedData; }; From 991b55446480347db4cafe4f1c3901ac07c10ad2 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Wed, 25 Dec 2024 23:09:27 +0530 Subject: [PATCH 08/53] feat: revert non-table scraping logic --- maxun-core/src/browserSide/scraper.js | 29 ++++++++++----------------- 1 file changed, 11 insertions(+), 18 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 4fafd25b..91feaef0 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -437,26 +437,19 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, // Handle non-table fields if (Object.keys(nonTableFields).length > 0) { - const firstField = Object.values(nonTableFields)[0]; - const baseElements = Array.from(container.querySelectorAll(firstField.selector)); + const record = {}; - for (let i = 0; i < Math.min(baseElements.length, limit); i++) { - const record = {}; - - for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { - const elements = Array.from(parent.querySelectorAll(selector)); - // Use the same index to maintain correspondence between fields - const element = elements[i]; - - if (element) { - record[label] = extractValue(element, attribute); - } - } - - if (Object.keys(record).length > 0) { - scrapedData.push(record); - } + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const element = container.querySelector(selector); + + if (element) { + record[label] = extractValue(element, attribute); } + } + + if (Object.keys(record).length > 0) { + scrapedData.push(record); + } } }); From 33de0f14898feb90d659f0b2a9d0412604884eaf Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 26 Dec 2024 01:16:29 +0530 Subject: [PATCH 09/53] feat: separate table and non-table data logic --- maxun-core/src/browserSide/scraper.js | 272 +++++++++++++++++++------- 1 file changed, 197 insertions(+), 75 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 2135311c..c12267d3 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -262,84 +262,206 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, * @returns {Array.>} Array of arrays of scraped items, one sub-array per list */ window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { - const tableFields = {}; - const nonTableFields = {}; - - for (const [label, field] of Object.entries(fields)) { - if (['TD', 'TH', 'TR'].includes(field.tag)) { - tableFields[label] = field; - } else { - nonTableFields[label] = field; - } - } - - const parentElements = Array.from(document.querySelectorAll(listSelector)); - const scrapedData = []; - - for (const parent of parentElements) { - // Get the first field's elements to determine how many items we have - const firstField = Object.values(fields)[0]; - const baseElements = Array.from(parent.querySelectorAll(firstField.selector)); + // Helper function to extract values from elements + function extractValue(element, attribute) { + if (!element) return null; - // Process each item up to the limit - for (let i = 0; i < Math.min(baseElements.length, limit); i++) { - const record = {}; - - // Process table fields - for (const [label, { selector, attribute }] of Object.entries(tableFields)) { - const elements = Array.from(parent.querySelectorAll(selector)); - // Use the same index to maintain correspondence between fields - const element = elements[i]; - - if (element) { - let value; - if (attribute === 'innerText') { - value = element.innerText.trim(); - } else if (attribute === 'innerHTML') { - value = element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - const attrValue = element.getAttribute(attribute); - value = attrValue ? new URL(attrValue, window.location.origin).href : null; - } else { - value = element.getAttribute(attribute); - } - record[label] = value; - } - } - - // Process non-table fields - for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { - const elements = Array.from(parent.querySelectorAll(selector)); - // Use the same index to maintain correspondence between fields - const element = elements[i]; - - if (element) { - let value; - if (attribute === 'innerText') { - value = element.innerText.trim(); - } else if (attribute === 'innerHTML') { - value = element.innerHTML.trim(); - } else if (attribute === 'src' || attribute === 'href') { - const attrValue = element.getAttribute(attribute); - value = attrValue ? new URL(attrValue, window.location.origin).href : null; - } else { - value = element.getAttribute(attribute); - } - record[label] = value; - } - } - - if (Object.keys(record).length > 0) { - scrapedData.push(record); - } - } - - if (scrapedData.length >= limit) { - scrapedData.length = limit; - break; + if (attribute === 'innerText') { + return element.innerText.trim(); + } else if (attribute === 'innerHTML') { + return element.innerHTML.trim(); + } else if (attribute === 'src' || attribute === 'href') { + const attrValue = element.getAttribute(attribute); + return attrValue ? new URL(attrValue, window.location.origin).href : null; } + return element.getAttribute(attribute); } - + + // Helper function to find table ancestors + function findTableAncestor(element) { + let currentElement = element; + const MAX_DEPTH = 5; + let depth = 0; + + while (currentElement && depth < MAX_DEPTH) { + if (currentElement.tagName === 'TD') { + return { type: 'TD', element: currentElement }; + } else if (currentElement.tagName === 'TR') { + return { type: 'TR', element: currentElement }; + } + currentElement = currentElement.parentElement; + depth++; + } + return null; + } + + function getCellIndex(td) { + let index = 0; + let sibling = td; + while (sibling = sibling.previousElementSibling) { + index++; + } + return index; + } + + function hasThElement(row, tableFields) { + for (const [label, { selector }] of Object.entries(tableFields)) { + const element = row.querySelector(selector); + if (element) { + let current = element; + while (current && current !== row) { + if (current.tagName === 'TH') { + return true; + } + current = current.parentElement; + } + } + } + return false; + } + + function filterRowsBasedOnTag(rows, tableFields) { + for (const row of rows) { + if (hasThElement(row, tableFields)) { + return rows; + } + } + return rows.filter(row => row.getElementsByTagName('TH').length === 0); + } + + // Get all containers that match the listSelector + const containers = Array.from(document.querySelectorAll(listSelector)); + if (containers.length === 0) return []; + + // Initialize arrays to store field classifications for each container + const containerFields = containers.map(() => ({ + tableFields: {}, + nonTableFields: {} + })); + + // Analyze field types for each container + containers.forEach((container, containerIndex) => { + for (const [label, field] of Object.entries(fields)) { + const sampleElement = container.querySelector(field.selector); + + if (sampleElement) { + const ancestor = findTableAncestor(sampleElement); + if (ancestor) { + containerFields[containerIndex].tableFields[label] = { + ...field, + tableContext: ancestor.type, + cellIndex: ancestor.type === 'TD' ? getCellIndex(ancestor.element) : -1 + }; + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } else { + containerFields[containerIndex].nonTableFields[label] = field; + } + } + }); + + const tableData = []; + const nonTableData = []; + + // Process table fields across all containers + for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { + const container = containers[containerIndex]; + const { tableFields } = containerFields[containerIndex]; + + if (Object.keys(tableFields).length > 0) { + const firstField = Object.values(tableFields)[0]; + const firstElement = container.querySelector(firstField.selector); + let tableContext = firstElement; + + while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { + tableContext = tableContext.parentElement; + } + + if (tableContext) { + const rows = Array.from(tableContext.getElementsByTagName('TR')); + const processedRows = filterRowsBasedOnTag(rows, tableFields); + + for (let rowIndex = 0; rowIndex < processedRows.length; rowIndex++) { + const record = {}; + const currentRow = processedRows[rowIndex]; + + for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) { + let element = null; + + if (cellIndex >= 0) { + const td = currentRow.children[cellIndex]; + if (td) { + element = td.querySelector(selector); + + if (!element && selector.split(">").pop().includes('td:nth-child')) { + element = td; + } + + if (!element) { + const tagOnlySelector = selector.split('.')[0]; + element = td.querySelector(tagOnlySelector); + } + + if (!element) { + let currentElement = td; + while (currentElement && currentElement.children.length > 0) { + let foundContentChild = false; + for (const child of currentElement.children) { + if (extractValue(child, attribute)) { + currentElement = child; + foundContentChild = true; + break; + } + } + if (!foundContentChild) break; + } + element = currentElement; + } + } + } else { + element = currentRow.querySelector(selector); + } + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + tableData.push(record); + } + } + } + } + } + + // Process non-table fields across all containers + for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { + if (nonTableData.length >= limit) break; + + const container = containers[containerIndex]; + const { nonTableFields } = containerFields[containerIndex]; + + if (Object.keys(nonTableFields).length > 0) { + const record = {}; + + for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) { + const element = container.querySelector(selector); + + if (element) { + record[label] = extractValue(element, attribute); + } + } + + if (Object.keys(record).length > 0) { + nonTableData.push(record); + } + } + } + + // Merge and limit the results + const scrapedData = [...tableData, ...nonTableData]; return scrapedData; }; From 31d973023741111ac033102466ed848927031c97 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 26 Dec 2024 03:17:18 +0530 Subject: [PATCH 10/53] feat: add logic to match 70% classes --- maxun-core/src/browserSide/scraper.js | 29 +++++++++++++++++++++++++-- 1 file changed, 27 insertions(+), 2 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index c12267d3..e109839b 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -330,9 +330,34 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } // Get all containers that match the listSelector - const containers = Array.from(document.querySelectorAll(listSelector)); + let containers = Array.from(document.querySelectorAll(listSelector)); if (containers.length === 0) return []; + if (limit > 1 && containers.length <= 1) { + const [containerSelector, _] = listSelector.split('>').map(s => s.trim()); + const container = document.querySelector(containerSelector); + + if (container) { + const allChildren = Array.from(container.children); + + const firstMatch = document.querySelector(listSelector); + if (firstMatch) { + // Get classes from the first matching element + const firstMatchClasses = Array.from(firstMatch.classList); + + // Find similar elements by matching most of their classes + containers = allChildren.filter(element => { + const elementClasses = Array.from(element.classList); + + // Element should share at least 70% of classes with the first match + const commonClasses = firstMatchClasses.filter(cls => + elementClasses.includes(cls)); + return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7); + }); + } + } + } + // Initialize arrays to store field classifications for each container const containerFields = containers.map(() => ({ tableFields: {}, @@ -436,7 +461,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, } } - // Process non-table fields across all containers + // Process non-table fields across all containers for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { if (nonTableData.length >= limit) break; From fcc71e08413f2f586815b404c5fab3faaa8d4ba5 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 26 Dec 2024 03:38:40 +0530 Subject: [PATCH 11/53] feat: add limit to table scraping --- maxun-core/src/browserSide/scraper.js | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index e109839b..89d5abe4 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -407,7 +407,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const rows = Array.from(tableContext.getElementsByTagName('TR')); const processedRows = filterRowsBasedOnTag(rows, tableFields); - for (let rowIndex = 0; rowIndex < processedRows.length; rowIndex++) { + for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) { const record = {}; const currentRow = processedRows[rowIndex]; From 97efd156318549d9289f9b8bed89de1962a73299 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 26 Dec 2024 19:16:44 +0530 Subject: [PATCH 12/53] feat: add check for table highlighting --- server/src/workflow-management/selector.ts | 158 ++++++++++++--------- 1 file changed, 91 insertions(+), 67 deletions(-) diff --git a/server/src/workflow-management/selector.ts b/server/src/workflow-management/selector.ts index 9f64b418..1f3eac76 100644 --- a/server/src/workflow-management/selector.ts +++ b/server/src/workflow-management/selector.ts @@ -84,36 +84,44 @@ export const getElementInformation = async ( if (originalEl) { let element = originalEl; - while (element.parentElement) { - if (element.tagName.toLowerCase() === 'body' || - element.tagName.toLowerCase() === 'html') { - break; + if (element.tagName === 'TD' || element.tagName === 'TH') { + const tableParent = element.closest('table'); + if (tableParent) { + element = tableParent; } + } - const parentRect = element.parentElement.getBoundingClientRect(); - const childRect = element.getBoundingClientRect(); + if (element.tagName !== 'TABLE') { + while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } - const fullyContained = - parentRect.left <= childRect.left && - parentRect.right >= childRect.right && - parentRect.top <= childRect.top && - parentRect.bottom >= childRect.bottom; + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); - const significantOverlap = - (childRect.width * childRect.height) / - (parentRect.width * parentRect.height) > 0.5; + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; - if (fullyContained && significantOverlap) { - // Only traverse up if next parent isn't body or html - const nextParent = element.parentElement; - if (nextParent.tagName.toLowerCase() !== 'body' && - nextParent.tagName.toLowerCase() !== 'html') { - element = nextParent; + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } - } else { - break; } } @@ -212,36 +220,44 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector if (originalEl) { let element = originalEl; - while (element.parentElement) { - if (element.tagName.toLowerCase() === 'body' || - element.tagName.toLowerCase() === 'html') { - break; + if (element.tagName === 'TD' || element.tagName === 'TH') { + const tableParent = element.closest('table'); + if (tableParent) { + element = tableParent; } + } - const parentRect = element.parentElement.getBoundingClientRect(); - const childRect = element.getBoundingClientRect(); + if (element.tagName !== 'TABLE') { + while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } - const fullyContained = - parentRect.left <= childRect.left && - parentRect.right >= childRect.right && - parentRect.top <= childRect.top && - parentRect.bottom >= childRect.bottom; + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); - const significantOverlap = - (childRect.width * childRect.height) / - (parentRect.width * parentRect.height) > 0.5; + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; - if (fullyContained && significantOverlap) { - // Only traverse up if next parent isn't body or html - const nextParent = element.parentElement; - if (nextParent.tagName.toLowerCase() !== 'body' && - nextParent.tagName.toLowerCase() !== 'html') { - element = nextParent; + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } - } else { - break; } } @@ -933,37 +949,45 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates let element = originalEl; - // if (listSelector === '') { - while (element.parentElement) { - if (element.tagName.toLowerCase() === 'body' || - element.tagName.toLowerCase() === 'html') { - break; + if (element.tagName === 'TD' || element.tagName === 'TH') { + const tableParent = element.closest('table'); + if (tableParent) { + element = tableParent; } + } - const parentRect = element.parentElement.getBoundingClientRect(); - const childRect = element.getBoundingClientRect(); + // if (listSelector === '') { + if (element.tagName !== 'TABLE') { + while (element.parentElement) { + if (element.tagName.toLowerCase() === 'body' || + element.tagName.toLowerCase() === 'html') { + break; + } - const fullyContained = - parentRect.left <= childRect.left && - parentRect.right >= childRect.right && - parentRect.top <= childRect.top && - parentRect.bottom >= childRect.bottom; + const parentRect = element.parentElement.getBoundingClientRect(); + const childRect = element.getBoundingClientRect(); - const significantOverlap = - (childRect.width * childRect.height) / - (parentRect.width * parentRect.height) > 0.5; + const fullyContained = + parentRect.left <= childRect.left && + parentRect.right >= childRect.right && + parentRect.top <= childRect.top && + parentRect.bottom >= childRect.bottom; - if (fullyContained && significantOverlap) { - // Only traverse up if next parent isn't body or html - const nextParent = element.parentElement; - if (nextParent.tagName.toLowerCase() !== 'body' && - nextParent.tagName.toLowerCase() !== 'html') { - element = nextParent; + const significantOverlap = + (childRect.width * childRect.height) / + (parentRect.width * parentRect.height) > 0.5; + + if (fullyContained && significantOverlap) { + const nextParent = element.parentElement; + if (nextParent.tagName.toLowerCase() !== 'body' && + nextParent.tagName.toLowerCase() !== 'html') { + element = nextParent; + } else { + break; + } } else { break; } - } else { - break; } } // } From fe38f5a581694561dccb852d7199af0066c75069 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Thu, 26 Dec 2024 19:27:59 +0530 Subject: [PATCH 13/53] feat: improve 70% class matching logic for scraping --- maxun-core/src/browserSide/scraper.js | 67 ++++++++++++++++++--------- 1 file changed, 44 insertions(+), 23 deletions(-) diff --git a/maxun-core/src/browserSide/scraper.js b/maxun-core/src/browserSide/scraper.js index 89d5abe4..0c5d74ac 100644 --- a/maxun-core/src/browserSide/scraper.js +++ b/maxun-core/src/browserSide/scraper.js @@ -329,32 +329,53 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, return rows.filter(row => row.getElementsByTagName('TH').length === 0); } - // Get all containers that match the listSelector + function calculateClassSimilarity(classList1, classList2) { + const set1 = new Set(classList1); + const set2 = new Set(classList2); + + // Calculate intersection + const intersection = new Set([...set1].filter(x => set2.has(x))); + + // Calculate union + const union = new Set([...set1, ...set2]); + + // Return Jaccard similarity coefficient + return intersection.size / union.size; + } + + // New helper function to find elements with similar classes + function findSimilarElements(baseElement, similarityThreshold = 0.7) { + const baseClasses = Array.from(baseElement.classList); + + if (baseClasses.length === 0) return []; + + const potentialElements = document.getElementsByTagName(baseElement.tagName); + + return Array.from(potentialElements).filter(element => { + if (element === baseElement) return false; + + const similarity = calculateClassSimilarity( + baseClasses, + Array.from(element.classList) + ); + + return similarity >= similarityThreshold; + }); + } + let containers = Array.from(document.querySelectorAll(listSelector)); if (containers.length === 0) return []; - if (limit > 1 && containers.length <= 1) { - const [containerSelector, _] = listSelector.split('>').map(s => s.trim()); - const container = document.querySelector(containerSelector); + if (limit > 1 && containers.length === 1) { + const baseContainer = containers[0]; + const similarContainers = findSimilarElements(baseContainer); - if (container) { - const allChildren = Array.from(container.children); - - const firstMatch = document.querySelector(listSelector); - if (firstMatch) { - // Get classes from the first matching element - const firstMatchClasses = Array.from(firstMatch.classList); - - // Find similar elements by matching most of their classes - containers = allChildren.filter(element => { - const elementClasses = Array.from(element.classList); - - // Element should share at least 70% of classes with the first match - const commonClasses = firstMatchClasses.filter(cls => - elementClasses.includes(cls)); - return commonClasses.length >= Math.floor(firstMatchClasses.length * 0.7); - }); - } + if (similarContainers.length > 0) { + const newContainers = similarContainers.filter(container => + !container.matches(listSelector) + ); + + containers = [...containers, ...newContainers]; } } @@ -389,7 +410,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, const tableData = []; const nonTableData = []; - // Process table fields across all containers + // Process table fields across all containers for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { const container = containers[containerIndex]; const { tableFields } = containerFields[containerIndex]; From ae6b7bc8be301235880a964a6514ec4b254574ee Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 26 Dec 2024 20:37:24 +0530 Subject: [PATCH 14/53] feat: english work email login --- public/locales/en.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/en.json b/public/locales/en.json index 9dcad514..48f1dd05 100644 --- a/public/locales/en.json +++ b/public/locales/en.json @@ -1,7 +1,7 @@ { "login": { "title": "Welcome Back!", - "email": "Email", + "email": "Enter Work Email", "password": "Password", "button": "Login", "loading": "Loading", From f8575ac1973454443142f20f48051cd3abbb9cb4 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 26 Dec 2024 20:38:00 +0530 Subject: [PATCH 15/53] feat: english work email register --- public/locales/en.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/en.json b/public/locales/en.json index 48f1dd05..c5a2ff4c 100644 --- a/public/locales/en.json +++ b/public/locales/en.json @@ -12,7 +12,7 @@ }, "register": { "title": "Register Account", - "email": "Email", + "email": "Enter Work Email", "password": "Password", "button": "Register", "loading": "Loading", From f9d0fe97e9c72214f323d9ccc204dcb447f849b5 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 26 Dec 2024 20:40:23 +0530 Subject: [PATCH 16/53] feat: german work email login --- public/locales/de.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/de.json b/public/locales/de.json index 90beaa14..95ea260e 100644 --- a/public/locales/de.json +++ b/public/locales/de.json @@ -1,7 +1,7 @@ { "login": { "title": "Willkommen zurück!", - "email": "E-Mail", + "email": "Geben Sie Ihre geschäftliche E-Mail-Adresse ein", "password": "Passwort", "button": "Einloggen", "loading": "Lädt", From fa128679dc83613feb07c79663bf2d80e34d44a2 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 26 Dec 2024 20:40:36 +0530 Subject: [PATCH 17/53] feat: german work email register --- public/locales/de.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/de.json b/public/locales/de.json index 95ea260e..411d8f22 100644 --- a/public/locales/de.json +++ b/public/locales/de.json @@ -12,7 +12,7 @@ }, "register": { "title": "Konto registrieren", - "email": "E-Mail", + "email": "Geben Sie Ihre geschäftliche E-Mail-Adresse ein", "password": "Passwort", "button": "Registrieren", "loading": "Lädt", From 0d51e7530f82a9fd7b1ca9b327d3ca9cdd4b0b0b Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 26 Dec 2024 20:41:30 +0530 Subject: [PATCH 18/53] feat: spanish work email login --- public/locales/es.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/es.json b/public/locales/es.json index 00fa379e..9ab6ec3a 100644 --- a/public/locales/es.json +++ b/public/locales/es.json @@ -1,7 +1,7 @@ { "login": { "title": "¡Bienvenido de nuevo!", - "email": "Correo electrónico", + "email": "Introducir correo electrónico de trabajo", "password": "Contraseña", "button": "Iniciar sesión", "loading": "Cargando", From ca76a792c08e879f66ea73c421ac367b9689c1d0 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 26 Dec 2024 20:41:58 +0530 Subject: [PATCH 19/53] feat: spanish work email register --- public/locales/es.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/es.json b/public/locales/es.json index 9ab6ec3a..6e52cc6f 100644 --- a/public/locales/es.json +++ b/public/locales/es.json @@ -12,7 +12,7 @@ }, "register": { "title": "Crear cuenta", - "email": "Correo electrónico", + "email": "Introducir correo electrónico de trabajo", "password": "Contraseña", "button": "Registrarse", "loading": "Cargando", From 800e193cfde0745573611b5ddd8ec512143ac1ec Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 26 Dec 2024 20:43:11 +0530 Subject: [PATCH 20/53] feat: japanese work email login --- public/locales/ja.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/ja.json b/public/locales/ja.json index b444c81a..0416184e 100644 --- a/public/locales/ja.json +++ b/public/locales/ja.json @@ -1,7 +1,7 @@ { "login": { "title": "お帰りなさい!", - "email": "メールアドレス", + "email": "勤務先メールアドレスを入力", "password": "パスワード", "button": "ログイン", "loading": "読み込み中", From 705408c014f9b1d54f0d086ecffd636ca825ad23 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 26 Dec 2024 20:43:29 +0530 Subject: [PATCH 21/53] feat: japanese work email register --- public/locales/ja.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/ja.json b/public/locales/ja.json index 0416184e..9d2d9a89 100644 --- a/public/locales/ja.json +++ b/public/locales/ja.json @@ -12,7 +12,7 @@ }, "register": { "title": "アカウントを登録する", - "email": "メールアドレス", + "email": "勤務先メールアドレスを入力", "password": "パスワード", "button": "登録する", "loading": "読み込み中", From 230464d4901cd613cfdb33eb96fd7425560b5399 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 26 Dec 2024 20:46:07 +0530 Subject: [PATCH 22/53] feat: chinese simplified work email register --- public/locales/zh.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/zh.json b/public/locales/zh.json index 27455ebe..61ec2f3d 100644 --- a/public/locales/zh.json +++ b/public/locales/zh.json @@ -1,7 +1,7 @@ { "login": { "title": "欢迎回来!", - "email": "电子邮箱", + "email": "输入工作电子邮箱", "password": "密码", "button": "登录", "loading": "加载中", From 25931aab4f31ff94538db5dd7ab0368f2dfd2725 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Thu, 26 Dec 2024 20:46:31 +0530 Subject: [PATCH 23/53] feat(translation): chinese simplified work email login --- public/locales/zh.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/zh.json b/public/locales/zh.json index 61ec2f3d..69561d5c 100644 --- a/public/locales/zh.json +++ b/public/locales/zh.json @@ -12,7 +12,7 @@ }, "register": { "title": "注册账号", - "email": "电子邮箱", + "email": "输入工作电子邮箱", "password": "密码", "button": "注册", "loading": "加载中", From 6ac3e19b82cbeada5030755afff4fdb5c1f82a83 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 28 Dec 2024 17:35:57 +0530 Subject: [PATCH 24/53] feat: add del functionality for text and list steps --- src/components/organisms/RightSidePanel.tsx | 56 ++++++++++++++++++++- 1 file changed, 54 insertions(+), 2 deletions(-) diff --git a/src/components/organisms/RightSidePanel.tsx b/src/components/organisms/RightSidePanel.tsx index 12f75028..403f78f6 100644 --- a/src/components/organisms/RightSidePanel.tsx +++ b/src/components/organisms/RightSidePanel.tsx @@ -169,6 +169,22 @@ export const RightSidePanel: React.FC = ({ onFinishCapture }); }; + const handleTextStepDelete = (id: number) => { + deleteBrowserStep(id); + setTextLabels(prevLabels => { + const { [id]: _, ...rest } = prevLabels; + return rest; + }); + setConfirmedTextSteps(prev => { + const { [id]: _, ...rest } = prev; + return rest; + }); + setErrors(prevErrors => { + const { [id]: _, ...rest } = prevErrors; + return rest; + }); + }; + const handleListTextFieldConfirm = (listId: number, fieldKey: string) => { setConfirmedListTextFields(prev => ({ ...prev, @@ -195,6 +211,22 @@ export const RightSidePanel: React.FC = ({ onFinishCapture }); }; + const handleListTextFieldDelete = (listId: number, fieldKey: string) => { + removeListTextField(listId, fieldKey); + setConfirmedListTextFields(prev => { + const updatedListFields = { ...(prev[listId] || {}) }; + delete updatedListFields[fieldKey]; + return { + ...prev, + [listId]: updatedListFields + }; + }); + setErrors(prev => { + const { [fieldKey]: _, ...rest } = prev; + return rest; + }); + }; + const getTextSettingsObject = useCallback(() => { const settings: Record = {}; browserSteps.forEach(step => { @@ -526,11 +558,21 @@ export const RightSidePanel: React.FC = ({ onFinishCapture ) }} /> - {!confirmedTextSteps[step.id] && ( + {!confirmedTextSteps[step.id] ? ( + ) : ( + + + )} )} @@ -578,7 +620,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture ) }} /> - {!confirmedListTextFields[step.id]?.[key] && ( + {!confirmedListTextFields[step.id]?.[key] ? ( + ) : ( + + + )} ))} From 3cf0b858933f0a9aa0541a7b714677600861939f Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 28 Dec 2024 17:39:04 +0530 Subject: [PATCH 25/53] feat: add lang translation for delete button --- public/locales/de.json | 3 ++- public/locales/en.json | 3 ++- public/locales/es.json | 3 ++- public/locales/ja.json | 3 ++- public/locales/zh.json | 3 ++- 5 files changed, 10 insertions(+), 5 deletions(-) diff --git a/public/locales/de.json b/public/locales/de.json index 411d8f22..db0ce562 100644 --- a/public/locales/de.json +++ b/public/locales/de.json @@ -162,7 +162,8 @@ "confirm_limit": "Limit bestätigen", "finish_capture": "Erfassung abschließen", "finish": "Fertig", - "cancel": "Abbrechen" + "cancel": "Abbrechen", + "delete": "Löschen" }, "screenshot": { "capture_fullpage": "Vollständige Seite erfassen", diff --git a/public/locales/en.json b/public/locales/en.json index c5a2ff4c..9b4defbc 100644 --- a/public/locales/en.json +++ b/public/locales/en.json @@ -163,7 +163,8 @@ "confirm_limit": "Confirm Limit", "finish_capture": "Finish Capture", "finish": "Finish", - "cancel": "Cancel" + "cancel": "Cancel", + "delete": "Delete" }, "screenshot": { "capture_fullpage": "Capture Fullpage", diff --git a/public/locales/es.json b/public/locales/es.json index 6e52cc6f..e897914e 100644 --- a/public/locales/es.json +++ b/public/locales/es.json @@ -163,7 +163,8 @@ "confirm_limit": "Confirmar Límite", "finish_capture": "Finalizar Captura", "finish": "Finalizar", - "cancel": "Cancelar" + "cancel": "Cancelar", + "delete": "Eliminar" }, "screenshot": { "capture_fullpage": "Capturar Página Completa", diff --git a/public/locales/ja.json b/public/locales/ja.json index 9d2d9a89..9ae226dc 100644 --- a/public/locales/ja.json +++ b/public/locales/ja.json @@ -163,7 +163,8 @@ "confirm_limit": "制限を確認", "finish_capture": "取得を完了", "finish": "完了", - "cancel": "キャンセル" + "cancel": "キャンセル", + "delete": "削除" }, "screenshot": { "capture_fullpage": "フルページを取得", diff --git a/public/locales/zh.json b/public/locales/zh.json index 69561d5c..344a58a7 100644 --- a/public/locales/zh.json +++ b/public/locales/zh.json @@ -163,7 +163,8 @@ "confirm_limit": "确认限制", "finish_capture": "完成捕获", "finish": "完成", - "cancel": "取消" + "cancel": "取消", + "delete": "删除" }, "screenshot": { "capture_fullpage": "捕获整页", From fd7e4ab626fe2b862de7fea14819be18c40012d8 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sat, 28 Dec 2024 18:11:24 +0530 Subject: [PATCH 26/53] feat: check confirm capture and render delete button --- src/components/organisms/RightSidePanel.tsx | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/src/components/organisms/RightSidePanel.tsx b/src/components/organisms/RightSidePanel.tsx index 403f78f6..c6b3479f 100644 --- a/src/components/organisms/RightSidePanel.tsx +++ b/src/components/organisms/RightSidePanel.tsx @@ -56,6 +56,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture const [showCaptureText, setShowCaptureText] = useState(true); const [hoverStates, setHoverStates] = useState<{ [id: string]: boolean }>({}); const [browserStepIdList, setBrowserStepIdList] = useState([]); + const [isCaptureTextConfirmed, setIsCaptureTextConfirmed] = useState(false); const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog } = useGlobalInfoStore(); const { getText, startGetText, stopGetText, getScreenshot, startGetScreenshot, stopGetScreenshot, getList, startGetList, stopGetList, startPaginationMode, stopPaginationMode, paginationType, updatePaginationType, limitType, customLimit, updateLimitType, updateCustomLimit, stopLimitMode, startLimitMode, captureStage, setCaptureStage } = useActionContext(); @@ -130,6 +131,11 @@ export const RightSidePanel: React.FC = ({ onFinishCapture const handlePairDelete = () => { } + const handleStartGetText = () => { + setIsCaptureTextConfirmed(false); + startGetText(); + } + const handleTextLabelChange = (id: number, label: string, listId?: number, fieldKey?: string) => { if (listId !== undefined && fieldKey !== undefined) { // Prevent editing if the field is confirmed @@ -256,6 +262,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture if (hasTextSteps) { socket?.emit('action', { action: 'scrapeSchema', settings }); } + setIsCaptureTextConfirmed(true); resetInterpretationLog(); onFinishCapture(); }, [stopGetText, getTextSettingsObject, socket, browserSteps, confirmedTextSteps, resetInterpretationLog]); @@ -502,7 +509,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture )} - {!getText && !getScreenshot && !getList && showCaptureText && } + {!getText && !getScreenshot && !getList && showCaptureText && } {getText && <> @@ -563,7 +570,7 @@ export const RightSidePanel: React.FC = ({ onFinishCapture - ) : ( + ) : !isCaptureTextConfirmed && ( - ) : ( + ) : !isCaptureListConfirmed && ( + )} - + )} From dead389e480cae29978242047a554e3a3f780f30 Mon Sep 17 00:00:00 2001 From: RohitR311 Date: Sun, 29 Dec 2024 17:55:01 +0530 Subject: [PATCH 30/53] feat: add translation for back button of capture list action --- public/locales/de.json | 1 + public/locales/en.json | 1 + public/locales/es.json | 1 + public/locales/ja.json | 1 + public/locales/zh.json | 1 + 5 files changed, 5 insertions(+) diff --git a/public/locales/de.json b/public/locales/de.json index 411d8f22..c43b46a2 100644 --- a/public/locales/de.json +++ b/public/locales/de.json @@ -161,6 +161,7 @@ "confirm_pagination": "Paginierung bestätigen", "confirm_limit": "Limit bestätigen", "finish_capture": "Erfassung abschließen", + "back": "Zurück", "finish": "Fertig", "cancel": "Abbrechen" }, diff --git a/public/locales/en.json b/public/locales/en.json index c5a2ff4c..1a68faed 100644 --- a/public/locales/en.json +++ b/public/locales/en.json @@ -162,6 +162,7 @@ "confirm_pagination": "Confirm Pagination", "confirm_limit": "Confirm Limit", "finish_capture": "Finish Capture", + "back": "Back", "finish": "Finish", "cancel": "Cancel" }, diff --git a/public/locales/es.json b/public/locales/es.json index 6e52cc6f..b2fb55a4 100644 --- a/public/locales/es.json +++ b/public/locales/es.json @@ -162,6 +162,7 @@ "confirm_pagination": "Confirmar Paginación", "confirm_limit": "Confirmar Límite", "finish_capture": "Finalizar Captura", + "back": "Atrás", "finish": "Finalizar", "cancel": "Cancelar" }, diff --git a/public/locales/ja.json b/public/locales/ja.json index 9d2d9a89..e4073814 100644 --- a/public/locales/ja.json +++ b/public/locales/ja.json @@ -162,6 +162,7 @@ "confirm_pagination": "ページネーションを確認", "confirm_limit": "制限を確認", "finish_capture": "取得を完了", + "back": "戻る", "finish": "完了", "cancel": "キャンセル" }, diff --git a/public/locales/zh.json b/public/locales/zh.json index 69561d5c..d171c2c9 100644 --- a/public/locales/zh.json +++ b/public/locales/zh.json @@ -162,6 +162,7 @@ "confirm_pagination": "确认分页", "confirm_limit": "确认限制", "finish_capture": "完成捕获", + "back": "返回", "finish": "完成", "cancel": "取消" }, From 4031ded27947f7ac72f40c9203cb1498f0ac4460 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 30 Dec 2024 19:26:53 +0530 Subject: [PATCH 31/53] feat: confirm instead of confirm pagination --- public/locales/en.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/en.json b/public/locales/en.json index 70ded861..cb3f2789 100644 --- a/public/locales/en.json +++ b/public/locales/en.json @@ -159,7 +159,7 @@ "confirm": "Confirm", "discard": "Discard", "confirm_capture": "Confirm Capture", - "confirm_pagination": "Confirm Pagination", + "confirm_pagination": "Confirm", "confirm_limit": "Confirm Limit", "finish_capture": "Finish Capture", "back": "Back", From 8baad8d1f90b9b36594206e91dc37304c68d0a9b Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 30 Dec 2024 19:27:23 +0530 Subject: [PATCH 32/53] feat: confirm instead of confirm limit --- public/locales/en.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/en.json b/public/locales/en.json index cb3f2789..bd8acce3 100644 --- a/public/locales/en.json +++ b/public/locales/en.json @@ -160,7 +160,7 @@ "discard": "Discard", "confirm_capture": "Confirm Capture", "confirm_pagination": "Confirm", - "confirm_limit": "Confirm Limit", + "confirm_limit": "Confirm", "finish_capture": "Finish Capture", "back": "Back", "finish": "Finish", From 663a4fd69c0f13998a61f13b1564cb8565bf942b Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 30 Dec 2024 19:29:24 +0530 Subject: [PATCH 33/53] feat(spanish): confirm instead of confirm pagination --- public/locales/es.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/es.json b/public/locales/es.json index 5cde0c70..089c10cd 100644 --- a/public/locales/es.json +++ b/public/locales/es.json @@ -159,7 +159,7 @@ "confirm": "Confirmar", "discard": "Descartar", "confirm_capture": "Confirmar Captura", - "confirm_pagination": "Confirmar Paginación", + "confirm_pagination": "Confirmar", "confirm_limit": "Confirmar Límite", "finish_capture": "Finalizar Captura", "back": "Atrás", From aded4dfebb42e765f8b22d1b01111c2f82fc73f3 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 30 Dec 2024 19:29:41 +0530 Subject: [PATCH 34/53] feat(spanish): confirm instead of confirm limit --- public/locales/es.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/es.json b/public/locales/es.json index 089c10cd..94210880 100644 --- a/public/locales/es.json +++ b/public/locales/es.json @@ -160,7 +160,7 @@ "discard": "Descartar", "confirm_capture": "Confirmar Captura", "confirm_pagination": "Confirmar", - "confirm_limit": "Confirmar Límite", + "confirm_limit": "Confirmar", "finish_capture": "Finalizar Captura", "back": "Atrás", "finish": "Finalizar", From 09b974ca782e574240beab052c1a1e78e0316eac Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 30 Dec 2024 19:30:38 +0530 Subject: [PATCH 35/53] feat(japanese): confirm instead of confirm pagination --- public/locales/ja.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/ja.json b/public/locales/ja.json index a0d18c67..e2204e14 100644 --- a/public/locales/ja.json +++ b/public/locales/ja.json @@ -159,7 +159,7 @@ "confirm": "確認", "discard": "破棄", "confirm_capture": "取得を確認", - "confirm_pagination": "ページネーションを確認", + "confirm_pagination": "確認", "confirm_limit": "制限を確認", "finish_capture": "取得を完了", "back": "戻る", From 20b31f36d99fe327075e9c104637b79b282edf87 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 30 Dec 2024 19:30:57 +0530 Subject: [PATCH 36/53] feat(japanese): confirm instead of confirm limit --- public/locales/ja.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/ja.json b/public/locales/ja.json index e2204e14..0bcba967 100644 --- a/public/locales/ja.json +++ b/public/locales/ja.json @@ -160,7 +160,7 @@ "discard": "破棄", "confirm_capture": "取得を確認", "confirm_pagination": "確認", - "confirm_limit": "制限を確認", + "confirm_limit": "確認", "finish_capture": "取得を完了", "back": "戻る", "finish": "完了", From e78a61139d7fa828cf772ee4bc6c63889f77f3f8 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 30 Dec 2024 19:31:50 +0530 Subject: [PATCH 37/53] feat(german): confirm instead of confirm pagination --- public/locales/de.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/de.json b/public/locales/de.json index debf80f6..e06b784a 100644 --- a/public/locales/de.json +++ b/public/locales/de.json @@ -158,7 +158,7 @@ "confirm": "Bestätigen", "discard": "Verwerfen", "confirm_capture": "Erfassung bestätigen", - "confirm_pagination": "Paginierung bestätigen", + "confirm_pagination": "Bestätigen", "confirm_limit": "Limit bestätigen", "finish_capture": "Erfassung abschließen", "back": "Zurück", From c753ce551200fa4b6431ff034694c6dcd343e516 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 30 Dec 2024 19:32:16 +0530 Subject: [PATCH 38/53] feat(german): confirm instead of confirm limit --- public/locales/de.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/de.json b/public/locales/de.json index e06b784a..b9b4185b 100644 --- a/public/locales/de.json +++ b/public/locales/de.json @@ -159,7 +159,7 @@ "discard": "Verwerfen", "confirm_capture": "Erfassung bestätigen", "confirm_pagination": "Bestätigen", - "confirm_limit": "Limit bestätigen", + "confirm_limit": "Bestätigen", "finish_capture": "Erfassung abschließen", "back": "Zurück", "finish": "Fertig", From 634daeecf595cd2418913d9ae21689e55e5c2b39 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 30 Dec 2024 19:32:56 +0530 Subject: [PATCH 39/53] feat(chinese): confirm instead of confirm pagination --- public/locales/zh.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/zh.json b/public/locales/zh.json index e55565f8..805396dc 100644 --- a/public/locales/zh.json +++ b/public/locales/zh.json @@ -159,7 +159,7 @@ "confirm": "确认", "discard": "放弃", "confirm_capture": "确认捕获", - "confirm_pagination": "确认分页", + "confirm_pagination": "确认", "confirm_limit": "确认限制", "finish_capture": "完成捕获", "back": "返回", From cd7f38f561a1ce2e4b596a25adeb48c5b6342f0f Mon Sep 17 00:00:00 2001 From: amhsirak Date: Mon, 30 Dec 2024 19:33:12 +0530 Subject: [PATCH 40/53] feat(chinese): confirm instead of confirm limit --- public/locales/zh.json | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/public/locales/zh.json b/public/locales/zh.json index 805396dc..a19fe439 100644 --- a/public/locales/zh.json +++ b/public/locales/zh.json @@ -160,7 +160,7 @@ "discard": "放弃", "confirm_capture": "确认捕获", "confirm_pagination": "确认", - "confirm_limit": "确认限制", + "confirm_limit": "确认", "finish_capture": "完成捕获", "back": "返回", "finish": "完成", From 4c0ad3ceed6a8b192d8e672b36908fd4db3871cd Mon Sep 17 00:00:00 2001 From: amhsirak Date: Tue, 31 Dec 2024 21:26:53 +0530 Subject: [PATCH 41/53] fix: avoid ui shift on api key reveal --- src/components/organisms/ApiKey.tsx | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/components/organisms/ApiKey.tsx b/src/components/organisms/ApiKey.tsx index 37a72764..0af27934 100644 --- a/src/components/organisms/ApiKey.tsx +++ b/src/components/organisms/ApiKey.tsx @@ -124,7 +124,11 @@ const ApiKeyManager = () => { {apiKeyName} - {showKey ? `${apiKey?.substring(0, 10)}...` : '***************'} + + + {showKey ? `${apiKey?.substring(0, 10)}...` : '**********'} + + From a3337d7fcc08a8436edadcb33b0214401a63e28d Mon Sep 17 00:00:00 2001 From: amhsirak Date: Tue, 31 Dec 2024 21:27:12 +0530 Subject: [PATCH 42/53] fix: format --- src/components/organisms/ApiKey.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/organisms/ApiKey.tsx b/src/components/organisms/ApiKey.tsx index 0af27934..9d54fe5c 100644 --- a/src/components/organisms/ApiKey.tsx +++ b/src/components/organisms/ApiKey.tsx @@ -126,7 +126,7 @@ const ApiKeyManager = () => { {apiKeyName} - {showKey ? `${apiKey?.substring(0, 10)}...` : '**********'} + {showKey ? `${apiKey?.substring(0, 10)}...` : '**********'} From 24915a93410aa1f309f27a7e18dd1bb0729f1b6f Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 1 Jan 2025 22:55:33 +0530 Subject: [PATCH 43/53] feat: get notify from global info store --- package.json | 1 - src/components/molecules/ScheduleSettings.tsx | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/package.json b/package.json index e89f13de..fc5e9edb 100644 --- a/package.json +++ b/package.json @@ -46,7 +46,6 @@ "jwt-decode": "^4.0.0", "loglevel": "^1.8.0", "loglevel-plugin-remote": "^0.6.8", - "maxun-core": "^0.0.7", "minio": "^8.0.1", "moment-timezone": "^0.5.45", "node-cron": "^3.0.3", diff --git a/src/components/molecules/ScheduleSettings.tsx b/src/components/molecules/ScheduleSettings.tsx index 3af0072f..ea78720c 100644 --- a/src/components/molecules/ScheduleSettings.tsx +++ b/src/components/molecules/ScheduleSettings.tsx @@ -79,7 +79,7 @@ export const ScheduleSettingsModal = ({ isOpen, handleStart, handleClose, initia 'SUNDAY' ]; - const { recordingId } = useGlobalInfoStore(); + const { recordingId, notify } = useGlobalInfoStore(); const deleteRobotSchedule = () => { if (recordingId) { From 8c2b92483b1a86996d970293f2a167ea05157af6 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 1 Jan 2025 22:57:23 +0530 Subject: [PATCH 44/53] feat: notify on schedule delete --- src/components/molecules/ScheduleSettings.tsx | 1 + 1 file changed, 1 insertion(+) diff --git a/src/components/molecules/ScheduleSettings.tsx b/src/components/molecules/ScheduleSettings.tsx index ea78720c..917696c9 100644 --- a/src/components/molecules/ScheduleSettings.tsx +++ b/src/components/molecules/ScheduleSettings.tsx @@ -85,6 +85,7 @@ export const ScheduleSettingsModal = ({ isOpen, handleStart, handleClose, initia if (recordingId) { deleteSchedule(recordingId); setSchedule(null); + notify('success', t('Schedule deleted successfully')); } else { console.error('No recording id provided'); } From e61798855564427223f4c0177f45fedcbf854814 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 1 Jan 2025 23:02:17 +0530 Subject: [PATCH 45/53] fix: revert local maxun-core changes --- package.json | 1 + 1 file changed, 1 insertion(+) diff --git a/package.json b/package.json index fc5e9edb..e89f13de 100644 --- a/package.json +++ b/package.json @@ -46,6 +46,7 @@ "jwt-decode": "^4.0.0", "loglevel": "^1.8.0", "loglevel-plugin-remote": "^0.6.8", + "maxun-core": "^0.0.7", "minio": "^8.0.1", "moment-timezone": "^0.5.45", "node-cron": "^3.0.3", From 9b2ea1f5353466f63a92fefc8921bc1271619339 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 1 Jan 2025 23:29:06 +0530 Subject: [PATCH 46/53] chore: cleanup space --- src/components/molecules/RecordingsTable.tsx | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/components/molecules/RecordingsTable.tsx b/src/components/molecules/RecordingsTable.tsx index 01bc524b..ecda3c8d 100644 --- a/src/components/molecules/RecordingsTable.tsx +++ b/src/components/molecules/RecordingsTable.tsx @@ -33,10 +33,6 @@ interface Column { format?: (value: string) => string; } - - - - interface Data { id: string; name: string; From cc6cc8ff8d03b17e942335bcd3770ad6d12f49aa Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 1 Jan 2025 23:30:00 +0530 Subject: [PATCH 47/53] fix: format --- src/components/molecules/RecordingsTable.tsx | 1 - 1 file changed, 1 deletion(-) diff --git a/src/components/molecules/RecordingsTable.tsx b/src/components/molecules/RecordingsTable.tsx index ecda3c8d..f8a0ba37 100644 --- a/src/components/molecules/RecordingsTable.tsx +++ b/src/components/molecules/RecordingsTable.tsx @@ -437,7 +437,6 @@ const OptionsButton = ({ handleEdit, handleDelete, handleDuplicate }: OptionsBut {t('recordingtable.duplicate')} - ); From 6d2507982077daceebcba4eba02d44c0c28bab58 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 1 Jan 2025 23:30:46 +0530 Subject: [PATCH 48/53] fix: format --- src/api/storage.ts | 5 ----- 1 file changed, 5 deletions(-) diff --git a/src/api/storage.ts b/src/api/storage.ts index 4b2f4e80..9ae3bc47 100644 --- a/src/api/storage.ts +++ b/src/api/storage.ts @@ -5,11 +5,6 @@ import { ScheduleSettings } from "../components/molecules/ScheduleSettings"; import { CreateRunResponse, ScheduleRunResponse } from "../pages/MainPage"; import { apiUrl } from "../apiConfig"; - - - - - export const getStoredRecordings = async (): Promise => { try { const response = await axios.get(`${apiUrl}/storage/recordings`); From 3b9e30ddae52efa4e450a948c393132dee883b67 Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 1 Jan 2025 23:31:21 +0530 Subject: [PATCH 49/53] fix: format --- src/api/storage.ts | 20 +++++--------------- 1 file changed, 5 insertions(+), 15 deletions(-) diff --git a/src/api/storage.ts b/src/api/storage.ts index 9ae3bc47..18c793c0 100644 --- a/src/api/storage.ts +++ b/src/api/storage.ts @@ -77,11 +77,7 @@ export const getStoredRecording = async (id: string) => { } } - - export const checkRunsForRecording = async (id: string): Promise => { - - try { const response = await axios.get(`${apiUrl}/storage/recordings/${id}/runs`); @@ -94,32 +90,26 @@ export const checkRunsForRecording = async (id: string): Promise => { } }; - export const deleteRecordingFromStorage = async (id: string): Promise => { - const hasRuns = await checkRunsForRecording(id); - + if (hasRuns) { - + return false; } try { const response = await axios.delete(`${apiUrl}/storage/recordings/${id}`); if (response.status === 200) { - + return true; } else { throw new Error(`Couldn't delete stored recording ${id}`); } } catch (error: any) { console.log(error); - + return false; } - - - - }; export const deleteRunFromStorage = async (id: string): Promise => { @@ -154,7 +144,7 @@ export const createRunForStoredRecording = async (id: string, settings: RunSetti try { const response = await axios.put( `${apiUrl}/storage/runs/${id}`, - { ...settings }); + { ...settings }); if (response.status === 200) { return response.data; } else { From 52aefd1c0f674cb0cac87e25124dd8c56027712b Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 1 Jan 2025 23:31:37 +0530 Subject: [PATCH 50/53] fix: format --- src/api/workflow.ts | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/api/workflow.ts b/src/api/workflow.ts index 03b677b1..40ac0d99 100644 --- a/src/api/workflow.ts +++ b/src/api/workflow.ts @@ -3,7 +3,7 @@ import { emptyWorkflow } from "../shared/constants"; import { default as axios, AxiosResponse } from "axios"; import { apiUrl } from "../apiConfig"; -export const getActiveWorkflow = async(id: string) : Promise => { +export const getActiveWorkflow = async (id: string): Promise => { try { const response = await axios.get(`${apiUrl}/workflow/${id}`) if (response.status === 200) { @@ -11,13 +11,13 @@ export const getActiveWorkflow = async(id: string) : Promise => { } else { throw new Error('Something went wrong when fetching a recorded workflow'); } - } catch(error: any) { + } catch (error: any) { console.log(error); return emptyWorkflow; } }; -export const getParamsOfActiveWorkflow = async(id: string) : Promise => { +export const getParamsOfActiveWorkflow = async (id: string): Promise => { try { const response = await axios.get(`${apiUrl}/workflow/params/${id}`) if (response.status === 200) { @@ -25,15 +25,15 @@ export const getParamsOfActiveWorkflow = async(id: string) : Promise => { +export const deletePair = async (index: number): Promise => { try { - const response = await axios.delete(`${apiUrl}/workflow/pair/${index}`); + const response = await axios.delete(`${apiUrl}/workflow/pair/${index}`); if (response.status === 200) { return response.data; } else { @@ -45,11 +45,11 @@ export const deletePair = async(index: number): Promise => { } }; -export const AddPair = async(index: number, pair: WhereWhatPair): Promise => { +export const AddPair = async (index: number, pair: WhereWhatPair): Promise => { try { const response = await axios.post(`${apiUrl}/workflow/pair/${index}`, { pair, - }, {headers: {'Content-Type': 'application/json'}}); + }, { headers: { 'Content-Type': 'application/json' } }); if (response.status === 200) { return response.data; } else { @@ -61,11 +61,11 @@ export const AddPair = async(index: number, pair: WhereWhatPair): Promise => { +export const UpdatePair = async (index: number, pair: WhereWhatPair): Promise => { try { const response = await axios.put(`${apiUrl}/workflow/pair/${index}`, { pair, - }, {headers: {'Content-Type': 'application/json'}}); + }, { headers: { 'Content-Type': 'application/json' } }); if (response.status === 200) { return response.data; } else { From 735b33b84727439c4c354021a06016166661902c Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 1 Jan 2025 23:53:58 +0530 Subject: [PATCH 51/53] fix: typo --- src/components/molecules/NavBar.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/molecules/NavBar.tsx b/src/components/molecules/NavBar.tsx index 142d45ab..8577f30e 100644 --- a/src/components/molecules/NavBar.tsx +++ b/src/components/molecules/NavBar.tsx @@ -318,7 +318,7 @@ export const NavBar: React.FC = ({ { window.open('https://x.com/maxun_io?ref=app', '_blank'); }}> - Twiiter (X) + Twiter (X) {t('navbar.menu_items.language')} From 22a99ff8b56788635581517bc863a3115b6b79db Mon Sep 17 00:00:00 2001 From: amhsirak Date: Wed, 1 Jan 2025 23:54:28 +0530 Subject: [PATCH 52/53] fix: twitter typo --- src/components/molecules/NavBar.tsx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/components/molecules/NavBar.tsx b/src/components/molecules/NavBar.tsx index 8577f30e..8aeeb05d 100644 --- a/src/components/molecules/NavBar.tsx +++ b/src/components/molecules/NavBar.tsx @@ -318,7 +318,7 @@ export const NavBar: React.FC = ({ { window.open('https://x.com/maxun_io?ref=app', '_blank'); }}> - Twiter (X) + Twitter (X) {t('navbar.menu_items.language')} From ec0bc75097c287a9ffce1b0fcc47600a96f781c8 Mon Sep 17 00:00:00 2001 From: Karishma Shukla Date: Thu, 2 Jan 2025 18:12:10 +0530 Subject: [PATCH 53/53] docs: update website to maxun.dev --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index cebcedd2..47e170b5 100644 --- a/README.md +++ b/README.md @@ -15,7 +15,7 @@ Maxun lets you train a robot in 2 minutes and scrape the web on auto-pilot. Web

- Website | + Website | Discord | Twitter | Join Maxun Cloud |