Merge pull request #462 from getmaxun/improve-extract

feat: improve highlighting and data extraction
This commit is contained in:
Karishma Shukla
2025-03-05 02:33:37 +05:30
committed by GitHub
4 changed files with 1157 additions and 431 deletions

View File

@@ -210,7 +210,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return Array.from(document.querySelectorAll(config.selector)); return Array.from(document.querySelectorAll(config.selector));
} }
// First handle iframe traversal if present
if (config.selector.includes(':>>')) { if (config.selector.includes(':>>')) {
const parts = config.selector.split(':>>').map(s => s.trim()); const parts = config.selector.split(':>>').map(s => s.trim());
let currentElements = [document]; let currentElements = [document];
@@ -223,23 +222,44 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
for (const element of currentElements) { for (const element of currentElements) {
try { try {
// For document or iframe document
const doc = element.contentDocument || element || element.contentWindow?.document; const doc = element.contentDocument || element || element.contentWindow?.document;
if (!doc) continue; if (!doc) continue;
// Query elements in current context if (part.startsWith('frame[name=') || part.startsWith('iframe[name=')) {
const nameMatch = part.match(/\[name=['"]([^'"]+)['"]\]/);
if (nameMatch && nameMatch[1]) {
const frameName = nameMatch[1];
let foundFrames = [];
if (doc.getElementsByName && typeof doc.getElementsByName === 'function') {
foundFrames = Array.from(doc.getElementsByName(frameName))
.filter(el => el.tagName === 'FRAME' || el.tagName === 'IFRAME');
}
if (foundFrames.length === 0) {
const framesBySelector = Array.from(doc.querySelectorAll(`frame[name="${frameName}"], iframe[name="${frameName}"]`));
foundFrames = framesBySelector;
}
if (isLast) {
nextElements.push(...foundFrames);
} else {
nextElements.push(...foundFrames);
}
continue;
}
}
const found = Array.from(doc.querySelectorAll(part)); const found = Array.from(doc.querySelectorAll(part));
if (isLast) { if (isLast) {
// If it's the last part, keep all matching elements
nextElements.push(...found); nextElements.push(...found);
} else { } else {
// If not last, only keep iframes for next iteration const frames = found.filter(el => el.tagName === 'IFRAME' || el.tagName === 'FRAME');
const iframes = found.filter(el => el.tagName === 'IFRAME'); nextElements.push(...frames);
nextElements.push(...iframes);
} }
} catch (error) { } catch (error) {
console.warn('Cannot access iframe content:', error, { console.warn('Cannot access iframe/frame content:', error, {
part, part,
element, element,
index: i index: i
@@ -285,12 +305,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return []; return [];
} }
// Modified to handle iframe context for URL resolution
function getElementValue(element, attribute) { function getElementValue(element, attribute) {
if (!element) return null; if (!element) return null;
// Get the base URL for resolving relative URLs let baseURL;
const baseURL = element.ownerDocument?.location?.href || window.location.origin; try {
baseURL = element.ownerDocument?.location?.href ||
element.ownerDocument?.baseURI ||
window.location.origin;
} catch (e) {
baseURL = window.location.origin;
}
switch (attribute) { switch (attribute) {
case 'href': { case 'href': {
@@ -305,6 +330,10 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return element.innerText?.trim(); return element.innerText?.trim();
case 'textContent': case 'textContent':
return element.textContent?.trim(); return element.textContent?.trim();
case 'innerHTML':
return element.innerHTML;
case 'outerHTML':
return element.outerHTML;
default: default:
return element.getAttribute(attribute) || element.innerText?.trim(); return element.getAttribute(attribute) || element.innerText?.trim();
} }
@@ -394,7 +423,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list * @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/ */
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) { window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
// Enhanced query function to handle both iframe and shadow DOM // Enhanced query function to handle iframe, frame and shadow DOM
const queryElement = (rootElement, selector) => { const queryElement = (rootElement, selector) => {
if (!selector.includes('>>') && !selector.includes(':>>')) { if (!selector.includes('>>') && !selector.includes(':>>')) {
return rootElement.querySelector(selector); return rootElement.querySelector(selector);
@@ -406,14 +435,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
for (let i = 0; i < parts.length; i++) { for (let i = 0; i < parts.length; i++) {
if (!currentElement) return null; if (!currentElement) return null;
// Handle iframe traversal // Handle iframe and frame traversal
if (currentElement.tagName === 'IFRAME') { if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
try { try {
const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document; const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
currentElement = iframeDoc.querySelector(parts[i]); currentElement = frameDoc.querySelector(parts[i]);
continue; continue;
} catch (e) { } catch (e) {
console.warn('Cannot access iframe content:', e); console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
return null; return null;
} }
} }
@@ -456,13 +485,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const nextElements = []; const nextElements = [];
for (const element of currentElements) { for (const element of currentElements) {
// Handle iframe traversal // Handle iframe and frame traversal
if (element.tagName === 'IFRAME') { if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
try { try {
const iframeDoc = element.contentDocument || element.contentWindow.document; const frameDoc = element.contentDocument || element.contentWindow.document;
nextElements.push(...iframeDoc.querySelectorAll(part)); nextElements.push(...frameDoc.querySelectorAll(part));
} catch (e) { } catch (e) {
console.warn('Cannot access iframe content:', e); console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
continue; continue;
} }
} else { } else {
@@ -537,8 +566,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return { type: 'TR', element: currentElement }; return { type: 'TR', element: currentElement };
} }
// Handle iframe crossing // Handle iframe and frame crossing
if (currentElement.tagName === 'IFRAME') { if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
try { try {
currentElement = currentElement.contentDocument.body; currentElement = currentElement.contentDocument.body;
} catch (e) { } catch (e) {
@@ -582,7 +611,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
if (current.tagName === 'TH') return true; if (current.tagName === 'TH') return true;
if (current.tagName === 'IFRAME') { if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
try { try {
current = current.contentDocument.body; current = current.contentDocument.body;
} catch (e) { } catch (e) {
@@ -638,14 +667,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName)); allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
} }
// Get elements from iframes // Get elements from iframes and frames
const iframes = document.getElementsByTagName('iframe'); const frames = [
for (const iframe of iframes) { ...Array.from(document.getElementsByTagName('iframe')),
...Array.from(document.getElementsByTagName('frame'))
];
for (const frame of frames) {
try { try {
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document; const frameDoc = frame.contentDocument || frame.contentWindow.document;
allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName)); allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
} catch (e) { } catch (e) {
console.warn('Cannot access iframe content:', e); console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
} }
} }
@@ -707,7 +740,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const tableData = []; const tableData = [];
const nonTableData = []; const nonTableData = [];
// Process table data with both iframe and shadow DOM support // Process table data with support for iframes, frames, and shadow DOM
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
const container = containers[containerIndex]; const container = containers[containerIndex];
const { tableFields } = containerFields[containerIndex]; const { tableFields } = containerFields[containerIndex];
@@ -717,14 +750,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const firstElement = queryElement(container, firstField.selector); const firstElement = queryElement(container, firstField.selector);
let tableContext = firstElement; let tableContext = firstElement;
// Find table context including both iframe and shadow DOM // Find table context including iframe, frame and shadow DOM
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) { while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
if (tableContext.getRootNode() instanceof ShadowRoot) { if (tableContext.getRootNode() instanceof ShadowRoot) {
tableContext = tableContext.getRootNode().host; tableContext = tableContext.getRootNode().host;
continue; continue;
} }
if (tableContext.tagName === 'IFRAME') { if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
try { try {
tableContext = tableContext.contentDocument.body; tableContext = tableContext.contentDocument.body;
} catch (e) { } catch (e) {
@@ -747,13 +780,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR')); rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
} }
// Get rows from iframes // Get rows from iframes and frames
if (tableContext.tagName === 'IFRAME') { if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
try { try {
const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document; const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
rows.push(...iframeDoc.getElementsByTagName('TR')); rows.push(...frameDoc.getElementsByTagName('TR'));
} catch (e) { } catch (e) {
console.warn('Cannot access iframe rows:', e); console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
} }
} }
@@ -823,7 +856,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
} }
} }
// Process non-table data with both contexts support // Process non-table data with all contexts support
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) { for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
if (nonTableData.length >= limit) break; if (nonTableData.length >= limit) break;

View File

@@ -286,6 +286,12 @@ export default class Interpreter extends EventEmitter {
? arrayToObject(<any>superset[key]) ? arrayToObject(<any>superset[key])
: superset[key]; : superset[key];
if ((key === 'url' || key === 'selectors') &&
Array.isArray(value) && Array.isArray(superset[key]) &&
value.length === 0 && (superset[key] as any[]).length === 0) {
return true;
}
if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) { if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
return value.some(selector => return value.some(selector =>
(superset[key] as any[]).includes(selector) (superset[key] as any[]).includes(selector)
@@ -592,33 +598,52 @@ export default class Interpreter extends EventEmitter {
}; };
// Enhanced button finder with retry mechanism // Enhanced button finder with retry mechanism
const findWorkingButton = async (selectors: string[], retryCount = 0): Promise<{ const findWorkingButton = async (selectors: string[]): Promise<{
button: ElementHandle | null, button: ElementHandle | null,
workingSelector: string | null workingSelector: string | null,
updatedSelectors: string[]
}> => { }> => {
for (const selector of selectors) { let updatedSelectors = [...selectors];
try {
const button = await page.waitForSelector(selector, { for (let i = 0; i < selectors.length; i++) {
state: 'attached', const selector = selectors[i];
timeout: 10000 // Reduced timeout for faster checks let retryCount = 0;
}); let selectorSuccess = false;
if (button) {
debugLog('Found working selector:', selector); while (retryCount < MAX_RETRIES && !selectorSuccess) {
return { button, workingSelector: selector }; try {
const button = await page.waitForSelector(selector, {
state: 'attached',
timeout: 10000
});
if (button) {
debugLog('Found working selector:', selector);
return {
button,
workingSelector: selector,
updatedSelectors
};
}
} catch (error) {
retryCount++;
debugLog(`Selector "${selector}" failed: attempt ${retryCount}/${MAX_RETRIES}`);
if (retryCount < MAX_RETRIES) {
await page.waitForTimeout(RETRY_DELAY);
} else {
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
updatedSelectors = updatedSelectors.filter(s => s !== selector);
}
} }
} catch (error) {
debugLog(`Selector failed: ${selector}`);
} }
} }
// Implement retry mechanism when no selectors work return {
if (selectors.length > 0 && retryCount < MAX_RETRIES) { button: null,
debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`); workingSelector: null,
await page.waitForTimeout(RETRY_DELAY); updatedSelectors
return findWorkingButton(selectors, retryCount + 1); };
}
return { button: null, workingSelector: null };
}; };
const retryOperation = async (operation: () => Promise<boolean>, retryCount = 0): Promise<boolean> => { const retryOperation = async (operation: () => Promise<boolean>, retryCount = 0): Promise<boolean> => {
@@ -680,7 +705,10 @@ export default class Interpreter extends EventEmitter {
await scrapeCurrentPage(); await scrapeCurrentPage();
if (checkLimit()) return allResults; if (checkLimit()) return allResults;
const { button, workingSelector } = await findWorkingButton(availableSelectors); const { button, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);
availableSelectors = updatedSelectors;
if (!button || !workingSelector) { if (!button || !workingSelector) {
// Final retry for navigation when no selectors work // Final retry for navigation when no selectors work
const success = await retryOperation(async () => { const success = await retryOperation(async () => {
@@ -697,10 +725,6 @@ export default class Interpreter extends EventEmitter {
break; break;
} }
availableSelectors = availableSelectors.slice(
availableSelectors.indexOf(workingSelector)
);
let retryCount = 0; let retryCount = 0;
let navigationSuccess = false; let navigationSuccess = false;
@@ -768,22 +792,25 @@ export default class Interpreter extends EventEmitter {
} }
case 'clickLoadMore': { case 'clickLoadMore': {
await scrapeCurrentPage();
if (checkLimit()) return allResults;
let loadMoreCounter = 0;
let previousResultCount = allResults.length;
let noNewItemsCounter = 0;
const MAX_NO_NEW_ITEMS = 2;
while (true) { while (true) {
// Find working button with retry mechanism, consistent with clickNext // Find working button with retry mechanism
const { button: loadMoreButton, workingSelector } = await findWorkingButton(availableSelectors); const { button: loadMoreButton, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);
availableSelectors = updatedSelectors;
if (!workingSelector || !loadMoreButton) { if (!workingSelector || !loadMoreButton) {
debugLog('No working Load More selector found after retries'); debugLog('No working Load More selector found after retries');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults; return allResults;
} }
// Update available selectors to start from the working one
availableSelectors = availableSelectors.slice(
availableSelectors.indexOf(workingSelector)
);
// Implement retry mechanism for clicking the button // Implement retry mechanism for clicking the button
let retryCount = 0; let retryCount = 0;
let clickSuccess = false; let clickSuccess = false;
@@ -808,6 +835,8 @@ export default class Interpreter extends EventEmitter {
if (clickSuccess) { if (clickSuccess) {
await page.waitForTimeout(1000); await page.waitForTimeout(1000);
loadMoreCounter++;
debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`);
} }
} catch (error) { } catch (error) {
debugLog(`Click attempt ${retryCount + 1} failed completely.`); debugLog(`Click attempt ${retryCount + 1} failed completely.`);
@@ -822,8 +851,6 @@ export default class Interpreter extends EventEmitter {
if (!clickSuccess) { if (!clickSuccess) {
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`); debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults; return allResults;
} }
@@ -833,20 +860,34 @@ export default class Interpreter extends EventEmitter {
await page.waitForTimeout(2000); await page.waitForTimeout(2000);
const currentHeight = await page.evaluate(() => document.body.scrollHeight); const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) { const heightChanged = currentHeight !== previousHeight;
debugLog('No more items loaded after Load More');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
previousHeight = currentHeight; previousHeight = currentHeight;
if (config.limit && allResults.length >= config.limit) { await scrapeCurrentPage();
allResults = allResults.slice(0, config.limit);
break; const currentResultCount = allResults.length;
const newItemsAdded = currentResultCount > previousResultCount;
if (!newItemsAdded) {
noNewItemsCounter++;
debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
return allResults;
}
} else {
noNewItemsCounter = 0;
previousResultCount = currentResultCount;
}
if (checkLimit()) return allResults;
if (!heightChanged) {
debugLog('No more items loaded after Load More');
return allResults;
} }
} }
break;
} }
default: { default: {

View File

@@ -825,6 +825,7 @@ export class WorkflowGenerator {
selectors?.testIdSelector, selectors?.testIdSelector,
selectors?.id, selectors?.id,
selectors?.hrefSelector, selectors?.hrefSelector,
selectors?.relSelector,
selectors?.accessibilitySelector, selectors?.accessibilitySelector,
selectors?.attrSelector selectors?.attrSelector
] ]

File diff suppressed because it is too large Load Diff