Merge pull request #462 from getmaxun/improve-extract

feat: improve highlighting and data extraction
This commit is contained in:
Karishma Shukla
2025-03-05 02:33:37 +05:30
committed by GitHub
4 changed files with 1157 additions and 431 deletions

View File

@@ -210,7 +210,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return Array.from(document.querySelectorAll(config.selector));
}
// First handle iframe traversal if present
if (config.selector.includes(':>>')) {
const parts = config.selector.split(':>>').map(s => s.trim());
let currentElements = [document];
@@ -223,23 +222,44 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
for (const element of currentElements) {
try {
// For document or iframe document
const doc = element.contentDocument || element || element.contentWindow?.document;
if (!doc) continue;
// Query elements in current context
if (part.startsWith('frame[name=') || part.startsWith('iframe[name=')) {
const nameMatch = part.match(/\[name=['"]([^'"]+)['"]\]/);
if (nameMatch && nameMatch[1]) {
const frameName = nameMatch[1];
let foundFrames = [];
if (doc.getElementsByName && typeof doc.getElementsByName === 'function') {
foundFrames = Array.from(doc.getElementsByName(frameName))
.filter(el => el.tagName === 'FRAME' || el.tagName === 'IFRAME');
}
if (foundFrames.length === 0) {
const framesBySelector = Array.from(doc.querySelectorAll(`frame[name="${frameName}"], iframe[name="${frameName}"]`));
foundFrames = framesBySelector;
}
if (isLast) {
nextElements.push(...foundFrames);
} else {
nextElements.push(...foundFrames);
}
continue;
}
}
const found = Array.from(doc.querySelectorAll(part));
if (isLast) {
// If it's the last part, keep all matching elements
nextElements.push(...found);
} else {
// If not last, only keep iframes for next iteration
const iframes = found.filter(el => el.tagName === 'IFRAME');
nextElements.push(...iframes);
const frames = found.filter(el => el.tagName === 'IFRAME' || el.tagName === 'FRAME');
nextElements.push(...frames);
}
} catch (error) {
console.warn('Cannot access iframe content:', error, {
console.warn('Cannot access iframe/frame content:', error, {
part,
element,
index: i
@@ -285,12 +305,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return [];
}
// Modified to handle iframe context for URL resolution
function getElementValue(element, attribute) {
if (!element) return null;
// Get the base URL for resolving relative URLs
const baseURL = element.ownerDocument?.location?.href || window.location.origin;
let baseURL;
try {
baseURL = element.ownerDocument?.location?.href ||
element.ownerDocument?.baseURI ||
window.location.origin;
} catch (e) {
baseURL = window.location.origin;
}
switch (attribute) {
case 'href': {
@@ -305,6 +330,10 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return element.innerText?.trim();
case 'textContent':
return element.textContent?.trim();
case 'innerHTML':
return element.innerHTML;
case 'outerHTML':
return element.outerHTML;
default:
return element.getAttribute(attribute) || element.innerText?.trim();
}
@@ -394,7 +423,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
*/
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
// Enhanced query function to handle both iframe and shadow DOM
// Enhanced query function to handle iframe, frame and shadow DOM
const queryElement = (rootElement, selector) => {
if (!selector.includes('>>') && !selector.includes(':>>')) {
return rootElement.querySelector(selector);
@@ -406,14 +435,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
for (let i = 0; i < parts.length; i++) {
if (!currentElement) return null;
// Handle iframe traversal
if (currentElement.tagName === 'IFRAME') {
// Handle iframe and frame traversal
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
try {
const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
currentElement = iframeDoc.querySelector(parts[i]);
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
currentElement = frameDoc.querySelector(parts[i]);
continue;
} catch (e) {
console.warn('Cannot access iframe content:', e);
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
return null;
}
}
@@ -456,13 +485,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const nextElements = [];
for (const element of currentElements) {
// Handle iframe traversal
if (element.tagName === 'IFRAME') {
// Handle iframe and frame traversal
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
try {
const iframeDoc = element.contentDocument || element.contentWindow.document;
nextElements.push(...iframeDoc.querySelectorAll(part));
const frameDoc = element.contentDocument || element.contentWindow.document;
nextElements.push(...frameDoc.querySelectorAll(part));
} catch (e) {
console.warn('Cannot access iframe content:', e);
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
continue;
}
} else {
@@ -537,8 +566,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
return { type: 'TR', element: currentElement };
}
// Handle iframe crossing
if (currentElement.tagName === 'IFRAME') {
// Handle iframe and frame crossing
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
try {
currentElement = currentElement.contentDocument.body;
} catch (e) {
@@ -582,7 +611,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
if (current.tagName === 'TH') return true;
if (current.tagName === 'IFRAME') {
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
try {
current = current.contentDocument.body;
} catch (e) {
@@ -638,14 +667,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
}
// Get elements from iframes
const iframes = document.getElementsByTagName('iframe');
for (const iframe of iframes) {
// Get elements from iframes and frames
const frames = [
...Array.from(document.getElementsByTagName('iframe')),
...Array.from(document.getElementsByTagName('frame'))
];
for (const frame of frames) {
try {
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
const frameDoc = frame.contentDocument || frame.contentWindow.document;
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
} catch (e) {
console.warn('Cannot access iframe content:', e);
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
}
}
@@ -707,7 +740,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const tableData = [];
const nonTableData = [];
// Process table data with both iframe and shadow DOM support
// Process table data with support for iframes, frames, and shadow DOM
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
const container = containers[containerIndex];
const { tableFields } = containerFields[containerIndex];
@@ -717,14 +750,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
const firstElement = queryElement(container, firstField.selector);
let tableContext = firstElement;
// Find table context including both iframe and shadow DOM
// Find table context including iframe, frame and shadow DOM
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
if (tableContext.getRootNode() instanceof ShadowRoot) {
tableContext = tableContext.getRootNode().host;
continue;
}
if (tableContext.tagName === 'IFRAME') {
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
try {
tableContext = tableContext.contentDocument.body;
} catch (e) {
@@ -747,13 +780,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
}
// Get rows from iframes
if (tableContext.tagName === 'IFRAME') {
// Get rows from iframes and frames
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
try {
const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
rows.push(...iframeDoc.getElementsByTagName('TR'));
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
rows.push(...frameDoc.getElementsByTagName('TR'));
} catch (e) {
console.warn('Cannot access iframe rows:', e);
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
}
}
@@ -823,7 +856,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
}
}
// Process non-table data with both contexts support
// Process non-table data with all contexts support
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
if (nonTableData.length >= limit) break;

View File

@@ -286,6 +286,12 @@ export default class Interpreter extends EventEmitter {
? arrayToObject(<any>superset[key])
: superset[key];
if ((key === 'url' || key === 'selectors') &&
Array.isArray(value) && Array.isArray(superset[key]) &&
value.length === 0 && (superset[key] as any[]).length === 0) {
return true;
}
if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
return value.some(selector =>
(superset[key] as any[]).includes(selector)
@@ -592,33 +598,52 @@ export default class Interpreter extends EventEmitter {
};
// Enhanced button finder with retry mechanism
const findWorkingButton = async (selectors: string[], retryCount = 0): Promise<{
button: ElementHandle | null,
workingSelector: string | null
const findWorkingButton = async (selectors: string[]): Promise<{
button: ElementHandle | null,
workingSelector: string | null,
updatedSelectors: string[]
}> => {
for (const selector of selectors) {
try {
const button = await page.waitForSelector(selector, {
state: 'attached',
timeout: 10000 // Reduced timeout for faster checks
});
if (button) {
debugLog('Found working selector:', selector);
return { button, workingSelector: selector };
let updatedSelectors = [...selectors];
for (let i = 0; i < selectors.length; i++) {
const selector = selectors[i];
let retryCount = 0;
let selectorSuccess = false;
while (retryCount < MAX_RETRIES && !selectorSuccess) {
try {
const button = await page.waitForSelector(selector, {
state: 'attached',
timeout: 10000
});
if (button) {
debugLog('Found working selector:', selector);
return {
button,
workingSelector: selector,
updatedSelectors
};
}
} catch (error) {
retryCount++;
debugLog(`Selector "${selector}" failed: attempt ${retryCount}/${MAX_RETRIES}`);
if (retryCount < MAX_RETRIES) {
await page.waitForTimeout(RETRY_DELAY);
} else {
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
updatedSelectors = updatedSelectors.filter(s => s !== selector);
}
}
} catch (error) {
debugLog(`Selector failed: ${selector}`);
}
}
// Implement retry mechanism when no selectors work
if (selectors.length > 0 && retryCount < MAX_RETRIES) {
debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
await page.waitForTimeout(RETRY_DELAY);
return findWorkingButton(selectors, retryCount + 1);
}
return { button: null, workingSelector: null };
return {
button: null,
workingSelector: null,
updatedSelectors
};
};
const retryOperation = async (operation: () => Promise<boolean>, retryCount = 0): Promise<boolean> => {
@@ -680,7 +705,10 @@ export default class Interpreter extends EventEmitter {
await scrapeCurrentPage();
if (checkLimit()) return allResults;
const { button, workingSelector } = await findWorkingButton(availableSelectors);
const { button, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);
availableSelectors = updatedSelectors;
if (!button || !workingSelector) {
// Final retry for navigation when no selectors work
const success = await retryOperation(async () => {
@@ -697,10 +725,6 @@ export default class Interpreter extends EventEmitter {
break;
}
availableSelectors = availableSelectors.slice(
availableSelectors.indexOf(workingSelector)
);
let retryCount = 0;
let navigationSuccess = false;
@@ -768,22 +792,25 @@ export default class Interpreter extends EventEmitter {
}
case 'clickLoadMore': {
await scrapeCurrentPage();
if (checkLimit()) return allResults;
let loadMoreCounter = 0;
let previousResultCount = allResults.length;
let noNewItemsCounter = 0;
const MAX_NO_NEW_ITEMS = 2;
while (true) {
// Find working button with retry mechanism, consistent with clickNext
const { button: loadMoreButton, workingSelector } = await findWorkingButton(availableSelectors);
// Find working button with retry mechanism
const { button: loadMoreButton, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);
availableSelectors = updatedSelectors;
if (!workingSelector || !loadMoreButton) {
debugLog('No working Load More selector found after retries');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
// Update available selectors to start from the working one
availableSelectors = availableSelectors.slice(
availableSelectors.indexOf(workingSelector)
);
// Implement retry mechanism for clicking the button
let retryCount = 0;
let clickSuccess = false;
@@ -808,6 +835,8 @@ export default class Interpreter extends EventEmitter {
if (clickSuccess) {
await page.waitForTimeout(1000);
loadMoreCounter++;
debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`);
}
} catch (error) {
debugLog(`Click attempt ${retryCount + 1} failed completely.`);
@@ -822,8 +851,6 @@ export default class Interpreter extends EventEmitter {
if (!clickSuccess) {
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
@@ -833,20 +860,34 @@ export default class Interpreter extends EventEmitter {
await page.waitForTimeout(2000);
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
if (currentHeight === previousHeight) {
debugLog('No more items loaded after Load More');
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
allResults = allResults.concat(finalResults);
return allResults;
}
const heightChanged = currentHeight !== previousHeight;
previousHeight = currentHeight;
if (config.limit && allResults.length >= config.limit) {
allResults = allResults.slice(0, config.limit);
break;
await scrapeCurrentPage();
const currentResultCount = allResults.length;
const newItemsAdded = currentResultCount > previousResultCount;
if (!newItemsAdded) {
noNewItemsCounter++;
debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
return allResults;
}
} else {
noNewItemsCounter = 0;
previousResultCount = currentResultCount;
}
if (checkLimit()) return allResults;
if (!heightChanged) {
debugLog('No more items loaded after Load More');
return allResults;
}
}
break;
}
default: {

View File

@@ -825,6 +825,7 @@ export class WorkflowGenerator {
selectors?.testIdSelector,
selectors?.id,
selectors?.hrefSelector,
selectors?.relSelector,
selectors?.accessibilitySelector,
selectors?.attrSelector
]

File diff suppressed because it is too large Load Diff