Merge pull request #462 from getmaxun/improve-extract
feat: improve highlighting and data extraction
This commit is contained in:
@@ -210,7 +210,6 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
return Array.from(document.querySelectorAll(config.selector));
|
||||
}
|
||||
|
||||
// First handle iframe traversal if present
|
||||
if (config.selector.includes(':>>')) {
|
||||
const parts = config.selector.split(':>>').map(s => s.trim());
|
||||
let currentElements = [document];
|
||||
@@ -223,23 +222,44 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
|
||||
for (const element of currentElements) {
|
||||
try {
|
||||
// For document or iframe document
|
||||
const doc = element.contentDocument || element || element.contentWindow?.document;
|
||||
if (!doc) continue;
|
||||
|
||||
// Query elements in current context
|
||||
if (part.startsWith('frame[name=') || part.startsWith('iframe[name=')) {
|
||||
const nameMatch = part.match(/\[name=['"]([^'"]+)['"]\]/);
|
||||
if (nameMatch && nameMatch[1]) {
|
||||
const frameName = nameMatch[1];
|
||||
let foundFrames = [];
|
||||
|
||||
if (doc.getElementsByName && typeof doc.getElementsByName === 'function') {
|
||||
foundFrames = Array.from(doc.getElementsByName(frameName))
|
||||
.filter(el => el.tagName === 'FRAME' || el.tagName === 'IFRAME');
|
||||
}
|
||||
|
||||
if (foundFrames.length === 0) {
|
||||
const framesBySelector = Array.from(doc.querySelectorAll(`frame[name="${frameName}"], iframe[name="${frameName}"]`));
|
||||
foundFrames = framesBySelector;
|
||||
}
|
||||
|
||||
if (isLast) {
|
||||
nextElements.push(...foundFrames);
|
||||
} else {
|
||||
nextElements.push(...foundFrames);
|
||||
}
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
const found = Array.from(doc.querySelectorAll(part));
|
||||
|
||||
if (isLast) {
|
||||
// If it's the last part, keep all matching elements
|
||||
nextElements.push(...found);
|
||||
} else {
|
||||
// If not last, only keep iframes for next iteration
|
||||
const iframes = found.filter(el => el.tagName === 'IFRAME');
|
||||
nextElements.push(...iframes);
|
||||
const frames = found.filter(el => el.tagName === 'IFRAME' || el.tagName === 'FRAME');
|
||||
nextElements.push(...frames);
|
||||
}
|
||||
} catch (error) {
|
||||
console.warn('Cannot access iframe content:', error, {
|
||||
console.warn('Cannot access iframe/frame content:', error, {
|
||||
part,
|
||||
element,
|
||||
index: i
|
||||
@@ -285,12 +305,17 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
return [];
|
||||
}
|
||||
|
||||
// Modified to handle iframe context for URL resolution
|
||||
function getElementValue(element, attribute) {
|
||||
if (!element) return null;
|
||||
|
||||
// Get the base URL for resolving relative URLs
|
||||
const baseURL = element.ownerDocument?.location?.href || window.location.origin;
|
||||
let baseURL;
|
||||
try {
|
||||
baseURL = element.ownerDocument?.location?.href ||
|
||||
element.ownerDocument?.baseURI ||
|
||||
window.location.origin;
|
||||
} catch (e) {
|
||||
baseURL = window.location.origin;
|
||||
}
|
||||
|
||||
switch (attribute) {
|
||||
case 'href': {
|
||||
@@ -305,6 +330,10 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
return element.innerText?.trim();
|
||||
case 'textContent':
|
||||
return element.textContent?.trim();
|
||||
case 'innerHTML':
|
||||
return element.innerHTML;
|
||||
case 'outerHTML':
|
||||
return element.outerHTML;
|
||||
default:
|
||||
return element.getAttribute(attribute) || element.innerText?.trim();
|
||||
}
|
||||
@@ -394,7 +423,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
||||
*/
|
||||
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
|
||||
// Enhanced query function to handle both iframe and shadow DOM
|
||||
// Enhanced query function to handle iframe, frame and shadow DOM
|
||||
const queryElement = (rootElement, selector) => {
|
||||
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
||||
return rootElement.querySelector(selector);
|
||||
@@ -406,14 +435,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
for (let i = 0; i < parts.length; i++) {
|
||||
if (!currentElement) return null;
|
||||
|
||||
// Handle iframe traversal
|
||||
if (currentElement.tagName === 'IFRAME') {
|
||||
// Handle iframe and frame traversal
|
||||
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
||||
try {
|
||||
const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
||||
currentElement = iframeDoc.querySelector(parts[i]);
|
||||
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
||||
currentElement = frameDoc.querySelector(parts[i]);
|
||||
continue;
|
||||
} catch (e) {
|
||||
console.warn('Cannot access iframe content:', e);
|
||||
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -456,13 +485,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
const nextElements = [];
|
||||
|
||||
for (const element of currentElements) {
|
||||
// Handle iframe traversal
|
||||
if (element.tagName === 'IFRAME') {
|
||||
// Handle iframe and frame traversal
|
||||
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
|
||||
try {
|
||||
const iframeDoc = element.contentDocument || element.contentWindow.document;
|
||||
nextElements.push(...iframeDoc.querySelectorAll(part));
|
||||
const frameDoc = element.contentDocument || element.contentWindow.document;
|
||||
nextElements.push(...frameDoc.querySelectorAll(part));
|
||||
} catch (e) {
|
||||
console.warn('Cannot access iframe content:', e);
|
||||
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
@@ -537,8 +566,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
return { type: 'TR', element: currentElement };
|
||||
}
|
||||
|
||||
// Handle iframe crossing
|
||||
if (currentElement.tagName === 'IFRAME') {
|
||||
// Handle iframe and frame crossing
|
||||
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
||||
try {
|
||||
currentElement = currentElement.contentDocument.body;
|
||||
} catch (e) {
|
||||
@@ -582,7 +611,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
|
||||
if (current.tagName === 'TH') return true;
|
||||
|
||||
if (current.tagName === 'IFRAME') {
|
||||
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
|
||||
try {
|
||||
current = current.contentDocument.body;
|
||||
} catch (e) {
|
||||
@@ -638,14 +667,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
||||
}
|
||||
|
||||
// Get elements from iframes
|
||||
const iframes = document.getElementsByTagName('iframe');
|
||||
for (const iframe of iframes) {
|
||||
// Get elements from iframes and frames
|
||||
const frames = [
|
||||
...Array.from(document.getElementsByTagName('iframe')),
|
||||
...Array.from(document.getElementsByTagName('frame'))
|
||||
];
|
||||
|
||||
for (const frame of frames) {
|
||||
try {
|
||||
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
|
||||
allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
|
||||
const frameDoc = frame.contentDocument || frame.contentWindow.document;
|
||||
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
|
||||
} catch (e) {
|
||||
console.warn('Cannot access iframe content:', e);
|
||||
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -707,7 +740,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
const tableData = [];
|
||||
const nonTableData = [];
|
||||
|
||||
// Process table data with both iframe and shadow DOM support
|
||||
// Process table data with support for iframes, frames, and shadow DOM
|
||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||
const container = containers[containerIndex];
|
||||
const { tableFields } = containerFields[containerIndex];
|
||||
@@ -717,14 +750,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
const firstElement = queryElement(container, firstField.selector);
|
||||
let tableContext = firstElement;
|
||||
|
||||
// Find table context including both iframe and shadow DOM
|
||||
// Find table context including iframe, frame and shadow DOM
|
||||
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
||||
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
||||
tableContext = tableContext.getRootNode().host;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (tableContext.tagName === 'IFRAME') {
|
||||
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
||||
try {
|
||||
tableContext = tableContext.contentDocument.body;
|
||||
} catch (e) {
|
||||
@@ -747,13 +780,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
||||
}
|
||||
|
||||
// Get rows from iframes
|
||||
if (tableContext.tagName === 'IFRAME') {
|
||||
// Get rows from iframes and frames
|
||||
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
||||
try {
|
||||
const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
||||
rows.push(...iframeDoc.getElementsByTagName('TR'));
|
||||
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
||||
rows.push(...frameDoc.getElementsByTagName('TR'));
|
||||
} catch (e) {
|
||||
console.warn('Cannot access iframe rows:', e);
|
||||
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -823,7 +856,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
}
|
||||
}
|
||||
|
||||
// Process non-table data with both contexts support
|
||||
// Process non-table data with all contexts support
|
||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||
if (nonTableData.length >= limit) break;
|
||||
|
||||
|
||||
@@ -286,6 +286,12 @@ export default class Interpreter extends EventEmitter {
|
||||
? arrayToObject(<any>superset[key])
|
||||
: superset[key];
|
||||
|
||||
if ((key === 'url' || key === 'selectors') &&
|
||||
Array.isArray(value) && Array.isArray(superset[key]) &&
|
||||
value.length === 0 && (superset[key] as any[]).length === 0) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
|
||||
return value.some(selector =>
|
||||
(superset[key] as any[]).includes(selector)
|
||||
@@ -592,33 +598,52 @@ export default class Interpreter extends EventEmitter {
|
||||
};
|
||||
|
||||
// Enhanced button finder with retry mechanism
|
||||
const findWorkingButton = async (selectors: string[], retryCount = 0): Promise<{
|
||||
button: ElementHandle | null,
|
||||
workingSelector: string | null
|
||||
const findWorkingButton = async (selectors: string[]): Promise<{
|
||||
button: ElementHandle | null,
|
||||
workingSelector: string | null,
|
||||
updatedSelectors: string[]
|
||||
}> => {
|
||||
for (const selector of selectors) {
|
||||
try {
|
||||
const button = await page.waitForSelector(selector, {
|
||||
state: 'attached',
|
||||
timeout: 10000 // Reduced timeout for faster checks
|
||||
});
|
||||
if (button) {
|
||||
debugLog('Found working selector:', selector);
|
||||
return { button, workingSelector: selector };
|
||||
let updatedSelectors = [...selectors];
|
||||
|
||||
for (let i = 0; i < selectors.length; i++) {
|
||||
const selector = selectors[i];
|
||||
let retryCount = 0;
|
||||
let selectorSuccess = false;
|
||||
|
||||
while (retryCount < MAX_RETRIES && !selectorSuccess) {
|
||||
try {
|
||||
const button = await page.waitForSelector(selector, {
|
||||
state: 'attached',
|
||||
timeout: 10000
|
||||
});
|
||||
|
||||
if (button) {
|
||||
debugLog('Found working selector:', selector);
|
||||
return {
|
||||
button,
|
||||
workingSelector: selector,
|
||||
updatedSelectors
|
||||
};
|
||||
}
|
||||
} catch (error) {
|
||||
retryCount++;
|
||||
debugLog(`Selector "${selector}" failed: attempt ${retryCount}/${MAX_RETRIES}`);
|
||||
|
||||
if (retryCount < MAX_RETRIES) {
|
||||
await page.waitForTimeout(RETRY_DELAY);
|
||||
} else {
|
||||
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
|
||||
updatedSelectors = updatedSelectors.filter(s => s !== selector);
|
||||
}
|
||||
}
|
||||
} catch (error) {
|
||||
debugLog(`Selector failed: ${selector}`);
|
||||
}
|
||||
}
|
||||
|
||||
// Implement retry mechanism when no selectors work
|
||||
if (selectors.length > 0 && retryCount < MAX_RETRIES) {
|
||||
debugLog(`Retry attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
||||
await page.waitForTimeout(RETRY_DELAY);
|
||||
return findWorkingButton(selectors, retryCount + 1);
|
||||
}
|
||||
|
||||
return { button: null, workingSelector: null };
|
||||
|
||||
return {
|
||||
button: null,
|
||||
workingSelector: null,
|
||||
updatedSelectors
|
||||
};
|
||||
};
|
||||
|
||||
const retryOperation = async (operation: () => Promise<boolean>, retryCount = 0): Promise<boolean> => {
|
||||
@@ -680,7 +705,10 @@ export default class Interpreter extends EventEmitter {
|
||||
await scrapeCurrentPage();
|
||||
if (checkLimit()) return allResults;
|
||||
|
||||
const { button, workingSelector } = await findWorkingButton(availableSelectors);
|
||||
const { button, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);
|
||||
|
||||
availableSelectors = updatedSelectors;
|
||||
|
||||
if (!button || !workingSelector) {
|
||||
// Final retry for navigation when no selectors work
|
||||
const success = await retryOperation(async () => {
|
||||
@@ -697,10 +725,6 @@ export default class Interpreter extends EventEmitter {
|
||||
break;
|
||||
}
|
||||
|
||||
availableSelectors = availableSelectors.slice(
|
||||
availableSelectors.indexOf(workingSelector)
|
||||
);
|
||||
|
||||
let retryCount = 0;
|
||||
let navigationSuccess = false;
|
||||
|
||||
@@ -768,22 +792,25 @@ export default class Interpreter extends EventEmitter {
|
||||
}
|
||||
|
||||
case 'clickLoadMore': {
|
||||
await scrapeCurrentPage();
|
||||
if (checkLimit()) return allResults;
|
||||
|
||||
let loadMoreCounter = 0;
|
||||
let previousResultCount = allResults.length;
|
||||
let noNewItemsCounter = 0;
|
||||
const MAX_NO_NEW_ITEMS = 2;
|
||||
|
||||
while (true) {
|
||||
// Find working button with retry mechanism, consistent with clickNext
|
||||
const { button: loadMoreButton, workingSelector } = await findWorkingButton(availableSelectors);
|
||||
// Find working button with retry mechanism
|
||||
const { button: loadMoreButton, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);
|
||||
|
||||
availableSelectors = updatedSelectors;
|
||||
|
||||
if (!workingSelector || !loadMoreButton) {
|
||||
debugLog('No working Load More selector found after retries');
|
||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||
allResults = allResults.concat(finalResults);
|
||||
return allResults;
|
||||
}
|
||||
|
||||
// Update available selectors to start from the working one
|
||||
availableSelectors = availableSelectors.slice(
|
||||
availableSelectors.indexOf(workingSelector)
|
||||
);
|
||||
|
||||
// Implement retry mechanism for clicking the button
|
||||
let retryCount = 0;
|
||||
let clickSuccess = false;
|
||||
@@ -808,6 +835,8 @@ export default class Interpreter extends EventEmitter {
|
||||
|
||||
if (clickSuccess) {
|
||||
await page.waitForTimeout(1000);
|
||||
loadMoreCounter++;
|
||||
debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`);
|
||||
}
|
||||
} catch (error) {
|
||||
debugLog(`Click attempt ${retryCount + 1} failed completely.`);
|
||||
@@ -822,8 +851,6 @@ export default class Interpreter extends EventEmitter {
|
||||
|
||||
if (!clickSuccess) {
|
||||
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||
allResults = allResults.concat(finalResults);
|
||||
return allResults;
|
||||
}
|
||||
|
||||
@@ -833,20 +860,34 @@ export default class Interpreter extends EventEmitter {
|
||||
await page.waitForTimeout(2000);
|
||||
|
||||
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
||||
if (currentHeight === previousHeight) {
|
||||
debugLog('No more items loaded after Load More');
|
||||
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||
allResults = allResults.concat(finalResults);
|
||||
return allResults;
|
||||
}
|
||||
const heightChanged = currentHeight !== previousHeight;
|
||||
previousHeight = currentHeight;
|
||||
|
||||
if (config.limit && allResults.length >= config.limit) {
|
||||
allResults = allResults.slice(0, config.limit);
|
||||
break;
|
||||
await scrapeCurrentPage();
|
||||
|
||||
const currentResultCount = allResults.length;
|
||||
const newItemsAdded = currentResultCount > previousResultCount;
|
||||
|
||||
if (!newItemsAdded) {
|
||||
noNewItemsCounter++;
|
||||
debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
|
||||
|
||||
if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
|
||||
debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
|
||||
return allResults;
|
||||
}
|
||||
} else {
|
||||
noNewItemsCounter = 0;
|
||||
previousResultCount = currentResultCount;
|
||||
}
|
||||
|
||||
if (checkLimit()) return allResults;
|
||||
|
||||
if (!heightChanged) {
|
||||
debugLog('No more items loaded after Load More');
|
||||
return allResults;
|
||||
}
|
||||
}
|
||||
break;
|
||||
}
|
||||
|
||||
default: {
|
||||
|
||||
@@ -825,6 +825,7 @@ export class WorkflowGenerator {
|
||||
selectors?.testIdSelector,
|
||||
selectors?.id,
|
||||
selectors?.hrefSelector,
|
||||
selectors?.relSelector,
|
||||
selectors?.accessibilitySelector,
|
||||
selectors?.attrSelector
|
||||
]
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
Reference in New Issue
Block a user