feat: add frame element support for scrapeList action
This commit is contained in:
@@ -423,7 +423,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
||||
*/
|
||||
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
|
||||
// Enhanced query function to handle both iframe and shadow DOM
|
||||
// Enhanced query function to handle iframe, frame and shadow DOM
|
||||
const queryElement = (rootElement, selector) => {
|
||||
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
||||
return rootElement.querySelector(selector);
|
||||
@@ -435,14 +435,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
for (let i = 0; i < parts.length; i++) {
|
||||
if (!currentElement) return null;
|
||||
|
||||
// Handle iframe traversal
|
||||
if (currentElement.tagName === 'IFRAME') {
|
||||
// Handle iframe and frame traversal
|
||||
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
||||
try {
|
||||
const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
||||
currentElement = iframeDoc.querySelector(parts[i]);
|
||||
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
||||
currentElement = frameDoc.querySelector(parts[i]);
|
||||
continue;
|
||||
} catch (e) {
|
||||
console.warn('Cannot access iframe content:', e);
|
||||
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
|
||||
return null;
|
||||
}
|
||||
}
|
||||
@@ -485,13 +485,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
const nextElements = [];
|
||||
|
||||
for (const element of currentElements) {
|
||||
// Handle iframe traversal
|
||||
if (element.tagName === 'IFRAME') {
|
||||
// Handle iframe and frame traversal
|
||||
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
|
||||
try {
|
||||
const iframeDoc = element.contentDocument || element.contentWindow.document;
|
||||
nextElements.push(...iframeDoc.querySelectorAll(part));
|
||||
const frameDoc = element.contentDocument || element.contentWindow.document;
|
||||
nextElements.push(...frameDoc.querySelectorAll(part));
|
||||
} catch (e) {
|
||||
console.warn('Cannot access iframe content:', e);
|
||||
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
|
||||
continue;
|
||||
}
|
||||
} else {
|
||||
@@ -566,8 +566,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
return { type: 'TR', element: currentElement };
|
||||
}
|
||||
|
||||
// Handle iframe crossing
|
||||
if (currentElement.tagName === 'IFRAME') {
|
||||
// Handle iframe and frame crossing
|
||||
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
||||
try {
|
||||
currentElement = currentElement.contentDocument.body;
|
||||
} catch (e) {
|
||||
@@ -611,7 +611,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
|
||||
if (current.tagName === 'TH') return true;
|
||||
|
||||
if (current.tagName === 'IFRAME') {
|
||||
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
|
||||
try {
|
||||
current = current.contentDocument.body;
|
||||
} catch (e) {
|
||||
@@ -667,14 +667,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
||||
}
|
||||
|
||||
// Get elements from iframes
|
||||
const iframes = document.getElementsByTagName('iframe');
|
||||
for (const iframe of iframes) {
|
||||
// Get elements from iframes and frames
|
||||
const frames = [
|
||||
...Array.from(document.getElementsByTagName('iframe')),
|
||||
...Array.from(document.getElementsByTagName('frame'))
|
||||
];
|
||||
|
||||
for (const frame of frames) {
|
||||
try {
|
||||
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
|
||||
allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
|
||||
const frameDoc = frame.contentDocument || frame.contentWindow.document;
|
||||
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
|
||||
} catch (e) {
|
||||
console.warn('Cannot access iframe content:', e);
|
||||
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -736,7 +740,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
const tableData = [];
|
||||
const nonTableData = [];
|
||||
|
||||
// Process table data with both iframe and shadow DOM support
|
||||
// Process table data with support for iframes, frames, and shadow DOM
|
||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||
const container = containers[containerIndex];
|
||||
const { tableFields } = containerFields[containerIndex];
|
||||
@@ -746,14 +750,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
const firstElement = queryElement(container, firstField.selector);
|
||||
let tableContext = firstElement;
|
||||
|
||||
// Find table context including both iframe and shadow DOM
|
||||
// Find table context including iframe, frame and shadow DOM
|
||||
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
||||
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
||||
tableContext = tableContext.getRootNode().host;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (tableContext.tagName === 'IFRAME') {
|
||||
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
||||
try {
|
||||
tableContext = tableContext.contentDocument.body;
|
||||
} catch (e) {
|
||||
@@ -776,13 +780,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
||||
}
|
||||
|
||||
// Get rows from iframes
|
||||
if (tableContext.tagName === 'IFRAME') {
|
||||
// Get rows from iframes and frames
|
||||
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
||||
try {
|
||||
const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
||||
rows.push(...iframeDoc.getElementsByTagName('TR'));
|
||||
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
||||
rows.push(...frameDoc.getElementsByTagName('TR'));
|
||||
} catch (e) {
|
||||
console.warn('Cannot access iframe rows:', e);
|
||||
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
|
||||
}
|
||||
}
|
||||
|
||||
@@ -852,7 +856,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
}
|
||||
}
|
||||
|
||||
// Process non-table data with both contexts support
|
||||
// Process non-table data with all contexts support
|
||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||
if (nonTableData.length >= limit) break;
|
||||
|
||||
|
||||
Reference in New Issue
Block a user