feat: add frame element support for scrapeList action
This commit is contained in:
@@ -423,7 +423,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
||||||
*/
|
*/
|
||||||
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
|
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
|
||||||
// Enhanced query function to handle both iframe and shadow DOM
|
// Enhanced query function to handle iframe, frame and shadow DOM
|
||||||
const queryElement = (rootElement, selector) => {
|
const queryElement = (rootElement, selector) => {
|
||||||
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
if (!selector.includes('>>') && !selector.includes(':>>')) {
|
||||||
return rootElement.querySelector(selector);
|
return rootElement.querySelector(selector);
|
||||||
@@ -435,14 +435,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
for (let i = 0; i < parts.length; i++) {
|
for (let i = 0; i < parts.length; i++) {
|
||||||
if (!currentElement) return null;
|
if (!currentElement) return null;
|
||||||
|
|
||||||
// Handle iframe traversal
|
// Handle iframe and frame traversal
|
||||||
if (currentElement.tagName === 'IFRAME') {
|
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
const iframeDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
const frameDoc = currentElement.contentDocument || currentElement.contentWindow.document;
|
||||||
currentElement = iframeDoc.querySelector(parts[i]);
|
currentElement = frameDoc.querySelector(parts[i]);
|
||||||
continue;
|
continue;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Cannot access iframe content:', e);
|
console.warn(`Cannot access ${currentElement.tagName.toLowerCase()} content:`, e);
|
||||||
return null;
|
return null;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -485,13 +485,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
const nextElements = [];
|
const nextElements = [];
|
||||||
|
|
||||||
for (const element of currentElements) {
|
for (const element of currentElements) {
|
||||||
// Handle iframe traversal
|
// Handle iframe and frame traversal
|
||||||
if (element.tagName === 'IFRAME') {
|
if (element.tagName === 'IFRAME' || element.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
const iframeDoc = element.contentDocument || element.contentWindow.document;
|
const frameDoc = element.contentDocument || element.contentWindow.document;
|
||||||
nextElements.push(...iframeDoc.querySelectorAll(part));
|
nextElements.push(...frameDoc.querySelectorAll(part));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Cannot access iframe content:', e);
|
console.warn(`Cannot access ${element.tagName.toLowerCase()} content:`, e);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
@@ -566,8 +566,8 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
return { type: 'TR', element: currentElement };
|
return { type: 'TR', element: currentElement };
|
||||||
}
|
}
|
||||||
|
|
||||||
// Handle iframe crossing
|
// Handle iframe and frame crossing
|
||||||
if (currentElement.tagName === 'IFRAME') {
|
if (currentElement.tagName === 'IFRAME' || currentElement.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
currentElement = currentElement.contentDocument.body;
|
currentElement = currentElement.contentDocument.body;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -611,7 +611,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
|
|
||||||
if (current.tagName === 'TH') return true;
|
if (current.tagName === 'TH') return true;
|
||||||
|
|
||||||
if (current.tagName === 'IFRAME') {
|
if (current.tagName === 'IFRAME' || current.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
current = current.contentDocument.body;
|
current = current.contentDocument.body;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -667,14 +667,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
allElements.push(...shadowHost.getElementsByTagName(baseElement.tagName));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get elements from iframes
|
// Get elements from iframes and frames
|
||||||
const iframes = document.getElementsByTagName('iframe');
|
const frames = [
|
||||||
for (const iframe of iframes) {
|
...Array.from(document.getElementsByTagName('iframe')),
|
||||||
|
...Array.from(document.getElementsByTagName('frame'))
|
||||||
|
];
|
||||||
|
|
||||||
|
for (const frame of frames) {
|
||||||
try {
|
try {
|
||||||
const iframeDoc = iframe.contentDocument || iframe.contentWindow.document;
|
const frameDoc = frame.contentDocument || frame.contentWindow.document;
|
||||||
allElements.push(...iframeDoc.getElementsByTagName(baseElement.tagName));
|
allElements.push(...frameDoc.getElementsByTagName(baseElement.tagName));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Cannot access iframe content:', e);
|
console.warn(`Cannot access ${frame.tagName.toLowerCase()} content:`, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -736,7 +740,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
const tableData = [];
|
const tableData = [];
|
||||||
const nonTableData = [];
|
const nonTableData = [];
|
||||||
|
|
||||||
// Process table data with both iframe and shadow DOM support
|
// Process table data with support for iframes, frames, and shadow DOM
|
||||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||||
const container = containers[containerIndex];
|
const container = containers[containerIndex];
|
||||||
const { tableFields } = containerFields[containerIndex];
|
const { tableFields } = containerFields[containerIndex];
|
||||||
@@ -746,14 +750,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
const firstElement = queryElement(container, firstField.selector);
|
const firstElement = queryElement(container, firstField.selector);
|
||||||
let tableContext = firstElement;
|
let tableContext = firstElement;
|
||||||
|
|
||||||
// Find table context including both iframe and shadow DOM
|
// Find table context including iframe, frame and shadow DOM
|
||||||
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
||||||
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
||||||
tableContext = tableContext.getRootNode().host;
|
tableContext = tableContext.getRootNode().host;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
|
|
||||||
if (tableContext.tagName === 'IFRAME') {
|
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
tableContext = tableContext.contentDocument.body;
|
tableContext = tableContext.contentDocument.body;
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
@@ -776,13 +780,13 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get rows from iframes
|
// Get rows from iframes and frames
|
||||||
if (tableContext.tagName === 'IFRAME') {
|
if (tableContext.tagName === 'IFRAME' || tableContext.tagName === 'FRAME') {
|
||||||
try {
|
try {
|
||||||
const iframeDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
const frameDoc = tableContext.contentDocument || tableContext.contentWindow.document;
|
||||||
rows.push(...iframeDoc.getElementsByTagName('TR'));
|
rows.push(...frameDoc.getElementsByTagName('TR'));
|
||||||
} catch (e) {
|
} catch (e) {
|
||||||
console.warn('Cannot access iframe rows:', e);
|
console.warn(`Cannot access ${tableContext.tagName.toLowerCase()} rows:`, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -852,7 +856,7 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Process non-table data with both contexts support
|
// Process non-table data with all contexts support
|
||||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||||
if (nonTableData.length >= limit) break;
|
if (nonTableData.length >= limit) break;
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user