Merge pull request #296 from getmaxun/shadow-dom
feat: shadow dom selection
This commit is contained in:
@@ -188,69 +188,154 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
* @param {Object.<string, {selector: string, tag: string}>} lists The named lists of HTML elements.
|
||||
* @returns {Array.<Object.<string, string>>}
|
||||
*/
|
||||
window.scrapeSchema = function (lists) {
|
||||
window.scrapeSchema = function(lists) {
|
||||
function omap(object, f, kf = (x) => x) {
|
||||
return Object.fromEntries(
|
||||
Object.entries(object)
|
||||
.map(([k, v]) => [kf(k), f(v)]),
|
||||
);
|
||||
return Object.fromEntries(
|
||||
Object.entries(object)
|
||||
.map(([k, v]) => [kf(k), f(v)]),
|
||||
);
|
||||
}
|
||||
|
||||
function ofilter(object, f) {
|
||||
return Object.fromEntries(
|
||||
Object.entries(object)
|
||||
.filter(([k, v]) => f(k, v)),
|
||||
);
|
||||
return Object.fromEntries(
|
||||
Object.entries(object)
|
||||
.filter(([k, v]) => f(k, v)),
|
||||
);
|
||||
}
|
||||
|
||||
function findAllElements(config) {
|
||||
if (!config.shadow || !config.selector.includes('>>')) {
|
||||
return Array.from(document.querySelectorAll(config.selector));
|
||||
}
|
||||
|
||||
// For shadow DOM, we'll get all possible combinations
|
||||
const parts = config.selector.split('>>').map(s => s.trim());
|
||||
let currentElements = [document];
|
||||
|
||||
for (let i = 0; i < parts.length; i++) {
|
||||
const part = parts[i];
|
||||
const nextElements = [];
|
||||
|
||||
for (const element of currentElements) {
|
||||
let targets;
|
||||
if (i === 0) {
|
||||
// First selector is queried from document
|
||||
targets = Array.from(element.querySelectorAll(part))
|
||||
.filter(el => {
|
||||
// Only include elements that either:
|
||||
// 1. Have an open shadow root
|
||||
// 2. Don't need shadow root (last part of selector)
|
||||
if (i === parts.length - 1) return true;
|
||||
const shadowRoot = el.shadowRoot;
|
||||
return shadowRoot && shadowRoot.mode === 'open';
|
||||
});
|
||||
} else {
|
||||
// For subsequent selectors, only use elements with open shadow roots
|
||||
const shadowRoot = element.shadowRoot;
|
||||
if (!shadowRoot || shadowRoot.mode !== 'open') continue;
|
||||
|
||||
targets = Array.from(shadowRoot.querySelectorAll(part));
|
||||
}
|
||||
nextElements.push(...targets);
|
||||
}
|
||||
|
||||
if (nextElements.length === 0) return [];
|
||||
currentElements = nextElements;
|
||||
}
|
||||
|
||||
return currentElements;
|
||||
}
|
||||
|
||||
function getElementValue(element, attribute) {
|
||||
if (!element) return null;
|
||||
|
||||
switch (attribute) {
|
||||
case 'href': {
|
||||
const relativeHref = element.getAttribute('href');
|
||||
return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
|
||||
}
|
||||
case 'src': {
|
||||
const relativeSrc = element.getAttribute('src');
|
||||
return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
|
||||
}
|
||||
case 'innerText':
|
||||
return element.innerText?.trim();
|
||||
case 'textContent':
|
||||
return element.textContent?.trim();
|
||||
default:
|
||||
return element.getAttribute(attribute) || element.innerText?.trim();
|
||||
}
|
||||
}
|
||||
|
||||
// Get the seed key based on the maximum number of elements found
|
||||
function getSeedKey(listObj) {
|
||||
const maxLength = Math.max(...Object.values(omap(listObj, (x) => document.querySelectorAll(x.selector).length)));
|
||||
return Object.keys(ofilter(listObj, (_, v) => document.querySelectorAll(v.selector).length === maxLength))[0];
|
||||
const maxLength = Math.max(...Object.values(
|
||||
omap(listObj, (x) => findAllElements(x).length)
|
||||
));
|
||||
return Object.keys(
|
||||
ofilter(listObj, (_, v) => findAllElements(v).length === maxLength)
|
||||
)[0];
|
||||
}
|
||||
|
||||
// Find minimal bounding elements
|
||||
function getMBEs(elements) {
|
||||
return elements.map((element) => {
|
||||
let candidate = element;
|
||||
const isUniqueChild = (e) => elements
|
||||
.filter((elem) => e.parentNode?.contains(elem))
|
||||
.length === 1;
|
||||
let candidate = element;
|
||||
const isUniqueChild = (e) => elements
|
||||
.filter((elem) => e.parentNode?.contains(elem))
|
||||
.length === 1;
|
||||
|
||||
while (candidate && isUniqueChild(candidate)) {
|
||||
candidate = candidate.parentNode;
|
||||
}
|
||||
while (candidate && isUniqueChild(candidate)) {
|
||||
candidate = candidate.parentNode;
|
||||
}
|
||||
|
||||
return candidate;
|
||||
return candidate;
|
||||
});
|
||||
}
|
||||
|
||||
// First try the MBE approach
|
||||
const seedName = getSeedKey(lists);
|
||||
const seedElements = Array.from(document.querySelectorAll(lists[seedName].selector));
|
||||
const seedElements = findAllElements(lists[seedName]);
|
||||
const MBEs = getMBEs(seedElements);
|
||||
|
||||
return MBEs.map((mbe) => omap(
|
||||
lists,
|
||||
({ selector, attribute }, key) => {
|
||||
const elem = Array.from(document.querySelectorAll(selector)).find((elem) => mbe.contains(elem));
|
||||
if (!elem) return undefined;
|
||||
|
||||
switch (attribute) {
|
||||
case 'href':
|
||||
const relativeHref = elem.getAttribute('href');
|
||||
return relativeHref ? new URL(relativeHref, window.location.origin).href : null;
|
||||
case 'src':
|
||||
const relativeSrc = elem.getAttribute('src');
|
||||
return relativeSrc ? new URL(relativeSrc, window.location.origin).href : null;
|
||||
case 'innerText':
|
||||
return elem.innerText;
|
||||
case 'textContent':
|
||||
return elem.textContent;
|
||||
default:
|
||||
return elem.innerText;
|
||||
}
|
||||
},
|
||||
(key) => key // Use the original key in the output
|
||||
|
||||
const mbeResults = MBEs.map((mbe) => omap(
|
||||
lists,
|
||||
(config) => {
|
||||
const elem = findAllElements(config)
|
||||
.find((elem) => mbe.contains(elem));
|
||||
|
||||
return elem ? getElementValue(elem, config.attribute) : undefined;
|
||||
},
|
||||
(key) => key
|
||||
)) || [];
|
||||
}
|
||||
|
||||
// If MBE approach didn't find all elements, try independent scraping
|
||||
if (mbeResults.some(result => Object.values(result).some(v => v === undefined))) {
|
||||
// Fall back to independent scraping
|
||||
const results = [];
|
||||
const foundElements = new Map();
|
||||
|
||||
// Find all elements for each selector
|
||||
Object.entries(lists).forEach(([key, config]) => {
|
||||
const elements = findAllElements(config);
|
||||
foundElements.set(key, elements);
|
||||
});
|
||||
|
||||
// Create result objects for each found element
|
||||
foundElements.forEach((elements, key) => {
|
||||
elements.forEach((element, index) => {
|
||||
if (!results[index]) {
|
||||
results[index] = {};
|
||||
}
|
||||
results[index][key] = getElementValue(element, lists[key].attribute);
|
||||
});
|
||||
});
|
||||
|
||||
return results.filter(result => Object.keys(result).length > 0);
|
||||
}
|
||||
|
||||
return mbeResults;
|
||||
};
|
||||
|
||||
/**
|
||||
* Scrapes multiple lists of similar items based on a template item.
|
||||
@@ -262,10 +347,91 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
* @returns {Array.<Array.<Object>>} Array of arrays of scraped items, one sub-array per list
|
||||
*/
|
||||
window.scrapeList = async function ({ listSelector, fields, limit = 10 }) {
|
||||
// Helper function to extract values from elements
|
||||
// Shadow DOM query functions remain unchanged
|
||||
const queryShadowDOM = (rootElement, selector) => {
|
||||
if (!selector.includes('>>')) {
|
||||
return rootElement.querySelector(selector);
|
||||
}
|
||||
|
||||
const parts = selector.split('>>').map(part => part.trim());
|
||||
let currentElement = rootElement;
|
||||
|
||||
for (let i = 0; i < parts.length; i++) {
|
||||
if (!currentElement) return null;
|
||||
|
||||
if (!currentElement.querySelector && !currentElement.shadowRoot) {
|
||||
currentElement = document.querySelector(parts[i]);
|
||||
continue;
|
||||
}
|
||||
|
||||
let nextElement = currentElement.querySelector(parts[i]);
|
||||
|
||||
if (!nextElement && currentElement.shadowRoot) {
|
||||
nextElement = currentElement.shadowRoot.querySelector(parts[i]);
|
||||
}
|
||||
|
||||
if (!nextElement) {
|
||||
const allChildren = Array.from(currentElement.children || []);
|
||||
for (const child of allChildren) {
|
||||
if (child.shadowRoot) {
|
||||
nextElement = child.shadowRoot.querySelector(parts[i]);
|
||||
if (nextElement) break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
currentElement = nextElement;
|
||||
}
|
||||
|
||||
return currentElement;
|
||||
};
|
||||
|
||||
const queryShadowDOMAll = (rootElement, selector) => {
|
||||
if (!selector.includes('>>')) {
|
||||
return rootElement.querySelectorAll(selector);
|
||||
}
|
||||
|
||||
const parts = selector.split('>>').map(part => part.trim());
|
||||
let currentElements = [rootElement];
|
||||
|
||||
for (const part of parts) {
|
||||
const nextElements = [];
|
||||
|
||||
for (const element of currentElements) {
|
||||
if (element.querySelectorAll) {
|
||||
nextElements.push(...element.querySelectorAll(part));
|
||||
}
|
||||
|
||||
if (element.shadowRoot) {
|
||||
nextElements.push(...element.shadowRoot.querySelectorAll(part));
|
||||
}
|
||||
|
||||
const children = Array.from(element.children || []);
|
||||
for (const child of children) {
|
||||
if (child.shadowRoot) {
|
||||
nextElements.push(...child.shadowRoot.querySelectorAll(part));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
currentElements = nextElements;
|
||||
}
|
||||
|
||||
return currentElements;
|
||||
};
|
||||
|
||||
// Enhanced table processing helper functions with shadow DOM support
|
||||
function extractValue(element, attribute) {
|
||||
if (!element) return null;
|
||||
|
||||
// Check for shadow root first
|
||||
if (element.shadowRoot) {
|
||||
const shadowContent = element.shadowRoot.textContent;
|
||||
if (shadowContent && shadowContent.trim()) {
|
||||
return shadowContent.trim();
|
||||
}
|
||||
}
|
||||
|
||||
if (attribute === 'innerText') {
|
||||
return element.innerText.trim();
|
||||
} else if (attribute === 'innerHTML') {
|
||||
@@ -277,13 +443,18 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
return element.getAttribute(attribute);
|
||||
}
|
||||
|
||||
// Helper function to find table ancestors
|
||||
function findTableAncestor(element) {
|
||||
let currentElement = element;
|
||||
const MAX_DEPTH = 5;
|
||||
let depth = 0;
|
||||
|
||||
while (currentElement && depth < MAX_DEPTH) {
|
||||
// Check if current element is in shadow DOM
|
||||
if (currentElement.getRootNode() instanceof ShadowRoot) {
|
||||
currentElement = currentElement.getRootNode().host;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (currentElement.tagName === 'TD') {
|
||||
return { type: 'TD', element: currentElement };
|
||||
} else if (currentElement.tagName === 'TR') {
|
||||
@@ -298,6 +469,14 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
function getCellIndex(td) {
|
||||
let index = 0;
|
||||
let sibling = td;
|
||||
|
||||
// Handle shadow DOM case
|
||||
if (td.getRootNode() instanceof ShadowRoot) {
|
||||
const shadowRoot = td.getRootNode();
|
||||
const allCells = Array.from(shadowRoot.querySelectorAll('td'));
|
||||
return allCells.indexOf(td);
|
||||
}
|
||||
|
||||
while (sibling = sibling.previousElementSibling) {
|
||||
index++;
|
||||
}
|
||||
@@ -306,10 +485,16 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
|
||||
function hasThElement(row, tableFields) {
|
||||
for (const [label, { selector }] of Object.entries(tableFields)) {
|
||||
const element = row.querySelector(selector);
|
||||
const element = queryShadowDOM(row, selector);
|
||||
if (element) {
|
||||
let current = element;
|
||||
while (current && current !== row) {
|
||||
// Check if we're in shadow DOM
|
||||
if (current.getRootNode() instanceof ShadowRoot) {
|
||||
current = current.getRootNode().host;
|
||||
continue;
|
||||
}
|
||||
|
||||
if (current.tagName === 'TH') {
|
||||
return true;
|
||||
}
|
||||
@@ -326,69 +511,65 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
return rows;
|
||||
}
|
||||
}
|
||||
return rows.filter(row => row.getElementsByTagName('TH').length === 0);
|
||||
// Include shadow DOM in TH search
|
||||
return rows.filter(row => {
|
||||
const directTH = row.getElementsByTagName('TH').length === 0;
|
||||
const shadowTH = row.shadowRoot ?
|
||||
row.shadowRoot.querySelector('th') === null : true;
|
||||
return directTH && shadowTH;
|
||||
});
|
||||
}
|
||||
|
||||
// Class similarity functions remain unchanged
|
||||
function calculateClassSimilarity(classList1, classList2) {
|
||||
const set1 = new Set(classList1);
|
||||
const set2 = new Set(classList2);
|
||||
|
||||
// Calculate intersection
|
||||
const intersection = new Set([...set1].filter(x => set2.has(x)));
|
||||
|
||||
// Calculate union
|
||||
const union = new Set([...set1, ...set2]);
|
||||
|
||||
// Return Jaccard similarity coefficient
|
||||
return intersection.size / union.size;
|
||||
}
|
||||
|
||||
// New helper function to find elements with similar classes
|
||||
function findSimilarElements(baseElement, similarityThreshold = 0.7) {
|
||||
const baseClasses = Array.from(baseElement.classList);
|
||||
|
||||
if (baseClasses.length === 0) return [];
|
||||
|
||||
const potentialElements = document.getElementsByTagName(baseElement.tagName);
|
||||
|
||||
return Array.from(potentialElements).filter(element => {
|
||||
if (element === baseElement) return false;
|
||||
|
||||
const similarity = calculateClassSimilarity(
|
||||
baseClasses,
|
||||
Array.from(element.classList)
|
||||
);
|
||||
|
||||
return similarity >= similarityThreshold;
|
||||
});
|
||||
const set1 = new Set(classList1);
|
||||
const set2 = new Set(classList2);
|
||||
const intersection = new Set([...set1].filter(x => set2.has(x)));
|
||||
const union = new Set([...set1, ...set2]);
|
||||
return intersection.size / union.size;
|
||||
}
|
||||
|
||||
let containers = Array.from(document.querySelectorAll(listSelector));
|
||||
function findSimilarElements(baseElement, similarityThreshold = 0.7) {
|
||||
const baseClasses = Array.from(baseElement.classList);
|
||||
if (baseClasses.length === 0) return [];
|
||||
const potentialElements = document.getElementsByTagName(baseElement.tagName);
|
||||
return Array.from(potentialElements).filter(element => {
|
||||
if (element === baseElement) return false;
|
||||
const similarity = calculateClassSimilarity(
|
||||
baseClasses,
|
||||
Array.from(element.classList)
|
||||
);
|
||||
return similarity >= similarityThreshold;
|
||||
});
|
||||
}
|
||||
|
||||
// Main scraping logic with shadow DOM support
|
||||
let containers = queryShadowDOMAll(document, listSelector);
|
||||
containers = Array.from(containers);
|
||||
|
||||
if (containers.length === 0) return [];
|
||||
|
||||
if (limit > 1 && containers.length === 1) {
|
||||
const baseContainer = containers[0];
|
||||
const similarContainers = findSimilarElements(baseContainer);
|
||||
|
||||
if (similarContainers.length > 0) {
|
||||
const newContainers = similarContainers.filter(container =>
|
||||
!container.matches(listSelector)
|
||||
);
|
||||
|
||||
containers = [...containers, ...newContainers];
|
||||
}
|
||||
const baseContainer = containers[0];
|
||||
const similarContainers = findSimilarElements(baseContainer);
|
||||
|
||||
if (similarContainers.length > 0) {
|
||||
const newContainers = similarContainers.filter(container =>
|
||||
!container.matches(listSelector)
|
||||
);
|
||||
containers = [...containers, ...newContainers];
|
||||
}
|
||||
}
|
||||
|
||||
// Initialize arrays to store field classifications for each container
|
||||
const containerFields = containers.map(() => ({
|
||||
tableFields: {},
|
||||
nonTableFields: {}
|
||||
}));
|
||||
|
||||
// Analyze field types for each container
|
||||
// Classify fields
|
||||
containers.forEach((container, containerIndex) => {
|
||||
for (const [label, field] of Object.entries(fields)) {
|
||||
const sampleElement = container.querySelector(field.selector);
|
||||
const sampleElement = queryShadowDOM(container, field.selector);
|
||||
|
||||
if (sampleElement) {
|
||||
const ancestor = findTableAncestor(sampleElement);
|
||||
@@ -409,101 +590,122 @@ function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3,
|
||||
|
||||
const tableData = [];
|
||||
const nonTableData = [];
|
||||
|
||||
// Process table fields across all containers
|
||||
|
||||
// Process table data with shadow DOM support
|
||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||
const container = containers[containerIndex];
|
||||
const { tableFields } = containerFields[containerIndex];
|
||||
const container = containers[containerIndex];
|
||||
const { tableFields } = containerFields[containerIndex];
|
||||
|
||||
if (Object.keys(tableFields).length > 0) {
|
||||
const firstField = Object.values(tableFields)[0];
|
||||
const firstElement = container.querySelector(firstField.selector);
|
||||
let tableContext = firstElement;
|
||||
if (Object.keys(tableFields).length > 0) {
|
||||
const firstField = Object.values(tableFields)[0];
|
||||
const firstElement = queryShadowDOM(container, firstField.selector);
|
||||
let tableContext = firstElement;
|
||||
|
||||
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
||||
tableContext = tableContext.parentElement;
|
||||
}
|
||||
|
||||
if (tableContext) {
|
||||
const rows = Array.from(tableContext.getElementsByTagName('TR'));
|
||||
const processedRows = filterRowsBasedOnTag(rows, tableFields);
|
||||
|
||||
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
|
||||
const record = {};
|
||||
const currentRow = processedRows[rowIndex];
|
||||
|
||||
for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
|
||||
let element = null;
|
||||
|
||||
if (cellIndex >= 0) {
|
||||
const td = currentRow.children[cellIndex];
|
||||
if (td) {
|
||||
element = td.querySelector(selector);
|
||||
|
||||
if (!element && selector.split(">").pop().includes('td:nth-child')) {
|
||||
element = td;
|
||||
}
|
||||
|
||||
if (!element) {
|
||||
const tagOnlySelector = selector.split('.')[0];
|
||||
element = td.querySelector(tagOnlySelector);
|
||||
}
|
||||
|
||||
if (!element) {
|
||||
let currentElement = td;
|
||||
while (currentElement && currentElement.children.length > 0) {
|
||||
let foundContentChild = false;
|
||||
for (const child of currentElement.children) {
|
||||
if (extractValue(child, attribute)) {
|
||||
currentElement = child;
|
||||
foundContentChild = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!foundContentChild) break;
|
||||
}
|
||||
element = currentElement;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
element = currentRow.querySelector(selector);
|
||||
}
|
||||
|
||||
if (element) {
|
||||
record[label] = extractValue(element, attribute);
|
||||
}
|
||||
// Find table context including shadow DOM
|
||||
while (tableContext && tableContext.tagName !== 'TABLE' && tableContext !== container) {
|
||||
if (tableContext.getRootNode() instanceof ShadowRoot) {
|
||||
tableContext = tableContext.getRootNode().host;
|
||||
} else {
|
||||
tableContext = tableContext.parentElement;
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.keys(record).length > 0) {
|
||||
tableData.push(record);
|
||||
if (tableContext) {
|
||||
// Get rows from both regular DOM and shadow DOM
|
||||
const rows = [];
|
||||
if (tableContext.shadowRoot) {
|
||||
rows.push(...tableContext.shadowRoot.getElementsByTagName('TR'));
|
||||
}
|
||||
rows.push(...tableContext.getElementsByTagName('TR'));
|
||||
|
||||
const processedRows = filterRowsBasedOnTag(rows, tableFields);
|
||||
|
||||
for (let rowIndex = 0; rowIndex < Math.min(processedRows.length, limit); rowIndex++) {
|
||||
const record = {};
|
||||
const currentRow = processedRows[rowIndex];
|
||||
|
||||
for (const [label, { selector, attribute, cellIndex }] of Object.entries(tableFields)) {
|
||||
let element = null;
|
||||
|
||||
if (cellIndex >= 0) {
|
||||
let td = currentRow.children[cellIndex];
|
||||
|
||||
// Check shadow DOM for td
|
||||
if (!td && currentRow.shadowRoot) {
|
||||
const shadowCells = currentRow.shadowRoot.children;
|
||||
if (shadowCells && shadowCells.length > cellIndex) {
|
||||
td = shadowCells[cellIndex];
|
||||
}
|
||||
}
|
||||
|
||||
if (td) {
|
||||
element = queryShadowDOM(td, selector);
|
||||
|
||||
if (!element && selector.split(">").pop().includes('td:nth-child')) {
|
||||
element = td;
|
||||
}
|
||||
|
||||
if (!element) {
|
||||
const tagOnlySelector = selector.split('.')[0];
|
||||
element = queryShadowDOM(td, tagOnlySelector);
|
||||
}
|
||||
|
||||
if (!element) {
|
||||
let currentElement = td;
|
||||
while (currentElement && currentElement.children.length > 0) {
|
||||
let foundContentChild = false;
|
||||
for (const child of currentElement.children) {
|
||||
if (extractValue(child, attribute)) {
|
||||
currentElement = child;
|
||||
foundContentChild = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!foundContentChild) break;
|
||||
}
|
||||
element = currentElement;
|
||||
}
|
||||
}
|
||||
} else {
|
||||
element = queryShadowDOM(currentRow, selector);
|
||||
}
|
||||
|
||||
if (element) {
|
||||
record[label] = extractValue(element, attribute);
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.keys(record).length > 0) {
|
||||
tableData.push(record);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Process non-table fields across all containers
|
||||
|
||||
// Non-table data scraping remains unchanged
|
||||
for (let containerIndex = 0; containerIndex < containers.length; containerIndex++) {
|
||||
if (nonTableData.length >= limit) break;
|
||||
if (nonTableData.length >= limit) break;
|
||||
|
||||
const container = containers[containerIndex];
|
||||
const { nonTableFields } = containerFields[containerIndex];
|
||||
const container = containers[containerIndex];
|
||||
const { nonTableFields } = containerFields[containerIndex];
|
||||
|
||||
if (Object.keys(nonTableFields).length > 0) {
|
||||
const record = {};
|
||||
if (Object.keys(nonTableFields).length > 0) {
|
||||
const record = {};
|
||||
|
||||
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
||||
const element = container.querySelector(selector);
|
||||
|
||||
if (element) {
|
||||
record[label] = extractValue(element, attribute);
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.keys(record).length > 0) {
|
||||
nonTableData.push(record);
|
||||
}
|
||||
}
|
||||
for (const [label, { selector, attribute }] of Object.entries(nonTableFields)) {
|
||||
const relativeSelector = selector.split('>>').slice(-1)[0];
|
||||
const element = queryShadowDOM(container, relativeSelector);
|
||||
|
||||
if (element) {
|
||||
record[label] = extractValue(element, attribute);
|
||||
}
|
||||
}
|
||||
|
||||
if (Object.keys(record).length > 0) {
|
||||
nonTableData.push(record);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Merge and limit the results
|
||||
|
||||
@@ -403,7 +403,7 @@ export default class Interpreter extends EventEmitter {
|
||||
await this.options.serializableCallback(scrapeResults);
|
||||
},
|
||||
|
||||
scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; }>) => {
|
||||
scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; shadow: string}>) => {
|
||||
await this.ensureScriptsLoaded(page);
|
||||
|
||||
const scrapeResult = await page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
|
||||
@@ -663,11 +663,28 @@ export default class Interpreter extends EventEmitter {
|
||||
if (isApplicable) {
|
||||
return actionId;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private removeShadowSelectors(workflow: Workflow) {
|
||||
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
||||
const step = workflow[actionId];
|
||||
|
||||
// Check if step has where and selectors
|
||||
if (step.where && Array.isArray(step.where.selectors)) {
|
||||
// Filter out selectors that contain ">>"
|
||||
step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>'));
|
||||
}
|
||||
}
|
||||
|
||||
return workflow;
|
||||
}
|
||||
|
||||
private async runLoop(p: Page, workflow: Workflow) {
|
||||
const workflowCopy: Workflow = JSON.parse(JSON.stringify(workflow));
|
||||
let workflowCopy: Workflow = JSON.parse(JSON.stringify(workflow));
|
||||
|
||||
// remove shadow selectors
|
||||
workflowCopy = this.removeShadowSelectors(workflowCopy);
|
||||
|
||||
// apply ad-blocker to the current page
|
||||
try {
|
||||
|
||||
@@ -129,6 +129,11 @@ export interface BaseActionInfo {
|
||||
hasOnlyText: boolean;
|
||||
}
|
||||
|
||||
interface ShadowSelector {
|
||||
full: string;
|
||||
mode: string;
|
||||
}
|
||||
|
||||
/**
|
||||
* Holds all the possible css selectors that has been found for an element.
|
||||
* @category Types
|
||||
@@ -143,6 +148,7 @@ export interface Selectors {
|
||||
hrefSelector: string|null;
|
||||
accessibilitySelector: string|null;
|
||||
formSelector: string|null;
|
||||
shadowSelector: ShadowSelector | null;
|
||||
}
|
||||
|
||||
/**
|
||||
@@ -156,7 +162,7 @@ export interface BaseAction extends BaseActionInfo{
|
||||
associatedActions: ActionType[];
|
||||
inputType: string | undefined;
|
||||
value: string | undefined;
|
||||
selectors: { [key: string]: string | null };
|
||||
selectors: Selectors;
|
||||
timestamp: number;
|
||||
isPassword: boolean;
|
||||
/**
|
||||
|
||||
@@ -730,15 +730,26 @@ export class WorkflowGenerator {
|
||||
const displaySelector = await this.generateSelector(page, coordinates, ActionType.Click);
|
||||
const elementInfo = await getElementInformation(page, coordinates, this.listSelector, this.getList);
|
||||
if (rect) {
|
||||
const highlighterData = {
|
||||
rect,
|
||||
selector: displaySelector,
|
||||
elementInfo,
|
||||
// Include shadow DOM specific information
|
||||
shadowInfo: elementInfo?.isShadowRoot ? {
|
||||
mode: elementInfo.shadowRootMode,
|
||||
content: elementInfo.shadowRootContent
|
||||
} : null
|
||||
};
|
||||
|
||||
if (this.getList === true) {
|
||||
if (this.listSelector !== '') {
|
||||
const childSelectors = await getChildSelectors(page, this.listSelector || '');
|
||||
this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo, childSelectors })
|
||||
this.socket.emit('highlighter', { ...highlighterData, childSelectors })
|
||||
} else {
|
||||
this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo });
|
||||
this.socket.emit('highlighter', { ...highlighterData });
|
||||
}
|
||||
} else {
|
||||
this.socket.emit('highlighter', { rect, selector: displaySelector, elementInfo });
|
||||
this.socket.emit('highlighter', { ...highlighterData });
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -23,10 +23,41 @@ export const getElementInformation = async (
|
||||
if (!getList || listSelector !== '') {
|
||||
const elementInfo = await page.evaluate(
|
||||
async ({ x, y }) => {
|
||||
const el = document.elementFromPoint(x, y) as HTMLElement;
|
||||
// Enhanced helper function to get element from point including shadow DOM
|
||||
const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => {
|
||||
let element = document.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!element) return null;
|
||||
|
||||
// Traverse through shadow roots
|
||||
let current = element;
|
||||
let shadowRoot = current.shadowRoot;
|
||||
|
||||
// Keep track of the deepest shadow DOM element found
|
||||
let deepestElement = current;
|
||||
|
||||
while (shadowRoot) {
|
||||
// Try to find element at same point in shadow DOM
|
||||
const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!shadowElement || shadowElement === current) break;
|
||||
|
||||
// Update our tracking of the deepest element
|
||||
deepestElement = shadowElement;
|
||||
current = shadowElement;
|
||||
shadowRoot = current.shadowRoot;
|
||||
}
|
||||
|
||||
return deepestElement;
|
||||
};
|
||||
|
||||
const el = getDeepestElementFromPoint(x, y);
|
||||
if (el) {
|
||||
const { parentElement } = el;
|
||||
const element = parentElement?.tagName === 'A' ? parentElement : el;
|
||||
|
||||
// Get the containing shadow root if any
|
||||
const containingShadowRoot = element.getRootNode() as ShadowRoot;
|
||||
const isShadowRoot = containingShadowRoot instanceof ShadowRoot;
|
||||
|
||||
let info: {
|
||||
tagName: string;
|
||||
hasOnlyText?: boolean;
|
||||
@@ -36,9 +67,21 @@ export const getElementInformation = async (
|
||||
attributes?: Record<string, string>;
|
||||
innerHTML?: string;
|
||||
outerHTML?: string;
|
||||
isShadowRoot?: boolean;
|
||||
shadowRootMode?: string;
|
||||
shadowRootContent?: string;
|
||||
} = {
|
||||
tagName: element?.tagName ?? '',
|
||||
isShadowRoot: isShadowRoot
|
||||
};
|
||||
|
||||
if (isShadowRoot) {
|
||||
// Include shadow root specific information
|
||||
info.shadowRootMode = containingShadowRoot.mode;
|
||||
info.shadowRootContent = containingShadowRoot.innerHTML;
|
||||
}
|
||||
|
||||
// Get attributes including those from shadow DOM context
|
||||
if (element) {
|
||||
info.attributes = Array.from(element.attributes).reduce(
|
||||
(acc, attr) => {
|
||||
@@ -47,40 +90,53 @@ export const getElementInformation = async (
|
||||
},
|
||||
{} as Record<string, string>
|
||||
);
|
||||
|
||||
// Get text content considering shadow DOM context
|
||||
info.innerText = element.textContent ?? '';
|
||||
info.innerHTML = element.innerHTML;
|
||||
info.outerHTML = element.outerHTML;
|
||||
info.hasOnlyText = element.children.length === 0 &&
|
||||
(element.textContent !== null &&
|
||||
element.textContent.trim().length > 0);
|
||||
}
|
||||
// Gather specific information based on the tag
|
||||
if (element?.tagName === 'A') {
|
||||
info.url = (element as HTMLAnchorElement).href;
|
||||
info.innerText = element.innerText ?? '';
|
||||
} else if (element?.tagName === 'IMG') {
|
||||
info.imageUrl = (element as HTMLImageElement).src;
|
||||
} else if (element?.tagName === 'SELECT') {
|
||||
const selectElement = element as HTMLSelectElement;
|
||||
info.innerText = selectElement.options[selectElement.selectedIndex]?.text ?? '';
|
||||
info.attributes = {
|
||||
...info.attributes,
|
||||
selectedValue: selectElement.value,
|
||||
};
|
||||
} else if (element?.tagName === 'INPUT' && (element as HTMLInputElement).type === 'time' || (element as HTMLInputElement).type === 'date') {
|
||||
info.innerText = (element as HTMLInputElement).value;
|
||||
} else {
|
||||
info.hasOnlyText = element?.children?.length === 0 &&
|
||||
element?.innerText?.length > 0;
|
||||
info.innerText = element?.innerText ?? '';
|
||||
}
|
||||
info.innerHTML = element.innerHTML;
|
||||
info.outerHTML = element.outerHTML;
|
||||
|
||||
return info;
|
||||
}
|
||||
return null;
|
||||
},
|
||||
{ x: coordinates.x, y: coordinates.y },
|
||||
{ x: coordinates.x, y: coordinates.y }
|
||||
);
|
||||
return elementInfo;
|
||||
} else {
|
||||
const elementInfo = await page.evaluate(
|
||||
async ({ x, y }) => {
|
||||
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
|
||||
// Enhanced helper function to get element from point including shadow DOM
|
||||
const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => {
|
||||
let element = document.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!element) return null;
|
||||
|
||||
// Traverse through shadow roots
|
||||
let current = element;
|
||||
let shadowRoot = current.shadowRoot;
|
||||
|
||||
// Keep track of the deepest shadow DOM element found
|
||||
let deepestElement = current;
|
||||
|
||||
while (shadowRoot) {
|
||||
// Try to find element at same point in shadow DOM
|
||||
const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!shadowElement || shadowElement === current) break;
|
||||
|
||||
// Update our tracking of the deepest element
|
||||
deepestElement = shadowElement;
|
||||
current = shadowElement;
|
||||
shadowRoot = current.shadowRoot;
|
||||
}
|
||||
|
||||
return deepestElement;
|
||||
};
|
||||
|
||||
const originalEl = getDeepestElementFromPoint(x, y);
|
||||
if (originalEl) {
|
||||
let element = originalEl;
|
||||
|
||||
@@ -124,7 +180,11 @@ export const getElementInformation = async (
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
// Get the containing shadow root if any
|
||||
const containingShadowRoot = element.getRootNode() as ShadowRoot;
|
||||
const isShadowRoot = containingShadowRoot instanceof ShadowRoot;
|
||||
|
||||
let info: {
|
||||
tagName: string;
|
||||
hasOnlyText?: boolean;
|
||||
@@ -134,11 +194,22 @@ export const getElementInformation = async (
|
||||
attributes?: Record<string, string>;
|
||||
innerHTML?: string;
|
||||
outerHTML?: string;
|
||||
isShadowRoot?: boolean;
|
||||
shadowRootMode?: string;
|
||||
shadowRootContent?: string;
|
||||
} = {
|
||||
tagName: element?.tagName ?? '',
|
||||
isShadowRoot: isShadowRoot
|
||||
};
|
||||
|
||||
|
||||
if (isShadowRoot) {
|
||||
// Include shadow root specific information
|
||||
info.shadowRootMode = containingShadowRoot.mode;
|
||||
info.shadowRootContent = containingShadowRoot.innerHTML;
|
||||
}
|
||||
|
||||
if (element) {
|
||||
// Get attributes including those from shadow DOM context
|
||||
info.attributes = Array.from(element.attributes).reduce(
|
||||
(acc, attr) => {
|
||||
acc[attr.name] = attr.value;
|
||||
@@ -146,21 +217,25 @@ export const getElementInformation = async (
|
||||
},
|
||||
{} as Record<string, string>
|
||||
);
|
||||
|
||||
// Handle specific element types
|
||||
if (element.tagName === 'A') {
|
||||
info.url = (element as HTMLAnchorElement).href;
|
||||
info.innerText = element.textContent ?? '';
|
||||
} else if (element.tagName === 'IMG') {
|
||||
info.imageUrl = (element as HTMLImageElement).src;
|
||||
} else {
|
||||
// Handle text content with proper null checking
|
||||
info.hasOnlyText = element.children.length === 0 &&
|
||||
(element.textContent !== null &&
|
||||
element.textContent.trim().length > 0);
|
||||
info.innerText = element.textContent ?? '';
|
||||
}
|
||||
|
||||
info.innerHTML = element.innerHTML;
|
||||
info.outerHTML = element.outerHTML;
|
||||
}
|
||||
|
||||
if (element?.tagName === 'A') {
|
||||
info.url = (element as HTMLAnchorElement).href;
|
||||
info.innerText = element.innerText ?? '';
|
||||
} else if (element?.tagName === 'IMG') {
|
||||
info.imageUrl = (element as HTMLImageElement).src;
|
||||
} else {
|
||||
info.hasOnlyText = element?.children?.length === 0 &&
|
||||
element?.innerText?.length > 0;
|
||||
info.innerText = element?.innerText ?? '';
|
||||
}
|
||||
|
||||
info.innerHTML = element.innerHTML;
|
||||
info.outerHTML = element.outerHTML;
|
||||
|
||||
return info;
|
||||
}
|
||||
return null;
|
||||
@@ -176,24 +251,40 @@ export const getElementInformation = async (
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Returns a {@link Rectangle} object representing
|
||||
* the coordinates, width, height and corner points of the element.
|
||||
* If an element is not found, returns null.
|
||||
* @param page The page instance.
|
||||
* @param coordinates Coordinates of an element.
|
||||
* @category WorkflowManagement-Selectors
|
||||
* @returns {Promise<Rectangle|undefined|null>}
|
||||
*/
|
||||
export const getRect = async (page: Page, coordinates: Coordinates, listSelector: string, getList: boolean) => {
|
||||
try {
|
||||
if (!getList || listSelector !== '') {
|
||||
const rect = await page.evaluate(
|
||||
async ({ x, y }) => {
|
||||
const el = document.elementFromPoint(x, y) as HTMLElement;
|
||||
// Enhanced helper function to get element from point including shadow DOM
|
||||
const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => {
|
||||
let element = document.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!element) return null;
|
||||
|
||||
// Traverse through shadow roots
|
||||
let current = element;
|
||||
let shadowRoot = current.shadowRoot;
|
||||
|
||||
// Keep track of the deepest shadow DOM element found
|
||||
let deepestElement = current;
|
||||
|
||||
while (shadowRoot) {
|
||||
// Try to find element at same point in shadow DOM
|
||||
const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!shadowElement || shadowElement === current) break;
|
||||
|
||||
// Update our tracking of the deepest element
|
||||
deepestElement = shadowElement;
|
||||
current = shadowElement;
|
||||
shadowRoot = current.shadowRoot;
|
||||
}
|
||||
|
||||
return deepestElement;
|
||||
};
|
||||
|
||||
const el = getDeepestElementFromPoint(x, y);
|
||||
if (el) {
|
||||
const { parentElement } = el;
|
||||
// Match the logic in recorder.ts for link clicks
|
||||
const element = parentElement?.tagName === 'A' ? parentElement : el;
|
||||
const rectangle = element?.getBoundingClientRect();
|
||||
if (rectangle) {
|
||||
@@ -209,14 +300,41 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector
|
||||
};
|
||||
}
|
||||
}
|
||||
return null;
|
||||
},
|
||||
{ x: coordinates.x, y: coordinates.y },
|
||||
{ x: coordinates.x, y: coordinates.y }
|
||||
);
|
||||
return rect;
|
||||
} else {
|
||||
const rect = await page.evaluate(
|
||||
async ({ x, y }) => {
|
||||
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
|
||||
// Enhanced helper function to get element from point including shadow DOM
|
||||
const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => {
|
||||
let element = document.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!element) return null;
|
||||
|
||||
// Traverse through shadow roots
|
||||
let current = element;
|
||||
let shadowRoot = current.shadowRoot;
|
||||
|
||||
// Keep track of the deepest shadow DOM element found
|
||||
let deepestElement = current;
|
||||
|
||||
while (shadowRoot) {
|
||||
// Try to find element at same point in shadow DOM
|
||||
const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!shadowElement || shadowElement === current) break;
|
||||
|
||||
// Update our tracking of the deepest element
|
||||
deepestElement = shadowElement;
|
||||
current = shadowElement;
|
||||
shadowRoot = current.shadowRoot;
|
||||
}
|
||||
|
||||
return deepestElement;
|
||||
};
|
||||
|
||||
const originalEl = getDeepestElementFromPoint(x, y);
|
||||
if (originalEl) {
|
||||
let element = originalEl;
|
||||
|
||||
@@ -262,7 +380,6 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector
|
||||
}
|
||||
|
||||
const rectangle = element?.getBoundingClientRect();
|
||||
|
||||
if (rectangle) {
|
||||
return {
|
||||
x: rectangle.x,
|
||||
@@ -278,18 +395,17 @@ export const getRect = async (page: Page, coordinates: Coordinates, listSelector
|
||||
}
|
||||
return null;
|
||||
},
|
||||
{ x: coordinates.x, y: coordinates.y },
|
||||
{ x: coordinates.x, y: coordinates.y }
|
||||
);
|
||||
return rect;
|
||||
}
|
||||
} catch (error) {
|
||||
const { message, stack } = error as Error;
|
||||
logger.log('error', `Error while retrieving selector: ${message}`);
|
||||
logger.log('error', `Stack: ${stack}`);
|
||||
console.error('Error while retrieving selector:', message);
|
||||
console.error('Stack:', stack);
|
||||
}
|
||||
};
|
||||
|
||||
|
||||
/**
|
||||
* Returns the best and unique css {@link Selectors} for the element on the page.
|
||||
* Internally uses a finder function from https://github.com/antonmedv/finder/blob/master/finder.ts
|
||||
@@ -760,6 +876,92 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
|
||||
return output;
|
||||
}
|
||||
|
||||
// const MAX_DEPTH = 10;
|
||||
|
||||
const getDeepestElementFromPoint = (x: number, y: number): HTMLElement | null => {
|
||||
let element = document.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!element) return null;
|
||||
|
||||
let current = element;
|
||||
let deepestElement = current;
|
||||
let depth = 0;
|
||||
const MAX_DEPTH = 4; // Limit to 2 levels of shadow DOM
|
||||
|
||||
while (current && depth < MAX_DEPTH) {
|
||||
const shadowRoot = current.shadowRoot;
|
||||
if (!shadowRoot) break;
|
||||
|
||||
const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!shadowElement || shadowElement === current) break;
|
||||
|
||||
deepestElement = shadowElement;
|
||||
current = shadowElement;
|
||||
depth++;
|
||||
}
|
||||
|
||||
return deepestElement;
|
||||
};
|
||||
|
||||
// Helper function to generate selectors for shadow DOM elements
|
||||
const genSelectorForShadowDOM = (element: HTMLElement) => {
|
||||
// Get complete path up to document root
|
||||
const getShadowPath = (el: HTMLElement) => {
|
||||
const path = [];
|
||||
let current = el;
|
||||
let depth = 0;
|
||||
const MAX_DEPTH = 4;
|
||||
|
||||
while (current && depth < MAX_DEPTH) {
|
||||
const rootNode = current.getRootNode();
|
||||
if (rootNode instanceof ShadowRoot) {
|
||||
path.unshift({
|
||||
host: rootNode.host as HTMLElement,
|
||||
root: rootNode,
|
||||
element: current
|
||||
});
|
||||
current = rootNode.host as HTMLElement;
|
||||
depth++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return path;
|
||||
};
|
||||
|
||||
const shadowPath = getShadowPath(element);
|
||||
if (shadowPath.length === 0) return null;
|
||||
|
||||
try {
|
||||
const selectorParts: string[] = [];
|
||||
|
||||
// Generate selector for each shadow DOM boundary
|
||||
shadowPath.forEach((context, index) => {
|
||||
// Get selector for the host element
|
||||
const hostSelector = finder(context.host, {
|
||||
root: index === 0 ? document.body : (shadowPath[index - 1].root as unknown as Element)
|
||||
});
|
||||
|
||||
// For the last context, get selector for target element
|
||||
if (index === shadowPath.length - 1) {
|
||||
const elementSelector = finder(element, {
|
||||
root: context.root as unknown as Element
|
||||
});
|
||||
selectorParts.push(`${hostSelector} >> ${elementSelector}`);
|
||||
} else {
|
||||
selectorParts.push(hostSelector);
|
||||
}
|
||||
});
|
||||
|
||||
return {
|
||||
fullSelector: selectorParts.join(' >> '),
|
||||
mode: shadowPath[shadowPath.length - 1].root.mode
|
||||
};
|
||||
} catch (e) {
|
||||
console.warn('Error generating shadow DOM selector:', e);
|
||||
return null;
|
||||
}
|
||||
};
|
||||
|
||||
const genSelectors = (element: HTMLElement | null) => {
|
||||
if (element == null) {
|
||||
return null;
|
||||
@@ -779,6 +981,9 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
|
||||
} catch (e) {
|
||||
}
|
||||
|
||||
// Generate shadow DOM specific selector
|
||||
const shadowSelector = genSelectorForShadowDOM(element);
|
||||
|
||||
const hrefSelector = genSelectorForAttributes(element, ['href']);
|
||||
const formSelector = genSelectorForAttributes(element, [
|
||||
'name',
|
||||
@@ -825,9 +1030,16 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
|
||||
hrefSelector,
|
||||
accessibilitySelector,
|
||||
formSelector,
|
||||
// Shadow DOM selector
|
||||
shadowSelector: shadowSelector ? {
|
||||
full: shadowSelector.fullSelector,
|
||||
mode: shadowSelector.mode
|
||||
} : null
|
||||
};
|
||||
}
|
||||
|
||||
|
||||
|
||||
function genAttributeSet(element: HTMLElement, attributes: string[]) {
|
||||
return new Set(
|
||||
attributes.filter((attr) => {
|
||||
@@ -867,7 +1079,7 @@ export const getSelectors = async (page: Page, coordinates: Coordinates) => {
|
||||
return char.length === 1 && char.match(/[0-9]/);
|
||||
}
|
||||
|
||||
const hoveredElement = document.elementFromPoint(x, y) as HTMLElement;
|
||||
const hoveredElement = getDeepestElementFromPoint(x, y);
|
||||
if (
|
||||
hoveredElement != null &&
|
||||
!hoveredElement.closest('#overlay-controls') != null
|
||||
@@ -902,9 +1114,41 @@ interface SelectorResult {
|
||||
*/
|
||||
|
||||
export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates, listSelector: string): Promise<SelectorResult> => {
|
||||
interface ShadowContext {
|
||||
host: HTMLElement;
|
||||
root: ShadowRoot;
|
||||
element: HTMLElement;
|
||||
}
|
||||
|
||||
try {
|
||||
if (!listSelector) {
|
||||
const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => {
|
||||
// Helper function to get deepest element, traversing shadow DOM
|
||||
function getDeepestElementFromPoint(x: number, y: number): HTMLElement | null {
|
||||
let element = document.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!element) return null;
|
||||
|
||||
let current = element;
|
||||
let deepestElement = current;
|
||||
let depth = 0;
|
||||
const MAX_DEPTH = 4; // Limit shadow DOM traversal depth
|
||||
|
||||
while (current && depth < MAX_DEPTH) {
|
||||
const shadowRoot = current.shadowRoot;
|
||||
if (!shadowRoot) break;
|
||||
|
||||
const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!shadowElement || shadowElement === current) break;
|
||||
|
||||
deepestElement = shadowElement;
|
||||
current = shadowElement;
|
||||
depth++;
|
||||
}
|
||||
|
||||
return deepestElement;
|
||||
}
|
||||
|
||||
// Generate basic selector from element's tag and classes
|
||||
function getNonUniqueSelector(element: HTMLElement): string {
|
||||
let selector = element.tagName.toLowerCase();
|
||||
|
||||
@@ -928,22 +1172,77 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
|
||||
return selector;
|
||||
}
|
||||
|
||||
function getSelectorPath(element: HTMLElement | null): string {
|
||||
const path: string[] = [];
|
||||
// Get complete shadow DOM path for an element
|
||||
function getShadowPath(element: HTMLElement): ShadowContext[] {
|
||||
const path: ShadowContext[] = [];
|
||||
let current = element;
|
||||
let depth = 0;
|
||||
const maxDepth = 2;
|
||||
const MAX_DEPTH = 4;
|
||||
|
||||
while (current && depth < MAX_DEPTH) {
|
||||
const rootNode = current.getRootNode();
|
||||
if (rootNode instanceof ShadowRoot) {
|
||||
path.unshift({
|
||||
host: rootNode.host as HTMLElement,
|
||||
root: rootNode,
|
||||
element: current
|
||||
});
|
||||
current = rootNode.host as HTMLElement;
|
||||
depth++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
while (element && element !== document.body && depth < maxDepth) {
|
||||
const selector = getNonUniqueSelector(element);
|
||||
// Generate complete selector path for any element
|
||||
function getSelectorPath(element: HTMLElement | null): string {
|
||||
if (!element) return '';
|
||||
|
||||
// Check for shadow DOM path first
|
||||
const shadowPath = getShadowPath(element);
|
||||
if (shadowPath.length > 0) {
|
||||
const selectorParts: string[] = [];
|
||||
|
||||
// Build complete shadow DOM path
|
||||
shadowPath.forEach((context, index) => {
|
||||
const hostSelector = getNonUniqueSelector(context.host);
|
||||
|
||||
if (index === shadowPath.length - 1) {
|
||||
// For deepest shadow context, include target element
|
||||
const elementSelector = getNonUniqueSelector(element);
|
||||
selectorParts.push(`${hostSelector} >> ${elementSelector}`);
|
||||
} else {
|
||||
// For intermediate shadow boundaries
|
||||
selectorParts.push(hostSelector);
|
||||
}
|
||||
});
|
||||
|
||||
return selectorParts.join(' >> ');
|
||||
}
|
||||
|
||||
// Regular DOM path generation
|
||||
const path: string[] = [];
|
||||
let currentElement = element;
|
||||
let depth = 0;
|
||||
const MAX_DEPTH = 2;
|
||||
|
||||
while (currentElement && currentElement !== document.body && depth < MAX_DEPTH) {
|
||||
const selector = getNonUniqueSelector(currentElement);
|
||||
path.unshift(selector);
|
||||
element = element.parentElement;
|
||||
|
||||
const parentElement = currentElement.parentElement;
|
||||
if (!parentElement) break;
|
||||
currentElement = parentElement;
|
||||
depth++;
|
||||
}
|
||||
|
||||
return path.join(' > ');
|
||||
}
|
||||
|
||||
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
|
||||
// Main logic to get element and generate selector
|
||||
const originalEl = getDeepestElementFromPoint(x, y);
|
||||
if (!originalEl) return null;
|
||||
|
||||
let element = originalEl;
|
||||
@@ -989,16 +1288,41 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
|
||||
}
|
||||
}
|
||||
}
|
||||
// }
|
||||
|
||||
const generalSelector = getSelectorPath(element);
|
||||
return {
|
||||
generalSelector,
|
||||
};
|
||||
return { generalSelector };
|
||||
}, coordinates);
|
||||
|
||||
return selectors || { generalSelector: '' };
|
||||
} else {
|
||||
// When we have a list selector, we need special handling while maintaining shadow DOM support
|
||||
const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => {
|
||||
// Helper function to get deepest element, traversing shadow DOM
|
||||
function getDeepestElementFromPoint(x: number, y: number): HTMLElement | null {
|
||||
let element = document.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!element) return null;
|
||||
|
||||
let current = element;
|
||||
let deepestElement = current;
|
||||
let depth = 0;
|
||||
const MAX_DEPTH = 4;
|
||||
|
||||
while (current && depth < MAX_DEPTH) {
|
||||
const shadowRoot = current.shadowRoot;
|
||||
if (!shadowRoot) break;
|
||||
|
||||
const shadowElement = shadowRoot.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!shadowElement || shadowElement === current) break;
|
||||
|
||||
deepestElement = shadowElement;
|
||||
current = shadowElement;
|
||||
depth++;
|
||||
}
|
||||
|
||||
return deepestElement;
|
||||
}
|
||||
|
||||
// Generate basic selector from element's tag and classes
|
||||
function getNonUniqueSelector(element: HTMLElement): string {
|
||||
let selector = element.tagName.toLowerCase();
|
||||
|
||||
@@ -1021,34 +1345,83 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
|
||||
return selector;
|
||||
}
|
||||
|
||||
function getSelectorPath(element: HTMLElement | null): string {
|
||||
const path: string[] = [];
|
||||
// Get complete shadow DOM path for an element
|
||||
function getShadowPath(element: HTMLElement): ShadowContext[] {
|
||||
const path: ShadowContext[] = [];
|
||||
let current = element;
|
||||
let depth = 0;
|
||||
const maxDepth = 2;
|
||||
const MAX_DEPTH = 4;
|
||||
|
||||
while (current && depth < MAX_DEPTH) {
|
||||
const rootNode = current.getRootNode();
|
||||
if (rootNode instanceof ShadowRoot) {
|
||||
path.unshift({
|
||||
host: rootNode.host as HTMLElement,
|
||||
root: rootNode,
|
||||
element: current
|
||||
});
|
||||
current = rootNode.host as HTMLElement;
|
||||
depth++;
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
return path;
|
||||
}
|
||||
|
||||
while (element && element !== document.body && depth < maxDepth) {
|
||||
const selector = getNonUniqueSelector(element);
|
||||
// Generate selector path specifically for list items
|
||||
function getListItemSelectorPath(element: HTMLElement | null): string {
|
||||
if (!element) return '';
|
||||
|
||||
// Check for shadow DOM path first
|
||||
const shadowPath = getShadowPath(element);
|
||||
if (shadowPath.length > 0) {
|
||||
const selectorParts: string[] = [];
|
||||
|
||||
shadowPath.forEach((context, index) => {
|
||||
const hostSelector = getNonUniqueSelector(context.host);
|
||||
|
||||
if (index === shadowPath.length - 1) {
|
||||
const elementSelector = getNonUniqueSelector(element);
|
||||
selectorParts.push(`${hostSelector} >> ${elementSelector}`);
|
||||
} else {
|
||||
selectorParts.push(hostSelector);
|
||||
}
|
||||
});
|
||||
|
||||
return selectorParts.join(' >> ');
|
||||
}
|
||||
|
||||
// For list items, we want a shallower path to better match list patterns
|
||||
const path: string[] = [];
|
||||
let currentElement = element;
|
||||
let depth = 0;
|
||||
const MAX_LIST_DEPTH = 2; // Keeping shallow depth for list items
|
||||
|
||||
while (currentElement && currentElement !== document.body && depth < MAX_LIST_DEPTH) {
|
||||
const selector = getNonUniqueSelector(currentElement);
|
||||
path.unshift(selector);
|
||||
element = element.parentElement;
|
||||
|
||||
if (!currentElement.parentElement) break;
|
||||
currentElement = currentElement.parentElement;
|
||||
depth++;
|
||||
}
|
||||
|
||||
return path.join(' > ');
|
||||
}
|
||||
|
||||
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
|
||||
if (!originalEl) return null;
|
||||
// Main logic for list item selection
|
||||
const originalEl = getDeepestElementFromPoint(x, y);
|
||||
if (!originalEl) return { generalSelector: '' };
|
||||
|
||||
let element = originalEl;
|
||||
|
||||
const generalSelector = getSelectorPath(element);
|
||||
return {
|
||||
generalSelector,
|
||||
};
|
||||
}, coordinates);
|
||||
return selectors || { generalSelector: '' };
|
||||
}
|
||||
const generalSelector = getListItemSelectorPath(element);
|
||||
return { generalSelector };
|
||||
}, coordinates);
|
||||
|
||||
return selectors || { generalSelector: '' };
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error in getNonUniqueSelectors:', error);
|
||||
return { generalSelector: '' };
|
||||
@@ -1083,42 +1456,110 @@ export const getChildSelectors = async (page: Page, parentSelector: string): Pro
|
||||
}
|
||||
|
||||
// Function to generate selector path from an element to its parent
|
||||
function getSelectorPath(element: HTMLElement | null): string {
|
||||
function getSelectorPath(element: HTMLElement): string {
|
||||
if (!element || !element.parentElement) return '';
|
||||
|
||||
const parentSelector = getNonUniqueSelector(element.parentElement);
|
||||
const elementSelector = getNonUniqueSelector(element);
|
||||
|
||||
// Check if element is in shadow DOM
|
||||
const rootNode = element.getRootNode();
|
||||
if (rootNode instanceof ShadowRoot) {
|
||||
const hostSelector = getNonUniqueSelector(rootNode.host as HTMLElement);
|
||||
return `${hostSelector} >> ${elementSelector}`;
|
||||
}
|
||||
|
||||
return `${parentSelector} > ${elementSelector}`;
|
||||
}
|
||||
|
||||
// Function to recursively get all descendant selectors
|
||||
// Function to get all shadow DOM children of an element
|
||||
function getShadowChildren(element: HTMLElement): HTMLElement[] {
|
||||
const children: HTMLElement[] = [];
|
||||
|
||||
// Check if element has shadow root
|
||||
const shadowRoot = element.shadowRoot;
|
||||
if (shadowRoot) {
|
||||
// Get all elements in the shadow DOM
|
||||
const shadowElements = Array.from(shadowRoot.querySelectorAll('*')) as HTMLElement[];
|
||||
children.push(...shadowElements);
|
||||
}
|
||||
|
||||
return children;
|
||||
}
|
||||
|
||||
// Function to recursively get all descendant selectors including shadow DOM
|
||||
function getAllDescendantSelectors(element: HTMLElement): string[] {
|
||||
let selectors: string[] = [];
|
||||
|
||||
// Handle regular DOM children
|
||||
const children = Array.from(element.children) as HTMLElement[];
|
||||
|
||||
for (const child of children) {
|
||||
const childPath = getSelectorPath(child);
|
||||
if (childPath) {
|
||||
selectors.push(childPath); // Add direct child path
|
||||
selectors = selectors.concat(getAllDescendantSelectors(child)); // Recursively process descendants
|
||||
selectors.push(childPath);
|
||||
// Recursively process regular DOM descendants
|
||||
selectors = selectors.concat(getAllDescendantSelectors(child));
|
||||
|
||||
// Check for shadow DOM in this child
|
||||
const shadowChildren = getShadowChildren(child);
|
||||
for (const shadowChild of shadowChildren) {
|
||||
const shadowPath = getSelectorPath(shadowChild);
|
||||
if (shadowPath) {
|
||||
selectors.push(shadowPath);
|
||||
// Recursively process shadow DOM descendants
|
||||
selectors = selectors.concat(getAllDescendantSelectors(shadowChild));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Handle direct shadow DOM children of the current element
|
||||
const shadowChildren = getShadowChildren(element);
|
||||
for (const shadowChild of shadowChildren) {
|
||||
const shadowPath = getSelectorPath(shadowChild);
|
||||
if (shadowPath) {
|
||||
selectors.push(shadowPath);
|
||||
selectors = selectors.concat(getAllDescendantSelectors(shadowChild));
|
||||
}
|
||||
}
|
||||
|
||||
return selectors;
|
||||
}
|
||||
|
||||
// Find all occurrences of the parent selector in the DOM
|
||||
const parentElements = Array.from(document.querySelectorAll(parentSelector)) as HTMLElement[];
|
||||
const allChildSelectors = new Set<string>(); // Use a set to ensure uniqueness
|
||||
// Split the parent selector if it contains shadow DOM parts
|
||||
const selectorParts = parentSelector.split('>>').map(part => part.trim());
|
||||
let parentElements: HTMLElement[] = [];
|
||||
|
||||
// Handle shadow DOM traversal if needed
|
||||
if (selectorParts.length > 1) {
|
||||
// Start with the host elements
|
||||
parentElements = Array.from(document.querySelectorAll(selectorParts[0])) as HTMLElement[];
|
||||
|
||||
// Traverse through shadow DOM parts
|
||||
for (let i = 1; i < selectorParts.length; i++) {
|
||||
const newParentElements: HTMLElement[] = [];
|
||||
for (const element of parentElements) {
|
||||
if (element.shadowRoot) {
|
||||
const shadowChildren = Array.from(element.shadowRoot.querySelectorAll(selectorParts[i])) as HTMLElement[];
|
||||
newParentElements.push(...shadowChildren);
|
||||
}
|
||||
}
|
||||
parentElements = newParentElements;
|
||||
}
|
||||
} else {
|
||||
// Regular DOM selector
|
||||
parentElements = Array.from(document.querySelectorAll(parentSelector)) as HTMLElement[];
|
||||
}
|
||||
|
||||
const allChildSelectors = new Set<string>();
|
||||
|
||||
// Process each parent element and its descendants
|
||||
parentElements.forEach((parentElement) => {
|
||||
const descendantSelectors = getAllDescendantSelectors(parentElement);
|
||||
descendantSelectors.forEach((selector) => allChildSelectors.add(selector)); // Add selectors to the set
|
||||
descendantSelectors.forEach((selector) => allChildSelectors.add(selector));
|
||||
});
|
||||
|
||||
return Array.from(allChildSelectors); // Convert the set back to an array
|
||||
return Array.from(allChildSelectors);
|
||||
}, parentSelector);
|
||||
|
||||
return childSelectors || [];
|
||||
|
||||
@@ -12,6 +12,11 @@ export const getBestSelectorForAction = (action: Action) => {
|
||||
case ActionType.Hover:
|
||||
case ActionType.DragAndDrop: {
|
||||
const selectors = action.selectors;
|
||||
|
||||
if (selectors?.shadowSelector?.full) {
|
||||
return selectors.shadowSelector.full;
|
||||
}
|
||||
|
||||
// less than 25 characters, and element only has text inside
|
||||
const textSelector =
|
||||
selectors?.text?.length != null &&
|
||||
@@ -75,6 +80,11 @@ export const getBestSelectorForAction = (action: Action) => {
|
||||
case ActionType.Input:
|
||||
case ActionType.Keydown: {
|
||||
const selectors = action.selectors;
|
||||
|
||||
if (selectors?.shadowSelector?.full) {
|
||||
return selectors.shadowSelector.full;
|
||||
}
|
||||
|
||||
return (
|
||||
selectors.testIdSelector ??
|
||||
selectors?.id ??
|
||||
|
||||
@@ -9,10 +9,10 @@ import { useBrowserSteps, TextStep } from '../../context/browserSteps';
|
||||
import { useGlobalInfoStore } from '../../context/globalInfo';
|
||||
import { useTranslation } from 'react-i18next';
|
||||
|
||||
|
||||
interface ElementInfo {
|
||||
tagName: string;
|
||||
hasOnlyText?: boolean;
|
||||
isShadowRoot?: boolean;
|
||||
innerText?: string;
|
||||
url?: string;
|
||||
imageUrl?: string;
|
||||
@@ -120,6 +120,9 @@ export const BrowserWindow = () => {
|
||||
if (getList === true) {
|
||||
if (listSelector) {
|
||||
socket?.emit('listSelector', { selector: listSelector });
|
||||
|
||||
const hasValidChildSelectors = Array.isArray(data.childSelectors) && data.childSelectors.length > 0;
|
||||
|
||||
if (limitMode) {
|
||||
setHighlighterData(null);
|
||||
} else if (paginationMode) {
|
||||
@@ -132,7 +135,29 @@ export const BrowserWindow = () => {
|
||||
} else if (data.childSelectors && data.childSelectors.includes(data.selector)) {
|
||||
// highlight only valid child elements within the listSelector
|
||||
setHighlighterData(data);
|
||||
} else {
|
||||
} else if (data.elementInfo?.isShadowRoot && data.childSelectors) {
|
||||
// New case: Handle pure Shadow DOM elements
|
||||
// Check if the selector matches any shadow root child selectors
|
||||
const isShadowChild = data.childSelectors.some(childSelector =>
|
||||
data.selector.includes('>>') && // Shadow DOM uses >> for piercing
|
||||
childSelector.split('>>').some(part =>
|
||||
data.selector.includes(part.trim())
|
||||
)
|
||||
);
|
||||
setHighlighterData(isShadowChild ? data : null);
|
||||
} else if (data.selector.includes('>>') && hasValidChildSelectors) {
|
||||
// New case: Handle mixed DOM cases
|
||||
// Split the selector into parts and check each against child selectors
|
||||
const selectorParts = data.selector.split('>>').map(part => part.trim());
|
||||
const isValidMixedSelector = selectorParts.some(part =>
|
||||
// Now we know data.childSelectors is defined
|
||||
data.childSelectors!.some(childSelector =>
|
||||
childSelector.includes(part)
|
||||
)
|
||||
);
|
||||
setHighlighterData(isValidMixedSelector ? data : null);
|
||||
}
|
||||
else {
|
||||
// if !valid child in normal mode, clear the highlighter
|
||||
setHighlighterData(null);
|
||||
}
|
||||
@@ -192,6 +217,7 @@ export const BrowserWindow = () => {
|
||||
addTextStep('', data, {
|
||||
selector: highlighterData.selector,
|
||||
tag: highlighterData.elementInfo?.tagName,
|
||||
shadow: highlighterData.elementInfo?.isShadowRoot,
|
||||
attribute
|
||||
});
|
||||
} else {
|
||||
@@ -199,7 +225,7 @@ export const BrowserWindow = () => {
|
||||
setAttributeOptions(options);
|
||||
setSelectedElement({
|
||||
selector: highlighterData.selector,
|
||||
info: highlighterData.elementInfo
|
||||
info: highlighterData.elementInfo,
|
||||
});
|
||||
setShowAttributeModal(true);
|
||||
}
|
||||
@@ -236,6 +262,7 @@ export const BrowserWindow = () => {
|
||||
selectorObj: {
|
||||
selector: highlighterData.selector,
|
||||
tag: highlighterData.elementInfo?.tagName,
|
||||
shadow: highlighterData.elementInfo?.isShadowRoot,
|
||||
attribute
|
||||
}
|
||||
};
|
||||
@@ -283,6 +310,7 @@ export const BrowserWindow = () => {
|
||||
addTextStep('', data, {
|
||||
selector: selectedElement.selector,
|
||||
tag: selectedElement.info?.tagName,
|
||||
shadow: selectedElement.info?.isShadowRoot,
|
||||
attribute: attribute
|
||||
});
|
||||
}
|
||||
@@ -295,6 +323,7 @@ export const BrowserWindow = () => {
|
||||
selectorObj: {
|
||||
selector: selectedElement.selector,
|
||||
tag: selectedElement.info?.tagName,
|
||||
shadow: selectedElement.info?.isShadowRoot,
|
||||
attribute: attribute
|
||||
}
|
||||
};
|
||||
|
||||
@@ -32,6 +32,7 @@ export interface SelectorObject {
|
||||
selector: string;
|
||||
tag?: string;
|
||||
attribute?: string;
|
||||
shadow?: boolean;
|
||||
[key: string]: any;
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user