feat: always group table rows
This commit is contained in:
@@ -185,7 +185,6 @@ class ClientSelectorGenerator {
|
|||||||
if (element.nodeType !== Node.ELEMENT_NODE) return null;
|
if (element.nodeType !== Node.ELEMENT_NODE) return null;
|
||||||
|
|
||||||
const tagName = element.tagName.toLowerCase();
|
const tagName = element.tagName.toLowerCase();
|
||||||
|
|
||||||
const isCustomElement = tagName.includes("-");
|
const isCustomElement = tagName.includes("-");
|
||||||
|
|
||||||
const standardExcludeSelectors = [
|
const standardExcludeSelectors = [
|
||||||
@@ -203,38 +202,55 @@ class ClientSelectorGenerator {
|
|||||||
if (this.groupingConfig.excludeSelectors.includes(tagName)) return null;
|
if (this.groupingConfig.excludeSelectors.includes(tagName)) return null;
|
||||||
|
|
||||||
const children = Array.from(element.children);
|
const children = Array.from(element.children);
|
||||||
const childrenStructure = children.map((child) => ({
|
let childrenStructureString: string;
|
||||||
tag: child.tagName.toLowerCase(),
|
|
||||||
classes: this.normalizeClasses(child.classList),
|
if (tagName === 'table') {
|
||||||
hasText: (child.textContent ?? "").trim().length > 0,
|
// For tables, the fingerprint is based on the header or first row's structure.
|
||||||
}));
|
const thead = element.querySelector('thead');
|
||||||
|
const representativeRow = thead ? thead.querySelector('tr') : element.querySelector('tr');
|
||||||
|
|
||||||
|
if (representativeRow) {
|
||||||
|
const structure = Array.from(representativeRow.children).map(child => ({
|
||||||
|
tag: child.tagName.toLowerCase(),
|
||||||
|
classes: this.normalizeClasses(child.classList),
|
||||||
|
}));
|
||||||
|
childrenStructureString = JSON.stringify(structure);
|
||||||
|
} else {
|
||||||
|
childrenStructureString = JSON.stringify([]);
|
||||||
|
}
|
||||||
|
} else if (tagName === 'tr') {
|
||||||
|
// For rows, the fingerprint is based on the cell structure, ignoring the cell's inner content.
|
||||||
|
const structure = children.map((child) => ({
|
||||||
|
tag: child.tagName.toLowerCase(),
|
||||||
|
classes: this.normalizeClasses(child.classList),
|
||||||
|
}));
|
||||||
|
childrenStructureString = JSON.stringify(structure);
|
||||||
|
} else {
|
||||||
|
// Original logic for all other elements.
|
||||||
|
const structure = children.map((child) => ({
|
||||||
|
tag: child.tagName.toLowerCase(),
|
||||||
|
classes: this.normalizeClasses(child.classList),
|
||||||
|
hasText: (child.textContent ?? "").trim().length > 0,
|
||||||
|
}));
|
||||||
|
childrenStructureString = JSON.stringify(structure);
|
||||||
|
}
|
||||||
|
|
||||||
const normalizedClasses = this.normalizeClasses(element.classList);
|
const normalizedClasses = this.normalizeClasses(element.classList);
|
||||||
|
|
||||||
const relevantAttributes = Array.from(element.attributes)
|
const relevantAttributes = Array.from(element.attributes)
|
||||||
.filter((attr) => {
|
.filter((attr) => {
|
||||||
if (isCustomElement) {
|
if (isCustomElement) {
|
||||||
return ![
|
return !["id", "style", "data-reactid", "data-react-checksum"].includes(attr.name.toLowerCase());
|
||||||
"id",
|
|
||||||
"style",
|
|
||||||
"data-reactid",
|
|
||||||
"data-react-checksum",
|
|
||||||
].includes(attr.name.toLowerCase());
|
|
||||||
} else {
|
} else {
|
||||||
return (
|
return (
|
||||||
!["id", "style", "data-reactid", "data-react-checksum"].includes(
|
!["id", "style", "data-reactid", "data-react-checksum"].includes(attr.name.toLowerCase()) &&
|
||||||
attr.name.toLowerCase()
|
(!attr.name.startsWith("data-") || attr.name === "data-type" || attr.name === "data-role")
|
||||||
) &&
|
|
||||||
(!attr.name.startsWith("data-") ||
|
|
||||||
attr.name === "data-type" ||
|
|
||||||
attr.name === "data-role")
|
|
||||||
);
|
);
|
||||||
}
|
}
|
||||||
})
|
})
|
||||||
.map((attr) => `${attr.name}=${attr.value}`)
|
.map((attr) => `${attr.name}=${attr.value}`)
|
||||||
.sort();
|
.sort();
|
||||||
|
|
||||||
// Calculate element depth
|
|
||||||
let depth = 0;
|
let depth = 0;
|
||||||
let parent = element.parentElement;
|
let parent = element.parentElement;
|
||||||
while (parent && depth < 20) {
|
while (parent && depth < 20) {
|
||||||
@@ -242,27 +258,22 @@ class ClientSelectorGenerator {
|
|||||||
parent = parent.parentElement;
|
parent = parent.parentElement;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Get text content characteristics
|
|
||||||
const textContent = (element.textContent ?? "").trim();
|
const textContent = (element.textContent ?? "").trim();
|
||||||
const textCharacteristics = {
|
const textCharacteristics = {
|
||||||
hasText: textContent.length > 0,
|
hasText: textContent.length > 0,
|
||||||
textLength: Math.floor(textContent.length / 20) * 20,
|
textLength: Math.floor(textContent.length / 20) * 20,
|
||||||
hasLinks: element.querySelectorAll("a").length,
|
hasLinks: element.querySelectorAll("a").length,
|
||||||
hasImages: element.querySelectorAll("img").length,
|
hasImages: element.querySelectorAll("img").length,
|
||||||
hasButtons: element.querySelectorAll(
|
hasButtons: element.querySelectorAll('button, input[type="button"], input[type="submit"]').length,
|
||||||
'button, input[type="button"], input[type="submit"]'
|
|
||||||
).length,
|
|
||||||
};
|
};
|
||||||
|
|
||||||
const signature = `${tagName}::${normalizedClasses}::${
|
const signature = `${tagName}::${normalizedClasses}::${children.length}::${childrenStructureString}::${relevantAttributes.join("|")}`;
|
||||||
children.length
|
|
||||||
}::${JSON.stringify(childrenStructure)}::${relevantAttributes.join("|")}`;
|
|
||||||
|
|
||||||
return {
|
return {
|
||||||
tagName,
|
tagName,
|
||||||
normalizedClasses,
|
normalizedClasses,
|
||||||
childrenCount: children.length,
|
childrenCount: children.length,
|
||||||
childrenStructure: JSON.stringify(childrenStructure),
|
childrenStructure: childrenStructureString,
|
||||||
attributes: relevantAttributes.join("|"),
|
attributes: relevantAttributes.join("|"),
|
||||||
depth,
|
depth,
|
||||||
textCharacteristics,
|
textCharacteristics,
|
||||||
@@ -387,79 +398,78 @@ class ClientSelectorGenerator {
|
|||||||
|
|
||||||
// Get all visible elements INCLUDING shadow DOM
|
// Get all visible elements INCLUDING shadow DOM
|
||||||
const allElements = this.getAllVisibleElementsWithShadow(iframeDoc);
|
const allElements = this.getAllVisibleElementsWithShadow(iframeDoc);
|
||||||
|
const processedInTables = new Set<HTMLElement>();
|
||||||
|
|
||||||
// Create fingerprints for all elements
|
// 1. Specifically find and group rows within each table, bypassing normal similarity checks.
|
||||||
|
const tables = allElements.filter(el => el.tagName === 'TABLE');
|
||||||
|
|
||||||
|
tables.forEach(table => {
|
||||||
|
const rows = Array.from(table.querySelectorAll('tbody > tr')).filter(row => {
|
||||||
|
const parent = row.parentElement;
|
||||||
|
if (!parent || !table.contains(parent)) return false; // Ensure row belongs to this table
|
||||||
|
|
||||||
|
const rect = row.getBoundingClientRect();
|
||||||
|
return rect.width > 0 && rect.height > 0;
|
||||||
|
}) as HTMLElement[];
|
||||||
|
|
||||||
|
// If the table has enough rows, force them into a single group.
|
||||||
|
if (rows.length >= this.groupingConfig.minGroupSize) {
|
||||||
|
const representativeFingerprint = this.getStructuralFingerprint(rows[0]);
|
||||||
|
if (!representativeFingerprint) return;
|
||||||
|
|
||||||
|
const group: ElementGroup = {
|
||||||
|
elements: rows,
|
||||||
|
fingerprint: representativeFingerprint,
|
||||||
|
representative: rows[0],
|
||||||
|
};
|
||||||
|
|
||||||
|
rows.forEach(row => {
|
||||||
|
this.elementGroups.set(row, group);
|
||||||
|
this.groupedElements.add(row);
|
||||||
|
processedInTables.add(row);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
// 2. Group all other elements, excluding table rows that were already grouped.
|
||||||
|
const remainingElements = allElements.filter(el => !processedInTables.has(el));
|
||||||
const elementFingerprints = new Map<HTMLElement, ElementFingerprint>();
|
const elementFingerprints = new Map<HTMLElement, ElementFingerprint>();
|
||||||
|
remainingElements.forEach((element) => {
|
||||||
allElements.forEach((element) => {
|
|
||||||
const fingerprint = this.getStructuralFingerprint(element);
|
const fingerprint = this.getStructuralFingerprint(element);
|
||||||
if (fingerprint) {
|
if (fingerprint) {
|
||||||
elementFingerprints.set(element, fingerprint);
|
elementFingerprints.set(element, fingerprint);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Find similar groups using similarity scoring
|
|
||||||
const similarGroups: ElementGroup[] = [];
|
|
||||||
const processedElements = new Set<HTMLElement>();
|
const processedElements = new Set<HTMLElement>();
|
||||||
|
|
||||||
elementFingerprints.forEach((fingerprint, element) => {
|
elementFingerprints.forEach((fingerprint, element) => {
|
||||||
if (processedElements.has(element)) return;
|
if (processedElements.has(element)) return;
|
||||||
|
|
||||||
const currentGroup = [element];
|
const currentGroup = [element];
|
||||||
processedElements.add(element);
|
processedElements.add(element);
|
||||||
|
|
||||||
// Find similar elements
|
|
||||||
elementFingerprints.forEach((otherFingerprint, otherElement) => {
|
elementFingerprints.forEach((otherFingerprint, otherElement) => {
|
||||||
if (processedElements.has(otherElement)) return;
|
if (processedElements.has(otherElement)) return;
|
||||||
|
|
||||||
const similarity = this.calculateSimilarity(
|
const similarity = this.calculateSimilarity(fingerprint, otherFingerprint);
|
||||||
fingerprint,
|
|
||||||
otherFingerprint
|
|
||||||
);
|
|
||||||
|
|
||||||
if (similarity >= this.groupingConfig.similarityThreshold) {
|
if (similarity >= this.groupingConfig.similarityThreshold) {
|
||||||
currentGroup.push(otherElement);
|
currentGroup.push(otherElement);
|
||||||
processedElements.add(otherElement);
|
processedElements.add(otherElement);
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Add group if it has enough members AND has meaningful children
|
if (currentGroup.length >= this.groupingConfig.minGroupSize && this.hasAnyMeaningfulChildren(element)) {
|
||||||
if (currentGroup.length >= this.groupingConfig.minGroupSize) {
|
const group: ElementGroup = {
|
||||||
// Check if the representative element has meaningful children
|
elements: currentGroup,
|
||||||
const hasChildren = this.hasAnyMeaningfulChildren(element);
|
fingerprint,
|
||||||
|
representative: element,
|
||||||
if (hasChildren) {
|
};
|
||||||
const group: ElementGroup = {
|
currentGroup.forEach((el) => {
|
||||||
elements: currentGroup,
|
this.elementGroups.set(el, group);
|
||||||
fingerprint,
|
this.groupedElements.add(el);
|
||||||
representative: element,
|
});
|
||||||
};
|
|
||||||
similarGroups.push(group);
|
|
||||||
|
|
||||||
// Map each element to its group
|
|
||||||
currentGroup.forEach((el) => {
|
|
||||||
this.elementGroups.set(el, group);
|
|
||||||
this.groupedElements.add(el);
|
|
||||||
});
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
// Sort groups by size and relevance
|
|
||||||
similarGroups.sort((a, b) => {
|
|
||||||
// Prioritize by size first
|
|
||||||
if (b.elements.length !== a.elements.length)
|
|
||||||
return b.elements.length - a.elements.length;
|
|
||||||
|
|
||||||
// Then by element size
|
|
||||||
const aSize =
|
|
||||||
a.representative.getBoundingClientRect().width *
|
|
||||||
a.representative.getBoundingClientRect().height;
|
|
||||||
const bSize =
|
|
||||||
b.representative.getBoundingClientRect().width *
|
|
||||||
b.representative.getBoundingClientRect().height;
|
|
||||||
return bSize - aSize;
|
|
||||||
});
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
|||||||
Reference in New Issue
Block a user