feat: always group table rows

This commit is contained in:
Rohit
2025-08-05 23:16:10 +05:30
parent 3fa00c8add
commit c62390b0d8

View File

@@ -185,7 +185,6 @@ class ClientSelectorGenerator {
if (element.nodeType !== Node.ELEMENT_NODE) return null; if (element.nodeType !== Node.ELEMENT_NODE) return null;
const tagName = element.tagName.toLowerCase(); const tagName = element.tagName.toLowerCase();
const isCustomElement = tagName.includes("-"); const isCustomElement = tagName.includes("-");
const standardExcludeSelectors = [ const standardExcludeSelectors = [
@@ -203,38 +202,55 @@ class ClientSelectorGenerator {
if (this.groupingConfig.excludeSelectors.includes(tagName)) return null; if (this.groupingConfig.excludeSelectors.includes(tagName)) return null;
const children = Array.from(element.children); const children = Array.from(element.children);
const childrenStructure = children.map((child) => ({ let childrenStructureString: string;
tag: child.tagName.toLowerCase(),
classes: this.normalizeClasses(child.classList), if (tagName === 'table') {
hasText: (child.textContent ?? "").trim().length > 0, // For tables, the fingerprint is based on the header or first row's structure.
})); const thead = element.querySelector('thead');
const representativeRow = thead ? thead.querySelector('tr') : element.querySelector('tr');
if (representativeRow) {
const structure = Array.from(representativeRow.children).map(child => ({
tag: child.tagName.toLowerCase(),
classes: this.normalizeClasses(child.classList),
}));
childrenStructureString = JSON.stringify(structure);
} else {
childrenStructureString = JSON.stringify([]);
}
} else if (tagName === 'tr') {
// For rows, the fingerprint is based on the cell structure, ignoring the cell's inner content.
const structure = children.map((child) => ({
tag: child.tagName.toLowerCase(),
classes: this.normalizeClasses(child.classList),
}));
childrenStructureString = JSON.stringify(structure);
} else {
// Original logic for all other elements.
const structure = children.map((child) => ({
tag: child.tagName.toLowerCase(),
classes: this.normalizeClasses(child.classList),
hasText: (child.textContent ?? "").trim().length > 0,
}));
childrenStructureString = JSON.stringify(structure);
}
const normalizedClasses = this.normalizeClasses(element.classList); const normalizedClasses = this.normalizeClasses(element.classList);
const relevantAttributes = Array.from(element.attributes) const relevantAttributes = Array.from(element.attributes)
.filter((attr) => { .filter((attr) => {
if (isCustomElement) { if (isCustomElement) {
return ![ return !["id", "style", "data-reactid", "data-react-checksum"].includes(attr.name.toLowerCase());
"id",
"style",
"data-reactid",
"data-react-checksum",
].includes(attr.name.toLowerCase());
} else { } else {
return ( return (
!["id", "style", "data-reactid", "data-react-checksum"].includes( !["id", "style", "data-reactid", "data-react-checksum"].includes(attr.name.toLowerCase()) &&
attr.name.toLowerCase() (!attr.name.startsWith("data-") || attr.name === "data-type" || attr.name === "data-role")
) &&
(!attr.name.startsWith("data-") ||
attr.name === "data-type" ||
attr.name === "data-role")
); );
} }
}) })
.map((attr) => `${attr.name}=${attr.value}`) .map((attr) => `${attr.name}=${attr.value}`)
.sort(); .sort();
// Calculate element depth
let depth = 0; let depth = 0;
let parent = element.parentElement; let parent = element.parentElement;
while (parent && depth < 20) { while (parent && depth < 20) {
@@ -242,27 +258,22 @@ class ClientSelectorGenerator {
parent = parent.parentElement; parent = parent.parentElement;
} }
// Get text content characteristics
const textContent = (element.textContent ?? "").trim(); const textContent = (element.textContent ?? "").trim();
const textCharacteristics = { const textCharacteristics = {
hasText: textContent.length > 0, hasText: textContent.length > 0,
textLength: Math.floor(textContent.length / 20) * 20, textLength: Math.floor(textContent.length / 20) * 20,
hasLinks: element.querySelectorAll("a").length, hasLinks: element.querySelectorAll("a").length,
hasImages: element.querySelectorAll("img").length, hasImages: element.querySelectorAll("img").length,
hasButtons: element.querySelectorAll( hasButtons: element.querySelectorAll('button, input[type="button"], input[type="submit"]').length,
'button, input[type="button"], input[type="submit"]'
).length,
}; };
const signature = `${tagName}::${normalizedClasses}::${ const signature = `${tagName}::${normalizedClasses}::${children.length}::${childrenStructureString}::${relevantAttributes.join("|")}`;
children.length
}::${JSON.stringify(childrenStructure)}::${relevantAttributes.join("|")}`;
return { return {
tagName, tagName,
normalizedClasses, normalizedClasses,
childrenCount: children.length, childrenCount: children.length,
childrenStructure: JSON.stringify(childrenStructure), childrenStructure: childrenStructureString,
attributes: relevantAttributes.join("|"), attributes: relevantAttributes.join("|"),
depth, depth,
textCharacteristics, textCharacteristics,
@@ -379,87 +390,86 @@ class ClientSelectorGenerator {
) { ) {
return; return;
} }
// Clear previous analysis // Clear previous analysis
this.elementGroups.clear(); this.elementGroups.clear();
this.groupedElements.clear(); this.groupedElements.clear();
this.lastAnalyzedDocument = iframeDoc; this.lastAnalyzedDocument = iframeDoc;
// Get all visible elements INCLUDING shadow DOM // Get all visible elements INCLUDING shadow DOM
const allElements = this.getAllVisibleElementsWithShadow(iframeDoc); const allElements = this.getAllVisibleElementsWithShadow(iframeDoc);
const processedInTables = new Set<HTMLElement>();
// 1. Specifically find and group rows within each table, bypassing normal similarity checks.
const tables = allElements.filter(el => el.tagName === 'TABLE');
tables.forEach(table => {
const rows = Array.from(table.querySelectorAll('tbody > tr')).filter(row => {
const parent = row.parentElement;
if (!parent || !table.contains(parent)) return false; // Ensure row belongs to this table
const rect = row.getBoundingClientRect();
return rect.width > 0 && rect.height > 0;
}) as HTMLElement[];
// If the table has enough rows, force them into a single group.
if (rows.length >= this.groupingConfig.minGroupSize) {
const representativeFingerprint = this.getStructuralFingerprint(rows[0]);
if (!representativeFingerprint) return;
// Create fingerprints for all elements const group: ElementGroup = {
elements: rows,
fingerprint: representativeFingerprint,
representative: rows[0],
};
rows.forEach(row => {
this.elementGroups.set(row, group);
this.groupedElements.add(row);
processedInTables.add(row);
});
}
});
// 2. Group all other elements, excluding table rows that were already grouped.
const remainingElements = allElements.filter(el => !processedInTables.has(el));
const elementFingerprints = new Map<HTMLElement, ElementFingerprint>(); const elementFingerprints = new Map<HTMLElement, ElementFingerprint>();
remainingElements.forEach((element) => {
allElements.forEach((element) => {
const fingerprint = this.getStructuralFingerprint(element); const fingerprint = this.getStructuralFingerprint(element);
if (fingerprint) { if (fingerprint) {
elementFingerprints.set(element, fingerprint); elementFingerprints.set(element, fingerprint);
} }
}); });
// Find similar groups using similarity scoring
const similarGroups: ElementGroup[] = [];
const processedElements = new Set<HTMLElement>(); const processedElements = new Set<HTMLElement>();
elementFingerprints.forEach((fingerprint, element) => { elementFingerprints.forEach((fingerprint, element) => {
if (processedElements.has(element)) return; if (processedElements.has(element)) return;
const currentGroup = [element]; const currentGroup = [element];
processedElements.add(element); processedElements.add(element);
// Find similar elements
elementFingerprints.forEach((otherFingerprint, otherElement) => { elementFingerprints.forEach((otherFingerprint, otherElement) => {
if (processedElements.has(otherElement)) return; if (processedElements.has(otherElement)) return;
const similarity = this.calculateSimilarity( const similarity = this.calculateSimilarity(fingerprint, otherFingerprint);
fingerprint,
otherFingerprint
);
if (similarity >= this.groupingConfig.similarityThreshold) { if (similarity >= this.groupingConfig.similarityThreshold) {
currentGroup.push(otherElement); currentGroup.push(otherElement);
processedElements.add(otherElement); processedElements.add(otherElement);
} }
}); });
// Add group if it has enough members AND has meaningful children if (currentGroup.length >= this.groupingConfig.minGroupSize && this.hasAnyMeaningfulChildren(element)) {
if (currentGroup.length >= this.groupingConfig.minGroupSize) { const group: ElementGroup = {
// Check if the representative element has meaningful children elements: currentGroup,
const hasChildren = this.hasAnyMeaningfulChildren(element); fingerprint,
representative: element,
if (hasChildren) { };
const group: ElementGroup = { currentGroup.forEach((el) => {
elements: currentGroup, this.elementGroups.set(el, group);
fingerprint, this.groupedElements.add(el);
representative: element, });
};
similarGroups.push(group);
// Map each element to its group
currentGroup.forEach((el) => {
this.elementGroups.set(el, group);
this.groupedElements.add(el);
});
}
} }
}); });
// Sort groups by size and relevance
similarGroups.sort((a, b) => {
// Prioritize by size first
if (b.elements.length !== a.elements.length)
return b.elements.length - a.elements.length;
// Then by element size
const aSize =
a.representative.getBoundingClientRect().width *
a.representative.getBoundingClientRect().height;
const bSize =
b.representative.getBoundingClientRect().width *
b.representative.getBoundingClientRect().height;
return bSize - aSize;
});
} }
/** /**