fix: url auto extraction

This commit is contained in:
Rohit Rajan
2026-01-04 21:50:27 +05:30
parent e546d5d469
commit 670fa9b29f

View File

@@ -321,81 +321,12 @@ export const BrowserWindow = () => {
const uniqueChildSelectors = [...new Set(childSelectors)]; const uniqueChildSelectors = [...new Set(childSelectors)];
const validateChildSelectors = (selectors: string[]): string[] => {
try {
// Get first 10 list elements
const listElements = evaluateXPathAllWithShadowSupport(
iframeElement.contentDocument!,
listSelector,
listSelector.includes(">>") || listSelector.startsWith("//")
).slice(0, 10);
if (listElements.length < 2) {
return selectors;
}
const validSelectors: string[] = [];
for (const selector of selectors) {
// First, try to access the element directly
try {
const testElement = iframeElement.contentDocument!.evaluate(
selector,
iframeElement.contentDocument!,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
).singleNodeValue;
// If we can't access the element, it's likely in shadow DOM - include it
if (!testElement) {
validSelectors.push(selector);
continue;
}
} catch (accessError) {
validSelectors.push(selector);
continue;
}
let occurrenceCount = 0;
// Get all elements that match this child selector
const childElements = evaluateXPathAllWithShadowSupport(
iframeElement.contentDocument!,
selector,
selector.includes(">>") || selector.startsWith("//")
);
// Check how many of these child elements are contained within our list elements
for (const childElement of childElements) {
for (const listElement of listElements) {
if (listElement.contains(childElement)) {
occurrenceCount++;
break;
}
}
}
// Only include selectors that occur in at least 2 list elements
if (occurrenceCount >= 2) {
validSelectors.push(selector);
}
}
return validSelectors;
} catch (error) {
console.warn("Failed to validate child selectors:", error);
return selectors;
}
};
const evaluateXPathAllWithShadowSupport = ( const evaluateXPathAllWithShadowSupport = (
document: Document, document: Document,
xpath: string, xpath: string,
isShadow: boolean = false isShadow: boolean = false
): Element[] => { ): Element[] => {
try { try {
// First try regular XPath evaluation
const result = document.evaluate( const result = document.evaluate(
xpath, xpath,
document, document,
@@ -478,9 +409,51 @@ export const BrowserWindow = () => {
}; };
}; };
const validatedChildSelectors = validateChildSelectors(uniqueChildSelectors); try {
const listElements = evaluateXPathAllWithShadowSupport(
iframeElement.contentDocument!,
listSelector,
listSelector.includes(">>") || listSelector.startsWith("//")
);
validatedChildSelectors.forEach((selector, index) => { if (listElements.length > 0) {
const firstListElement = listElements[0] as HTMLElement;
const listTagName = firstListElement.tagName.toLowerCase();
if (listTagName === 'a' && isElementVisible(firstListElement)) {
const href = firstListElement.getAttribute('href');
if (href && href !== '#' && !href.startsWith('javascript:') && isValidData(href)) {
const rect = firstListElement.getBoundingClientRect();
const fieldId = Date.now();
candidateFields.push({
id: fieldId,
element: firstListElement,
isLeaf: true,
depth: 0,
position: { x: rect.left, y: rect.top },
field: {
id: fieldId,
type: "text",
label: "Label 1",
data: href,
selectorObj: {
selector: listSelector,
attribute: 'href',
tag: 'A',
isShadow: firstListElement.getRootNode() instanceof ShadowRoot
}
}
});
}
}
}
} catch (error) {
console.warn('Failed to extract list container data:', error);
}
uniqueChildSelectors.forEach((selector, index) => {
try { try {
const listElements = evaluateXPathAllWithShadowSupport( const listElements = evaluateXPathAllWithShadowSupport(
iframeElement.contentDocument!, iframeElement.contentDocument!,
@@ -501,13 +474,16 @@ export const BrowserWindow = () => {
const matchRatio = allMatches.length / listElements.length; const matchRatio = allMatches.length / listElements.length;
if (matchRatio < 0.6) { const isLinkOrImage = allMatches.length > 0 &&
(allMatches[0].tagName === 'A' || allMatches[0].tagName === 'IMG');
if (!isLinkOrImage && matchRatio < 0.6) {
return; return;
} }
} }
const firstListElement = listElements[0]; const firstListElement = listElements[0];
const elements = evaluateXPathAllWithShadowSupport( const elements = evaluateXPathAllWithShadowSupport(
iframeElement.contentDocument!, iframeElement.contentDocument!,
selector, selector,
@@ -617,8 +593,13 @@ export const BrowserWindow = () => {
selectorObj: fieldData.selectorObj selectorObj: fieldData.selectorObj
} }
}); });
const anchorParent = element.closest('a'); }
if (anchorParent) {
const anchorParent = element.closest('a');
if (anchorParent) {
const isListContainer = listElements.some(listEl => listEl === anchorParent);
if (!isListContainer) {
const href = anchorParent.getAttribute('href'); const href = anchorParent.getAttribute('href');
if (href && href !== '#' && !href.startsWith('javascript:') && isValidData(href)) { if (href && href !== '#' && !href.startsWith('javascript:') && isValidData(href)) {
let anchorSelector = selector; let anchorSelector = selector;
@@ -645,7 +626,7 @@ export const BrowserWindow = () => {
selector: anchorSelector, selector: anchorSelector,
attribute: 'href', attribute: 'href',
tag: 'A', tag: 'A',
isShadow: anchorParent.getRootNode() instanceof ShadowRoot, isShadow: anchorParent.getRootNode() instanceof ShadowRoot
} }
} }
}); });
@@ -659,54 +640,63 @@ export const BrowserWindow = () => {
} }
}); });
// Sort candidates by visual position (top-to-bottom, then left-to-right)
candidateFields.sort((a, b) => { candidateFields.sort((a, b) => {
const yDiff = a.position.y - b.position.y; const yDiff = a.position.y - b.position.y;
// If elements are roughly on the same horizontal line (within 5px tolerance)
if (Math.abs(yDiff) <= 5) { if (Math.abs(yDiff) <= 5) {
return a.position.x - b.position.x; return a.position.x - b.position.x; // Sort by x-position (left to right)
} }
return yDiff; return yDiff; // Sort by y-position (top to bottom)
}); });
const filteredCandidates = removeParentChildDuplicates(candidateFields); const filteredCandidates = removeParentChildDuplicates(candidateFields);
const cleanedCandidates = filteredCandidates.filter((candidate) => { const cleanedCandidates = filteredCandidates.filter((candidate) => {
const data = candidate.field.data.trim(); const data = candidate.field.data.trim();
const textChildren = Array.from(candidate.element.children).filter(child => const isHrefField = candidate.field.selectorObj?.attribute === 'href';
(child.textContent || '').trim().length > 0 if (isHrefField) {
); return true;
if (textChildren.length === 0) {
return true;
}
const childCandidates = filteredCandidates.filter((other) => {
if (other === candidate) return false;
return candidate.element.contains(other.element);
});
if (childCandidates.length === 0) {
return true;
}
let coveredLength = 0;
childCandidates.forEach(child => {
const childText = child.field.data.trim();
if (data.includes(childText)) {
coveredLength += childText.length;
} }
const textChildren = Array.from(candidate.element.children).filter(child =>
(child.textContent || '').trim().length > 0
);
if (textChildren.length === 0) {
return true;
}
const childCandidates = filteredCandidates.filter((other) => {
if (other === candidate) return false;
return candidate.element.contains(other.element);
});
if (childCandidates.length === 0) {
return true;
}
let coveredLength = 0;
childCandidates.forEach(child => {
const childText = child.field.data.trim();
if (data.includes(childText)) {
coveredLength += childText.length;
}
});
const coverageRatio = coveredLength / data.length;
const hasMultipleChildTexts = childCandidates.length >= 2;
const highCoverage = coverageRatio > 0.7;
return !(hasMultipleChildTexts && highCoverage);
}); });
const coverageRatio = coveredLength / data.length; const finalFields = removeDuplicateContent(cleanedCandidates);
const hasMultipleChildTexts = childCandidates.length >= 2;
const highCoverage = coverageRatio > 0.7;
return !(hasMultipleChildTexts && highCoverage); return finalFields;
});
const finalFields = removeDuplicateContent(cleanedCandidates);
return finalFields;
}, },
[currentSnapshot] [currentSnapshot]
); );
@@ -738,6 +728,14 @@ export const BrowserWindow = () => {
}> = []; }> = [];
for (const candidate of candidates) { for (const candidate of candidates) {
const isAnchorWithHref = candidate.element.tagName.toLowerCase() === "a" &&
candidate.field.selectorObj?.attribute === 'href';
if (isAnchorWithHref) {
filtered.push(candidate);
continue;
}
let shouldInclude = true; let shouldInclude = true;
for (const existing of filtered) { for (const existing of filtered) {
@@ -745,16 +743,17 @@ export const BrowserWindow = () => {
shouldInclude = false; shouldInclude = false;
break; break;
} else if (existing.element.contains(candidate.element)) { } else if (existing.element.contains(candidate.element)) {
const existingIndex = filtered.indexOf(existing); const existingIsAnchorWithHref = existing.element.tagName.toLowerCase() === "a" &&
filtered.splice(existingIndex, 1); existing.field.selectorObj?.attribute === 'href';
if (!existingIsAnchorWithHref) {
const existingIndex = filtered.indexOf(existing);
filtered.splice(existingIndex, 1);
}
break; break;
} }
} }
if (candidate.element.tagName.toLowerCase() === "a") {
shouldInclude = true;
}
if (shouldInclude) { if (shouldInclude) {
filtered.push(candidate); filtered.push(candidate);
} }