fix: url auto extraction
This commit is contained in:
@@ -321,81 +321,12 @@ export const BrowserWindow = () => {
|
|||||||
|
|
||||||
const uniqueChildSelectors = [...new Set(childSelectors)];
|
const uniqueChildSelectors = [...new Set(childSelectors)];
|
||||||
|
|
||||||
const validateChildSelectors = (selectors: string[]): string[] => {
|
|
||||||
try {
|
|
||||||
// Get first 10 list elements
|
|
||||||
const listElements = evaluateXPathAllWithShadowSupport(
|
|
||||||
iframeElement.contentDocument!,
|
|
||||||
listSelector,
|
|
||||||
listSelector.includes(">>") || listSelector.startsWith("//")
|
|
||||||
).slice(0, 10);
|
|
||||||
|
|
||||||
if (listElements.length < 2) {
|
|
||||||
return selectors;
|
|
||||||
}
|
|
||||||
|
|
||||||
const validSelectors: string[] = [];
|
|
||||||
|
|
||||||
for (const selector of selectors) {
|
|
||||||
// First, try to access the element directly
|
|
||||||
try {
|
|
||||||
const testElement = iframeElement.contentDocument!.evaluate(
|
|
||||||
selector,
|
|
||||||
iframeElement.contentDocument!,
|
|
||||||
null,
|
|
||||||
XPathResult.FIRST_ORDERED_NODE_TYPE,
|
|
||||||
null
|
|
||||||
).singleNodeValue;
|
|
||||||
|
|
||||||
// If we can't access the element, it's likely in shadow DOM - include it
|
|
||||||
if (!testElement) {
|
|
||||||
validSelectors.push(selector);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
} catch (accessError) {
|
|
||||||
validSelectors.push(selector);
|
|
||||||
continue;
|
|
||||||
}
|
|
||||||
|
|
||||||
let occurrenceCount = 0;
|
|
||||||
|
|
||||||
// Get all elements that match this child selector
|
|
||||||
const childElements = evaluateXPathAllWithShadowSupport(
|
|
||||||
iframeElement.contentDocument!,
|
|
||||||
selector,
|
|
||||||
selector.includes(">>") || selector.startsWith("//")
|
|
||||||
);
|
|
||||||
|
|
||||||
// Check how many of these child elements are contained within our list elements
|
|
||||||
for (const childElement of childElements) {
|
|
||||||
for (const listElement of listElements) {
|
|
||||||
if (listElement.contains(childElement)) {
|
|
||||||
occurrenceCount++;
|
|
||||||
break;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
// Only include selectors that occur in at least 2 list elements
|
|
||||||
if (occurrenceCount >= 2) {
|
|
||||||
validSelectors.push(selector);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
return validSelectors;
|
|
||||||
} catch (error) {
|
|
||||||
console.warn("Failed to validate child selectors:", error);
|
|
||||||
return selectors;
|
|
||||||
}
|
|
||||||
};
|
|
||||||
|
|
||||||
const evaluateXPathAllWithShadowSupport = (
|
const evaluateXPathAllWithShadowSupport = (
|
||||||
document: Document,
|
document: Document,
|
||||||
xpath: string,
|
xpath: string,
|
||||||
isShadow: boolean = false
|
isShadow: boolean = false
|
||||||
): Element[] => {
|
): Element[] => {
|
||||||
try {
|
try {
|
||||||
// First try regular XPath evaluation
|
|
||||||
const result = document.evaluate(
|
const result = document.evaluate(
|
||||||
xpath,
|
xpath,
|
||||||
document,
|
document,
|
||||||
@@ -478,9 +409,51 @@ export const BrowserWindow = () => {
|
|||||||
};
|
};
|
||||||
};
|
};
|
||||||
|
|
||||||
const validatedChildSelectors = validateChildSelectors(uniqueChildSelectors);
|
try {
|
||||||
|
const listElements = evaluateXPathAllWithShadowSupport(
|
||||||
|
iframeElement.contentDocument!,
|
||||||
|
listSelector,
|
||||||
|
listSelector.includes(">>") || listSelector.startsWith("//")
|
||||||
|
);
|
||||||
|
|
||||||
validatedChildSelectors.forEach((selector, index) => {
|
if (listElements.length > 0) {
|
||||||
|
const firstListElement = listElements[0] as HTMLElement;
|
||||||
|
const listTagName = firstListElement.tagName.toLowerCase();
|
||||||
|
|
||||||
|
if (listTagName === 'a' && isElementVisible(firstListElement)) {
|
||||||
|
const href = firstListElement.getAttribute('href');
|
||||||
|
|
||||||
|
if (href && href !== '#' && !href.startsWith('javascript:') && isValidData(href)) {
|
||||||
|
const rect = firstListElement.getBoundingClientRect();
|
||||||
|
const fieldId = Date.now();
|
||||||
|
|
||||||
|
candidateFields.push({
|
||||||
|
id: fieldId,
|
||||||
|
element: firstListElement,
|
||||||
|
isLeaf: true,
|
||||||
|
depth: 0,
|
||||||
|
position: { x: rect.left, y: rect.top },
|
||||||
|
field: {
|
||||||
|
id: fieldId,
|
||||||
|
type: "text",
|
||||||
|
label: "Label 1",
|
||||||
|
data: href,
|
||||||
|
selectorObj: {
|
||||||
|
selector: listSelector,
|
||||||
|
attribute: 'href',
|
||||||
|
tag: 'A',
|
||||||
|
isShadow: firstListElement.getRootNode() instanceof ShadowRoot
|
||||||
|
}
|
||||||
|
}
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (error) {
|
||||||
|
console.warn('Failed to extract list container data:', error);
|
||||||
|
}
|
||||||
|
|
||||||
|
uniqueChildSelectors.forEach((selector, index) => {
|
||||||
try {
|
try {
|
||||||
const listElements = evaluateXPathAllWithShadowSupport(
|
const listElements = evaluateXPathAllWithShadowSupport(
|
||||||
iframeElement.contentDocument!,
|
iframeElement.contentDocument!,
|
||||||
@@ -501,13 +474,16 @@ export const BrowserWindow = () => {
|
|||||||
|
|
||||||
const matchRatio = allMatches.length / listElements.length;
|
const matchRatio = allMatches.length / listElements.length;
|
||||||
|
|
||||||
if (matchRatio < 0.6) {
|
const isLinkOrImage = allMatches.length > 0 &&
|
||||||
|
(allMatches[0].tagName === 'A' || allMatches[0].tagName === 'IMG');
|
||||||
|
|
||||||
|
if (!isLinkOrImage && matchRatio < 0.6) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
const firstListElement = listElements[0];
|
const firstListElement = listElements[0];
|
||||||
|
|
||||||
const elements = evaluateXPathAllWithShadowSupport(
|
const elements = evaluateXPathAllWithShadowSupport(
|
||||||
iframeElement.contentDocument!,
|
iframeElement.contentDocument!,
|
||||||
selector,
|
selector,
|
||||||
@@ -617,8 +593,13 @@ export const BrowserWindow = () => {
|
|||||||
selectorObj: fieldData.selectorObj
|
selectorObj: fieldData.selectorObj
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
const anchorParent = element.closest('a');
|
}
|
||||||
if (anchorParent) {
|
|
||||||
|
const anchorParent = element.closest('a');
|
||||||
|
if (anchorParent) {
|
||||||
|
const isListContainer = listElements.some(listEl => listEl === anchorParent);
|
||||||
|
|
||||||
|
if (!isListContainer) {
|
||||||
const href = anchorParent.getAttribute('href');
|
const href = anchorParent.getAttribute('href');
|
||||||
if (href && href !== '#' && !href.startsWith('javascript:') && isValidData(href)) {
|
if (href && href !== '#' && !href.startsWith('javascript:') && isValidData(href)) {
|
||||||
let anchorSelector = selector;
|
let anchorSelector = selector;
|
||||||
@@ -645,7 +626,7 @@ export const BrowserWindow = () => {
|
|||||||
selector: anchorSelector,
|
selector: anchorSelector,
|
||||||
attribute: 'href',
|
attribute: 'href',
|
||||||
tag: 'A',
|
tag: 'A',
|
||||||
isShadow: anchorParent.getRootNode() instanceof ShadowRoot,
|
isShadow: anchorParent.getRootNode() instanceof ShadowRoot
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -659,54 +640,63 @@ export const BrowserWindow = () => {
|
|||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Sort candidates by visual position (top-to-bottom, then left-to-right)
|
||||||
candidateFields.sort((a, b) => {
|
candidateFields.sort((a, b) => {
|
||||||
const yDiff = a.position.y - b.position.y;
|
const yDiff = a.position.y - b.position.y;
|
||||||
|
|
||||||
|
// If elements are roughly on the same horizontal line (within 5px tolerance)
|
||||||
if (Math.abs(yDiff) <= 5) {
|
if (Math.abs(yDiff) <= 5) {
|
||||||
return a.position.x - b.position.x;
|
return a.position.x - b.position.x; // Sort by x-position (left to right)
|
||||||
}
|
}
|
||||||
|
|
||||||
return yDiff;
|
return yDiff; // Sort by y-position (top to bottom)
|
||||||
});
|
});
|
||||||
|
|
||||||
const filteredCandidates = removeParentChildDuplicates(candidateFields);
|
const filteredCandidates = removeParentChildDuplicates(candidateFields);
|
||||||
|
|
||||||
const cleanedCandidates = filteredCandidates.filter((candidate) => {
|
const cleanedCandidates = filteredCandidates.filter((candidate) => {
|
||||||
const data = candidate.field.data.trim();
|
const data = candidate.field.data.trim();
|
||||||
|
|
||||||
const textChildren = Array.from(candidate.element.children).filter(child =>
|
const isHrefField = candidate.field.selectorObj?.attribute === 'href';
|
||||||
(child.textContent || '').trim().length > 0
|
if (isHrefField) {
|
||||||
);
|
return true;
|
||||||
|
|
||||||
if (textChildren.length === 0) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
const childCandidates = filteredCandidates.filter((other) => {
|
|
||||||
if (other === candidate) return false;
|
|
||||||
return candidate.element.contains(other.element);
|
|
||||||
});
|
|
||||||
|
|
||||||
if (childCandidates.length === 0) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
let coveredLength = 0;
|
|
||||||
childCandidates.forEach(child => {
|
|
||||||
const childText = child.field.data.trim();
|
|
||||||
if (data.includes(childText)) {
|
|
||||||
coveredLength += childText.length;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const textChildren = Array.from(candidate.element.children).filter(child =>
|
||||||
|
(child.textContent || '').trim().length > 0
|
||||||
|
);
|
||||||
|
|
||||||
|
if (textChildren.length === 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
const childCandidates = filteredCandidates.filter((other) => {
|
||||||
|
if (other === candidate) return false;
|
||||||
|
return candidate.element.contains(other.element);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (childCandidates.length === 0) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
let coveredLength = 0;
|
||||||
|
childCandidates.forEach(child => {
|
||||||
|
const childText = child.field.data.trim();
|
||||||
|
if (data.includes(childText)) {
|
||||||
|
coveredLength += childText.length;
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
const coverageRatio = coveredLength / data.length;
|
||||||
|
const hasMultipleChildTexts = childCandidates.length >= 2;
|
||||||
|
const highCoverage = coverageRatio > 0.7;
|
||||||
|
|
||||||
|
return !(hasMultipleChildTexts && highCoverage);
|
||||||
});
|
});
|
||||||
|
|
||||||
const coverageRatio = coveredLength / data.length;
|
const finalFields = removeDuplicateContent(cleanedCandidates);
|
||||||
const hasMultipleChildTexts = childCandidates.length >= 2;
|
|
||||||
const highCoverage = coverageRatio > 0.7;
|
|
||||||
|
|
||||||
return !(hasMultipleChildTexts && highCoverage);
|
return finalFields;
|
||||||
});
|
|
||||||
|
|
||||||
const finalFields = removeDuplicateContent(cleanedCandidates);
|
|
||||||
return finalFields;
|
|
||||||
},
|
},
|
||||||
[currentSnapshot]
|
[currentSnapshot]
|
||||||
);
|
);
|
||||||
@@ -738,6 +728,14 @@ export const BrowserWindow = () => {
|
|||||||
}> = [];
|
}> = [];
|
||||||
|
|
||||||
for (const candidate of candidates) {
|
for (const candidate of candidates) {
|
||||||
|
const isAnchorWithHref = candidate.element.tagName.toLowerCase() === "a" &&
|
||||||
|
candidate.field.selectorObj?.attribute === 'href';
|
||||||
|
|
||||||
|
if (isAnchorWithHref) {
|
||||||
|
filtered.push(candidate);
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
let shouldInclude = true;
|
let shouldInclude = true;
|
||||||
|
|
||||||
for (const existing of filtered) {
|
for (const existing of filtered) {
|
||||||
@@ -745,16 +743,17 @@ export const BrowserWindow = () => {
|
|||||||
shouldInclude = false;
|
shouldInclude = false;
|
||||||
break;
|
break;
|
||||||
} else if (existing.element.contains(candidate.element)) {
|
} else if (existing.element.contains(candidate.element)) {
|
||||||
const existingIndex = filtered.indexOf(existing);
|
const existingIsAnchorWithHref = existing.element.tagName.toLowerCase() === "a" &&
|
||||||
filtered.splice(existingIndex, 1);
|
existing.field.selectorObj?.attribute === 'href';
|
||||||
|
|
||||||
|
if (!existingIsAnchorWithHref) {
|
||||||
|
const existingIndex = filtered.indexOf(existing);
|
||||||
|
filtered.splice(existingIndex, 1);
|
||||||
|
}
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (candidate.element.tagName.toLowerCase() === "a") {
|
|
||||||
shouldInclude = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (shouldInclude) {
|
if (shouldInclude) {
|
||||||
filtered.push(candidate);
|
filtered.push(candidate);
|
||||||
}
|
}
|
||||||
|
|||||||
Reference in New Issue
Block a user