Merge branch 'develop' into perfect-ui

This commit is contained in:
Rohit
2025-07-07 01:24:45 +05:30
committed by GitHub
10 changed files with 3074 additions and 3013 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@@ -492,7 +492,8 @@ const handleChangeUrl = async (activeBrowser: RemoteBrowser, page: Page, url: st
await generator.onChangeUrl(url, page); await generator.onChangeUrl(url, page);
try { try {
await page.goto(url, { waitUntil: "networkidle", timeout: 100000 }); await page.goto(url, { waitUntil: "domcontentloaded", timeout: 30000 });
await page.waitForTimeout(2000);
logger.log("debug", `Went to ${url}`); logger.log("debug", `Went to ${url}`);
} catch (e) { } catch (e) {
const { message } = e as Error; const { message } = e as Error;

View File

@@ -464,7 +464,6 @@ export class WorkflowGenerator {
public onClick = async (coordinates: Coordinates, page: Page) => { public onClick = async (coordinates: Coordinates, page: Page) => {
let where: WhereWhatPair["where"] = { url: this.getBestUrl(page.url()) }; let where: WhereWhatPair["where"] = { url: this.getBestUrl(page.url()) };
const selector = await this.generateSelector(page, coordinates, ActionType.Click); const selector = await this.generateSelector(page, coordinates, ActionType.Click);
console.log("COOORDINATES: ", coordinates);
logger.log('debug', `Element's selector: ${selector}`); logger.log('debug', `Element's selector: ${selector}`);
const elementInfo = await getElementInformation(page, coordinates, '', false); const elementInfo = await getElementInformation(page, coordinates, '', false);
@@ -999,6 +998,7 @@ export class WorkflowGenerator {
rect, rect,
selector: displaySelector, selector: displaySelector,
elementInfo, elementInfo,
isDOMMode: this.isDOMMode,
// Include shadow DOM specific information // Include shadow DOM specific information
shadowInfo: elementInfo?.isShadowRoot ? { shadowInfo: elementInfo?.isShadowRoot ? {
mode: elementInfo.shadowRootMode, mode: elementInfo.shadowRootMode,

File diff suppressed because it is too large Load Diff

View File

@@ -98,6 +98,7 @@ interface RRWebDOMBrowserRendererProps {
getList?: boolean; getList?: boolean;
getText?: boolean; getText?: boolean;
listSelector?: string | null; listSelector?: string | null;
cachedChildSelectors?: string[];
paginationMode?: boolean; paginationMode?: boolean;
paginationType?: string; paginationType?: string;
limitMode?: boolean; limitMode?: boolean;
@@ -106,12 +107,14 @@ interface RRWebDOMBrowserRendererProps {
selector: string; selector: string;
elementInfo: ElementInfo | null; elementInfo: ElementInfo | null;
childSelectors?: string[]; childSelectors?: string[];
groupInfo?: any;
}) => void; }) => void;
onElementSelect?: (data: { onElementSelect?: (data: {
rect: DOMRect; rect: DOMRect;
selector: string; selector: string;
elementInfo: ElementInfo | null; elementInfo: ElementInfo | null;
childSelectors?: string[]; childSelectors?: string[];
groupInfo?: any;
}) => void; }) => void;
onShowDatePicker?: (info: { onShowDatePicker?: (info: {
coordinates: { x: number; y: number }; coordinates: { x: number; y: number };
@@ -144,6 +147,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
getList = false, getList = false,
getText = false, getText = false,
listSelector = null, listSelector = null,
cachedChildSelectors = [],
paginationMode = false, paginationMode = false,
paginationType = "", paginationType = "",
limitMode = false, limitMode = false,
@@ -205,11 +209,24 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
const handleDOMHighlighting = useCallback( const handleDOMHighlighting = useCallback(
(x: number, y: number, iframeDoc: Document) => { (x: number, y: number, iframeDoc: Document) => {
try { try {
if (!getText && !getList) {
setCurrentHighlight(null);
if (onHighlight) {
onHighlight({
rect: new DOMRect(0, 0, 0, 0),
selector: "",
elementInfo: null,
});
}
return;
}
const highlighterData = const highlighterData =
clientSelectorGenerator.generateDataForHighlighter( clientSelectorGenerator.generateDataForHighlighter(
{ x, y }, { x, y },
iframeDoc, iframeDoc,
true true,
cachedChildSelectors
); );
if (!highlighterData) { if (!highlighterData) {
@@ -224,70 +241,40 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
return; return;
} }
const { rect, selector, elementInfo, childSelectors } = highlighterData; const { rect, selector, elementInfo, childSelectors, groupInfo } =
highlighterData;
let shouldHighlight = false; let shouldHighlight = false;
if (getList) { if (getList) {
if (listSelector) { // First phase: Allow any group to be highlighted for selection
const hasValidChildSelectors = if (!listSelector && groupInfo?.isGroupElement) {
Array.isArray(childSelectors) && childSelectors.length > 0; shouldHighlight = true;
}
// Second phase: Show valid children within selected group
else if (listSelector) {
if (limitMode) { if (limitMode) {
shouldHighlight = false; shouldHighlight = false;
} else if (paginationMode) { } else if (
if ( paginationMode &&
paginationType !== "" && paginationType !== "" &&
!["none", "scrollDown", "scrollUp"].includes(paginationType) !["none", "scrollDown", "scrollUp"].includes(paginationType)
) { ) {
shouldHighlight = true; shouldHighlight = true;
} else { } else if (childSelectors && childSelectors.length > 0) {
shouldHighlight = false; console.log("✅ Child selectors present, highlighting enabled");
}
} else if (childSelectors && childSelectors.includes(selector)) {
shouldHighlight = true; shouldHighlight = true;
} else if (elementInfo?.isIframeContent && childSelectors) {
const isIframeChild = childSelectors.some(
(childSelector: string) =>
selector.includes(":>>") &&
childSelector
.split(":>>")
.some((part) => selector.includes(part.trim()))
);
shouldHighlight = isIframeChild;
} else if (selector.includes(":>>") && hasValidChildSelectors) {
const selectorParts = selector
.split(":>>")
.map((part: string) => part.trim());
const isValidMixedSelector = selectorParts.some((part: any) =>
childSelectors!.some((childSelector) =>
childSelector.includes(part)
)
);
} else if (elementInfo?.isShadowRoot && childSelectors) {
const isShadowChild = childSelectors.some(
(childSelector: string) =>
selector.includes(">>") &&
childSelector
.split(">>")
.some((part) => selector.includes(part.trim()))
);
} else if (selector.includes(">>") && hasValidChildSelectors) {
const selectorParts = selector
.split(">>")
.map((part: string) => part.trim());
const isValidMixedSelector = selectorParts.some((part: any) =>
childSelectors!.some((childSelector) =>
childSelector.includes(part)
)
);
} else { } else {
console.log("❌ No child selectors available");
shouldHighlight = false; shouldHighlight = false;
} }
} else { }
// No list selector - show regular highlighting
else {
shouldHighlight = true; shouldHighlight = true;
} }
} else { } else {
// getText mode - always highlight
shouldHighlight = true; shouldHighlight = true;
} }
@@ -316,6 +303,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
}, },
selector, selector,
childSelectors, childSelectors,
groupInfo,
}); });
} }
} }
@@ -335,9 +323,11 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
} }
}, },
[ [
getText,
getList, getList,
listSelector, listSelector,
paginationMode, paginationMode,
cachedChildSelectors,
paginationType, paginationType,
limitMode, limitMode,
onHighlight, onHighlight,
@@ -363,6 +353,10 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
return; return;
} }
if (!isInCaptureMode) {
return;
}
const now = performance.now(); const now = performance.now();
if (now - lastMouseMoveTime.current < MOUSE_MOVE_THROTTLE) { if (now - lastMouseMoveTime.current < MOUSE_MOVE_THROTTLE) {
return; return;
@@ -401,11 +395,24 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
e.stopPropagation(); e.stopPropagation();
if (currentHighlight && onElementSelect) { if (currentHighlight && onElementSelect) {
// Get the group info for the current highlight
const highlighterData =
clientSelectorGenerator.generateDataForHighlighter(
{ x: iframeX, y: iframeY },
iframeDoc,
true,
cachedChildSelectors
);
onElementSelect({ onElementSelect({
rect: currentHighlight.rect, rect: currentHighlight.rect,
selector: currentHighlight.selector, selector: currentHighlight.selector,
elementInfo: currentHighlight.elementInfo, elementInfo: currentHighlight.elementInfo,
childSelectors: currentHighlight.childSelectors || [], childSelectors:
cachedChildSelectors.length > 0
? cachedChildSelectors
: highlighterData?.childSelectors || [],
groupInfo: highlighterData?.groupInfo,
}); });
} }
notifyLastAction("select element"); notifyLastAction("select element");

View File

@@ -22,6 +22,7 @@ import { useThemeMode } from '../../context/theme-provider';
import { useTranslation } from 'react-i18next'; import { useTranslation } from 'react-i18next';
import { useBrowserDimensionsStore } from '../../context/browserDimensions'; import { useBrowserDimensionsStore } from '../../context/browserDimensions';
import { clientListExtractor } from '../../helpers/clientListExtractor'; import { clientListExtractor } from '../../helpers/clientListExtractor';
import { clientSelectorGenerator } from '../../helpers/clientSelectorGenerator';
const fetchWorkflow = (id: string, callback: (response: WorkflowFile) => void) => { const fetchWorkflow = (id: string, callback: (response: WorkflowFile) => void) => {
getActiveWorkflow(id).then( getActiveWorkflow(id).then(
@@ -52,10 +53,8 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
const [isCaptureTextConfirmed, setIsCaptureTextConfirmed] = useState(false); const [isCaptureTextConfirmed, setIsCaptureTextConfirmed] = useState(false);
const [isCaptureListConfirmed, setIsCaptureListConfirmed] = useState(false); const [isCaptureListConfirmed, setIsCaptureListConfirmed] = useState(false);
const { panelHeight } = useBrowserDimensionsStore(); const { panelHeight } = useBrowserDimensionsStore();
const [isDOMMode, setIsDOMMode] = useState(false);
const [currentSnapshot, setCurrentSnapshot] = useState<any>(null);
const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog, currentListActionId, setCurrentListActionId, currentTextActionId, setCurrentTextActionId, currentScreenshotActionId, setCurrentScreenshotActionId } = useGlobalInfoStore(); const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog, currentListActionId, setCurrentListActionId, currentTextActionId, setCurrentTextActionId, currentScreenshotActionId, setCurrentScreenshotActionId, updateDOMMode, currentSnapshot, isDOMMode } = useGlobalInfoStore();
const { const {
getText, startGetText, stopGetText, getText, startGetText, stopGetText,
getList, startGetList, stopGetList, getList, startGetList, stopGetList,
@@ -86,22 +85,20 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
if (socket) { if (socket) {
const domModeHandler = (data: any) => { const domModeHandler = (data: any) => {
if (!data.userId || data.userId === id) { if (!data.userId || data.userId === id) {
setIsDOMMode(true); updateDOMMode(true);
} }
}; };
const screenshotModeHandler = (data: any) => { const screenshotModeHandler = (data: any) => {
if (!data.userId || data.userId === id) { if (!data.userId || data.userId === id) {
setIsDOMMode(false); updateDOMMode(false);
setCurrentSnapshot(null);
} }
}; };
const domcastHandler = (data: any) => { const domcastHandler = (data: any) => {
if (!data.userId || data.userId === id) { if (!data.userId || data.userId === id) {
if (data.snapshotData && data.snapshotData.snapshot) { if (data.snapshotData && data.snapshotData.snapshot) {
setCurrentSnapshot(data.snapshotData); updateDOMMode(true, data.snapshotData);
setIsDOMMode(true);
} }
} }
}; };
@@ -116,7 +113,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
socket.off("domcast", domcastHandler); socket.off("domcast", domcastHandler);
}; };
} }
}, [socket, id]); }, [socket, id, updateDOMMode]);
useEffect(() => { useEffect(() => {
if (socket) { if (socket) {
@@ -214,7 +211,6 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
) => { ) => {
if (isDOMMode && currentSnapshot) { if (isDOMMode && currentSnapshot) {
try { try {
// Find the DOM iframe element
let iframeElement = document.querySelector( let iframeElement = document.querySelector(
"#dom-browser-iframe" "#dom-browser-iframe"
) as HTMLIFrameElement; ) as HTMLIFrameElement;
@@ -247,22 +243,42 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
return; return;
} }
// Use client-side extraction Object.entries(fields).forEach(([key, field]) => {
if (field.selectorObj?.selector) {
const isFieldXPath =
field.selectorObj.selector.startsWith("//") ||
field.selectorObj.selector.startsWith("/");
console.log(
`Field "${key}" selector:`,
field.selectorObj.selector,
`(XPath: ${isFieldXPath})`
);
}
});
const extractedData = clientListExtractor.extractListData( const extractedData = clientListExtractor.extractListData(
iframeDoc, iframeDoc,
listSelector, listSelector,
fields, fields,
5 // limit for preview 5
); );
updateListStepData(currentListId, extractedData); updateListStepData(currentListId, extractedData);
console.log("✅ UI extraction completed:");
if (extractedData.length === 0) {
console.warn(
"⚠️ No data extracted - this might indicate selector issues"
);
notify(
"warning",
"No data was extracted. Please verify your selections."
);
}
} catch (error) { } catch (error) {
console.error("Error in client-side data extraction:", error); console.error("Error in client-side data extraction:", error);
notify("error", "Failed to extract data client-side"); notify("error", "Failed to extract data client-side");
} }
} else { } else {
// Fallback to socket-based extraction for screenshot mode
if (!socket) { if (!socket) {
console.error("Socket not available for backend extraction"); console.error("Socket not available for backend extraction");
return; return;
@@ -275,8 +291,6 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
currentListId, currentListId,
pagination: { type: "", selector: "" }, pagination: { type: "", selector: "" },
}); });
console.log("📤 Sent extraction request to server");
} catch (error) { } catch (error) {
console.error("Error in backend data extraction:", error); console.error("Error in backend data extraction:", error);
} }
@@ -443,6 +457,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
resetInterpretationLog(); resetInterpretationLog();
finishAction('text'); finishAction('text');
onFinishCapture(); onFinishCapture();
clientSelectorGenerator.cleanup();
}, [stopGetText, getTextSettingsObject, socket, browserSteps, confirmedTextSteps, resetInterpretationLog, finishAction, notify, onFinishCapture, t]); }, [stopGetText, getTextSettingsObject, socket, browserSteps, confirmedTextSteps, resetInterpretationLog, finishAction, notify, onFinishCapture, t]);
const getListSettingsObject = useCallback(() => { const getListSettingsObject = useCallback(() => {
@@ -494,6 +509,8 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
const stopCaptureAndEmitGetListSettings = useCallback(() => { const stopCaptureAndEmitGetListSettings = useCallback(() => {
const settings = getListSettingsObject(); const settings = getListSettingsObject();
console.log("rrwebSnapshotHandler", settings);
const latestListStep = getLatestListStep(browserSteps); const latestListStep = getLatestListStep(browserSteps);
if (latestListStep && settings) { if (latestListStep && settings) {
@@ -509,6 +526,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
resetInterpretationLog(); resetInterpretationLog();
finishAction('list'); finishAction('list');
onFinishCapture(); onFinishCapture();
clientSelectorGenerator.cleanup();
}, [getListSettingsObject, socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide]); }, [getListSettingsObject, socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide]);
const hasUnconfirmedListTextFields = browserSteps.some(step => const hasUnconfirmedListTextFields = browserSteps.some(step =>
@@ -638,6 +656,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
setCurrentTextActionId(''); setCurrentTextActionId('');
setIsCaptureTextConfirmed(false); setIsCaptureTextConfirmed(false);
clientSelectorGenerator.cleanup();
notify('error', t('right_panel.errors.capture_text_discarded')); notify('error', t('right_panel.errors.capture_text_discarded'));
}, [currentTextActionId, browserSteps, stopGetText, deleteStepsByActionId, notify, t]); }, [currentTextActionId, browserSteps, stopGetText, deleteStepsByActionId, notify, t]);
@@ -668,6 +687,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
setCaptureStage('initial'); setCaptureStage('initial');
setCurrentListActionId(''); setCurrentListActionId('');
setIsCaptureListConfirmed(false); setIsCaptureListConfirmed(false);
clientSelectorGenerator.cleanup();
notify('error', t('right_panel.errors.capture_list_discarded')); notify('error', t('right_panel.errors.capture_list_discarded'));
}, [currentListActionId, browserSteps, stopGetList, deleteStepsByActionId, resetListState, setShowPaginationOptions, setShowLimitOptions, setCaptureStage, notify, t]); }, [currentListActionId, browserSteps, stopGetList, deleteStepsByActionId, resetListState, setShowPaginationOptions, setShowLimitOptions, setCaptureStage, notify, t]);
@@ -686,6 +706,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
stopGetScreenshot(); stopGetScreenshot();
resetInterpretationLog(); resetInterpretationLog();
finishAction('screenshot'); finishAction('screenshot');
clientSelectorGenerator.cleanup();
onFinishCapture(); onFinishCapture();
}; };

View File

@@ -27,6 +27,41 @@ interface ScheduleConfig {
cronExpression?: string; cronExpression?: string;
} }
interface ProcessedSnapshot {
snapshot: any;
resources: {
stylesheets: Array<{
href: string;
content: string;
media?: string;
}>;
images: Array<{
src: string;
dataUrl: string;
alt?: string;
}>;
fonts: Array<{
url: string;
dataUrl: string;
format?: string;
}>;
scripts: Array<{
src: string;
content: string;
type?: string;
}>;
media: Array<{
src: string;
dataUrl: string;
type: string;
}>;
};
baseUrl: string;
viewport: { width: number; height: number };
timestamp: number;
processingStats: any;
}
export interface RobotSettings { export interface RobotSettings {
id: string; id: string;
userId?: number; userId?: number;
@@ -86,6 +121,11 @@ interface GlobalInfo {
setCurrentListActionId: (actionId: string) => void; setCurrentListActionId: (actionId: string) => void;
currentScreenshotActionId: string; currentScreenshotActionId: string;
setCurrentScreenshotActionId: (actionId: string) => void; setCurrentScreenshotActionId: (actionId: string) => void;
isDOMMode: boolean;
setIsDOMMode: (isDOMMode: boolean) => void;
currentSnapshot: ProcessedSnapshot | null;
setCurrentSnapshot: (snapshot: ProcessedSnapshot | null) => void;
updateDOMMode: (isDOMMode: boolean, snapshot?: ProcessedSnapshot | null) => void;
}; };
class GlobalInfoStore implements Partial<GlobalInfo> { class GlobalInfoStore implements Partial<GlobalInfo> {
@@ -115,6 +155,8 @@ class GlobalInfoStore implements Partial<GlobalInfo> {
currentTextActionId = ''; currentTextActionId = '';
currentListActionId = ''; currentListActionId = '';
currentScreenshotActionId = ''; currentScreenshotActionId = '';
isDOMMode = false;
currentSnapshot = null;
}; };
const globalInfoStore = new GlobalInfoStore(); const globalInfoStore = new GlobalInfoStore();
@@ -141,6 +183,8 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
const [currentTextActionId, setCurrentTextActionId] = useState<string>(''); const [currentTextActionId, setCurrentTextActionId] = useState<string>('');
const [currentListActionId, setCurrentListActionId] = useState<string>(''); const [currentListActionId, setCurrentListActionId] = useState<string>('');
const [currentScreenshotActionId, setCurrentScreenshotActionId] = useState<string>(''); const [currentScreenshotActionId, setCurrentScreenshotActionId] = useState<string>('');
const [isDOMMode, setIsDOMMode] = useState<boolean>(globalInfoStore.isDOMMode);
const [currentSnapshot, setCurrentSnapshot] = useState<ProcessedSnapshot | null>(globalInfoStore.currentSnapshot);
const notify = (severity: 'error' | 'warning' | 'info' | 'success', message: string) => { const notify = (severity: 'error' | 'warning' | 'info' | 'success', message: string) => {
setNotification({ severity, message, isOpen: true }); setNotification({ severity, message, isOpen: true });
@@ -165,6 +209,18 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
}, 100); }, 100);
} }
const updateDOMMode = (mode: boolean, snapshot?: ProcessedSnapshot | null) => {
setIsDOMMode(mode);
if (snapshot !== undefined) {
setCurrentSnapshot(snapshot);
}
if (!mode) {
setCurrentSnapshot(null);
}
}
return ( return (
<globalInfoContext.Provider <globalInfoContext.Provider
value={{ value={{
@@ -205,6 +261,11 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
setCurrentListActionId, setCurrentListActionId,
currentScreenshotActionId, currentScreenshotActionId,
setCurrentScreenshotActionId, setCurrentScreenshotActionId,
isDOMMode,
setIsDOMMode,
currentSnapshot,
setCurrentSnapshot,
updateDOMMode,
}} }}
> >
{children} {children}

View File

@@ -15,30 +15,89 @@ interface ExtractedListData {
[key: string]: string; [key: string]: string;
} }
interface TableField { interface Field {
selector: string; selector: string;
attribute: string; attribute: string;
tableContext?: string;
cellIndex?: number;
}
interface NonTableField {
selector: string;
attribute: string;
}
interface ContainerFields {
tableFields: Record<string, TableField>;
nonTableFields: Record<string, NonTableField>;
} }
class ClientListExtractor { class ClientListExtractor {
private evaluateXPath = (
rootElement: Element | Document,
xpath: string
): Element | null => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? (rootElement as Document)
: rootElement.ownerDocument;
if (!ownerDoc) return null;
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
return result.singleNodeValue as Element | null;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return null;
}
};
private evaluateXPathAll = (
rootElement: Element | Document,
xpath: string
): Element[] => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? (rootElement as Document)
: rootElement.ownerDocument;
if (!ownerDoc) return [];
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
null
);
const elements: Element[] = [];
for (let i = 0; i < result.snapshotLength; i++) {
const node = result.snapshotItem(i);
if (node && node.nodeType === Node.ELEMENT_NODE) {
elements.push(node as Element);
}
}
return elements;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return [];
}
};
private queryElement = ( private queryElement = (
rootElement: Element | Document, rootElement: Element | Document,
selector: string selector: string
): Element | null => { ): Element | null => {
if (!selector.includes(">>") && !selector.includes(":>>")) { if (!selector.includes(">>") && !selector.includes(":>>")) {
return rootElement.querySelector(selector); // Check if it's an XPath selector (starts with // or / or ./)
if (
selector.startsWith("//") ||
selector.startsWith("/") ||
selector.startsWith("./")
) {
return this.evaluateXPath(rootElement, selector);
} else {
return rootElement.querySelector(selector);
}
} }
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim()); const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
@@ -59,7 +118,17 @@ class ClientListExtractor {
frameElement.contentDocument || frameElement.contentDocument ||
frameElement.contentWindow?.document; frameElement.contentWindow?.document;
if (!frameDoc) return null; if (!frameDoc) return null;
currentElement = frameDoc.querySelector(parts[i]);
// Handle XPath in iframe context
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
currentElement = this.evaluateXPath(frameDoc, parts[i]);
} else {
currentElement = frameDoc.querySelector(parts[i]);
}
continue; continue;
} catch (e) { } catch (e) {
console.warn( console.warn(
@@ -75,7 +144,16 @@ class ClientListExtractor {
let nextElement: Element | null = null; let nextElement: Element | null = null;
if ("querySelector" in currentElement) { if ("querySelector" in currentElement) {
nextElement = currentElement.querySelector(parts[i]); // Handle XPath vs CSS selector
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(currentElement, parts[i]);
} else {
nextElement = currentElement.querySelector(parts[i]);
}
} }
if ( if (
@@ -83,9 +161,20 @@ class ClientListExtractor {
"shadowRoot" in currentElement && "shadowRoot" in currentElement &&
(currentElement as Element).shadowRoot (currentElement as Element).shadowRoot
) { ) {
nextElement = (currentElement as Element).shadowRoot!.querySelector( if (
parts[i] parts[i].startsWith("//") ||
); parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(
(currentElement as Element).shadowRoot as unknown as Document,
parts[i]
);
} else {
nextElement = (currentElement as Element).shadowRoot!.querySelector(
parts[i]
);
}
} }
if (!nextElement && "children" in currentElement) { if (!nextElement && "children" in currentElement) {
@@ -94,7 +183,18 @@ class ClientListExtractor {
); );
for (const child of children) { for (const child of children) {
if (child.shadowRoot) { if (child.shadowRoot) {
nextElement = child.shadowRoot.querySelector(parts[i]); if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(
child.shadowRoot as unknown as Document,
parts[i]
);
} else {
nextElement = child.shadowRoot.querySelector(parts[i]);
}
if (nextElement) break; if (nextElement) break;
} }
} }
@@ -111,7 +211,12 @@ class ClientListExtractor {
selector: string selector: string
): Element[] => { ): Element[] => {
if (!selector.includes(">>") && !selector.includes(":>>")) { if (!selector.includes(">>") && !selector.includes(":>>")) {
return Array.from(rootElement.querySelectorAll(selector)); // Check if it's an XPath selector (starts with // or /)
if (selector.startsWith("//") || selector.startsWith("/")) {
return this.evaluateXPathAll(rootElement, selector);
} else {
return Array.from(rootElement.querySelectorAll(selector));
}
} }
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim()); const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
@@ -133,7 +238,14 @@ class ClientListExtractor {
frameElement.contentDocument || frameElement.contentDocument ||
frameElement.contentWindow?.document; frameElement.contentWindow?.document;
if (frameDoc) { if (frameDoc) {
nextElements.push(...Array.from(frameDoc.querySelectorAll(part))); // Handle XPath in iframe context
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(...this.evaluateXPathAll(frameDoc, part));
} else {
nextElements.push(
...Array.from(frameDoc.querySelectorAll(part))
);
}
} }
} catch (e) { } catch (e) {
console.warn( console.warn(
@@ -146,24 +258,47 @@ class ClientListExtractor {
} }
} else { } else {
if ("querySelectorAll" in element) { if ("querySelectorAll" in element) {
nextElements.push(...Array.from(element.querySelectorAll(part))); // Handle XPath vs CSS selector
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(...this.evaluateXPathAll(element, part));
} else {
nextElements.push(...Array.from(element.querySelectorAll(part)));
}
} }
if ("shadowRoot" in element && (element as Element).shadowRoot) { if ("shadowRoot" in element && (element as Element).shadowRoot) {
nextElements.push( if (part.startsWith("//") || part.startsWith("/")) {
...Array.from( nextElements.push(
(element as Element).shadowRoot!.querySelectorAll(part) ...this.evaluateXPathAll(
) (element as Element).shadowRoot as unknown as Document,
); part
)
);
} else {
nextElements.push(
...Array.from(
(element as Element).shadowRoot!.querySelectorAll(part)
)
);
}
} }
if ("children" in element) { if ("children" in element) {
const children = Array.from((element as Element).children || []); const children = Array.from((element as Element).children || []);
for (const child of children) { for (const child of children) {
if (child.shadowRoot) { if (child.shadowRoot) {
nextElements.push( if (part.startsWith("//") || part.startsWith("/")) {
...Array.from(child.shadowRoot.querySelectorAll(part)) nextElements.push(
); ...this.evaluateXPathAll(
child.shadowRoot as unknown as Document,
part
)
);
} else {
nextElements.push(
...Array.from(child.shadowRoot.querySelectorAll(part))
);
}
} }
} }
} }
@@ -193,35 +328,66 @@ class ClientListExtractor {
} }
if (attribute === "innerText") { if (attribute === "innerText") {
return (element as HTMLElement).innerText?.trim() || null; // First try standard innerText/textContent
} else if (attribute === "innerHTML") { let textContent =
return element.innerHTML?.trim() || null; (element as HTMLElement).innerText?.trim() ||
} else if (attribute === "src" || attribute === "href") { (element as HTMLElement).textContent?.trim();
if (attribute === "href" && element.tagName !== "A") {
const parentElement = element.parentElement; // If empty, check for common data attributes that might contain the text
if (parentElement && parentElement.tagName === "A") { if (!textContent) {
const parentHref = parentElement.getAttribute("href"); // Check for data-* attributes that commonly contain text values
if (parentHref) { const dataAttributes = [
try { "data-600",
return new URL(parentHref, baseURL).href; "data-text",
} catch (e) { "data-label",
return parentHref; "data-value",
} "data-content",
];
for (const attr of dataAttributes) {
const dataValue = element.getAttribute(attr);
if (dataValue && dataValue.trim()) {
textContent = dataValue.trim();
break;
} }
} }
} }
return textContent || null;
} else if (attribute === "innerHTML") {
return element.innerHTML?.trim() || null;
} else if (attribute === "href") {
// For href, we need to find the anchor tag if the current element isn't one
let anchorElement = element;
// If current element is not an anchor, look for parent anchor
if (element.tagName !== "A") {
anchorElement =
element.closest("a") ||
element.parentElement?.closest("a") ||
element;
}
const hrefValue = anchorElement.getAttribute("href");
if (!hrefValue || hrefValue.trim() === "") {
return null;
}
try {
return new URL(hrefValue, baseURL).href;
} catch (e) {
console.warn("Error creating URL from", hrefValue, e);
return hrefValue;
}
} else if (attribute === "src") {
const attrValue = element.getAttribute(attribute); const attrValue = element.getAttribute(attribute);
const dataAttr = attrValue || element.getAttribute("data-" + attribute); const dataAttr = attrValue || element.getAttribute("data-" + attribute);
if (!dataAttr || dataAttr.trim() === "") { if (!dataAttr || dataAttr.trim() === "") {
if (attribute === "src") { const style = window.getComputedStyle(element as HTMLElement);
const style = window.getComputedStyle(element as HTMLElement); const bgImage = style.backgroundImage;
const bgImage = style.backgroundImage; if (bgImage && bgImage !== "none") {
if (bgImage && bgImage !== "none") { const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/); return matches ? new URL(matches[1], baseURL).href : null;
return matches ? new URL(matches[1], baseURL).href : null;
}
} }
return null; return null;
} }
@@ -236,187 +402,8 @@ class ClientListExtractor {
return element.getAttribute(attribute); return element.getAttribute(attribute);
}; };
private findTableAncestor = ( private convertFields = (fields: any): Record<string, Field> => {
element: Element const convertedFields: Record<string, Field> = {};
): { type: string; element: Element } | null => {
let currentElement: Element | null = element;
const MAX_DEPTH = 5;
let depth = 0;
while (currentElement && depth < MAX_DEPTH) {
if (currentElement.getRootNode() instanceof ShadowRoot) {
currentElement = (currentElement.getRootNode() as ShadowRoot).host;
continue;
}
if (currentElement.tagName === "TD") {
return { type: "TD", element: currentElement };
} else if (currentElement.tagName === "TR") {
return { type: "TR", element: currentElement };
}
if (
currentElement.tagName === "IFRAME" ||
currentElement.tagName === "FRAME"
) {
try {
const frameElement = currentElement as
| HTMLIFrameElement
| HTMLFrameElement;
currentElement = frameElement.contentDocument?.body || null;
} catch (e) {
return null;
}
} else {
currentElement = currentElement.parentElement;
}
depth++;
}
return null;
};
private getCellIndex = (td: Element): number => {
if (td.getRootNode() instanceof ShadowRoot) {
const shadowRoot = td.getRootNode() as ShadowRoot;
const allCells = Array.from(shadowRoot.querySelectorAll("td"));
return allCells.indexOf(td as HTMLTableCellElement);
}
let index = 0;
let sibling = td;
while ((sibling = sibling.previousElementSibling as Element)) {
index++;
}
return index;
};
private hasThElement = (
row: Element,
tableFields: Record<string, TableField>
): boolean => {
for (const [_, { selector }] of Object.entries(tableFields)) {
const element = this.queryElement(row, selector);
if (element) {
let current: Element | ShadowRoot | Document | null = element;
while (current && current !== row) {
if (current.getRootNode() instanceof ShadowRoot) {
current = (current.getRootNode() as ShadowRoot).host;
continue;
}
if ((current as Element).tagName === "TH") return true;
if (
(current as Element).tagName === "IFRAME" ||
(current as Element).tagName === "FRAME"
) {
try {
const frameElement = current as
| HTMLIFrameElement
| HTMLFrameElement;
current = frameElement.contentDocument?.body || null;
} catch (e) {
break;
}
} else {
current = (current as Element).parentElement;
}
}
}
}
return false;
};
private filterRowsBasedOnTag = (
rows: Element[],
tableFields: Record<string, TableField>
): Element[] => {
for (const row of rows) {
if (this.hasThElement(row, tableFields)) {
return rows;
}
}
return rows.filter((row) => {
const directTH = row.getElementsByTagName("TH").length === 0;
const shadowTH = row.shadowRoot
? row.shadowRoot.querySelector("th") === null
: true;
return directTH && shadowTH;
});
};
private calculateClassSimilarity = (
classList1: string[],
classList2: string[]
): number => {
const set1 = new Set(classList1);
const set2 = new Set(classList2);
const intersection = new Set([...set1].filter((x) => set2.has(x)));
const union = new Set([...set1, ...set2]);
return intersection.size / union.size;
};
private findSimilarElements = (
baseElement: Element,
document: Document,
similarityThreshold: number = 0.7
): Element[] => {
const baseClasses = Array.from(baseElement.classList);
if (baseClasses.length === 0) return [];
const allElements: Element[] = [];
allElements.push(
...Array.from(document.getElementsByTagName(baseElement.tagName))
);
if (baseElement.getRootNode() instanceof ShadowRoot) {
const shadowHost = (baseElement.getRootNode() as ShadowRoot).host;
allElements.push(
...Array.from(shadowHost.getElementsByTagName(baseElement.tagName))
);
}
const frames = [
...Array.from(document.getElementsByTagName("iframe")),
...Array.from(document.getElementsByTagName("frame")),
];
for (const frame of frames) {
try {
const frameElement = frame as HTMLIFrameElement | HTMLFrameElement;
const frameDoc =
frameElement.contentDocument || frameElement.contentWindow?.document;
if (frameDoc) {
allElements.push(
...Array.from(frameDoc.getElementsByTagName(baseElement.tagName))
);
}
} catch (e) {
console.warn(
`Cannot access ${frame.tagName.toLowerCase()} content:`,
e
);
}
}
return allElements.filter((element) => {
if (element === baseElement) return false;
const similarity = this.calculateClassSimilarity(
baseClasses,
Array.from(element.classList)
);
return similarity >= similarityThreshold;
});
};
private convertFields = (
fields: any
): Record<string, { selector: string; attribute: string }> => {
const convertedFields: Record<
string,
{ selector: string; attribute: string }
> = {};
for (const [key, field] of Object.entries(fields)) { for (const [key, field] of Object.entries(fields)) {
const typedField = field as TextStep; const typedField = field as TextStep;
@@ -439,285 +426,134 @@ class ClientListExtractor {
// Convert fields to the format expected by the extraction logic // Convert fields to the format expected by the extraction logic
const convertedFields = this.convertFields(fields); const convertedFields = this.convertFields(fields);
// Get all container elements matching the list selector // Step 1: Get all container elements matching the list selector
let containers = this.queryElementAll(iframeDocument, listSelector); const containers = this.queryElementAll(iframeDocument, listSelector);
if (containers.length === 0) { if (containers.length === 0) {
console.warn("No containers found for listSelector:", listSelector); console.warn("No containers found for listSelector:", listSelector);
return []; return [];
} }
// Enhanced container discovery: find similar elements if we need more containers // Step 2: Extract data from each container up to the limit
if (limit > 1 && containers.length === 1) { const extractedData: ExtractedListData[] = [];
const baseContainer = containers[0]; const containersToProcess = Math.min(containers.length, limit);
const similarContainers = this.findSimilarElements(
baseContainer,
iframeDocument,
0.7
);
if (similarContainers.length > 0) { for (
const newContainers = similarContainers.filter( let containerIndex = 0;
(container) => !container.matches(listSelector) containerIndex < containersToProcess;
); containerIndex++
containers = [...containers, ...newContainers]; ) {
} const container = containers[containerIndex];
} const record: ExtractedListData = {};
// Analyze fields for table vs non-table context // Step 3: For each field, extract data from the current container
const containerFields: ContainerFields[] = containers.map(() => ({ for (const [label, { selector, attribute }] of Object.entries(
tableFields: {}, convertedFields
nonTableFields: {}, )) {
})); let element: Element | null = null;
containers.forEach((container, containerIndex) => { // CORRECT APPROACH: Create indexed absolute XPath
for (const [label, field] of Object.entries(convertedFields)) { if (selector.startsWith("//")) {
const sampleElement = this.queryElement(container, field.selector); // Convert the absolute selector to target the specific container instance
const indexedSelector = this.createIndexedXPath(
selector,
listSelector,
containerIndex + 1
);
if (sampleElement) { element = this.evaluateXPathSingle(iframeDocument, indexedSelector);
const ancestor = this.findTableAncestor(sampleElement);
if (ancestor) {
containerFields[containerIndex].tableFields[label] = {
...field,
tableContext: ancestor.type,
cellIndex:
ancestor.type === "TD"
? this.getCellIndex(ancestor.element)
: -1,
};
} else {
containerFields[containerIndex].nonTableFields[label] = field;
}
} else { } else {
containerFields[containerIndex].nonTableFields[label] = field; // Fallback for non-XPath selectors
element = this.queryElement(container, selector);
} }
}
});
// Extract table data // Step 4: Extract the value from the found element
const tableData: ExtractedListData[] = []; if (element) {
for ( const value = this.extractValue(element, attribute);
let containerIndex = 0; if (value !== null && value !== "") {
containerIndex < containers.length; record[label] = value;
containerIndex++
) {
const container = containers[containerIndex];
const { tableFields } = containerFields[containerIndex];
if (Object.keys(tableFields).length > 0) {
const firstField = Object.values(tableFields)[0];
const firstElement = this.queryElement(
container,
firstField.selector
);
let tableContext: Element | null = firstElement;
// Find the table context
while (
tableContext &&
tableContext.tagName !== "TABLE" &&
tableContext !== container
) {
if (tableContext.getRootNode() instanceof ShadowRoot) {
tableContext = (tableContext.getRootNode() as ShadowRoot).host;
continue;
}
if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try {
const frameElement = tableContext as
| HTMLIFrameElement
| HTMLFrameElement;
tableContext = frameElement.contentDocument?.body || null;
} catch (e) {
break;
}
} else { } else {
tableContext = tableContext.parentElement; console.warn(` ⚠️ Empty value for "${label}"`);
}
}
if (tableContext) {
const rows: Element[] = [];
rows.push(...Array.from(tableContext.getElementsByTagName("TR")));
if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try {
const frameElement = tableContext as
| HTMLIFrameElement
| HTMLFrameElement;
const frameDoc =
frameElement.contentDocument ||
frameElement.contentWindow?.document;
if (frameDoc) {
rows.push(...Array.from(frameDoc.getElementsByTagName("TR")));
}
} catch (e) {
console.warn(
`Cannot access ${tableContext.tagName.toLowerCase()} rows:`,
e
);
}
}
const processedRows = this.filterRowsBasedOnTag(rows, tableFields);
for (
let rowIndex = 0;
rowIndex < Math.min(processedRows.length, limit);
rowIndex++
) {
const record: ExtractedListData = {};
const currentRow = processedRows[rowIndex];
for (const [
label,
{ selector, attribute, cellIndex },
] of Object.entries(tableFields)) {
let element: Element | null = null;
if (cellIndex !== undefined && cellIndex >= 0) {
let td: Element | null =
currentRow.children[cellIndex] || null;
if (!td && currentRow.shadowRoot) {
const shadowCells = currentRow.shadowRoot.children;
if (shadowCells && shadowCells.length > cellIndex) {
td = shadowCells[cellIndex];
}
}
if (td) {
element = this.queryElement(td, selector);
if (
!element &&
selector
.split(/(?:>>|:>>)/)
.pop()
?.includes("td:nth-child")
) {
element = td;
}
if (!element) {
const tagOnlySelector = selector.split(".")[0];
element = this.queryElement(td, tagOnlySelector);
}
if (!element) {
let currentElement: Element | null = td;
while (
currentElement &&
currentElement.children.length > 0
) {
let foundContentChild = false;
for (const child of Array.from(
currentElement.children
)) {
if (this.extractValue(child, attribute)) {
currentElement = child;
foundContentChild = true;
break;
}
}
if (!foundContentChild) break;
}
element = currentElement;
}
}
} else {
element = this.queryElement(currentRow, selector);
}
if (element) {
const value = this.extractValue(element, attribute);
if (value !== null && value !== "") {
record[label] = value;
} else {
console.warn(
`❌ No value for ${label} in row ${rowIndex + 1}`
);
record[label] = "";
}
} else {
console.warn(
`❌ Element not found for ${label} with selector:`,
selector
);
record[label] = "";
}
}
if (Object.values(record).some((value) => value !== "")) {
tableData.push(record);
}
}
}
}
}
// Extract non-table data
const nonTableData: ExtractedListData[] = [];
for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
if (nonTableData.length >= limit) break;
const container = containers[containerIndex];
const { nonTableFields } = containerFields[containerIndex];
if (Object.keys(nonTableFields).length > 0) {
const record: ExtractedListData = {};
for (const [label, { selector, attribute }] of Object.entries(
nonTableFields
)) {
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
const element = this.queryElement(container, relativeSelector);
if (element) {
const value = this.extractValue(element, attribute);
if (value !== null && value !== "") {
record[label] = value;
} else {
console.warn(
`❌ No value for ${label} in container ${containerIndex + 1}`
);
record[label] = "";
}
} else {
console.warn(
`❌ Element not found for ${label} with selector:`,
selector
);
record[label] = ""; record[label] = "";
} }
} } else {
console.warn(` ❌ Element not found for "${label}"`);
if (Object.values(record).some((value) => value !== "")) { record[label] = "";
nonTableData.push(record);
} }
} }
}
// Combine and limit results // Step 5: Add record if it has any non-empty values
const extractedData = [...tableData, ...nonTableData].slice(0, limit); if (Object.values(record).some((value) => value !== "")) {
extractedData.push(record);
} else {
console.warn(
` ⚠️ Skipping empty record for container ${containerIndex + 1}`
);
}
}
return extractedData; return extractedData;
} catch (error) { } catch (error) {
console.error("Error in client-side extractListData:", error); console.error("💥 Error in client-side extractListData:", error);
return []; return [];
} }
}; };
// Create indexed XPath for specific container instance
private createIndexedXPath(
childSelector: string,
listSelector: string,
containerIndex: number
): string {
// Check if the child selector contains the list selector pattern
if (childSelector.includes(listSelector.replace("//", ""))) {
// Replace the list selector part with indexed version
const listPattern = listSelector.replace("//", "");
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
const indexedSelector = childSelector.replace(
`//${listPattern}`,
indexedListSelector
);
return indexedSelector;
} else {
// If pattern doesn't match, create a more generic indexed selector
// This is a fallback approach
console.warn(` ⚠️ Pattern doesn't match, using fallback approach`);
return `(${listSelector})[${containerIndex}]${childSelector.replace(
"//",
"/"
)}`;
}
}
// Helper method for single XPath evaluation
private evaluateXPathSingle = (
document: Document,
xpath: string
): Element | null => {
try {
const result = document.evaluate(
xpath,
document,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
const element = result.singleNodeValue as Element | null;
if (!element) {
console.warn(`❌ XPath found no element for: ${xpath}`);
}
return element;
} catch (error) {
console.error("❌ XPath evaluation failed:", xpath, error);
return null;
}
};
} }
export const clientListExtractor = new ClientListExtractor(); export const clientListExtractor = new ClientListExtractor();

File diff suppressed because it is too large Load Diff