Merge pull request #675 from getmaxun/smart-list

feat: better, faster, smarter capture list
This commit is contained in:
Rohit
2025-07-07 01:23:53 +05:30
committed by GitHub
9 changed files with 3079 additions and 2305 deletions

File diff suppressed because it is too large Load Diff

View File

@@ -243,7 +243,7 @@ export class RemoteBrowser {
scripts: [] as Array<{ src: string; content: string; type?: string }>,
media: [] as Array<{ src: string; dataUrl: string; type: string }>,
};
const viewport = (await this.currentPage?.viewportSize()) || {
width: 1280,
height: 720,
@@ -617,7 +617,7 @@ export class RemoteBrowser {
);
await this.context.addInitScript({ path: './server/src/browser-management/classes/rrweb-bundle.js' });
this.currentPage = await this.context.newPage();
await this.setupPageEventListeners(this.currentPage);
@@ -1286,129 +1286,181 @@ export class RemoteBrowser {
* @returns void
*/
public registerEditorEvents = (): void => {
// For each event, include userId to make sure events are handled for the correct browser
logger.log('debug', `Registering editor events for user: ${this.userId}`);
// For each event, include userId to make sure events are handled for the correct browser
logger.log("debug", `Registering editor events for user: ${this.userId}`);
this.socket.on(`captureDirectScreenshot:${this.userId}`, async (settings) => {
logger.debug(`Direct screenshot capture requested for user ${this.userId}`);
this.socket.on(
`captureDirectScreenshot:${this.userId}`,
async (settings) => {
logger.debug(
`Direct screenshot capture requested for user ${this.userId}`
);
await this.captureDirectScreenshot(settings);
});
}
);
// For backward compatibility
this.socket.on('captureDirectScreenshot', async (settings) => {
await this.captureDirectScreenshot(settings);
});
// Listen for specific events for this user
this.socket.on(`rerender:${this.userId}`, async () => {
logger.debug(`Rerender event received for user ${this.userId}`);
await this.makeAndEmitScreenshot();
});
// For backward compatibility, also listen to the general event
this.socket.on('rerender', async () => {
logger.debug(`General rerender event received, checking if for user ${this.userId}`);
await this.makeAndEmitScreenshot();
});
this.socket.on(`settings:${this.userId}`, (settings) => {
this.interpreterSettings = settings;
logger.debug(`Settings updated for user ${this.userId}`);
});
this.socket.on(`changeTab:${this.userId}`, async (tabIndex) => {
logger.debug(`Tab change to ${tabIndex} requested for user ${this.userId}`);
await this.changeTab(tabIndex);
});
this.socket.on(`addTab:${this.userId}`, async () => {
logger.debug(`New tab requested for user ${this.userId}`);
await this.currentPage?.context().newPage();
const lastTabIndex = this.currentPage ? this.currentPage.context().pages().length - 1 : 0;
await this.changeTab(lastTabIndex);
});
this.socket.on(`closeTab:${this.userId}`, async (tabInfo) => {
logger.debug(`Close tab ${tabInfo.index} requested for user ${this.userId}`);
const page = this.currentPage?.context().pages()[tabInfo.index];
if (page) {
if (tabInfo.isCurrent) {
if (this.currentPage?.context().pages()[tabInfo.index + 1]) {
// next tab
await this.changeTab(tabInfo.index + 1);
} else {
//previous tab
await this.changeTab(tabInfo.index - 1);
}
}
await page.close();
logger.log(
'debug',
`Tab ${tabInfo.index} was closed for user ${this.userId}, new tab count: ${this.currentPage?.context().pages().length}`
);
// For backward compatibility
this.socket.on("captureDirectScreenshot", async (settings) => {
await this.captureDirectScreenshot(settings);
});
// Listen for specific events for this user
this.socket.on(`rerender:${this.userId}`, async () => {
logger.debug(`Rerender event received for user ${this.userId}`);
if (this.renderingMode === "dom") {
await this.makeAndEmitDOMSnapshot();
} else {
await this.makeAndEmitScreenshot();
}
});
this.socket.on("rerender", async () => {
logger.debug(
`General rerender event received, checking if for user ${this.userId}`
);
if (this.renderingMode === "dom") {
await this.makeAndEmitDOMSnapshot();
} else {
await this.makeAndEmitScreenshot();
}
});
this.socket.on(`settings:${this.userId}`, (settings) => {
this.interpreterSettings = settings;
logger.debug(`Settings updated for user ${this.userId}`);
});
this.socket.on(`changeTab:${this.userId}`, async (tabIndex) => {
logger.debug(
`Tab change to ${tabIndex} requested for user ${this.userId}`
);
await this.changeTab(tabIndex);
});
this.socket.on(`addTab:${this.userId}`, async () => {
logger.debug(`New tab requested for user ${this.userId}`);
await this.currentPage?.context().newPage();
const lastTabIndex = this.currentPage
? this.currentPage.context().pages().length - 1
: 0;
await this.changeTab(lastTabIndex);
});
this.socket.on(`closeTab:${this.userId}`, async (tabInfo) => {
logger.debug(
`Close tab ${tabInfo.index} requested for user ${this.userId}`
);
const page = this.currentPage?.context().pages()[tabInfo.index];
if (page) {
if (tabInfo.isCurrent) {
if (this.currentPage?.context().pages()[tabInfo.index + 1]) {
// next tab
await this.changeTab(tabInfo.index + 1);
} else {
logger.log('error', `Tab index ${tabInfo.index} out of range for user ${this.userId}`);
//previous tab
await this.changeTab(tabInfo.index - 1);
}
});
this.socket.on(`setViewportSize:${this.userId}`, async (data: { width: number, height: number }) => {
const { width, height } = data;
logger.log('debug', `Viewport size change to width=${width}, height=${height} requested for user ${this.userId}`);
}
await page.close();
logger.log(
"debug",
`Tab ${tabInfo.index} was closed for user ${
this.userId
}, new tab count: ${this.currentPage?.context().pages().length}`
);
} else {
logger.log(
"error",
`Tab index ${tabInfo.index} out of range for user ${this.userId}`
);
}
});
// Update the browser context's viewport dynamically
if (this.context && this.browser) {
this.context = await this.browser.newContext({ viewport: { width, height } });
logger.log('debug', `Viewport size updated to width=${width}, height=${height} for user ${this.userId}`);
}
});
// For backward compatibility, also register the standard events
this.socket.on('settings', (settings) => this.interpreterSettings = settings);
this.socket.on('changeTab', async (tabIndex) => await this.changeTab(tabIndex));
this.socket.on('addTab', async () => {
await this.currentPage?.context().newPage();
const lastTabIndex = this.currentPage ? this.currentPage.context().pages().length - 1 : 0;
await this.changeTab(lastTabIndex);
});
this.socket.on('closeTab', async (tabInfo) => {
const page = this.currentPage?.context().pages()[tabInfo.index];
if (page) {
if (tabInfo.isCurrent) {
if (this.currentPage?.context().pages()[tabInfo.index + 1]) {
await this.changeTab(tabInfo.index + 1);
} else {
await this.changeTab(tabInfo.index - 1);
}
}
await page.close();
}
});
this.socket.on('setViewportSize', async (data: { width: number, height: number }) => {
const { width, height } = data;
if (this.context && this.browser) {
this.context = await this.browser.newContext({ viewport: { width, height } });
}
});
this.socket.on(
`setViewportSize:${this.userId}`,
async (data: { width: number; height: number }) => {
const { width, height } = data;
logger.log(
"debug",
`Viewport size change to width=${width}, height=${height} requested for user ${this.userId}`
);
this.socket.on('extractListData', async (data: {
listSelector: string,
fields: Record<string, any>,
currentListId: number,
pagination: any
// Update the browser context's viewport dynamically
if (this.context && this.browser) {
this.context = await this.browser.newContext({
viewport: { width, height },
});
logger.log(
"debug",
`Viewport size updated to width=${width}, height=${height} for user ${this.userId}`
);
}
}
);
// For backward compatibility, also register the standard events
this.socket.on(
"settings",
(settings) => (this.interpreterSettings = settings)
);
this.socket.on(
"changeTab",
async (tabIndex) => await this.changeTab(tabIndex)
);
this.socket.on("addTab", async () => {
await this.currentPage?.context().newPage();
const lastTabIndex = this.currentPage
? this.currentPage.context().pages().length - 1
: 0;
await this.changeTab(lastTabIndex);
});
this.socket.on("closeTab", async (tabInfo) => {
const page = this.currentPage?.context().pages()[tabInfo.index];
if (page) {
if (tabInfo.isCurrent) {
if (this.currentPage?.context().pages()[tabInfo.index + 1]) {
await this.changeTab(tabInfo.index + 1);
} else {
await this.changeTab(tabInfo.index - 1);
}
}
await page.close();
}
});
this.socket.on(
"setViewportSize",
async (data: { width: number; height: number }) => {
const { width, height } = data;
if (this.context && this.browser) {
this.context = await this.browser.newContext({
viewport: { width, height },
});
}
}
);
this.socket.on(
"extractListData",
async (data: {
listSelector: string;
fields: Record<string, any>;
currentListId: number;
pagination: any;
}) => {
if (this.currentPage) {
const extractedData = await this.extractListData(
this.currentPage,
data.listSelector,
data.fields
);
this.socket.emit('listDataExtracted', {
currentListId: data.currentListId,
data: extractedData
});
}
});
if (this.currentPage) {
const extractedData = await this.extractListData(
this.currentPage,
data.listSelector,
data.fields
);
this.socket.emit("listDataExtracted", {
currentListId: data.currentListId,
data: extractedData,
});
}
}
);
};
/**
* Subscribes the remote browser for a screencast session
@@ -1476,15 +1528,12 @@ export class RemoteBrowser {
this.isDOMStreamingActive = false;
}
}
/**
* CDP-based DOM snapshot creation using captured network resources
*/
public async makeAndEmitDOMSnapshot(): Promise<void> {
if (
!this.currentPage ||
!this.isDOMStreamingActive
) {
if (!this.currentPage || !this.isDOMStreamingActive) {
return;
}
@@ -1537,10 +1586,11 @@ export class RemoteBrowser {
if (typeof window.rrwebSnapshot === "undefined") {
throw new Error("rrweb-snapshot library not available");
}
return window.rrwebSnapshot.snapshot(document, {
inlineImages: true,
collectFonts: true,
});
return window.rrwebSnapshot.snapshot(document, {
inlineImages: true,
collectFonts: true,
});
});
// Process the snapshot to proxy resources
@@ -1557,10 +1607,12 @@ export class RemoteBrowser {
this.emitRRWebSnapshot(enhancedSnapshot);
} catch (error) {
// Handle navigation context destruction gracefully
if (error instanceof Error &&
(error.message.includes("Execution context was destroyed") ||
if (
error instanceof Error &&
(error.message.includes("Execution context was destroyed") ||
error.message.includes("most likely because of a navigation") ||
error.message.includes("Target closed"))) {
error.message.includes("Target closed"))
) {
logger.debug("DOM snapshot skipped due to page navigation or closure");
return; // Don't emit error for navigation - this is expected
}
@@ -1622,35 +1674,35 @@ export class RemoteBrowser {
* @returns {Promise<void>}
*/
public async switchOff(): Promise<void> {
try {
this.isScreencastActive = false;
this.isDOMStreamingActive = false;
try {
this.isScreencastActive = false;
this.isDOMStreamingActive = false;
await this.interpreter.stopInterpretation();
await this.interpreter.stopInterpretation();
if (this.screencastInterval) {
clearInterval(this.screencastInterval);
}
if (this.domUpdateInterval) {
clearInterval(this.domUpdateInterval);
}
if (this.client) {
await this.stopScreencast();
await this.stopDOM();
}
if (this.browser) {
await this.browser.close();
}
this.screenshotQueue = [];
//this.performanceMonitor.reset();
} catch (error) {
logger.error('Error during browser shutdown:', error);
if (this.screencastInterval) {
clearInterval(this.screencastInterval);
}
if (this.domUpdateInterval) {
clearInterval(this.domUpdateInterval);
}
if (this.client) {
await this.stopScreencast();
await this.stopDOM();
}
if (this.browser) {
await this.browser.close();
}
this.screenshotQueue = [];
//this.performanceMonitor.reset();
} catch (error) {
logger.error('Error during browser shutdown:', error);
}
}
private async optimizeScreenshot(screenshot: Buffer): Promise<Buffer> {
@@ -1772,6 +1824,7 @@ export class RemoteBrowser {
const page = this.currentPage?.context().pages()[tabIndex];
if (page) {
await this.stopScreencast();
await this.stopDOM();
this.currentPage = page;
await this.setupPageEventListeners(this.currentPage);
@@ -1783,8 +1836,13 @@ export class RemoteBrowser {
url: this.currentPage.url(),
userId: this.userId
});
await this.makeAndEmitScreenshot();
await this.subscribeToScreencast();
if (this.isDOMStreamingActive) {
await this.makeAndEmitDOMSnapshot();
await this.subscribeToDOM();
} else {
await this.makeAndEmitScreenshot();
await this.subscribeToScreencast();
}
} else {
logger.log('error', `${tabIndex} index out of range of pages`)
}

View File

@@ -464,7 +464,6 @@ export class WorkflowGenerator {
public onClick = async (coordinates: Coordinates, page: Page) => {
let where: WhereWhatPair["where"] = { url: this.getBestUrl(page.url()) };
const selector = await this.generateSelector(page, coordinates, ActionType.Click);
console.log("COOORDINATES: ", coordinates);
logger.log('debug', `Element's selector: ${selector}`);
const elementInfo = await getElementInformation(page, coordinates, '', false);
@@ -999,6 +998,7 @@ export class WorkflowGenerator {
rect,
selector: displaySelector,
elementInfo,
isDOMMode: this.isDOMMode,
// Include shadow DOM specific information
shadowInfo: elementInfo?.isShadowRoot ? {
mode: elementInfo.shadowRootMode,

File diff suppressed because it is too large Load Diff

View File

@@ -98,6 +98,7 @@ interface RRWebDOMBrowserRendererProps {
getList?: boolean;
getText?: boolean;
listSelector?: string | null;
cachedChildSelectors?: string[];
paginationMode?: boolean;
paginationType?: string;
limitMode?: boolean;
@@ -106,12 +107,14 @@ interface RRWebDOMBrowserRendererProps {
selector: string;
elementInfo: ElementInfo | null;
childSelectors?: string[];
groupInfo?: any;
}) => void;
onElementSelect?: (data: {
rect: DOMRect;
selector: string;
elementInfo: ElementInfo | null;
childSelectors?: string[];
groupInfo?: any;
}) => void;
onShowDatePicker?: (info: {
coordinates: { x: number; y: number };
@@ -144,6 +147,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
getList = false,
getText = false,
listSelector = null,
cachedChildSelectors = [],
paginationMode = false,
paginationType = "",
limitMode = false,
@@ -205,11 +209,24 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
const handleDOMHighlighting = useCallback(
(x: number, y: number, iframeDoc: Document) => {
try {
if (!getText && !getList) {
setCurrentHighlight(null);
if (onHighlight) {
onHighlight({
rect: new DOMRect(0, 0, 0, 0),
selector: "",
elementInfo: null,
});
}
return;
}
const highlighterData =
clientSelectorGenerator.generateDataForHighlighter(
{ x, y },
iframeDoc,
true
true,
cachedChildSelectors
);
if (!highlighterData) {
@@ -224,70 +241,40 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
return;
}
const { rect, selector, elementInfo, childSelectors } = highlighterData;
const { rect, selector, elementInfo, childSelectors, groupInfo } =
highlighterData;
let shouldHighlight = false;
if (getList) {
if (listSelector) {
const hasValidChildSelectors =
Array.isArray(childSelectors) && childSelectors.length > 0;
// First phase: Allow any group to be highlighted for selection
if (!listSelector && groupInfo?.isGroupElement) {
shouldHighlight = true;
}
// Second phase: Show valid children within selected group
else if (listSelector) {
if (limitMode) {
shouldHighlight = false;
} else if (paginationMode) {
if (
paginationType !== "" &&
!["none", "scrollDown", "scrollUp"].includes(paginationType)
) {
shouldHighlight = true;
} else {
shouldHighlight = false;
}
} else if (childSelectors && childSelectors.includes(selector)) {
} else if (
paginationMode &&
paginationType !== "" &&
!["none", "scrollDown", "scrollUp"].includes(paginationType)
) {
shouldHighlight = true;
} else if (childSelectors && childSelectors.length > 0) {
console.log("✅ Child selectors present, highlighting enabled");
shouldHighlight = true;
} else if (elementInfo?.isIframeContent && childSelectors) {
const isIframeChild = childSelectors.some(
(childSelector: string) =>
selector.includes(":>>") &&
childSelector
.split(":>>")
.some((part) => selector.includes(part.trim()))
);
shouldHighlight = isIframeChild;
} else if (selector.includes(":>>") && hasValidChildSelectors) {
const selectorParts = selector
.split(":>>")
.map((part: string) => part.trim());
const isValidMixedSelector = selectorParts.some((part: any) =>
childSelectors!.some((childSelector) =>
childSelector.includes(part)
)
);
} else if (elementInfo?.isShadowRoot && childSelectors) {
const isShadowChild = childSelectors.some(
(childSelector: string) =>
selector.includes(">>") &&
childSelector
.split(">>")
.some((part) => selector.includes(part.trim()))
);
} else if (selector.includes(">>") && hasValidChildSelectors) {
const selectorParts = selector
.split(">>")
.map((part: string) => part.trim());
const isValidMixedSelector = selectorParts.some((part: any) =>
childSelectors!.some((childSelector) =>
childSelector.includes(part)
)
);
} else {
console.log("❌ No child selectors available");
shouldHighlight = false;
}
} else {
}
// No list selector - show regular highlighting
else {
shouldHighlight = true;
}
} else {
// getText mode - always highlight
shouldHighlight = true;
}
@@ -316,6 +303,7 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
},
selector,
childSelectors,
groupInfo,
});
}
}
@@ -335,9 +323,11 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
}
},
[
getText,
getList,
listSelector,
paginationMode,
cachedChildSelectors,
paginationType,
limitMode,
onHighlight,
@@ -363,6 +353,10 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
return;
}
if (!isInCaptureMode) {
return;
}
const now = performance.now();
if (now - lastMouseMoveTime.current < MOUSE_MOVE_THROTTLE) {
return;
@@ -401,11 +395,24 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
e.stopPropagation();
if (currentHighlight && onElementSelect) {
// Get the group info for the current highlight
const highlighterData =
clientSelectorGenerator.generateDataForHighlighter(
{ x: iframeX, y: iframeY },
iframeDoc,
true,
cachedChildSelectors
);
onElementSelect({
rect: currentHighlight.rect,
selector: currentHighlight.selector,
elementInfo: currentHighlight.elementInfo,
childSelectors: currentHighlight.childSelectors || [],
childSelectors:
cachedChildSelectors.length > 0
? cachedChildSelectors
: highlighterData?.childSelectors || [],
groupInfo: highlighterData?.groupInfo,
});
}
notifyLastAction("select element");
@@ -790,12 +797,41 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
rebuiltHTML = "<!DOCTYPE html>\n" + rebuiltHTML;
const additionalCSS = [];
if (snapshotData.resources.fonts?.length > 0) {
const fontCSS = snapshotData.resources.fonts
.map((font) => {
const format = font.format || "woff2";
return `
@font-face {
font-family: 'ProxiedFont-${
font.url.split("/").pop()?.split(".")[0] || "unknown"
}';
src: url("${font.dataUrl}") format("${format}");
font-display: swap;
}
`;
})
.join("\n");
additionalCSS.push(fontCSS);
}
if (snapshotData.resources.stylesheets?.length > 0) {
const externalCSS = snapshotData.resources.stylesheets
.map((stylesheet) => stylesheet.content)
.join("\n\n");
additionalCSS.push(externalCSS);
}
const enhancedCSS = `
/* rrweb rebuilt content styles */
html, body {
margin: 0 !important;
padding: 8px !important;
overflow-x: hidden !important;
margin: 0 !important;
padding: 8px !important;
font-family: system-ui, -apple-system, BlinkMacSystemFont, sans-serif !important;
background: white !important;
overflow-x: hidden !important;
}
html::-webkit-scrollbar,
@@ -818,12 +854,22 @@ export const DOMBrowserRenderer: React.FC<RRWebDOMBrowserRendererProps> = ({
scrollbar-width: none !important; /* Firefox */
-ms-overflow-style: none !important; /* Internet Explorer 10+ */
}
img {
max-width: 100% !important;
height: auto !important;
}
/* Make everything interactive */
* {
cursor: "pointer" !important;
}
`;
/* Additional CSS from resources */
${additionalCSS.join("\n\n")}
`;
const headTagRegex = /<head[^>]*>/i;
const cssInjection = `

View File

@@ -22,6 +22,7 @@ import { useThemeMode } from '../../context/theme-provider';
import { useTranslation } from 'react-i18next';
import { useBrowserDimensionsStore } from '../../context/browserDimensions';
import { clientListExtractor } from '../../helpers/clientListExtractor';
import { clientSelectorGenerator } from '../../helpers/clientSelectorGenerator';
const fetchWorkflow = (id: string, callback: (response: WorkflowFile) => void) => {
getActiveWorkflow(id).then(
@@ -52,10 +53,8 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
const [isCaptureTextConfirmed, setIsCaptureTextConfirmed] = useState(false);
const [isCaptureListConfirmed, setIsCaptureListConfirmed] = useState(false);
const { panelHeight } = useBrowserDimensionsStore();
const [isDOMMode, setIsDOMMode] = useState(false);
const [currentSnapshot, setCurrentSnapshot] = useState<any>(null);
const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog, currentListActionId, setCurrentListActionId, currentTextActionId, setCurrentTextActionId, currentScreenshotActionId, setCurrentScreenshotActionId } = useGlobalInfoStore();
const { lastAction, notify, currentWorkflowActionsState, setCurrentWorkflowActionsState, resetInterpretationLog, currentListActionId, setCurrentListActionId, currentTextActionId, setCurrentTextActionId, currentScreenshotActionId, setCurrentScreenshotActionId, updateDOMMode, currentSnapshot, isDOMMode } = useGlobalInfoStore();
const {
getText, startGetText, stopGetText,
getList, startGetList, stopGetList,
@@ -86,22 +85,20 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
if (socket) {
const domModeHandler = (data: any) => {
if (!data.userId || data.userId === id) {
setIsDOMMode(true);
updateDOMMode(true);
}
};
const screenshotModeHandler = (data: any) => {
if (!data.userId || data.userId === id) {
setIsDOMMode(false);
setCurrentSnapshot(null);
updateDOMMode(false);
}
};
const domcastHandler = (data: any) => {
if (!data.userId || data.userId === id) {
if (data.snapshotData && data.snapshotData.snapshot) {
setCurrentSnapshot(data.snapshotData);
setIsDOMMode(true);
updateDOMMode(true, data.snapshotData);
}
}
};
@@ -116,7 +113,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
socket.off("domcast", domcastHandler);
};
}
}, [socket, id]);
}, [socket, id, updateDOMMode]);
useEffect(() => {
if (socket) {
@@ -214,7 +211,6 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
) => {
if (isDOMMode && currentSnapshot) {
try {
// Find the DOM iframe element
let iframeElement = document.querySelector(
"#dom-browser-iframe"
) as HTMLIFrameElement;
@@ -247,22 +243,42 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
return;
}
// Use client-side extraction
Object.entries(fields).forEach(([key, field]) => {
if (field.selectorObj?.selector) {
const isFieldXPath =
field.selectorObj.selector.startsWith("//") ||
field.selectorObj.selector.startsWith("/");
console.log(
`Field "${key}" selector:`,
field.selectorObj.selector,
`(XPath: ${isFieldXPath})`
);
}
});
const extractedData = clientListExtractor.extractListData(
iframeDoc,
listSelector,
fields,
5 // limit for preview
5
);
updateListStepData(currentListId, extractedData);
console.log("✅ UI extraction completed:");
if (extractedData.length === 0) {
console.warn(
"⚠️ No data extracted - this might indicate selector issues"
);
notify(
"warning",
"No data was extracted. Please verify your selections."
);
}
} catch (error) {
console.error("Error in client-side data extraction:", error);
notify("error", "Failed to extract data client-side");
}
} else {
// Fallback to socket-based extraction for screenshot mode
if (!socket) {
console.error("Socket not available for backend extraction");
return;
@@ -275,8 +291,6 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
currentListId,
pagination: { type: "", selector: "" },
});
console.log("📤 Sent extraction request to server");
} catch (error) {
console.error("Error in backend data extraction:", error);
}
@@ -443,6 +457,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
resetInterpretationLog();
finishAction('text');
onFinishCapture();
clientSelectorGenerator.cleanup();
}, [stopGetText, getTextSettingsObject, socket, browserSteps, confirmedTextSteps, resetInterpretationLog, finishAction, notify, onFinishCapture, t]);
const getListSettingsObject = useCallback(() => {
@@ -494,6 +509,8 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
const stopCaptureAndEmitGetListSettings = useCallback(() => {
const settings = getListSettingsObject();
console.log("rrwebSnapshotHandler", settings);
const latestListStep = getLatestListStep(browserSteps);
if (latestListStep && settings) {
@@ -509,6 +526,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
resetInterpretationLog();
finishAction('list');
onFinishCapture();
clientSelectorGenerator.cleanup();
}, [getListSettingsObject, socket, notify, handleStopGetList, resetInterpretationLog, finishAction, onFinishCapture, t, browserSteps, extractDataClientSide]);
const hasUnconfirmedListTextFields = browserSteps.some(step =>
@@ -638,6 +656,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
setCurrentTextActionId('');
setIsCaptureTextConfirmed(false);
clientSelectorGenerator.cleanup();
notify('error', t('right_panel.errors.capture_text_discarded'));
}, [currentTextActionId, browserSteps, stopGetText, deleteStepsByActionId, notify, t]);
@@ -668,6 +687,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
setCaptureStage('initial');
setCurrentListActionId('');
setIsCaptureListConfirmed(false);
clientSelectorGenerator.cleanup();
notify('error', t('right_panel.errors.capture_list_discarded'));
}, [currentListActionId, browserSteps, stopGetList, deleteStepsByActionId, resetListState, setShowPaginationOptions, setShowLimitOptions, setCaptureStage, notify, t]);
@@ -686,6 +706,7 @@ export const RightSidePanel: React.FC<RightSidePanelProps> = ({ onFinishCapture
stopGetScreenshot();
resetInterpretationLog();
finishAction('screenshot');
clientSelectorGenerator.cleanup();
onFinishCapture();
};

View File

@@ -27,6 +27,41 @@ interface ScheduleConfig {
cronExpression?: string;
}
interface ProcessedSnapshot {
snapshot: any;
resources: {
stylesheets: Array<{
href: string;
content: string;
media?: string;
}>;
images: Array<{
src: string;
dataUrl: string;
alt?: string;
}>;
fonts: Array<{
url: string;
dataUrl: string;
format?: string;
}>;
scripts: Array<{
src: string;
content: string;
type?: string;
}>;
media: Array<{
src: string;
dataUrl: string;
type: string;
}>;
};
baseUrl: string;
viewport: { width: number; height: number };
timestamp: number;
processingStats: any;
}
export interface RobotSettings {
id: string;
userId?: number;
@@ -86,6 +121,11 @@ interface GlobalInfo {
setCurrentListActionId: (actionId: string) => void;
currentScreenshotActionId: string;
setCurrentScreenshotActionId: (actionId: string) => void;
isDOMMode: boolean;
setIsDOMMode: (isDOMMode: boolean) => void;
currentSnapshot: ProcessedSnapshot | null;
setCurrentSnapshot: (snapshot: ProcessedSnapshot | null) => void;
updateDOMMode: (isDOMMode: boolean, snapshot?: ProcessedSnapshot | null) => void;
};
class GlobalInfoStore implements Partial<GlobalInfo> {
@@ -115,6 +155,8 @@ class GlobalInfoStore implements Partial<GlobalInfo> {
currentTextActionId = '';
currentListActionId = '';
currentScreenshotActionId = '';
isDOMMode = false;
currentSnapshot = null;
};
const globalInfoStore = new GlobalInfoStore();
@@ -141,6 +183,8 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
const [currentTextActionId, setCurrentTextActionId] = useState<string>('');
const [currentListActionId, setCurrentListActionId] = useState<string>('');
const [currentScreenshotActionId, setCurrentScreenshotActionId] = useState<string>('');
const [isDOMMode, setIsDOMMode] = useState<boolean>(globalInfoStore.isDOMMode);
const [currentSnapshot, setCurrentSnapshot] = useState<ProcessedSnapshot | null>(globalInfoStore.currentSnapshot);
const notify = (severity: 'error' | 'warning' | 'info' | 'success', message: string) => {
setNotification({ severity, message, isOpen: true });
@@ -165,6 +209,18 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
}, 100);
}
const updateDOMMode = (mode: boolean, snapshot?: ProcessedSnapshot | null) => {
setIsDOMMode(mode);
if (snapshot !== undefined) {
setCurrentSnapshot(snapshot);
}
if (!mode) {
setCurrentSnapshot(null);
}
}
return (
<globalInfoContext.Provider
value={{
@@ -205,6 +261,11 @@ export const GlobalInfoProvider = ({ children }: { children: JSX.Element }) => {
setCurrentListActionId,
currentScreenshotActionId,
setCurrentScreenshotActionId,
isDOMMode,
setIsDOMMode,
currentSnapshot,
setCurrentSnapshot,
updateDOMMode,
}}
>
{children}

View File

@@ -15,30 +15,89 @@ interface ExtractedListData {
[key: string]: string;
}
interface TableField {
interface Field {
selector: string;
attribute: string;
tableContext?: string;
cellIndex?: number;
}
interface NonTableField {
selector: string;
attribute: string;
}
interface ContainerFields {
tableFields: Record<string, TableField>;
nonTableFields: Record<string, NonTableField>;
}
class ClientListExtractor {
private evaluateXPath = (
rootElement: Element | Document,
xpath: string
): Element | null => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? (rootElement as Document)
: rootElement.ownerDocument;
if (!ownerDoc) return null;
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
return result.singleNodeValue as Element | null;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return null;
}
};
private evaluateXPathAll = (
rootElement: Element | Document,
xpath: string
): Element[] => {
try {
const ownerDoc =
rootElement.nodeType === Node.DOCUMENT_NODE
? (rootElement as Document)
: rootElement.ownerDocument;
if (!ownerDoc) return [];
const result = ownerDoc.evaluate(
xpath,
rootElement,
null,
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
null
);
const elements: Element[] = [];
for (let i = 0; i < result.snapshotLength; i++) {
const node = result.snapshotItem(i);
if (node && node.nodeType === Node.ELEMENT_NODE) {
elements.push(node as Element);
}
}
return elements;
} catch (error) {
console.warn("XPath evaluation failed:", xpath, error);
return [];
}
};
private queryElement = (
rootElement: Element | Document,
selector: string
): Element | null => {
if (!selector.includes(">>") && !selector.includes(":>>")) {
return rootElement.querySelector(selector);
// Check if it's an XPath selector (starts with // or / or ./)
if (
selector.startsWith("//") ||
selector.startsWith("/") ||
selector.startsWith("./")
) {
return this.evaluateXPath(rootElement, selector);
} else {
return rootElement.querySelector(selector);
}
}
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
@@ -59,7 +118,17 @@ class ClientListExtractor {
frameElement.contentDocument ||
frameElement.contentWindow?.document;
if (!frameDoc) return null;
currentElement = frameDoc.querySelector(parts[i]);
// Handle XPath in iframe context
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
currentElement = this.evaluateXPath(frameDoc, parts[i]);
} else {
currentElement = frameDoc.querySelector(parts[i]);
}
continue;
} catch (e) {
console.warn(
@@ -75,7 +144,16 @@ class ClientListExtractor {
let nextElement: Element | null = null;
if ("querySelector" in currentElement) {
nextElement = currentElement.querySelector(parts[i]);
// Handle XPath vs CSS selector
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(currentElement, parts[i]);
} else {
nextElement = currentElement.querySelector(parts[i]);
}
}
if (
@@ -83,9 +161,20 @@ class ClientListExtractor {
"shadowRoot" in currentElement &&
(currentElement as Element).shadowRoot
) {
nextElement = (currentElement as Element).shadowRoot!.querySelector(
parts[i]
);
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(
(currentElement as Element).shadowRoot as unknown as Document,
parts[i]
);
} else {
nextElement = (currentElement as Element).shadowRoot!.querySelector(
parts[i]
);
}
}
if (!nextElement && "children" in currentElement) {
@@ -94,7 +183,18 @@ class ClientListExtractor {
);
for (const child of children) {
if (child.shadowRoot) {
nextElement = child.shadowRoot.querySelector(parts[i]);
if (
parts[i].startsWith("//") ||
parts[i].startsWith("/") ||
parts[i].startsWith("./")
) {
nextElement = this.evaluateXPath(
child.shadowRoot as unknown as Document,
parts[i]
);
} else {
nextElement = child.shadowRoot.querySelector(parts[i]);
}
if (nextElement) break;
}
}
@@ -111,7 +211,12 @@ class ClientListExtractor {
selector: string
): Element[] => {
if (!selector.includes(">>") && !selector.includes(":>>")) {
return Array.from(rootElement.querySelectorAll(selector));
// Check if it's an XPath selector (starts with // or /)
if (selector.startsWith("//") || selector.startsWith("/")) {
return this.evaluateXPathAll(rootElement, selector);
} else {
return Array.from(rootElement.querySelectorAll(selector));
}
}
const parts = selector.split(/(?:>>|:>>)/).map((part) => part.trim());
@@ -133,7 +238,14 @@ class ClientListExtractor {
frameElement.contentDocument ||
frameElement.contentWindow?.document;
if (frameDoc) {
nextElements.push(...Array.from(frameDoc.querySelectorAll(part)));
// Handle XPath in iframe context
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(...this.evaluateXPathAll(frameDoc, part));
} else {
nextElements.push(
...Array.from(frameDoc.querySelectorAll(part))
);
}
}
} catch (e) {
console.warn(
@@ -146,24 +258,47 @@ class ClientListExtractor {
}
} else {
if ("querySelectorAll" in element) {
nextElements.push(...Array.from(element.querySelectorAll(part)));
// Handle XPath vs CSS selector
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(...this.evaluateXPathAll(element, part));
} else {
nextElements.push(...Array.from(element.querySelectorAll(part)));
}
}
if ("shadowRoot" in element && (element as Element).shadowRoot) {
nextElements.push(
...Array.from(
(element as Element).shadowRoot!.querySelectorAll(part)
)
);
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(
...this.evaluateXPathAll(
(element as Element).shadowRoot as unknown as Document,
part
)
);
} else {
nextElements.push(
...Array.from(
(element as Element).shadowRoot!.querySelectorAll(part)
)
);
}
}
if ("children" in element) {
const children = Array.from((element as Element).children || []);
for (const child of children) {
if (child.shadowRoot) {
nextElements.push(
...Array.from(child.shadowRoot.querySelectorAll(part))
);
if (part.startsWith("//") || part.startsWith("/")) {
nextElements.push(
...this.evaluateXPathAll(
child.shadowRoot as unknown as Document,
part
)
);
} else {
nextElements.push(
...Array.from(child.shadowRoot.querySelectorAll(part))
);
}
}
}
}
@@ -193,35 +328,66 @@ class ClientListExtractor {
}
if (attribute === "innerText") {
return (element as HTMLElement).innerText?.trim() || null;
} else if (attribute === "innerHTML") {
return element.innerHTML?.trim() || null;
} else if (attribute === "src" || attribute === "href") {
if (attribute === "href" && element.tagName !== "A") {
const parentElement = element.parentElement;
if (parentElement && parentElement.tagName === "A") {
const parentHref = parentElement.getAttribute("href");
if (parentHref) {
try {
return new URL(parentHref, baseURL).href;
} catch (e) {
return parentHref;
}
// First try standard innerText/textContent
let textContent =
(element as HTMLElement).innerText?.trim() ||
(element as HTMLElement).textContent?.trim();
// If empty, check for common data attributes that might contain the text
if (!textContent) {
// Check for data-* attributes that commonly contain text values
const dataAttributes = [
"data-600",
"data-text",
"data-label",
"data-value",
"data-content",
];
for (const attr of dataAttributes) {
const dataValue = element.getAttribute(attr);
if (dataValue && dataValue.trim()) {
textContent = dataValue.trim();
break;
}
}
}
return textContent || null;
} else if (attribute === "innerHTML") {
return element.innerHTML?.trim() || null;
} else if (attribute === "href") {
// For href, we need to find the anchor tag if the current element isn't one
let anchorElement = element;
// If current element is not an anchor, look for parent anchor
if (element.tagName !== "A") {
anchorElement =
element.closest("a") ||
element.parentElement?.closest("a") ||
element;
}
const hrefValue = anchorElement.getAttribute("href");
if (!hrefValue || hrefValue.trim() === "") {
return null;
}
try {
return new URL(hrefValue, baseURL).href;
} catch (e) {
console.warn("Error creating URL from", hrefValue, e);
return hrefValue;
}
} else if (attribute === "src") {
const attrValue = element.getAttribute(attribute);
const dataAttr = attrValue || element.getAttribute("data-" + attribute);
if (!dataAttr || dataAttr.trim() === "") {
if (attribute === "src") {
const style = window.getComputedStyle(element as HTMLElement);
const bgImage = style.backgroundImage;
if (bgImage && bgImage !== "none") {
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
return matches ? new URL(matches[1], baseURL).href : null;
}
const style = window.getComputedStyle(element as HTMLElement);
const bgImage = style.backgroundImage;
if (bgImage && bgImage !== "none") {
const matches = bgImage.match(/url\(['"]?([^'")]+)['"]?\)/);
return matches ? new URL(matches[1], baseURL).href : null;
}
return null;
}
@@ -236,187 +402,8 @@ class ClientListExtractor {
return element.getAttribute(attribute);
};
private findTableAncestor = (
element: Element
): { type: string; element: Element } | null => {
let currentElement: Element | null = element;
const MAX_DEPTH = 5;
let depth = 0;
while (currentElement && depth < MAX_DEPTH) {
if (currentElement.getRootNode() instanceof ShadowRoot) {
currentElement = (currentElement.getRootNode() as ShadowRoot).host;
continue;
}
if (currentElement.tagName === "TD") {
return { type: "TD", element: currentElement };
} else if (currentElement.tagName === "TR") {
return { type: "TR", element: currentElement };
}
if (
currentElement.tagName === "IFRAME" ||
currentElement.tagName === "FRAME"
) {
try {
const frameElement = currentElement as
| HTMLIFrameElement
| HTMLFrameElement;
currentElement = frameElement.contentDocument?.body || null;
} catch (e) {
return null;
}
} else {
currentElement = currentElement.parentElement;
}
depth++;
}
return null;
};
private getCellIndex = (td: Element): number => {
if (td.getRootNode() instanceof ShadowRoot) {
const shadowRoot = td.getRootNode() as ShadowRoot;
const allCells = Array.from(shadowRoot.querySelectorAll("td"));
return allCells.indexOf(td as HTMLTableCellElement);
}
let index = 0;
let sibling = td;
while ((sibling = sibling.previousElementSibling as Element)) {
index++;
}
return index;
};
private hasThElement = (
row: Element,
tableFields: Record<string, TableField>
): boolean => {
for (const [_, { selector }] of Object.entries(tableFields)) {
const element = this.queryElement(row, selector);
if (element) {
let current: Element | ShadowRoot | Document | null = element;
while (current && current !== row) {
if (current.getRootNode() instanceof ShadowRoot) {
current = (current.getRootNode() as ShadowRoot).host;
continue;
}
if ((current as Element).tagName === "TH") return true;
if (
(current as Element).tagName === "IFRAME" ||
(current as Element).tagName === "FRAME"
) {
try {
const frameElement = current as
| HTMLIFrameElement
| HTMLFrameElement;
current = frameElement.contentDocument?.body || null;
} catch (e) {
break;
}
} else {
current = (current as Element).parentElement;
}
}
}
}
return false;
};
private filterRowsBasedOnTag = (
rows: Element[],
tableFields: Record<string, TableField>
): Element[] => {
for (const row of rows) {
if (this.hasThElement(row, tableFields)) {
return rows;
}
}
return rows.filter((row) => {
const directTH = row.getElementsByTagName("TH").length === 0;
const shadowTH = row.shadowRoot
? row.shadowRoot.querySelector("th") === null
: true;
return directTH && shadowTH;
});
};
private calculateClassSimilarity = (
classList1: string[],
classList2: string[]
): number => {
const set1 = new Set(classList1);
const set2 = new Set(classList2);
const intersection = new Set([...set1].filter((x) => set2.has(x)));
const union = new Set([...set1, ...set2]);
return intersection.size / union.size;
};
private findSimilarElements = (
baseElement: Element,
document: Document,
similarityThreshold: number = 0.7
): Element[] => {
const baseClasses = Array.from(baseElement.classList);
if (baseClasses.length === 0) return [];
const allElements: Element[] = [];
allElements.push(
...Array.from(document.getElementsByTagName(baseElement.tagName))
);
if (baseElement.getRootNode() instanceof ShadowRoot) {
const shadowHost = (baseElement.getRootNode() as ShadowRoot).host;
allElements.push(
...Array.from(shadowHost.getElementsByTagName(baseElement.tagName))
);
}
const frames = [
...Array.from(document.getElementsByTagName("iframe")),
...Array.from(document.getElementsByTagName("frame")),
];
for (const frame of frames) {
try {
const frameElement = frame as HTMLIFrameElement | HTMLFrameElement;
const frameDoc =
frameElement.contentDocument || frameElement.contentWindow?.document;
if (frameDoc) {
allElements.push(
...Array.from(frameDoc.getElementsByTagName(baseElement.tagName))
);
}
} catch (e) {
console.warn(
`Cannot access ${frame.tagName.toLowerCase()} content:`,
e
);
}
}
return allElements.filter((element) => {
if (element === baseElement) return false;
const similarity = this.calculateClassSimilarity(
baseClasses,
Array.from(element.classList)
);
return similarity >= similarityThreshold;
});
};
private convertFields = (
fields: any
): Record<string, { selector: string; attribute: string }> => {
const convertedFields: Record<
string,
{ selector: string; attribute: string }
> = {};
private convertFields = (fields: any): Record<string, Field> => {
const convertedFields: Record<string, Field> = {};
for (const [key, field] of Object.entries(fields)) {
const typedField = field as TextStep;
@@ -439,285 +426,134 @@ class ClientListExtractor {
// Convert fields to the format expected by the extraction logic
const convertedFields = this.convertFields(fields);
// Get all container elements matching the list selector
let containers = this.queryElementAll(iframeDocument, listSelector);
// Step 1: Get all container elements matching the list selector
const containers = this.queryElementAll(iframeDocument, listSelector);
if (containers.length === 0) {
console.warn("No containers found for listSelector:", listSelector);
console.warn("No containers found for listSelector:", listSelector);
return [];
}
// Enhanced container discovery: find similar elements if we need more containers
if (limit > 1 && containers.length === 1) {
const baseContainer = containers[0];
const similarContainers = this.findSimilarElements(
baseContainer,
iframeDocument,
0.7
);
// Step 2: Extract data from each container up to the limit
const extractedData: ExtractedListData[] = [];
const containersToProcess = Math.min(containers.length, limit);
if (similarContainers.length > 0) {
const newContainers = similarContainers.filter(
(container) => !container.matches(listSelector)
);
containers = [...containers, ...newContainers];
}
}
for (
let containerIndex = 0;
containerIndex < containersToProcess;
containerIndex++
) {
const container = containers[containerIndex];
const record: ExtractedListData = {};
// Analyze fields for table vs non-table context
const containerFields: ContainerFields[] = containers.map(() => ({
tableFields: {},
nonTableFields: {},
}));
// Step 3: For each field, extract data from the current container
for (const [label, { selector, attribute }] of Object.entries(
convertedFields
)) {
let element: Element | null = null;
containers.forEach((container, containerIndex) => {
for (const [label, field] of Object.entries(convertedFields)) {
const sampleElement = this.queryElement(container, field.selector);
// CORRECT APPROACH: Create indexed absolute XPath
if (selector.startsWith("//")) {
// Convert the absolute selector to target the specific container instance
const indexedSelector = this.createIndexedXPath(
selector,
listSelector,
containerIndex + 1
);
if (sampleElement) {
const ancestor = this.findTableAncestor(sampleElement);
if (ancestor) {
containerFields[containerIndex].tableFields[label] = {
...field,
tableContext: ancestor.type,
cellIndex:
ancestor.type === "TD"
? this.getCellIndex(ancestor.element)
: -1,
};
} else {
containerFields[containerIndex].nonTableFields[label] = field;
}
element = this.evaluateXPathSingle(iframeDocument, indexedSelector);
} else {
containerFields[containerIndex].nonTableFields[label] = field;
// Fallback for non-XPath selectors
element = this.queryElement(container, selector);
}
}
});
// Extract table data
const tableData: ExtractedListData[] = [];
for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
const container = containers[containerIndex];
const { tableFields } = containerFields[containerIndex];
if (Object.keys(tableFields).length > 0) {
const firstField = Object.values(tableFields)[0];
const firstElement = this.queryElement(
container,
firstField.selector
);
let tableContext: Element | null = firstElement;
// Find the table context
while (
tableContext &&
tableContext.tagName !== "TABLE" &&
tableContext !== container
) {
if (tableContext.getRootNode() instanceof ShadowRoot) {
tableContext = (tableContext.getRootNode() as ShadowRoot).host;
continue;
}
if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try {
const frameElement = tableContext as
| HTMLIFrameElement
| HTMLFrameElement;
tableContext = frameElement.contentDocument?.body || null;
} catch (e) {
break;
}
// Step 4: Extract the value from the found element
if (element) {
const value = this.extractValue(element, attribute);
if (value !== null && value !== "") {
record[label] = value;
} else {
tableContext = tableContext.parentElement;
}
}
if (tableContext) {
const rows: Element[] = [];
rows.push(...Array.from(tableContext.getElementsByTagName("TR")));
if (
tableContext.tagName === "IFRAME" ||
tableContext.tagName === "FRAME"
) {
try {
const frameElement = tableContext as
| HTMLIFrameElement
| HTMLFrameElement;
const frameDoc =
frameElement.contentDocument ||
frameElement.contentWindow?.document;
if (frameDoc) {
rows.push(...Array.from(frameDoc.getElementsByTagName("TR")));
}
} catch (e) {
console.warn(
`Cannot access ${tableContext.tagName.toLowerCase()} rows:`,
e
);
}
}
const processedRows = this.filterRowsBasedOnTag(rows, tableFields);
for (
let rowIndex = 0;
rowIndex < Math.min(processedRows.length, limit);
rowIndex++
) {
const record: ExtractedListData = {};
const currentRow = processedRows[rowIndex];
for (const [
label,
{ selector, attribute, cellIndex },
] of Object.entries(tableFields)) {
let element: Element | null = null;
if (cellIndex !== undefined && cellIndex >= 0) {
let td: Element | null =
currentRow.children[cellIndex] || null;
if (!td && currentRow.shadowRoot) {
const shadowCells = currentRow.shadowRoot.children;
if (shadowCells && shadowCells.length > cellIndex) {
td = shadowCells[cellIndex];
}
}
if (td) {
element = this.queryElement(td, selector);
if (
!element &&
selector
.split(/(?:>>|:>>)/)
.pop()
?.includes("td:nth-child")
) {
element = td;
}
if (!element) {
const tagOnlySelector = selector.split(".")[0];
element = this.queryElement(td, tagOnlySelector);
}
if (!element) {
let currentElement: Element | null = td;
while (
currentElement &&
currentElement.children.length > 0
) {
let foundContentChild = false;
for (const child of Array.from(
currentElement.children
)) {
if (this.extractValue(child, attribute)) {
currentElement = child;
foundContentChild = true;
break;
}
}
if (!foundContentChild) break;
}
element = currentElement;
}
}
} else {
element = this.queryElement(currentRow, selector);
}
if (element) {
const value = this.extractValue(element, attribute);
if (value !== null && value !== "") {
record[label] = value;
} else {
console.warn(
`❌ No value for ${label} in row ${rowIndex + 1}`
);
record[label] = "";
}
} else {
console.warn(
`❌ Element not found for ${label} with selector:`,
selector
);
record[label] = "";
}
}
if (Object.values(record).some((value) => value !== "")) {
tableData.push(record);
}
}
}
}
}
// Extract non-table data
const nonTableData: ExtractedListData[] = [];
for (
let containerIndex = 0;
containerIndex < containers.length;
containerIndex++
) {
if (nonTableData.length >= limit) break;
const container = containers[containerIndex];
const { nonTableFields } = containerFields[containerIndex];
if (Object.keys(nonTableFields).length > 0) {
const record: ExtractedListData = {};
for (const [label, { selector, attribute }] of Object.entries(
nonTableFields
)) {
const relativeSelector = selector.split(/(?:>>|:>>)/).slice(-1)[0];
const element = this.queryElement(container, relativeSelector);
if (element) {
const value = this.extractValue(element, attribute);
if (value !== null && value !== "") {
record[label] = value;
} else {
console.warn(
`❌ No value for ${label} in container ${containerIndex + 1}`
);
record[label] = "";
}
} else {
console.warn(
`❌ Element not found for ${label} with selector:`,
selector
);
console.warn(` ⚠️ Empty value for "${label}"`);
record[label] = "";
}
}
if (Object.values(record).some((value) => value !== "")) {
nonTableData.push(record);
} else {
console.warn(` ❌ Element not found for "${label}"`);
record[label] = "";
}
}
}
// Combine and limit results
const extractedData = [...tableData, ...nonTableData].slice(0, limit);
// Step 5: Add record if it has any non-empty values
if (Object.values(record).some((value) => value !== "")) {
extractedData.push(record);
} else {
console.warn(
` ⚠️ Skipping empty record for container ${containerIndex + 1}`
);
}
}
return extractedData;
} catch (error) {
console.error("Error in client-side extractListData:", error);
console.error("💥 Error in client-side extractListData:", error);
return [];
}
};
// Create indexed XPath for specific container instance
private createIndexedXPath(
childSelector: string,
listSelector: string,
containerIndex: number
): string {
// Check if the child selector contains the list selector pattern
if (childSelector.includes(listSelector.replace("//", ""))) {
// Replace the list selector part with indexed version
const listPattern = listSelector.replace("//", "");
const indexedListSelector = `(${listSelector})[${containerIndex}]`;
const indexedSelector = childSelector.replace(
`//${listPattern}`,
indexedListSelector
);
return indexedSelector;
} else {
// If pattern doesn't match, create a more generic indexed selector
// This is a fallback approach
console.warn(` ⚠️ Pattern doesn't match, using fallback approach`);
return `(${listSelector})[${containerIndex}]${childSelector.replace(
"//",
"/"
)}`;
}
}
// Helper method for single XPath evaluation
private evaluateXPathSingle = (
document: Document,
xpath: string
): Element | null => {
try {
const result = document.evaluate(
xpath,
document,
null,
XPathResult.FIRST_ORDERED_NODE_TYPE,
null
);
const element = result.singleNodeValue as Element | null;
if (!element) {
console.warn(`❌ XPath found no element for: ${xpath}`);
}
return element;
} catch (error) {
console.error("❌ XPath evaluation failed:", xpath, error);
return null;
}
};
}
export const clientListExtractor = new ClientListExtractor();

File diff suppressed because it is too large Load Diff