Merge pull request #235 from getmaxun/rect-improve

feat: rect dom depth information
This commit is contained in:
Karishma Shukla
2024-12-09 21:18:09 +05:30
committed by GitHub
4 changed files with 140 additions and 61 deletions

View File

@@ -534,6 +534,8 @@ export default class Interpreter extends EventEmitter {
case 'clickNext':
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
// console.log("Page results:", pageResults);
// Filter out already scraped items
const newResults = pageResults.filter(item => {
const uniqueKey = JSON.stringify(item);
@@ -541,9 +543,9 @@ export default class Interpreter extends EventEmitter {
scrapedItems.add(uniqueKey); // Mark as scraped
return true;
});
allResults = allResults.concat(newResults);
if (config.limit && allResults.length >= config.limit) {
return allResults.slice(0, config.limit);
}
@@ -553,7 +555,7 @@ export default class Interpreter extends EventEmitter {
return allResults; // No more pages to scrape
}
await Promise.all([
nextButton.click(),
nextButton.dispatchEvent('click'),
page.waitForNavigation({ waitUntil: 'networkidle' })
]);

View File

@@ -36,6 +36,7 @@
"fortawesome": "^0.0.1-security",
"google-auth-library": "^9.14.1",
"googleapis": "^144.0.0",
"idcac-playwright": "^0.1.3",
"ioredis": "^5.4.1",
"joi": "^17.6.0",
"jsonwebtoken": "^9.0.2",

View File

@@ -15,6 +15,7 @@ import { InterpreterSettings, RemoteBrowserOptions } from "../../types";
import { WorkflowGenerator } from "../../workflow-management/classes/Generator";
import { WorkflowInterpreter } from "../../workflow-management/classes/Interpreter";
import { getDecryptedProxyConfig } from '../../routes/proxy';
import { getInjectableScript } from 'idcac-playwright';
chromium.use(stealthPlugin());
@@ -168,6 +169,7 @@ export class RemoteBrowser {
this.currentPage.on('framenavigated', (frame) => {
if (frame === this.currentPage?.mainFrame()) {
this.currentPage.evaluate(getInjectableScript())
this.socket.emit('urlChanged', this.currentPage.url());
}
});
@@ -370,11 +372,12 @@ export class RemoteBrowser {
await this.stopScreencast();
this.currentPage = page;
// this.currentPage.on('framenavigated', (frame) => {
// if (frame === this.currentPage?.mainFrame()) {
// this.socket.emit('urlChanged', this.currentPage.url());
// }
// });
this.currentPage.on('framenavigated', (frame) => {
if (frame === this.currentPage?.mainFrame()) {
this.currentPage.evaluate(getInjectableScript());
this.socket.emit('urlChanged', this.currentPage.url());
}
});
//await this.currentPage.setViewportSize({ height: 400, width: 900 })
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
@@ -404,6 +407,7 @@ export class RemoteBrowser {
if (this.currentPage) {
this.currentPage.on('framenavigated', (frame) => {
if (frame === this.currentPage?.mainFrame()) {
this.currentPage.evaluate(getInjectableScript());
this.socket.emit('urlChanged', this.currentPage.url());
}
});

View File

@@ -20,49 +20,6 @@ type Workflow = WorkflowFile["workflow"];
* @category WorkflowManagement-Selectors
* @returns {Promise<Rectangle|undefined|null>}
*/
export const getRect = async (page: Page, coordinates: Coordinates) => {
try {
const rect = await page.evaluate(
async ({ x, y }) => {
const el = document.elementFromPoint(x, y) as HTMLElement;
if (el) {
const { parentElement } = el;
// Match the logic in recorder.ts for link clicks
const element = parentElement?.tagName === 'A' ? parentElement : el;
const rectangle = element?.getBoundingClientRect();
// @ts-ignore
if (rectangle) {
return {
x: rectangle.x,
y: rectangle.y,
width: rectangle.width,
height: rectangle.height,
top: rectangle.top,
right: rectangle.right,
bottom: rectangle.bottom,
left: rectangle.left,
};
}
}
},
{ x: coordinates.x, y: coordinates.y },
);
return rect;
} catch (error) {
const { message, stack } = error as Error;
logger.log('error', `Error while retrieving selector: ${message}`);
logger.log('error', `Stack: ${stack}`);
}
}
/**
* Checks the basic info about an element and returns a {@link BaseActionInfo} object.
* If the element is not found, returns undefined.
* @param page The page instance.
* @param coordinates Coordinates of an element.
* @category WorkflowManagement-Selectors
* @returns {Promise<BaseActionInfo|undefined>}
*/
export const getElementInformation = async (
page: Page,
coordinates: Coordinates
@@ -70,10 +27,38 @@ export const getElementInformation = async (
try {
const elementInfo = await page.evaluate(
async ({ x, y }) => {
const el = document.elementFromPoint(x, y) as HTMLElement;
if (el) {
const { parentElement } = el;
const element = parentElement?.tagName === 'A' ? parentElement : el;
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
if (originalEl) {
let element = originalEl;
if (originalEl.tagName === 'A') {
element = originalEl;
} else if (originalEl.parentElement?.tagName === 'A') {
element = originalEl.parentElement;
} else {
// Generic parent finding logic based on visual containment
while (element.parentElement) {
const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();
// Check if parent visually contains the child
const fullyContained =
parentRect.left <= childRect.left &&
parentRect.right >= childRect.right &&
parentRect.top <= childRect.top &&
parentRect.bottom >= childRect.bottom;
// Additional checks for more comprehensive containment
const significantOverlap =
(childRect.width * childRect.height) /
(parentRect.width * parentRect.height) > 0.5;
if (fullyContained && significantOverlap) {
element = element.parentElement;
} else {
break;
}
} }
let info: {
tagName: string;
@@ -98,7 +83,7 @@ export const getElementInformation = async (
);
}
// Gather specific information based on the tag
// Existing tag-specific logic
if (element?.tagName === 'A') {
info.url = (element as HTMLAnchorElement).href;
info.innerText = element.innerText ?? '';
@@ -112,7 +97,6 @@ export const getElementInformation = async (
info.innerHTML = element.innerHTML;
info.outerHTML = element.outerHTML;
return info;
}
return null;
@@ -127,6 +111,67 @@ export const getElementInformation = async (
}
};
export const getRect = async (page: Page, coordinates: Coordinates) => {
try {
const rect = await page.evaluate(
async ({ x, y }) => {
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
if (originalEl) {
let element = originalEl;
if (originalEl.tagName === 'A') {
element = originalEl;
} else if (originalEl.parentElement?.tagName === 'A') {
element = originalEl.parentElement;
} else {
while (element.parentElement) {
const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();
const fullyContained =
parentRect.left <= childRect.left &&
parentRect.right >= childRect.right &&
parentRect.top <= childRect.top &&
parentRect.bottom >= childRect.bottom;
const significantOverlap =
(childRect.width * childRect.height) /
(parentRect.width * parentRect.height) > 0.5;
if (fullyContained && significantOverlap) {
element = element.parentElement;
} else {
break;
}
}}
//element = element?.parentElement?.tagName === 'A' ? element?.parentElement : element;
const rectangle = element?.getBoundingClientRect();
if (rectangle) {
return {
x: rectangle.x,
y: rectangle.y,
width: rectangle.width,
height: rectangle.height,
top: rectangle.top,
right: rectangle.right,
bottom: rectangle.bottom,
left: rectangle.left,
};
}
}
},
{ x: coordinates.x, y: coordinates.y },
);
return rect;
} catch (error) {
const { message, stack } = error as Error;
logger.log('error', `Error while retrieving selector: ${message}`);
logger.log('error', `Stack: ${stack}`);
}
}
/**
* Returns the best and unique css {@link Selectors} for the element on the page.
@@ -742,7 +787,6 @@ interface SelectorResult {
export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates): Promise<SelectorResult> => {
try {
const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => {
function getNonUniqueSelector(element: HTMLElement): string {
let selector = element.tagName.toLowerCase();
@@ -774,8 +818,37 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
return path.join(' > ');
}
const element = document.elementFromPoint(x, y) as HTMLElement | null;
if (!element) return null;
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
if (!originalEl) return null;
let element = originalEl;
if (originalEl.tagName === 'A') {
element = originalEl;
} else if (originalEl.parentElement?.tagName === 'A') {
element = originalEl.parentElement;
} else {
while (element.parentElement) {
const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();
const fullyContained =
parentRect.left <= childRect.left &&
parentRect.right >= childRect.right &&
parentRect.top <= childRect.top &&
parentRect.bottom >= childRect.bottom;
const significantOverlap =
(childRect.width * childRect.height) /
(parentRect.width * parentRect.height) > 0.5;
if (fullyContained && significantOverlap) {
element = element.parentElement;
} else {
break;
}
}
}
const generalSelector = getSelectorPath(element);
return {
@@ -790,7 +863,6 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
}
};
export const getChildSelectors = async (page: Page, parentSelector: string): Promise<string[]> => {
try {
const childSelectors = await page.evaluate((parentSelector: string) => {