Merge pull request #235 from getmaxun/rect-improve

feat: rect dom depth information
This commit is contained in:
Karishma Shukla
2024-12-09 21:18:09 +05:30
committed by GitHub
4 changed files with 140 additions and 61 deletions

View File

@@ -534,6 +534,8 @@ export default class Interpreter extends EventEmitter {
case 'clickNext': case 'clickNext':
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config); const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
// console.log("Page results:", pageResults);
// Filter out already scraped items // Filter out already scraped items
const newResults = pageResults.filter(item => { const newResults = pageResults.filter(item => {
const uniqueKey = JSON.stringify(item); const uniqueKey = JSON.stringify(item);
@@ -541,9 +543,9 @@ export default class Interpreter extends EventEmitter {
scrapedItems.add(uniqueKey); // Mark as scraped scrapedItems.add(uniqueKey); // Mark as scraped
return true; return true;
}); });
allResults = allResults.concat(newResults); allResults = allResults.concat(newResults);
if (config.limit && allResults.length >= config.limit) { if (config.limit && allResults.length >= config.limit) {
return allResults.slice(0, config.limit); return allResults.slice(0, config.limit);
} }
@@ -553,7 +555,7 @@ export default class Interpreter extends EventEmitter {
return allResults; // No more pages to scrape return allResults; // No more pages to scrape
} }
await Promise.all([ await Promise.all([
nextButton.click(), nextButton.dispatchEvent('click'),
page.waitForNavigation({ waitUntil: 'networkidle' }) page.waitForNavigation({ waitUntil: 'networkidle' })
]); ]);

View File

@@ -36,6 +36,7 @@
"fortawesome": "^0.0.1-security", "fortawesome": "^0.0.1-security",
"google-auth-library": "^9.14.1", "google-auth-library": "^9.14.1",
"googleapis": "^144.0.0", "googleapis": "^144.0.0",
"idcac-playwright": "^0.1.3",
"ioredis": "^5.4.1", "ioredis": "^5.4.1",
"joi": "^17.6.0", "joi": "^17.6.0",
"jsonwebtoken": "^9.0.2", "jsonwebtoken": "^9.0.2",

View File

@@ -15,6 +15,7 @@ import { InterpreterSettings, RemoteBrowserOptions } from "../../types";
import { WorkflowGenerator } from "../../workflow-management/classes/Generator"; import { WorkflowGenerator } from "../../workflow-management/classes/Generator";
import { WorkflowInterpreter } from "../../workflow-management/classes/Interpreter"; import { WorkflowInterpreter } from "../../workflow-management/classes/Interpreter";
import { getDecryptedProxyConfig } from '../../routes/proxy'; import { getDecryptedProxyConfig } from '../../routes/proxy';
import { getInjectableScript } from 'idcac-playwright';
chromium.use(stealthPlugin()); chromium.use(stealthPlugin());
@@ -168,6 +169,7 @@ export class RemoteBrowser {
this.currentPage.on('framenavigated', (frame) => { this.currentPage.on('framenavigated', (frame) => {
if (frame === this.currentPage?.mainFrame()) { if (frame === this.currentPage?.mainFrame()) {
this.currentPage.evaluate(getInjectableScript())
this.socket.emit('urlChanged', this.currentPage.url()); this.socket.emit('urlChanged', this.currentPage.url());
} }
}); });
@@ -370,11 +372,12 @@ export class RemoteBrowser {
await this.stopScreencast(); await this.stopScreencast();
this.currentPage = page; this.currentPage = page;
// this.currentPage.on('framenavigated', (frame) => { this.currentPage.on('framenavigated', (frame) => {
// if (frame === this.currentPage?.mainFrame()) { if (frame === this.currentPage?.mainFrame()) {
// this.socket.emit('urlChanged', this.currentPage.url()); this.currentPage.evaluate(getInjectableScript());
// } this.socket.emit('urlChanged', this.currentPage.url());
// }); }
});
//await this.currentPage.setViewportSize({ height: 400, width: 900 }) //await this.currentPage.setViewportSize({ height: 400, width: 900 })
this.client = await this.currentPage.context().newCDPSession(this.currentPage); this.client = await this.currentPage.context().newCDPSession(this.currentPage);
@@ -404,6 +407,7 @@ export class RemoteBrowser {
if (this.currentPage) { if (this.currentPage) {
this.currentPage.on('framenavigated', (frame) => { this.currentPage.on('framenavigated', (frame) => {
if (frame === this.currentPage?.mainFrame()) { if (frame === this.currentPage?.mainFrame()) {
this.currentPage.evaluate(getInjectableScript());
this.socket.emit('urlChanged', this.currentPage.url()); this.socket.emit('urlChanged', this.currentPage.url());
} }
}); });

View File

@@ -20,49 +20,6 @@ type Workflow = WorkflowFile["workflow"];
* @category WorkflowManagement-Selectors * @category WorkflowManagement-Selectors
* @returns {Promise<Rectangle|undefined|null>} * @returns {Promise<Rectangle|undefined|null>}
*/ */
export const getRect = async (page: Page, coordinates: Coordinates) => {
try {
const rect = await page.evaluate(
async ({ x, y }) => {
const el = document.elementFromPoint(x, y) as HTMLElement;
if (el) {
const { parentElement } = el;
// Match the logic in recorder.ts for link clicks
const element = parentElement?.tagName === 'A' ? parentElement : el;
const rectangle = element?.getBoundingClientRect();
// @ts-ignore
if (rectangle) {
return {
x: rectangle.x,
y: rectangle.y,
width: rectangle.width,
height: rectangle.height,
top: rectangle.top,
right: rectangle.right,
bottom: rectangle.bottom,
left: rectangle.left,
};
}
}
},
{ x: coordinates.x, y: coordinates.y },
);
return rect;
} catch (error) {
const { message, stack } = error as Error;
logger.log('error', `Error while retrieving selector: ${message}`);
logger.log('error', `Stack: ${stack}`);
}
}
/**
* Checks the basic info about an element and returns a {@link BaseActionInfo} object.
* If the element is not found, returns undefined.
* @param page The page instance.
* @param coordinates Coordinates of an element.
* @category WorkflowManagement-Selectors
* @returns {Promise<BaseActionInfo|undefined>}
*/
export const getElementInformation = async ( export const getElementInformation = async (
page: Page, page: Page,
coordinates: Coordinates coordinates: Coordinates
@@ -70,10 +27,38 @@ export const getElementInformation = async (
try { try {
const elementInfo = await page.evaluate( const elementInfo = await page.evaluate(
async ({ x, y }) => { async ({ x, y }) => {
const el = document.elementFromPoint(x, y) as HTMLElement; const originalEl = document.elementFromPoint(x, y) as HTMLElement;
if (el) { if (originalEl) {
const { parentElement } = el; let element = originalEl;
const element = parentElement?.tagName === 'A' ? parentElement : el;
if (originalEl.tagName === 'A') {
element = originalEl;
} else if (originalEl.parentElement?.tagName === 'A') {
element = originalEl.parentElement;
} else {
// Generic parent finding logic based on visual containment
while (element.parentElement) {
const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();
// Check if parent visually contains the child
const fullyContained =
parentRect.left <= childRect.left &&
parentRect.right >= childRect.right &&
parentRect.top <= childRect.top &&
parentRect.bottom >= childRect.bottom;
// Additional checks for more comprehensive containment
const significantOverlap =
(childRect.width * childRect.height) /
(parentRect.width * parentRect.height) > 0.5;
if (fullyContained && significantOverlap) {
element = element.parentElement;
} else {
break;
}
} }
let info: { let info: {
tagName: string; tagName: string;
@@ -98,7 +83,7 @@ export const getElementInformation = async (
); );
} }
// Gather specific information based on the tag // Existing tag-specific logic
if (element?.tagName === 'A') { if (element?.tagName === 'A') {
info.url = (element as HTMLAnchorElement).href; info.url = (element as HTMLAnchorElement).href;
info.innerText = element.innerText ?? ''; info.innerText = element.innerText ?? '';
@@ -112,7 +97,6 @@ export const getElementInformation = async (
info.innerHTML = element.innerHTML; info.innerHTML = element.innerHTML;
info.outerHTML = element.outerHTML; info.outerHTML = element.outerHTML;
return info; return info;
} }
return null; return null;
@@ -127,6 +111,67 @@ export const getElementInformation = async (
} }
}; };
export const getRect = async (page: Page, coordinates: Coordinates) => {
try {
const rect = await page.evaluate(
async ({ x, y }) => {
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
if (originalEl) {
let element = originalEl;
if (originalEl.tagName === 'A') {
element = originalEl;
} else if (originalEl.parentElement?.tagName === 'A') {
element = originalEl.parentElement;
} else {
while (element.parentElement) {
const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();
const fullyContained =
parentRect.left <= childRect.left &&
parentRect.right >= childRect.right &&
parentRect.top <= childRect.top &&
parentRect.bottom >= childRect.bottom;
const significantOverlap =
(childRect.width * childRect.height) /
(parentRect.width * parentRect.height) > 0.5;
if (fullyContained && significantOverlap) {
element = element.parentElement;
} else {
break;
}
}}
//element = element?.parentElement?.tagName === 'A' ? element?.parentElement : element;
const rectangle = element?.getBoundingClientRect();
if (rectangle) {
return {
x: rectangle.x,
y: rectangle.y,
width: rectangle.width,
height: rectangle.height,
top: rectangle.top,
right: rectangle.right,
bottom: rectangle.bottom,
left: rectangle.left,
};
}
}
},
{ x: coordinates.x, y: coordinates.y },
);
return rect;
} catch (error) {
const { message, stack } = error as Error;
logger.log('error', `Error while retrieving selector: ${message}`);
logger.log('error', `Stack: ${stack}`);
}
}
/** /**
* Returns the best and unique css {@link Selectors} for the element on the page. * Returns the best and unique css {@link Selectors} for the element on the page.
@@ -742,7 +787,6 @@ interface SelectorResult {
export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates): Promise<SelectorResult> => { export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates): Promise<SelectorResult> => {
try { try {
const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => { const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => {
function getNonUniqueSelector(element: HTMLElement): string { function getNonUniqueSelector(element: HTMLElement): string {
let selector = element.tagName.toLowerCase(); let selector = element.tagName.toLowerCase();
@@ -774,8 +818,37 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
return path.join(' > '); return path.join(' > ');
} }
const element = document.elementFromPoint(x, y) as HTMLElement | null; const originalEl = document.elementFromPoint(x, y) as HTMLElement;
if (!element) return null; if (!originalEl) return null;
let element = originalEl;
if (originalEl.tagName === 'A') {
element = originalEl;
} else if (originalEl.parentElement?.tagName === 'A') {
element = originalEl.parentElement;
} else {
while (element.parentElement) {
const parentRect = element.parentElement.getBoundingClientRect();
const childRect = element.getBoundingClientRect();
const fullyContained =
parentRect.left <= childRect.left &&
parentRect.right >= childRect.right &&
parentRect.top <= childRect.top &&
parentRect.bottom >= childRect.bottom;
const significantOverlap =
(childRect.width * childRect.height) /
(parentRect.width * parentRect.height) > 0.5;
if (fullyContained && significantOverlap) {
element = element.parentElement;
} else {
break;
}
}
}
const generalSelector = getSelectorPath(element); const generalSelector = getSelectorPath(element);
return { return {
@@ -790,7 +863,6 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
} }
}; };
export const getChildSelectors = async (page: Page, parentSelector: string): Promise<string[]> => { export const getChildSelectors = async (page: Page, parentSelector: string): Promise<string[]> => {
try { try {
const childSelectors = await page.evaluate((parentSelector: string) => { const childSelectors = await page.evaluate((parentSelector: string) => {