Merge pull request #235 from getmaxun/rect-improve
feat: rect dom depth information
This commit is contained in:
@@ -534,6 +534,8 @@ export default class Interpreter extends EventEmitter {
|
|||||||
case 'clickNext':
|
case 'clickNext':
|
||||||
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
||||||
|
|
||||||
|
// console.log("Page results:", pageResults);
|
||||||
|
|
||||||
// Filter out already scraped items
|
// Filter out already scraped items
|
||||||
const newResults = pageResults.filter(item => {
|
const newResults = pageResults.filter(item => {
|
||||||
const uniqueKey = JSON.stringify(item);
|
const uniqueKey = JSON.stringify(item);
|
||||||
@@ -541,9 +543,9 @@ export default class Interpreter extends EventEmitter {
|
|||||||
scrapedItems.add(uniqueKey); // Mark as scraped
|
scrapedItems.add(uniqueKey); // Mark as scraped
|
||||||
return true;
|
return true;
|
||||||
});
|
});
|
||||||
|
|
||||||
allResults = allResults.concat(newResults);
|
allResults = allResults.concat(newResults);
|
||||||
|
|
||||||
if (config.limit && allResults.length >= config.limit) {
|
if (config.limit && allResults.length >= config.limit) {
|
||||||
return allResults.slice(0, config.limit);
|
return allResults.slice(0, config.limit);
|
||||||
}
|
}
|
||||||
@@ -553,7 +555,7 @@ export default class Interpreter extends EventEmitter {
|
|||||||
return allResults; // No more pages to scrape
|
return allResults; // No more pages to scrape
|
||||||
}
|
}
|
||||||
await Promise.all([
|
await Promise.all([
|
||||||
nextButton.click(),
|
nextButton.dispatchEvent('click'),
|
||||||
page.waitForNavigation({ waitUntil: 'networkidle' })
|
page.waitForNavigation({ waitUntil: 'networkidle' })
|
||||||
]);
|
]);
|
||||||
|
|
||||||
|
|||||||
@@ -36,6 +36,7 @@
|
|||||||
"fortawesome": "^0.0.1-security",
|
"fortawesome": "^0.0.1-security",
|
||||||
"google-auth-library": "^9.14.1",
|
"google-auth-library": "^9.14.1",
|
||||||
"googleapis": "^144.0.0",
|
"googleapis": "^144.0.0",
|
||||||
|
"idcac-playwright": "^0.1.3",
|
||||||
"ioredis": "^5.4.1",
|
"ioredis": "^5.4.1",
|
||||||
"joi": "^17.6.0",
|
"joi": "^17.6.0",
|
||||||
"jsonwebtoken": "^9.0.2",
|
"jsonwebtoken": "^9.0.2",
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ import { InterpreterSettings, RemoteBrowserOptions } from "../../types";
|
|||||||
import { WorkflowGenerator } from "../../workflow-management/classes/Generator";
|
import { WorkflowGenerator } from "../../workflow-management/classes/Generator";
|
||||||
import { WorkflowInterpreter } from "../../workflow-management/classes/Interpreter";
|
import { WorkflowInterpreter } from "../../workflow-management/classes/Interpreter";
|
||||||
import { getDecryptedProxyConfig } from '../../routes/proxy';
|
import { getDecryptedProxyConfig } from '../../routes/proxy';
|
||||||
|
import { getInjectableScript } from 'idcac-playwright';
|
||||||
chromium.use(stealthPlugin());
|
chromium.use(stealthPlugin());
|
||||||
|
|
||||||
|
|
||||||
@@ -168,6 +169,7 @@ export class RemoteBrowser {
|
|||||||
|
|
||||||
this.currentPage.on('framenavigated', (frame) => {
|
this.currentPage.on('framenavigated', (frame) => {
|
||||||
if (frame === this.currentPage?.mainFrame()) {
|
if (frame === this.currentPage?.mainFrame()) {
|
||||||
|
this.currentPage.evaluate(getInjectableScript())
|
||||||
this.socket.emit('urlChanged', this.currentPage.url());
|
this.socket.emit('urlChanged', this.currentPage.url());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
@@ -370,11 +372,12 @@ export class RemoteBrowser {
|
|||||||
await this.stopScreencast();
|
await this.stopScreencast();
|
||||||
this.currentPage = page;
|
this.currentPage = page;
|
||||||
|
|
||||||
// this.currentPage.on('framenavigated', (frame) => {
|
this.currentPage.on('framenavigated', (frame) => {
|
||||||
// if (frame === this.currentPage?.mainFrame()) {
|
if (frame === this.currentPage?.mainFrame()) {
|
||||||
// this.socket.emit('urlChanged', this.currentPage.url());
|
this.currentPage.evaluate(getInjectableScript());
|
||||||
// }
|
this.socket.emit('urlChanged', this.currentPage.url());
|
||||||
// });
|
}
|
||||||
|
});
|
||||||
|
|
||||||
//await this.currentPage.setViewportSize({ height: 400, width: 900 })
|
//await this.currentPage.setViewportSize({ height: 400, width: 900 })
|
||||||
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
|
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
|
||||||
@@ -404,6 +407,7 @@ export class RemoteBrowser {
|
|||||||
if (this.currentPage) {
|
if (this.currentPage) {
|
||||||
this.currentPage.on('framenavigated', (frame) => {
|
this.currentPage.on('framenavigated', (frame) => {
|
||||||
if (frame === this.currentPage?.mainFrame()) {
|
if (frame === this.currentPage?.mainFrame()) {
|
||||||
|
this.currentPage.evaluate(getInjectableScript());
|
||||||
this.socket.emit('urlChanged', this.currentPage.url());
|
this.socket.emit('urlChanged', this.currentPage.url());
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -20,49 +20,6 @@ type Workflow = WorkflowFile["workflow"];
|
|||||||
* @category WorkflowManagement-Selectors
|
* @category WorkflowManagement-Selectors
|
||||||
* @returns {Promise<Rectangle|undefined|null>}
|
* @returns {Promise<Rectangle|undefined|null>}
|
||||||
*/
|
*/
|
||||||
export const getRect = async (page: Page, coordinates: Coordinates) => {
|
|
||||||
try {
|
|
||||||
const rect = await page.evaluate(
|
|
||||||
async ({ x, y }) => {
|
|
||||||
const el = document.elementFromPoint(x, y) as HTMLElement;
|
|
||||||
if (el) {
|
|
||||||
const { parentElement } = el;
|
|
||||||
// Match the logic in recorder.ts for link clicks
|
|
||||||
const element = parentElement?.tagName === 'A' ? parentElement : el;
|
|
||||||
const rectangle = element?.getBoundingClientRect();
|
|
||||||
// @ts-ignore
|
|
||||||
if (rectangle) {
|
|
||||||
return {
|
|
||||||
x: rectangle.x,
|
|
||||||
y: rectangle.y,
|
|
||||||
width: rectangle.width,
|
|
||||||
height: rectangle.height,
|
|
||||||
top: rectangle.top,
|
|
||||||
right: rectangle.right,
|
|
||||||
bottom: rectangle.bottom,
|
|
||||||
left: rectangle.left,
|
|
||||||
};
|
|
||||||
}
|
|
||||||
}
|
|
||||||
},
|
|
||||||
{ x: coordinates.x, y: coordinates.y },
|
|
||||||
);
|
|
||||||
return rect;
|
|
||||||
} catch (error) {
|
|
||||||
const { message, stack } = error as Error;
|
|
||||||
logger.log('error', `Error while retrieving selector: ${message}`);
|
|
||||||
logger.log('error', `Stack: ${stack}`);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/**
|
|
||||||
* Checks the basic info about an element and returns a {@link BaseActionInfo} object.
|
|
||||||
* If the element is not found, returns undefined.
|
|
||||||
* @param page The page instance.
|
|
||||||
* @param coordinates Coordinates of an element.
|
|
||||||
* @category WorkflowManagement-Selectors
|
|
||||||
* @returns {Promise<BaseActionInfo|undefined>}
|
|
||||||
*/
|
|
||||||
export const getElementInformation = async (
|
export const getElementInformation = async (
|
||||||
page: Page,
|
page: Page,
|
||||||
coordinates: Coordinates
|
coordinates: Coordinates
|
||||||
@@ -70,10 +27,38 @@ export const getElementInformation = async (
|
|||||||
try {
|
try {
|
||||||
const elementInfo = await page.evaluate(
|
const elementInfo = await page.evaluate(
|
||||||
async ({ x, y }) => {
|
async ({ x, y }) => {
|
||||||
const el = document.elementFromPoint(x, y) as HTMLElement;
|
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
|
||||||
if (el) {
|
if (originalEl) {
|
||||||
const { parentElement } = el;
|
let element = originalEl;
|
||||||
const element = parentElement?.tagName === 'A' ? parentElement : el;
|
|
||||||
|
if (originalEl.tagName === 'A') {
|
||||||
|
element = originalEl;
|
||||||
|
} else if (originalEl.parentElement?.tagName === 'A') {
|
||||||
|
element = originalEl.parentElement;
|
||||||
|
} else {
|
||||||
|
// Generic parent finding logic based on visual containment
|
||||||
|
while (element.parentElement) {
|
||||||
|
const parentRect = element.parentElement.getBoundingClientRect();
|
||||||
|
const childRect = element.getBoundingClientRect();
|
||||||
|
|
||||||
|
// Check if parent visually contains the child
|
||||||
|
const fullyContained =
|
||||||
|
parentRect.left <= childRect.left &&
|
||||||
|
parentRect.right >= childRect.right &&
|
||||||
|
parentRect.top <= childRect.top &&
|
||||||
|
parentRect.bottom >= childRect.bottom;
|
||||||
|
|
||||||
|
// Additional checks for more comprehensive containment
|
||||||
|
const significantOverlap =
|
||||||
|
(childRect.width * childRect.height) /
|
||||||
|
(parentRect.width * parentRect.height) > 0.5;
|
||||||
|
|
||||||
|
if (fullyContained && significantOverlap) {
|
||||||
|
element = element.parentElement;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
} }
|
||||||
|
|
||||||
let info: {
|
let info: {
|
||||||
tagName: string;
|
tagName: string;
|
||||||
@@ -98,7 +83,7 @@ export const getElementInformation = async (
|
|||||||
);
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
// Gather specific information based on the tag
|
// Existing tag-specific logic
|
||||||
if (element?.tagName === 'A') {
|
if (element?.tagName === 'A') {
|
||||||
info.url = (element as HTMLAnchorElement).href;
|
info.url = (element as HTMLAnchorElement).href;
|
||||||
info.innerText = element.innerText ?? '';
|
info.innerText = element.innerText ?? '';
|
||||||
@@ -112,7 +97,6 @@ export const getElementInformation = async (
|
|||||||
|
|
||||||
info.innerHTML = element.innerHTML;
|
info.innerHTML = element.innerHTML;
|
||||||
info.outerHTML = element.outerHTML;
|
info.outerHTML = element.outerHTML;
|
||||||
|
|
||||||
return info;
|
return info;
|
||||||
}
|
}
|
||||||
return null;
|
return null;
|
||||||
@@ -127,6 +111,67 @@ export const getElementInformation = async (
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
export const getRect = async (page: Page, coordinates: Coordinates) => {
|
||||||
|
try {
|
||||||
|
const rect = await page.evaluate(
|
||||||
|
async ({ x, y }) => {
|
||||||
|
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
|
||||||
|
if (originalEl) {
|
||||||
|
let element = originalEl;
|
||||||
|
|
||||||
|
if (originalEl.tagName === 'A') {
|
||||||
|
element = originalEl;
|
||||||
|
} else if (originalEl.parentElement?.tagName === 'A') {
|
||||||
|
element = originalEl.parentElement;
|
||||||
|
} else {
|
||||||
|
while (element.parentElement) {
|
||||||
|
const parentRect = element.parentElement.getBoundingClientRect();
|
||||||
|
const childRect = element.getBoundingClientRect();
|
||||||
|
|
||||||
|
const fullyContained =
|
||||||
|
parentRect.left <= childRect.left &&
|
||||||
|
parentRect.right >= childRect.right &&
|
||||||
|
parentRect.top <= childRect.top &&
|
||||||
|
parentRect.bottom >= childRect.bottom;
|
||||||
|
|
||||||
|
const significantOverlap =
|
||||||
|
(childRect.width * childRect.height) /
|
||||||
|
(parentRect.width * parentRect.height) > 0.5;
|
||||||
|
|
||||||
|
if (fullyContained && significantOverlap) {
|
||||||
|
element = element.parentElement;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
|
||||||
|
//element = element?.parentElement?.tagName === 'A' ? element?.parentElement : element;
|
||||||
|
const rectangle = element?.getBoundingClientRect();
|
||||||
|
|
||||||
|
if (rectangle) {
|
||||||
|
return {
|
||||||
|
x: rectangle.x,
|
||||||
|
y: rectangle.y,
|
||||||
|
width: rectangle.width,
|
||||||
|
height: rectangle.height,
|
||||||
|
top: rectangle.top,
|
||||||
|
right: rectangle.right,
|
||||||
|
bottom: rectangle.bottom,
|
||||||
|
left: rectangle.left,
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
},
|
||||||
|
{ x: coordinates.x, y: coordinates.y },
|
||||||
|
);
|
||||||
|
return rect;
|
||||||
|
} catch (error) {
|
||||||
|
const { message, stack } = error as Error;
|
||||||
|
logger.log('error', `Error while retrieving selector: ${message}`);
|
||||||
|
logger.log('error', `Stack: ${stack}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Returns the best and unique css {@link Selectors} for the element on the page.
|
* Returns the best and unique css {@link Selectors} for the element on the page.
|
||||||
@@ -742,7 +787,6 @@ interface SelectorResult {
|
|||||||
export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates): Promise<SelectorResult> => {
|
export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates): Promise<SelectorResult> => {
|
||||||
try {
|
try {
|
||||||
const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => {
|
const selectors = await page.evaluate(({ x, y }: { x: number, y: number }) => {
|
||||||
|
|
||||||
function getNonUniqueSelector(element: HTMLElement): string {
|
function getNonUniqueSelector(element: HTMLElement): string {
|
||||||
let selector = element.tagName.toLowerCase();
|
let selector = element.tagName.toLowerCase();
|
||||||
|
|
||||||
@@ -774,8 +818,37 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
|
|||||||
return path.join(' > ');
|
return path.join(' > ');
|
||||||
}
|
}
|
||||||
|
|
||||||
const element = document.elementFromPoint(x, y) as HTMLElement | null;
|
const originalEl = document.elementFromPoint(x, y) as HTMLElement;
|
||||||
if (!element) return null;
|
if (!originalEl) return null;
|
||||||
|
|
||||||
|
let element = originalEl;
|
||||||
|
|
||||||
|
if (originalEl.tagName === 'A') {
|
||||||
|
element = originalEl;
|
||||||
|
} else if (originalEl.parentElement?.tagName === 'A') {
|
||||||
|
element = originalEl.parentElement;
|
||||||
|
} else {
|
||||||
|
while (element.parentElement) {
|
||||||
|
const parentRect = element.parentElement.getBoundingClientRect();
|
||||||
|
const childRect = element.getBoundingClientRect();
|
||||||
|
|
||||||
|
const fullyContained =
|
||||||
|
parentRect.left <= childRect.left &&
|
||||||
|
parentRect.right >= childRect.right &&
|
||||||
|
parentRect.top <= childRect.top &&
|
||||||
|
parentRect.bottom >= childRect.bottom;
|
||||||
|
|
||||||
|
const significantOverlap =
|
||||||
|
(childRect.width * childRect.height) /
|
||||||
|
(parentRect.width * parentRect.height) > 0.5;
|
||||||
|
|
||||||
|
if (fullyContained && significantOverlap) {
|
||||||
|
element = element.parentElement;
|
||||||
|
} else {
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
const generalSelector = getSelectorPath(element);
|
const generalSelector = getSelectorPath(element);
|
||||||
return {
|
return {
|
||||||
@@ -790,7 +863,6 @@ export const getNonUniqueSelectors = async (page: Page, coordinates: Coordinates
|
|||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|
||||||
export const getChildSelectors = async (page: Page, parentSelector: string): Promise<string[]> => {
|
export const getChildSelectors = async (page: Page, parentSelector: string): Promise<string[]> => {
|
||||||
try {
|
try {
|
||||||
const childSelectors = await page.evaluate((parentSelector: string) => {
|
const childSelectors = await page.evaluate((parentSelector: string) => {
|
||||||
|
|||||||
Reference in New Issue
Block a user