Files
parcer/server/src/browser-management/classes/RemoteBrowser.ts

518 lines
20 KiB
TypeScript
Raw Normal View History

2024-06-01 10:55:04 +05:30
import {
Page,
Browser,
CDPSession,
BrowserContext,
} from 'playwright';
import { Socket } from "socket.io";
2024-11-22 23:04:03 +05:30
import { chromium } from 'playwright-extra';
import stealthPlugin from 'puppeteer-extra-plugin-stealth';
2024-09-21 18:51:11 +05:30
import { PlaywrightBlocker } from '@cliqz/adblocker-playwright';
2024-07-20 05:57:15 +05:30
import fetch from 'cross-fetch';
2024-06-01 10:55:04 +05:30
import logger from '../../logger';
import { InterpreterSettings, RemoteBrowserOptions } from "../../types";
import { WorkflowGenerator } from "../../workflow-management/classes/Generator";
import { WorkflowInterpreter } from "../../workflow-management/classes/Interpreter";
import { getDecryptedProxyConfig } from '../../routes/proxy';
2024-12-09 17:57:28 +05:30
import { getInjectableScript } from 'idcac-playwright';
2024-11-22 23:04:03 +05:30
chromium.use(stealthPlugin());
2024-07-20 06:01:35 +05:30
2024-06-01 10:55:04 +05:30
/**
* This class represents a remote browser instance.
* It is used to allow a variety of interaction with the Playwright's browser instance.
* Every remote browser holds an instance of a generator and interpreter classes with
* the purpose of generating and interpreting workflows.
* @category BrowserManagement
*/
export class RemoteBrowser {
/**
* Playwright's [browser](https://playwright.dev/docs/api/class-browser) instance.
* @private
*/
private browser: Browser | null = null;
private context: BrowserContext | null = null;
2024-06-01 10:55:04 +05:30
/**
* The Playwright's [CDPSession](https://playwright.dev/docs/api/class-cdpsession) instance,
* used to talk raw Chrome Devtools Protocol.
* @private
*/
2024-06-01 11:05:45 +05:30
private client: CDPSession | null | undefined = null;
2024-06-01 10:55:04 +05:30
/**
* Socket.io socket instance enabling communication with the client (frontend) side.
* @private
*/
2024-06-01 11:05:45 +05:30
private socket: Socket;
2024-06-01 10:55:04 +05:30
/**
* The Playwright's [Page](https://playwright.dev/docs/api/class-page) instance
* as current interactive remote browser's page.
* @private
*/
2024-06-01 11:05:45 +05:30
private currentPage: Page | null | undefined = null;
2024-06-01 10:55:04 +05:30
/**
* Interpreter settings for any started interpretation.
* @private
*/
private interpreterSettings: InterpreterSettings = {
debug: false,
maxConcurrency: 1,
maxRepeats: 1,
};
private lastEmittedUrl: string | null = null;
2024-06-01 10:55:04 +05:30
/**
* {@link WorkflowGenerator} instance specific to the remote browser.
*/
public generator: WorkflowGenerator;
/**
* {@link WorkflowInterpreter} instance specific to the remote browser.
*/
public interpreter: WorkflowInterpreter;
/**
* Initializes a new instances of the {@link Generator} and {@link WorkflowInterpreter} classes and
* assigns the socket instance everywhere.
* @param socket socket.io socket instance used to communicate with the client side
* @constructor
*/
public constructor(socket: Socket) {
this.socket = socket;
this.interpreter = new WorkflowInterpreter(socket);
this.generator = new WorkflowGenerator(socket);
}
/**
* Normalizes URLs to prevent navigation loops while maintaining consistent format
*/
private normalizeUrl(url: string): string {
try {
const parsedUrl = new URL(url);
// Remove trailing slashes except for root path
parsedUrl.pathname = parsedUrl.pathname.replace(/\/+$/, '') || '/';
// Ensure consistent protocol handling
parsedUrl.protocol = parsedUrl.protocol.toLowerCase();
return parsedUrl.toString();
} catch {
return url;
}
2024-12-13 21:21:00 +05:30
}
/**
* Determines if a URL change is significant enough to emit
*/
private shouldEmitUrlChange(newUrl: string): boolean {
if (!this.lastEmittedUrl) {
return true;
}
const normalizedNew = this.normalizeUrl(newUrl);
const normalizedLast = this.normalizeUrl(this.lastEmittedUrl);
return normalizedNew !== normalizedLast;
}
private async setupPageEventListeners(page: Page) {
page.on('framenavigated', async (frame) => {
if (frame === page.mainFrame()) {
const currentUrl = page.url();
if (this.shouldEmitUrlChange(currentUrl)) {
this.lastEmittedUrl = currentUrl;
this.socket.emit('urlChanged', currentUrl);
}
}
});
// Handle page load events with retry mechanism
2024-12-13 21:21:00 +05:30
page.on('load', async () => {
const injectScript = async (): Promise<boolean> => {
try {
await page.waitForLoadState('networkidle', { timeout: 5000 });
2024-12-13 21:21:00 +05:30
await page.evaluate(getInjectableScript());
return true;
} catch (error: any) {
logger.log('warn', `Script injection attempt failed: ${error.message}`);
return false;
}
};
const success = await injectScript();
console.log("Script injection result:", success);
});
}
2024-12-14 22:30:50 +05:30
private getUserAgent() {
const userAgents = [
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.140 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:117.0) Gecko/20100101 Firefox/117.0',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.1938.81 Safari/537.36 Edg/116.0.1938.81',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/116.0.5845.96 Safari/537.36 OPR/101.0.4843.25',
'Mozilla/5.0 (Windows NT 11.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/117.0.5938.62 Safari/537.36',
'Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:118.0) Gecko/20100101 Firefox/118.0',
];
return userAgents[Math.floor(Math.random() * userAgents.length)];
}
/**
* An asynchronous constructor for asynchronously initialized properties.
* Must be called right after creating an instance of RemoteBrowser class.
* @param options remote browser options to be used when launching the browser
* @returns {Promise<void>}
*/
2024-11-03 02:59:30 +05:30
public initialize = async (userId: string): Promise<void> => {
this.browser = <Browser>(await chromium.launch({
2024-11-03 01:09:53 +05:30
headless: true,
2024-12-14 06:58:29 +05:30
args: [
"--disable-blink-features=AutomationControlled",
"--disable-web-security",
"--disable-features=IsolateOrigins,site-per-process",
"--disable-site-isolation-trials",
"--disable-extensions"
],
2024-11-03 02:59:30 +05:30
}));
2024-10-27 17:55:16 +05:30
const proxyConfig = await getDecryptedProxyConfig(userId);
let proxyOptions: { server: string, username?: string, password?: string } = { server: '' };
if (proxyConfig.proxy_url) {
proxyOptions = {
server: proxyConfig.proxy_url,
...(proxyConfig.proxy_username && proxyConfig.proxy_password && {
username: proxyConfig.proxy_username,
password: proxyConfig.proxy_password,
}),
};
}
const contextOptions: any = {
viewport: { height: 400, width: 900 },
// recordVideo: { dir: 'videos/' }
2024-12-13 21:21:00 +05:30
// Force reduced motion to prevent animation issues
2024-11-03 01:09:53 +05:30
reducedMotion: 'reduce',
// Force JavaScript to be enabled
javaScriptEnabled: true,
// Set a reasonable timeout
timeout: 50000,
// Disable hardware acceleration
forcedColors: 'none',
isMobile: false,
hasTouch: false
};
if (proxyOptions.server) {
contextOptions.proxy = {
2024-10-27 18:05:22 +05:30
server: proxyOptions.server,
username: proxyOptions.username ? proxyOptions.username : undefined,
password: proxyOptions.password ? proxyOptions.password : undefined,
};
}
2024-11-03 01:09:53 +05:30
2024-12-14 22:30:50 +05:30
contextOptions.userAgent = this.getUserAgent();
this.context = await this.browser.newContext(contextOptions);
await this.context.addInitScript(
`const defaultGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
defaultGetter.apply(navigator);
defaultGetter.toString();
Object.defineProperty(Navigator.prototype, "webdriver", {
set: undefined,
enumerable: true,
configurable: true,
get: new Proxy(defaultGetter, {
apply: (target, thisArg, args) => {
Reflect.apply(target, thisArg, args);
return false;
},
}),
});
const patchedGetter = Object.getOwnPropertyDescriptor(
Navigator.prototype,
"webdriver"
).get;
patchedGetter.apply(navigator);
patchedGetter.toString();`
2024-12-13 21:21:00 +05:30
);
this.currentPage = await this.context.newPage();
await this.setupPageEventListeners(this.currentPage);
2024-12-09 22:05:50 +05:30
2024-12-11 14:13:12 +05:30
const blocker = await PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']);
2024-07-20 06:10:37 +05:30
await blocker.enableBlockingInPage(this.currentPage);
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
2024-07-20 06:09:57 +05:30
await blocker.disableBlockingInPage(this.currentPage);
};
2024-06-01 10:57:32 +05:30
/**
* Registers all event listeners needed for the recording editor session.
* Should be called only once after the full initialization of the remote browser.
* @returns void
*/
2024-06-01 11:05:45 +05:30
public registerEditorEvents = (): void => {
this.socket.on('rerender', async () => await this.makeAndEmitScreenshot());
2024-06-01 10:57:32 +05:30
this.socket.on('settings', (settings) => this.interpreterSettings = settings);
2024-06-01 11:05:45 +05:30
this.socket.on('changeTab', async (tabIndex) => await this.changeTab(tabIndex));
2024-06-01 10:57:32 +05:30
this.socket.on('addTab', async () => {
await this.currentPage?.context().newPage();
const lastTabIndex = this.currentPage ? this.currentPage.context().pages().length - 1 : 0;
await this.changeTab(lastTabIndex);
});
this.socket.on('closeTab', async (tabInfo) => {
const page = this.currentPage?.context().pages()[tabInfo.index];
if (page) {
2024-06-01 11:05:45 +05:30
if (tabInfo.isCurrent) {
2024-06-01 10:57:32 +05:30
if (this.currentPage?.context().pages()[tabInfo.index + 1]) {
// next tab
await this.changeTab(tabInfo.index + 1);
} else {
//previous tab
await this.changeTab(tabInfo.index - 1);
}
}
await page.close();
logger.log(
2024-06-01 11:05:45 +05:30
'debug',
`${tabInfo.index} page was closed, new length of pages: ${this.currentPage?.context().pages().length}`
2024-06-01 10:57:32 +05:30
)
} else {
logger.log('error', `${tabInfo.index} index out of range of pages`)
}
});
this.socket.on('setViewportSize', async (data: { width: number, height: number }) => {
const { width, height } = data;
logger.log('debug', `Received viewport size: width=${width}, height=${height}`);
2024-10-27 18:05:22 +05:30
// Update the browser context's viewport dynamically
if (this.context && this.browser) {
this.context = await this.browser.newContext({ viewport: { width, height } });
logger.log('debug', `Viewport size updated to width=${width}, height=${height} for the entire browser context`);
}
});
2024-06-01 10:57:32 +05:30
}
2024-06-01 10:58:23 +05:30
/**
* Subscribes the remote browser for a screencast session
* on [CDP](https://chromedevtools.github.io/devtools-protocol/) level,
* where screenshot is being sent through the socket
* every time the browser's active page updates.
* @returns {Promise<void>}
*/
2024-06-01 11:05:45 +05:30
public subscribeToScreencast = async (): Promise<void> => {
2024-06-01 10:58:23 +05:30
await this.startScreencast();
if (!this.client) {
2024-06-01 11:05:45 +05:30
logger.log('warn', 'client is not initialized');
2024-06-01 10:58:23 +05:30
return;
}
this.client.on('Page.screencastFrame', ({ data: base64, sessionId }) => {
2024-07-09 22:58:18 +05:30
this.emitScreenshot(base64)
2024-06-01 10:58:23 +05:30
setTimeout(async () => {
try {
if (!this.client) {
2024-06-01 11:05:45 +05:30
logger.log('warn', 'client is not initialized');
2024-06-01 10:58:23 +05:30
return;
}
await this.client.send('Page.screencastFrameAck', { sessionId: sessionId });
2024-11-29 22:10:48 +05:30
} catch (e: any) {
logger.log('error', `Screencast error: ${e}`);
2024-06-01 10:58:23 +05:30
}
}, 100);
});
};
2024-06-01 10:59:15 +05:30
/**
* Terminates the screencast session and closes the remote browser.
* If an interpretation was running it will be stopped.
* @returns {Promise<void>}
*/
2024-06-01 11:05:45 +05:30
public switchOff = async (): Promise<void> => {
2024-06-01 10:59:15 +05:30
await this.interpreter.stopInterpretation();
if (this.browser) {
await this.stopScreencast();
await this.browser.close();
} else {
logger.log('error', 'Browser wasn\'t initialized');
2024-06-01 11:05:45 +05:30
logger.log('error', 'Switching off the browser failed');
2024-06-01 10:59:15 +05:30
}
};
2024-07-09 22:58:18 +05:30
/**
2024-06-01 10:59:49 +05:30
* Makes and emits a single screenshot to the client side.
* @returns {Promise<void>}
*/
2024-06-01 11:05:45 +05:30
public makeAndEmitScreenshot = async (): Promise<void> => {
2024-06-01 10:59:49 +05:30
try {
2024-09-21 18:49:09 +05:30
const screenshot = await this.currentPage?.screenshot();
2024-06-01 10:59:49 +05:30
if (screenshot) {
2024-07-11 17:27:49 +05:30
this.emitScreenshot(screenshot.toString('base64'));
2024-06-01 10:59:49 +05:30
}
} catch (e) {
const { message } = e as Error;
2024-11-29 22:11:16 +05:30
logger.log('error', `Screenshot error: ${message}`);
2024-06-01 10:59:49 +05:30
}
2024-07-09 22:58:18 +05:30
};
2024-06-01 10:59:49 +05:30
2024-06-01 11:00:28 +05:30
/**
* Updates the active socket instance.
* This will update all registered events for the socket and
* all the properties using the socket.
* @param socket socket.io socket instance used to communicate with the client side
* @returns void
*/
2024-06-01 11:05:45 +05:30
public updateSocket = (socket: Socket): void => {
2024-06-01 11:00:28 +05:30
this.socket = socket;
this.registerEditorEvents();
this.generator?.updateSocket(socket);
this.interpreter?.updateSocket(socket);
};
2024-06-01 11:03:01 +05:30
/**
* Starts the interpretation of the currently generated workflow.
* @returns {Promise<void>}
*/
2024-06-01 11:05:45 +05:30
public interpretCurrentRecording = async (): Promise<void> => {
2024-06-01 11:03:01 +05:30
logger.log('debug', 'Starting interpretation in the editor');
if (this.generator) {
const workflow = this.generator.AddGeneratedFlags(this.generator.getWorkflowFile());
await this.initializeNewPage();
if (this.currentPage) {
2024-11-13 23:08:44 +05:30
this.currentPage.setViewportSize({ height: 400, width: 900 });
2024-06-01 11:03:01 +05:30
const params = this.generator.getParams();
if (params) {
this.interpreterSettings.params = params.reduce((acc, param) => {
if (this.interpreterSettings.params && Object.keys(this.interpreterSettings.params).includes(param)) {
return { ...acc, [param]: this.interpreterSettings.params[param] };
} else {
return { ...acc, [param]: '', }
}
}, {})
}
logger.log('debug', `Starting interpretation with settings: ${JSON.stringify(this.interpreterSettings, null, 2)}`);
await this.interpreter.interpretRecordingInEditor(
2024-06-01 11:05:45 +05:30
workflow, this.currentPage,
(newPage: Page) => this.currentPage = newPage,
this.interpreterSettings
2024-06-01 11:03:01 +05:30
);
// clear the active index from generator
this.generator.clearLastIndex();
} else {
logger.log('error', 'Could not get a new page, returned undefined');
}
} else {
logger.log('error', 'Generator is not initialized');
}
};
/**
* Stops the workflow interpretation and initializes a new page.
* @returns {Promise<void>}
*/
2024-06-01 11:05:45 +05:30
public stopCurrentInterpretation = async (): Promise<void> => {
await this.interpreter.stopInterpretation();
await this.initializeNewPage();
};
/**
* Returns the current page instance.
* @returns {Page | null | undefined}
*/
2024-06-01 11:05:45 +05:30
public getCurrentPage = (): Page | null | undefined => {
return this.currentPage;
};
/**
* Changes the active page to the page instance on the given index
* available in pages array on the {@link BrowserContext}.
* Automatically stops the screencast session on the previous page and starts the new one.
* @param tabIndex index of the page in the pages array on the {@link BrowserContext}
* @returns {Promise<void>}
*/
2024-06-01 11:05:45 +05:30
private changeTab = async (tabIndex: number): Promise<void> => {
const page = this.currentPage?.context().pages()[tabIndex];
if (page) {
await this.stopScreencast();
this.currentPage = page;
await this.setupPageEventListeners(this.currentPage);
2024-12-09 22:05:50 +05:30
2024-10-22 15:23:57 +05:30
//await this.currentPage.setViewportSize({ height: 400, width: 900 })
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
this.socket.emit('urlChanged', this.currentPage.url());
await this.makeAndEmitScreenshot();
await this.subscribeToScreencast();
} else {
logger.log('error', `${tabIndex} index out of range of pages`)
}
}
2024-06-01 11:05:13 +05:30
/**
* Internal method for a new page initialization. Subscribes this page to the screencast.
* @param options optional page options to be used when creating a new page
* @returns {Promise<void>}
*/
2024-06-01 11:05:45 +05:30
private initializeNewPage = async (options?: Object): Promise<void> => {
2024-06-01 11:05:13 +05:30
await this.stopScreencast();
const newPage = options ? await this.browser?.newPage(options)
2024-06-01 11:05:45 +05:30
: await this.browser?.newPage();
2024-11-03 01:09:53 +05:30
await newPage?.setExtraHTTPHeaders({
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/58.0.3029.110 Safari/537.3'
});
2024-06-01 11:05:13 +05:30
await this.currentPage?.close();
this.currentPage = newPage;
if (this.currentPage) {
await this.setupPageEventListeners(this.currentPage);
2024-12-13 21:21:00 +05:30
2024-06-01 11:05:13 +05:30
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
await this.subscribeToScreencast();
} else {
2024-06-01 11:05:45 +05:30
logger.log('error', 'Could not get a new page, returned undefined');
2024-06-01 11:05:13 +05:30
}
};
2024-06-01 11:02:21 +05:30
/**
* Initiates screencast of the remote browser through socket,
* registers listener for rerender event and emits the loaded event.
* Should be called only once after the browser is fully initialized.
* @returns {Promise<void>}
*/
2024-06-01 11:05:45 +05:30
private startScreencast = async (): Promise<void> => {
2024-06-01 11:02:21 +05:30
if (!this.client) {
2024-06-01 11:05:45 +05:30
logger.log('warn', 'client is not initialized');
2024-06-01 11:02:21 +05:30
return;
}
2024-09-21 18:49:29 +05:30
await this.client.send('Page.startScreencast', { format: 'jpeg', quality: 75 });
2024-06-01 11:05:45 +05:30
logger.log('info', `Browser started with screencasting a page.`);
2024-06-01 11:02:21 +05:30
};
/**
* Unsubscribes the current page from the screencast session.
* @returns {Promise<void>}
*/
2024-06-01 11:05:45 +05:30
private stopScreencast = async (): Promise<void> => {
2024-06-01 11:02:21 +05:30
if (!this.client) {
2024-06-01 11:05:45 +05:30
logger.log('error', 'client is not initialized');
logger.log('error', 'Screencast stop failed');
2024-06-01 11:02:21 +05:30
} else {
await this.client.send('Page.stopScreencast');
logger.log('info', `Browser stopped with screencasting.`);
}
};
/**
* Helper for emitting the screenshot of browser's active page through websocket.
* @param payload the screenshot binary data
* @returns void
*/
2024-06-01 11:05:45 +05:30
private emitScreenshot = (payload: any): void => {
2024-07-11 17:29:11 +05:30
const dataWithMimeType = ('data:image/jpeg;base64,').concat(payload);
this.socket.emit('screencast', dataWithMimeType);
2024-06-01 11:05:45 +05:30
logger.log('debug', `Screenshot emitted`);
};
2024-06-01 10:55:04 +05:30
}