Merge pull request #229 from getmaxun/login-scrape
feat: extract behind login
This commit is contained in:
@@ -121,6 +121,53 @@ export default class Interpreter extends EventEmitter {
|
||||
}
|
||||
}
|
||||
|
||||
// private getSelectors(workflow: Workflow, actionId: number): string[] {
|
||||
// const selectors: string[] = [];
|
||||
|
||||
// // Validate actionId
|
||||
// if (actionId <= 0) {
|
||||
// console.log("No previous selectors to collect.");
|
||||
// return selectors; // Empty array as there are no previous steps
|
||||
// }
|
||||
|
||||
// // Iterate from the start up to (but not including) actionId
|
||||
// for (let index = 0; index < actionId; index++) {
|
||||
// const currentSelectors = workflow[index]?.where?.selectors;
|
||||
// console.log(`Selectors at step ${index}:`, currentSelectors);
|
||||
|
||||
// if (currentSelectors && currentSelectors.length > 0) {
|
||||
// currentSelectors.forEach((selector) => {
|
||||
// if (!selectors.includes(selector)) {
|
||||
// selectors.push(selector); // Avoid duplicates
|
||||
// }
|
||||
// });
|
||||
// }
|
||||
// }
|
||||
|
||||
// console.log("Collected Selectors:", selectors);
|
||||
// return selectors;
|
||||
// }
|
||||
|
||||
private getSelectors(workflow: Workflow): string[] {
|
||||
const selectorsSet = new Set<string>();
|
||||
|
||||
if (workflow.length === 0) {
|
||||
return [];
|
||||
}
|
||||
|
||||
for (let index = workflow.length - 1; index >= 0; index--) {
|
||||
const currentSelectors = workflow[index]?.where?.selectors;
|
||||
|
||||
if (currentSelectors && currentSelectors.length > 0) {
|
||||
currentSelectors.forEach((selector) => selectorsSet.add(selector));
|
||||
return Array.from(selectorsSet);
|
||||
}
|
||||
}
|
||||
|
||||
return [];
|
||||
}
|
||||
|
||||
|
||||
/**
|
||||
* Returns the context object from given Page and the current workflow.\
|
||||
* \
|
||||
@@ -130,52 +177,63 @@ export default class Interpreter extends EventEmitter {
|
||||
* @param workflow Current **initialized** workflow (array of where-what pairs).
|
||||
* @returns {PageState} State of the current page.
|
||||
*/
|
||||
private async getState(page: Page, workflow: Workflow): Promise<PageState> {
|
||||
private async getState(page: Page, workflowCopy: Workflow, selectors: string[]): Promise<PageState> {
|
||||
/**
|
||||
* All the selectors present in the current Workflow
|
||||
*/
|
||||
const selectors = Preprocessor.extractSelectors(workflow);
|
||||
// const selectors = Preprocessor.extractSelectors(workflow);
|
||||
// console.log("Current selectors:", selectors);
|
||||
|
||||
/**
|
||||
* Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability).
|
||||
* @param selector Selector to be queried
|
||||
* @returns True if the targetted element is actionable, false otherwise.
|
||||
*/
|
||||
const actionable = async (selector: string): Promise<boolean> => {
|
||||
try {
|
||||
const proms = [
|
||||
page.isEnabled(selector, { timeout: 500 }),
|
||||
page.isVisible(selector, { timeout: 500 }),
|
||||
];
|
||||
// const actionable = async (selector: string): Promise<boolean> => {
|
||||
// try {
|
||||
// const proms = [
|
||||
// page.isEnabled(selector, { timeout: 5000 }),
|
||||
// page.isVisible(selector, { timeout: 5000 }),
|
||||
// ];
|
||||
|
||||
return await Promise.all(proms).then((bools) => bools.every((x) => x));
|
||||
} catch (e) {
|
||||
// log(<Error>e, Level.ERROR);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
// return await Promise.all(proms).then((bools) => bools.every((x) => x));
|
||||
// } catch (e) {
|
||||
// // log(<Error>e, Level.ERROR);
|
||||
// return false;
|
||||
// }
|
||||
// };
|
||||
|
||||
/**
|
||||
* Object of selectors present in the current page.
|
||||
*/
|
||||
const presentSelectors: SelectorArray = await Promise.all(
|
||||
selectors.map(async (selector) => {
|
||||
if (await actionable(selector)) {
|
||||
return [selector];
|
||||
// const presentSelectors: SelectorArray = await Promise.all(
|
||||
// selectors.map(async (selector) => {
|
||||
// if (await actionable(selector)) {
|
||||
// return [selector];
|
||||
// }
|
||||
// return [];
|
||||
// }),
|
||||
// ).then((x) => x.flat());
|
||||
|
||||
const action = workflowCopy[workflowCopy.length - 1];
|
||||
|
||||
// console.log("Next action:", action)
|
||||
|
||||
let url: any = page.url();
|
||||
|
||||
if (action && action.where.url !== url && action.where.url !== "about:blank") {
|
||||
url = action.where.url;
|
||||
}
|
||||
return [];
|
||||
}),
|
||||
).then((x) => x.flat());
|
||||
|
||||
return {
|
||||
url: page.url(),
|
||||
url,
|
||||
cookies: (await page.context().cookies([page.url()]))
|
||||
.reduce((p, cookie) => (
|
||||
{
|
||||
...p,
|
||||
[cookie.name]: cookie.value,
|
||||
}), {}),
|
||||
selectors: presentSelectors,
|
||||
selectors,
|
||||
};
|
||||
}
|
||||
|
||||
@@ -365,6 +423,7 @@ export default class Interpreter extends EventEmitter {
|
||||
console.log("MERGED results:", mergedResult);
|
||||
|
||||
await this.options.serializableCallback(mergedResult);
|
||||
// await this.options.serializableCallback(scrapeResult);
|
||||
},
|
||||
|
||||
scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => {
|
||||
@@ -546,11 +605,31 @@ export default class Interpreter extends EventEmitter {
|
||||
return allResults;
|
||||
}
|
||||
|
||||
private getMatchingActionId(workflow: Workflow, pageState: PageState, usedActions: string[]) {
|
||||
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
||||
const step = workflow[actionId];
|
||||
const isApplicable = this.applicable(step.where, pageState, usedActions);
|
||||
console.log("-------------------------------------------------------------");
|
||||
console.log(`Where:`, step.where);
|
||||
console.log(`Page state:`, pageState);
|
||||
console.log(`Match result: ${isApplicable}`);
|
||||
console.log("-------------------------------------------------------------");
|
||||
|
||||
if (isApplicable) {
|
||||
return actionId;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
private async runLoop(p: Page, workflow: Workflow) {
|
||||
const workflowCopy: Workflow = JSON.parse(JSON.stringify(workflow));
|
||||
|
||||
// apply ad-blocker to the current page
|
||||
await this.applyAdBlocker(p);
|
||||
const usedActions: string[] = [];
|
||||
let selectors: string[] = [];
|
||||
let lastAction = null;
|
||||
let actionId = -1
|
||||
let repeatCount = 0;
|
||||
|
||||
/**
|
||||
@@ -559,7 +638,7 @@ export default class Interpreter extends EventEmitter {
|
||||
* e.g. via `enqueueLinks`.
|
||||
*/
|
||||
p.on('popup', (popup) => {
|
||||
this.concurrency.addJob(() => this.runLoop(popup, workflow));
|
||||
this.concurrency.addJob(() => this.runLoop(popup, workflowCopy));
|
||||
});
|
||||
|
||||
/* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
|
||||
@@ -578,8 +657,11 @@ export default class Interpreter extends EventEmitter {
|
||||
}
|
||||
|
||||
let pageState = {};
|
||||
let getStateTest = "Hello";
|
||||
try {
|
||||
pageState = await this.getState(p, workflow);
|
||||
pageState = await this.getState(p, workflowCopy, selectors);
|
||||
selectors = [];
|
||||
console.log("Empty selectors:", selectors)
|
||||
} catch (e: any) {
|
||||
this.log('The browser has been closed.');
|
||||
return;
|
||||
@@ -589,16 +671,22 @@ export default class Interpreter extends EventEmitter {
|
||||
this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
|
||||
}
|
||||
|
||||
const actionId = workflow.findIndex((step) => {
|
||||
const isApplicable = this.applicable(step.where, pageState, usedActions);
|
||||
console.log(`Where:`, step.where);
|
||||
console.log(`Page state:`, pageState);
|
||||
console.log(`Match result: ${isApplicable}`);
|
||||
return isApplicable;
|
||||
});
|
||||
// const actionId = workflow.findIndex((step) => {
|
||||
// const isApplicable = this.applicable(step.where, pageState, usedActions);
|
||||
// console.log("-------------------------------------------------------------");
|
||||
// console.log(`Where:`, step.where);
|
||||
// console.log(`Page state:`, pageState);
|
||||
// console.log(`Match result: ${isApplicable}`);
|
||||
// console.log("-------------------------------------------------------------");
|
||||
// return isApplicable;
|
||||
// });
|
||||
|
||||
const action = workflow[actionId];
|
||||
actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
|
||||
|
||||
const action = workflowCopy[actionId];
|
||||
|
||||
console.log("MATCHED ACTION:", action);
|
||||
console.log("MATCHED ACTION ID:", actionId);
|
||||
this.log(`Matched ${JSON.stringify(action?.where)}`, Level.LOG);
|
||||
|
||||
if (action) { // action is matched
|
||||
@@ -607,14 +695,28 @@ export default class Interpreter extends EventEmitter {
|
||||
}
|
||||
|
||||
repeatCount = action === lastAction ? repeatCount + 1 : 0;
|
||||
if (this.options.maxRepeats && repeatCount >= this.options.maxRepeats) {
|
||||
|
||||
console.log("REPEAT COUNT", repeatCount);
|
||||
if (this.options.maxRepeats && repeatCount > this.options.maxRepeats) {
|
||||
return;
|
||||
}
|
||||
lastAction = action;
|
||||
|
||||
try {
|
||||
console.log("Carrying out:", action.what);
|
||||
await this.carryOutSteps(p, action.what);
|
||||
usedActions.push(action.id ?? 'undefined');
|
||||
|
||||
workflowCopy.splice(actionId, 1);
|
||||
console.log(`Action with ID ${action.id} removed from the workflow copy.`);
|
||||
|
||||
// const newSelectors = this.getPreviousSelectors(workflow, actionId);
|
||||
const newSelectors = this.getSelectors(workflowCopy);
|
||||
newSelectors.forEach(selector => {
|
||||
if (!selectors.includes(selector)) {
|
||||
selectors.push(selector);
|
||||
}
|
||||
});
|
||||
} catch (e) {
|
||||
this.log(<Error>e, Level.ERROR);
|
||||
}
|
||||
|
||||
@@ -15,6 +15,8 @@ import { io, Socket } from "socket.io-client";
|
||||
import { BinaryOutputService } from "../storage/mino";
|
||||
import { AuthenticatedRequest } from "../routes/record"
|
||||
import {capture} from "../utils/analytics";
|
||||
import { Page } from "playwright";
|
||||
import { WorkflowFile } from "maxun-core";
|
||||
chromium.use(stealthPlugin());
|
||||
|
||||
const formatRecording = (recordingData: any) => {
|
||||
@@ -533,6 +535,17 @@ function resetRecordingState(browserId: string, id: string) {
|
||||
id = '';
|
||||
}
|
||||
|
||||
function AddGeneratedFlags(workflow: WorkflowFile) {
|
||||
const copy = JSON.parse(JSON.stringify(workflow));
|
||||
for (let i = 0; i < workflow.workflow.length; i++) {
|
||||
copy.workflow[i].what.unshift({
|
||||
action: 'flag',
|
||||
args: ['generated'],
|
||||
});
|
||||
}
|
||||
return copy;
|
||||
};
|
||||
|
||||
async function executeRun(id: string) {
|
||||
try {
|
||||
const run = await Run.findOne({ where: { runId: id } });
|
||||
@@ -560,13 +573,14 @@ async function executeRun(id: string) {
|
||||
throw new Error('Could not access browser');
|
||||
}
|
||||
|
||||
const currentPage = await browser.getCurrentPage();
|
||||
let currentPage = await browser.getCurrentPage();
|
||||
if (!currentPage) {
|
||||
throw new Error('Could not create a new page');
|
||||
}
|
||||
|
||||
const workflow = AddGeneratedFlags(recording.recording);
|
||||
const interpretationInfo = await browser.interpreter.InterpretRecording(
|
||||
recording.recording, currentPage, plainRun.interpreterSettings
|
||||
workflow, currentPage, (newPage: Page) => currentPage = newPage, plainRun.interpreterSettings
|
||||
);
|
||||
|
||||
const binaryOutputService = new BinaryOutputService('maxun-run-screenshots');
|
||||
|
||||
@@ -370,11 +370,11 @@ export class RemoteBrowser {
|
||||
await this.stopScreencast();
|
||||
this.currentPage = page;
|
||||
|
||||
this.currentPage.on('framenavigated', (frame) => {
|
||||
if (frame === this.currentPage?.mainFrame()) {
|
||||
this.socket.emit('urlChanged', this.currentPage.url());
|
||||
}
|
||||
});
|
||||
// this.currentPage.on('framenavigated', (frame) => {
|
||||
// if (frame === this.currentPage?.mainFrame()) {
|
||||
// this.socket.emit('urlChanged', this.currentPage.url());
|
||||
// }
|
||||
// });
|
||||
|
||||
//await this.currentPage.setViewportSize({ height: 400, width: 900 })
|
||||
this.client = await this.currentPage.context().newCDPSession(this.currentPage);
|
||||
|
||||
@@ -18,6 +18,8 @@ import { AuthenticatedRequest } from './record';
|
||||
import { computeNextRun } from '../utils/schedule';
|
||||
import { capture } from "../utils/analytics";
|
||||
import { tryCatch } from 'bullmq';
|
||||
import { WorkflowFile } from 'maxun-core';
|
||||
import { Page } from 'playwright';
|
||||
chromium.use(stealthPlugin());
|
||||
|
||||
export const router = Router();
|
||||
@@ -422,6 +424,17 @@ router.get('/runs/run/:id', requireSignIn, async (req, res) => {
|
||||
}
|
||||
});
|
||||
|
||||
function AddGeneratedFlags(workflow: WorkflowFile) {
|
||||
const copy = JSON.parse(JSON.stringify(workflow));
|
||||
for (let i = 0; i < workflow.workflow.length; i++) {
|
||||
copy.workflow[i].what.unshift({
|
||||
action: 'flag',
|
||||
args: ['generated'],
|
||||
});
|
||||
}
|
||||
return copy;
|
||||
};
|
||||
|
||||
/**
|
||||
* PUT endpoint for finishing a run and saving it to the storage.
|
||||
*/
|
||||
@@ -443,10 +456,11 @@ router.post('/runs/run/:id', requireSignIn, async (req: AuthenticatedRequest, re
|
||||
|
||||
// interpret the run in active browser
|
||||
const browser = browserPool.getRemoteBrowser(plainRun.browserId);
|
||||
const currentPage = browser?.getCurrentPage();
|
||||
let currentPage = browser?.getCurrentPage();
|
||||
if (browser && currentPage) {
|
||||
const workflow = AddGeneratedFlags(recording.recording);
|
||||
const interpretationInfo = await browser.interpreter.InterpretRecording(
|
||||
recording.recording, currentPage, plainRun.interpreterSettings);
|
||||
workflow, currentPage, (newPage: Page) => currentPage = newPage, plainRun.interpreterSettings);
|
||||
const binaryOutputService = new BinaryOutputService('maxun-run-screenshots');
|
||||
const uploadedBinaryOutput = await binaryOutputService.uploadAndStoreBinaryOutput(run, interpretationInfo.binaryOutput);
|
||||
await destroyRemoteBrowser(plainRun.browserId);
|
||||
|
||||
@@ -244,7 +244,12 @@ export class WorkflowInterpreter {
|
||||
* @param page The page instance used to interact with the browser.
|
||||
* @param settings The settings to use for the interpretation.
|
||||
*/
|
||||
public InterpretRecording = async (workflow: WorkflowFile, page: Page, settings: InterpreterSettings) => {
|
||||
public InterpretRecording = async (
|
||||
workflow: WorkflowFile,
|
||||
page: Page,
|
||||
updatePageOnPause: (page: Page) => void,
|
||||
settings: InterpreterSettings
|
||||
) => {
|
||||
const params = settings.params ? settings.params : null;
|
||||
delete settings.params;
|
||||
|
||||
@@ -262,7 +267,7 @@ export class WorkflowInterpreter {
|
||||
this.socket.emit('debugMessage', msg)
|
||||
},
|
||||
},
|
||||
serializableCallback: (data: string) => {
|
||||
serializableCallback: (data: any) => {
|
||||
this.serializableData.push(data);
|
||||
this.socket.emit('serializableCallback', data);
|
||||
},
|
||||
@@ -275,6 +280,23 @@ export class WorkflowInterpreter {
|
||||
const interpreter = new Interpreter(decryptedWorkflow, options);
|
||||
this.interpreter = interpreter;
|
||||
|
||||
interpreter.on('flag', async (page, resume) => {
|
||||
if (this.activeId !== null && this.breakpoints[this.activeId]) {
|
||||
logger.log('debug', `breakpoint hit id: ${this.activeId}`);
|
||||
this.socket.emit('breakpointHit');
|
||||
this.interpretationIsPaused = true;
|
||||
}
|
||||
|
||||
if (this.interpretationIsPaused) {
|
||||
this.interpretationResume = resume;
|
||||
logger.log('debug', `Paused inside of flag: ${page.url()}`);
|
||||
updatePageOnPause(page);
|
||||
this.socket.emit('log', '----- The interpretation has been paused -----', false);
|
||||
} else {
|
||||
resume();
|
||||
}
|
||||
});
|
||||
|
||||
const status = await interpreter.run(page, params);
|
||||
|
||||
const lastArray = this.serializableData.length > 1
|
||||
|
||||
@@ -11,6 +11,8 @@ import Run from "../../models/Run";
|
||||
import { getDecryptedProxyConfig } from "../../routes/proxy";
|
||||
import { BinaryOutputService } from "../../storage/mino";
|
||||
import { capture } from "../../utils/analytics";
|
||||
import { WorkflowFile } from "maxun-core";
|
||||
import { Page } from "playwright";
|
||||
chromium.use(stealthPlugin());
|
||||
|
||||
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
|
||||
@@ -79,6 +81,17 @@ async function createWorkflowAndStoreMetadata(id: string, userId: string) {
|
||||
}
|
||||
}
|
||||
|
||||
function AddGeneratedFlags(workflow: WorkflowFile) {
|
||||
const copy = JSON.parse(JSON.stringify(workflow));
|
||||
for (let i = 0; i < workflow.workflow.length; i++) {
|
||||
copy.workflow[i].what.unshift({
|
||||
action: 'flag',
|
||||
args: ['generated'],
|
||||
});
|
||||
}
|
||||
return copy;
|
||||
};
|
||||
|
||||
async function executeRun(id: string) {
|
||||
try {
|
||||
const run = await Run.findOne({ where: { runId: id } });
|
||||
@@ -106,13 +119,15 @@ async function executeRun(id: string) {
|
||||
throw new Error('Could not access browser');
|
||||
}
|
||||
|
||||
const currentPage = await browser.getCurrentPage();
|
||||
let currentPage = await browser.getCurrentPage();
|
||||
if (!currentPage) {
|
||||
throw new Error('Could not create a new page');
|
||||
}
|
||||
|
||||
const workflow = AddGeneratedFlags(recording.recording);
|
||||
const interpretationInfo = await browser.interpreter.InterpretRecording(
|
||||
recording.recording, currentPage, plainRun.interpreterSettings);
|
||||
workflow, currentPage, (newPage: Page) => currentPage = newPage, plainRun.interpreterSettings
|
||||
);
|
||||
|
||||
const binaryOutputService = new BinaryOutputService('maxun-run-screenshots');
|
||||
const uploadedBinaryOutput = await binaryOutputService.uploadAndStoreBinaryOutput(run, interpretationInfo.binaryOutput);
|
||||
|
||||
Reference in New Issue
Block a user