2024-07-31 20:40:14 +05:30
|
|
|
/* eslint-disable no-await-in-loop, no-restricted-syntax */
|
2024-10-27 18:16:48 +05:30
|
|
|
import { Page, PageScreenshotOptions } from 'playwright';
|
2024-08-21 05:25:54 +05:30
|
|
|
import { PlaywrightBlocker } from '@cliqz/adblocker-playwright';
|
2024-08-21 05:06:34 +05:30
|
|
|
import fetch from 'cross-fetch';
|
2024-07-31 20:40:14 +05:30
|
|
|
import path from 'path';
|
|
|
|
|
|
|
|
|
|
import { EventEmitter } from 'events';
|
|
|
|
|
import {
|
|
|
|
|
Where, What, PageState, Workflow, WorkflowFile,
|
|
|
|
|
ParamType, SelectorArray, CustomFunctions,
|
|
|
|
|
} from './types/workflow';
|
|
|
|
|
|
|
|
|
|
import { operators, meta } from './types/logic';
|
|
|
|
|
import { arrayToObject } from './utils/utils';
|
|
|
|
|
import Concurrency from './utils/concurrency';
|
|
|
|
|
import Preprocessor from './preprocessor';
|
|
|
|
|
import log, { Level } from './utils/logger';
|
|
|
|
|
|
2024-12-03 17:51:06 +05:30
|
|
|
/**
|
|
|
|
|
* Extending the Window interface for custom scraping functions.
|
|
|
|
|
*/
|
|
|
|
|
declare global {
|
|
|
|
|
interface Window {
|
|
|
|
|
scrape: (selector: string | null) => Record<string, string>[];
|
|
|
|
|
scrapeSchema: (
|
|
|
|
|
schema: Record<string, { selector: string; tag: string; attribute: string }>
|
|
|
|
|
) => Record<string, any>;
|
|
|
|
|
scrapeList: (config: { listSelector: string; fields: any; limit?: number; pagination: any }) => Record<string, any>[];
|
|
|
|
|
scrapeListAuto: (listSelector: string) => { selector: string; innerText: string }[];
|
|
|
|
|
scrollDown: (pages?: number) => void;
|
|
|
|
|
scrollUp: (pages?: number) => void;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
2024-07-31 20:40:14 +05:30
|
|
|
/**
|
|
|
|
|
* Defines optional intepreter options (passed in constructor)
|
|
|
|
|
*/
|
|
|
|
|
interface InterpreterOptions {
|
|
|
|
|
maxRepeats: number;
|
|
|
|
|
maxConcurrency: number;
|
|
|
|
|
serializableCallback: (output: any) => (void | Promise<void>);
|
|
|
|
|
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
|
|
|
|
debug: boolean;
|
|
|
|
|
debugChannel: Partial<{
|
|
|
|
|
activeId: Function,
|
|
|
|
|
debugMessage: Function,
|
|
|
|
|
}>
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Class for running the Smart Workflows.
|
|
|
|
|
*/
|
|
|
|
|
export default class Interpreter extends EventEmitter {
|
|
|
|
|
private workflow: Workflow;
|
|
|
|
|
|
|
|
|
|
private initializedWorkflow: Workflow | null;
|
|
|
|
|
|
|
|
|
|
private options: InterpreterOptions;
|
|
|
|
|
|
|
|
|
|
private concurrency: Concurrency;
|
|
|
|
|
|
|
|
|
|
private stopper: Function | null = null;
|
|
|
|
|
|
|
|
|
|
private log: typeof log;
|
|
|
|
|
|
2024-08-21 05:06:34 +05:30
|
|
|
private blocker: PlaywrightBlocker | null = null;
|
|
|
|
|
|
2024-12-03 17:51:06 +05:30
|
|
|
private cumulativeResults: Record<string, any>[] = [];
|
|
|
|
|
|
2024-07-31 20:40:14 +05:30
|
|
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
|
|
|
|
|
super();
|
|
|
|
|
this.workflow = workflow.workflow;
|
|
|
|
|
this.initializedWorkflow = null;
|
|
|
|
|
this.options = {
|
|
|
|
|
maxRepeats: 5,
|
|
|
|
|
maxConcurrency: 5,
|
2024-12-03 17:51:06 +05:30
|
|
|
serializableCallback: (data) => {
|
|
|
|
|
log(JSON.stringify(data), Level.WARN);
|
|
|
|
|
},
|
2024-07-31 20:40:14 +05:30
|
|
|
binaryCallback: () => { log('Received binary data, thrashing them.', Level.WARN); },
|
|
|
|
|
debug: false,
|
|
|
|
|
debugChannel: {},
|
|
|
|
|
...options,
|
|
|
|
|
};
|
|
|
|
|
this.concurrency = new Concurrency(this.options.maxConcurrency);
|
|
|
|
|
this.log = (...args) => log(...args);
|
|
|
|
|
|
|
|
|
|
const error = Preprocessor.validateWorkflow(workflow);
|
|
|
|
|
if (error) {
|
|
|
|
|
throw (error);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.options.debugChannel?.debugMessage) {
|
|
|
|
|
const oldLog = this.log;
|
|
|
|
|
// @ts-ignore
|
|
|
|
|
this.log = (...args: Parameters<typeof oldLog>) => {
|
|
|
|
|
if (args[1] !== Level.LOG) {
|
|
|
|
|
this.options.debugChannel.debugMessage!(typeof args[0] === 'string' ? args[0] : args[0].message);
|
|
|
|
|
}
|
|
|
|
|
oldLog(...args);
|
|
|
|
|
};
|
|
|
|
|
}
|
2024-08-21 05:10:56 +05:30
|
|
|
|
2024-08-21 05:11:28 +05:30
|
|
|
PlaywrightBlocker.fromPrebuiltAdsAndTracking(fetch).then(blocker => {
|
2024-08-21 05:10:56 +05:30
|
|
|
this.blocker = blocker;
|
2024-08-21 05:11:28 +05:30
|
|
|
}).catch(err => {
|
2024-08-21 05:10:56 +05:30
|
|
|
this.log(`Failed to initialize ad-blocker:`, Level.ERROR);
|
|
|
|
|
})
|
2024-07-31 20:40:14 +05:30
|
|
|
}
|
|
|
|
|
|
2024-08-21 05:14:58 +05:30
|
|
|
private async applyAdBlocker(page: Page): Promise<void> {
|
|
|
|
|
if (this.blocker) {
|
|
|
|
|
await this.blocker.enableBlockingInPage(page);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-21 05:24:41 +05:30
|
|
|
private async disableAdBlocker(page: Page): Promise<void> {
|
|
|
|
|
if (this.blocker) {
|
2024-08-21 05:24:52 +05:30
|
|
|
await this.blocker.disableBlockingInPage(page);
|
2024-08-21 05:24:41 +05:30
|
|
|
}
|
2024-08-21 05:24:52 +05:30
|
|
|
}
|
2024-08-21 05:24:41 +05:30
|
|
|
|
2024-07-31 20:40:14 +05:30
|
|
|
/**
|
|
|
|
|
* Returns the context object from given Page and the current workflow.\
|
|
|
|
|
* \
|
|
|
|
|
* `workflow` is used for selector extraction - function searches for used selectors to
|
|
|
|
|
* look for later in the page's context.
|
|
|
|
|
* @param page Playwright Page object
|
|
|
|
|
* @param workflow Current **initialized** workflow (array of where-what pairs).
|
|
|
|
|
* @returns {PageState} State of the current page.
|
|
|
|
|
*/
|
|
|
|
|
private async getState(page: Page, workflow: Workflow): Promise<PageState> {
|
|
|
|
|
/**
|
|
|
|
|
* All the selectors present in the current Workflow
|
|
|
|
|
*/
|
|
|
|
|
const selectors = Preprocessor.extractSelectors(workflow);
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability).
|
|
|
|
|
* @param selector Selector to be queried
|
|
|
|
|
* @returns True if the targetted element is actionable, false otherwise.
|
|
|
|
|
*/
|
|
|
|
|
const actionable = async (selector: string): Promise<boolean> => {
|
|
|
|
|
try {
|
|
|
|
|
const proms = [
|
|
|
|
|
page.isEnabled(selector, { timeout: 500 }),
|
|
|
|
|
page.isVisible(selector, { timeout: 500 }),
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
return await Promise.all(proms).then((bools) => bools.every((x) => x));
|
|
|
|
|
} catch (e) {
|
|
|
|
|
// log(<Error>e, Level.ERROR);
|
|
|
|
|
return false;
|
|
|
|
|
}
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Object of selectors present in the current page.
|
|
|
|
|
*/
|
|
|
|
|
const presentSelectors: SelectorArray = await Promise.all(
|
|
|
|
|
selectors.map(async (selector) => {
|
|
|
|
|
if (await actionable(selector)) {
|
|
|
|
|
return [selector];
|
|
|
|
|
}
|
|
|
|
|
return [];
|
|
|
|
|
}),
|
|
|
|
|
).then((x) => x.flat());
|
|
|
|
|
|
|
|
|
|
return {
|
|
|
|
|
url: page.url(),
|
|
|
|
|
cookies: (await page.context().cookies([page.url()]))
|
|
|
|
|
.reduce((p, cookie) => (
|
|
|
|
|
{
|
|
|
|
|
...p,
|
|
|
|
|
[cookie.name]: cookie.value,
|
|
|
|
|
}), {}),
|
|
|
|
|
selectors: presentSelectors,
|
|
|
|
|
};
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Tests if the given action is applicable with the given context.
|
|
|
|
|
* @param where Tested *where* condition
|
|
|
|
|
* @param context Current browser context.
|
|
|
|
|
* @returns True if `where` is applicable in the given context, false otherwise
|
|
|
|
|
*/
|
|
|
|
|
private applicable(where: Where, context: PageState, usedActions: string[] = []): boolean {
|
|
|
|
|
/**
|
|
|
|
|
* Given two arbitrary objects, determines whether `subset` is a subset of `superset`.\
|
|
|
|
|
* \
|
|
|
|
|
* For every key in `subset`, there must be a corresponding key with equal scalar
|
|
|
|
|
* value in `superset`, or `inclusive(subset[key], superset[key])` must hold.
|
|
|
|
|
* @param subset Arbitrary non-cyclic JS object (where clause)
|
|
|
|
|
* @param superset Arbitrary non-cyclic JS object (browser context)
|
|
|
|
|
* @returns `true` if `subset <= superset`, `false` otherwise.
|
|
|
|
|
*/
|
|
|
|
|
const inclusive = (subset: Record<string, unknown>, superset: Record<string, unknown>)
|
|
|
|
|
: boolean => (
|
|
|
|
|
Object.entries(subset).every(
|
|
|
|
|
([key, value]) => {
|
|
|
|
|
/**
|
|
|
|
|
* Arrays are compared without order (are transformed into objects before comparison).
|
|
|
|
|
*/
|
|
|
|
|
const parsedValue = Array.isArray(value) ? arrayToObject(value) : value;
|
|
|
|
|
|
|
|
|
|
const parsedSuperset: Record<string, unknown> = {};
|
|
|
|
|
parsedSuperset[key] = Array.isArray(superset[key])
|
|
|
|
|
? arrayToObject(<any>superset[key])
|
|
|
|
|
: superset[key];
|
|
|
|
|
|
|
|
|
|
// Every `subset` key must exist in the `superset` and
|
|
|
|
|
// have the same value (strict equality), or subset[key] <= superset[key]
|
|
|
|
|
return parsedSuperset[key]
|
|
|
|
|
&& (
|
|
|
|
|
(parsedSuperset[key] === parsedValue)
|
|
|
|
|
|| ((parsedValue).constructor.name === 'RegExp' && (<RegExp>parsedValue).test(<string>parsedSuperset[key]))
|
|
|
|
|
|| (
|
|
|
|
|
(parsedValue).constructor.name !== 'RegExp'
|
|
|
|
|
&& typeof parsedValue === 'object' && inclusive(<typeof subset>parsedValue, <typeof superset>parsedSuperset[key])
|
|
|
|
|
)
|
|
|
|
|
);
|
|
|
|
|
},
|
|
|
|
|
)
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
// Every value in the "where" object should be compliant to the current state.
|
|
|
|
|
return Object.entries(where).every(
|
|
|
|
|
([key, value]) => {
|
|
|
|
|
if (operators.includes(<any>key)) {
|
|
|
|
|
const array = Array.isArray(value)
|
|
|
|
|
? value as Where[]
|
|
|
|
|
: Object.entries(value).map((a) => Object.fromEntries([a]));
|
|
|
|
|
// every condition is treated as a single context
|
|
|
|
|
|
2024-09-21 22:20:04 +05:30
|
|
|
switch (key as keyof typeof operators) {
|
2024-12-03 17:51:06 +05:30
|
|
|
case '$and' as keyof typeof operators:
|
2024-07-31 20:40:14 +05:30
|
|
|
return array?.every((x) => this.applicable(x, context));
|
2024-12-03 17:51:06 +05:30
|
|
|
case '$or' as keyof typeof operators:
|
2024-07-31 20:40:14 +05:30
|
|
|
return array?.some((x) => this.applicable(x, context));
|
2024-12-03 17:51:06 +05:30
|
|
|
case '$not' as keyof typeof operators:
|
2024-07-31 20:40:14 +05:30
|
|
|
return !this.applicable(<Where>value, context); // $not should be a unary operator
|
|
|
|
|
default:
|
|
|
|
|
throw new Error('Undefined logic operator.');
|
|
|
|
|
}
|
|
|
|
|
} else if (meta.includes(<any>key)) {
|
|
|
|
|
const testRegexString = (x: string) => {
|
|
|
|
|
if (typeof value === 'string') {
|
|
|
|
|
return x === value;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
return (<RegExp><unknown>value).test(x);
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
switch (key as keyof typeof meta) {
|
2024-12-03 17:51:06 +05:30
|
|
|
case '$before' as keyof typeof meta:
|
2024-07-31 20:40:14 +05:30
|
|
|
return !usedActions.find(testRegexString);
|
2024-12-03 17:51:06 +05:30
|
|
|
case '$after' as keyof typeof meta:
|
2024-07-31 20:40:14 +05:30
|
|
|
return !!usedActions.find(testRegexString);
|
|
|
|
|
default:
|
|
|
|
|
throw new Error('Undefined meta operator.');
|
|
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
// Current key is a base condition (url, cookies, selectors)
|
|
|
|
|
return inclusive({ [key]: value }, context);
|
|
|
|
|
}
|
|
|
|
|
},
|
|
|
|
|
);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Given a Playwright's page object and a "declarative" list of actions, this function
|
|
|
|
|
* calls all mentioned functions on the Page object.\
|
|
|
|
|
* \
|
|
|
|
|
* Manipulates the iterator indexes (experimental feature, likely to be removed in
|
2024-09-21 22:20:04 +05:30
|
|
|
* the following versions of maxun-core)
|
2024-07-31 20:40:14 +05:30
|
|
|
* @param page Playwright Page object
|
|
|
|
|
* @param steps Array of actions.
|
|
|
|
|
*/
|
|
|
|
|
private async carryOutSteps(page: Page, steps: What[]): Promise<void> {
|
|
|
|
|
/**
|
|
|
|
|
* Defines overloaded (or added) methods/actions usable in the workflow.
|
|
|
|
|
* If a method overloads any existing method of the Page class, it accepts the same set
|
|
|
|
|
* of parameters *(but can override some!)*\
|
|
|
|
|
* \
|
|
|
|
|
* Also, following piece of code defines functions to be run in the browser's context.
|
|
|
|
|
* Beware of false linter errors - here, we know better!
|
|
|
|
|
*/
|
|
|
|
|
const wawActions: Record<CustomFunctions, (...args: any[]) => void> = {
|
|
|
|
|
screenshot: async (params: PageScreenshotOptions) => {
|
|
|
|
|
const screenshotBuffer = await page.screenshot({
|
|
|
|
|
...params, path: undefined,
|
|
|
|
|
});
|
|
|
|
|
await this.options.binaryCallback(screenshotBuffer, 'image/png');
|
|
|
|
|
},
|
|
|
|
|
enqueueLinks: async (selector: string) => {
|
|
|
|
|
const links: string[] = await page.locator(selector)
|
|
|
|
|
.evaluateAll(
|
|
|
|
|
// @ts-ignore
|
|
|
|
|
(elements) => elements.map((a) => a.href).filter((x) => x),
|
|
|
|
|
);
|
2024-10-27 18:16:48 +05:30
|
|
|
const context = page.context();
|
2024-07-31 20:40:14 +05:30
|
|
|
|
|
|
|
|
for (const link of links) {
|
|
|
|
|
// eslint-disable-next-line
|
|
|
|
|
this.concurrency.addJob(async () => {
|
|
|
|
|
try {
|
2024-10-27 18:16:48 +05:30
|
|
|
const newPage = await context.newPage();
|
2024-07-31 20:40:14 +05:30
|
|
|
await newPage.goto(link);
|
|
|
|
|
await newPage.waitForLoadState('networkidle');
|
|
|
|
|
await this.runLoop(newPage, this.initializedWorkflow!);
|
|
|
|
|
} catch (e) {
|
|
|
|
|
// `runLoop` uses soft mode, so it recovers from it's own exceptions
|
|
|
|
|
// but newPage(), goto() and waitForLoadState() don't (and will kill
|
|
|
|
|
// the interpreter by throwing).
|
|
|
|
|
this.log(<Error>e, Level.ERROR);
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
}
|
|
|
|
|
await page.close();
|
|
|
|
|
},
|
|
|
|
|
scrape: async (selector?: string) => {
|
2024-08-02 19:05:41 +05:30
|
|
|
await this.ensureScriptsLoaded(page);
|
2024-08-03 02:20:59 +05:30
|
|
|
|
2024-08-01 19:16:13 +05:30
|
|
|
const scrapeResults: Record<string, string>[] = await page.evaluate((s) => window.scrape(s ?? null), selector);
|
2024-07-31 20:40:14 +05:30
|
|
|
await this.options.serializableCallback(scrapeResults);
|
2024-08-03 02:20:59 +05:30
|
|
|
},
|
|
|
|
|
|
2024-08-04 03:50:03 +05:30
|
|
|
scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; }>) => {
|
2024-08-03 02:20:59 +05:30
|
|
|
await this.ensureScriptsLoaded(page);
|
2024-12-03 17:51:06 +05:30
|
|
|
|
2024-08-03 20:32:12 +05:30
|
|
|
const scrapeResult = await page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
|
2024-12-03 17:51:06 +05:30
|
|
|
|
2024-12-03 22:21:26 +05:30
|
|
|
const newResults = Array.isArray(scrapeResult) ? scrapeResult : [scrapeResult];
|
|
|
|
|
newResults.forEach((result) => {
|
|
|
|
|
Object.entries(result).forEach(([key, value]) => {
|
|
|
|
|
const keyExists = this.cumulativeResults.some(
|
|
|
|
|
(item) => key in item && item[key] !== undefined
|
|
|
|
|
);
|
|
|
|
|
|
|
|
|
|
if (!keyExists) {
|
|
|
|
|
this.cumulativeResults.push({ [key]: value });
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
});
|
2024-12-03 17:51:06 +05:30
|
|
|
|
|
|
|
|
const mergedResult: Record<string, string>[] = [
|
2024-12-03 22:21:26 +05:30
|
|
|
Object.fromEntries(
|
2024-12-03 17:51:06 +05:30
|
|
|
Object.entries(
|
|
|
|
|
this.cumulativeResults.reduce((acc, curr) => {
|
|
|
|
|
Object.entries(curr).forEach(([key, value]) => {
|
|
|
|
|
// If the key doesn't exist or the current value is not undefined, add/update it
|
|
|
|
|
if (value !== undefined) {
|
|
|
|
|
acc[key] = value;
|
|
|
|
|
}
|
|
|
|
|
});
|
|
|
|
|
return acc;
|
|
|
|
|
}, {})
|
|
|
|
|
)
|
|
|
|
|
)
|
|
|
|
|
];
|
|
|
|
|
|
|
|
|
|
// Log cumulative results after each action
|
|
|
|
|
console.log("CUMULATIVE results:", this.cumulativeResults);
|
|
|
|
|
console.log("MERGED results:", mergedResult);
|
|
|
|
|
|
|
|
|
|
await this.options.serializableCallback(mergedResult);
|
2024-08-03 02:20:59 +05:30
|
|
|
},
|
|
|
|
|
|
2024-08-16 18:21:34 +05:30
|
|
|
scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }) => {
|
2024-08-06 23:20:10 +05:30
|
|
|
await this.ensureScriptsLoaded(page);
|
2024-08-18 22:22:43 +05:30
|
|
|
if (!config.pagination) {
|
|
|
|
|
const scrapeResults: Record<string, any>[] = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
|
|
|
await this.options.serializableCallback(scrapeResults);
|
|
|
|
|
} else {
|
|
|
|
|
const scrapeResults: Record<string, any>[] = await this.handlePagination(page, config);
|
|
|
|
|
await this.options.serializableCallback(scrapeResults);
|
|
|
|
|
}
|
2024-08-06 23:20:10 +05:30
|
|
|
},
|
|
|
|
|
|
2024-08-12 06:28:08 +05:30
|
|
|
scrapeListAuto: async (config: { listSelector: string }) => {
|
|
|
|
|
await this.ensureScriptsLoaded(page);
|
2024-08-12 06:35:00 +05:30
|
|
|
|
2024-08-12 06:28:08 +05:30
|
|
|
const scrapeResults: { selector: string, innerText: string }[] = await page.evaluate((listSelector) => {
|
|
|
|
|
return window.scrapeListAuto(listSelector);
|
|
|
|
|
}, config.listSelector);
|
2024-08-12 06:35:00 +05:30
|
|
|
|
2024-08-12 06:28:08 +05:30
|
|
|
await this.options.serializableCallback(scrapeResults);
|
|
|
|
|
},
|
|
|
|
|
|
2024-07-31 20:40:14 +05:30
|
|
|
scroll: async (pages?: number) => {
|
|
|
|
|
await page.evaluate(async (pagesInternal) => {
|
|
|
|
|
for (let i = 1; i <= (pagesInternal ?? 1); i += 1) {
|
|
|
|
|
// @ts-ignore
|
|
|
|
|
window.scrollTo(0, window.scrollY + window.innerHeight);
|
|
|
|
|
}
|
|
|
|
|
}, pages ?? 1);
|
|
|
|
|
},
|
2024-08-12 06:35:00 +05:30
|
|
|
|
2024-07-31 20:40:14 +05:30
|
|
|
script: async (code: string) => {
|
|
|
|
|
const AsyncFunction: FunctionConstructor = Object.getPrototypeOf(
|
|
|
|
|
async () => { },
|
|
|
|
|
).constructor;
|
|
|
|
|
const x = new AsyncFunction('page', 'log', code);
|
|
|
|
|
await x(page, this.log);
|
|
|
|
|
},
|
2024-08-18 22:19:24 +05:30
|
|
|
|
2024-07-31 20:40:14 +05:30
|
|
|
flag: async () => new Promise((res) => {
|
|
|
|
|
this.emit('flag', page, res);
|
|
|
|
|
}),
|
|
|
|
|
};
|
|
|
|
|
|
|
|
|
|
for (const step of steps) {
|
2024-12-03 17:51:06 +05:30
|
|
|
this.log(`Launching ${String(step.action)}`, Level.LOG);
|
2024-07-31 20:40:14 +05:30
|
|
|
|
|
|
|
|
if (step.action in wawActions) {
|
|
|
|
|
// "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
|
|
|
|
|
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
|
|
|
|
|
await wawActions[step.action as CustomFunctions](...(params ?? []));
|
|
|
|
|
} else {
|
|
|
|
|
// Implements the dot notation for the "method name" in the workflow
|
2024-12-03 17:51:06 +05:30
|
|
|
const levels = String(step.action).split('.');
|
2024-07-31 20:40:14 +05:30
|
|
|
const methodName = levels[levels.length - 1];
|
|
|
|
|
|
|
|
|
|
let invokee: any = page;
|
|
|
|
|
for (const level of levels.splice(0, levels.length - 1)) {
|
|
|
|
|
invokee = invokee[level];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (!step.args || Array.isArray(step.args)) {
|
|
|
|
|
await (<any>invokee[methodName])(...(step.args ?? []));
|
|
|
|
|
} else {
|
|
|
|
|
await (<any>invokee[methodName])(step.args);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
await new Promise((res) => { setTimeout(res, 500); });
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-17 23:54:00 +05:30
|
|
|
private async handlePagination(page: Page, config: { listSelector: string, fields: any, limit?: number, pagination: any }) {
|
|
|
|
|
let allResults: Record<string, any>[] = [];
|
2024-08-19 01:38:41 +05:30
|
|
|
let previousHeight = 0;
|
2024-08-19 18:18:30 +05:30
|
|
|
// track unique items per page to avoid re-scraping
|
2024-08-20 23:15:33 +05:30
|
|
|
let scrapedItems: Set<string> = new Set<string>();
|
2024-08-18 22:19:24 +05:30
|
|
|
|
2024-08-17 23:54:00 +05:30
|
|
|
while (true) {
|
|
|
|
|
switch (config.pagination.type) {
|
|
|
|
|
case 'scrollDown':
|
2024-08-18 22:18:03 +05:30
|
|
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
2024-08-18 22:19:24 +05:30
|
|
|
await page.waitForTimeout(2000);
|
|
|
|
|
|
|
|
|
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
|
|
|
if (currentHeight === previousHeight) {
|
2024-08-18 22:32:39 +05:30
|
|
|
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
|
|
|
allResults = allResults.concat(finalResults);
|
|
|
|
|
return allResults;
|
2024-08-18 22:19:24 +05:30
|
|
|
}
|
2024-08-18 22:47:14 +05:30
|
|
|
|
2024-08-18 22:19:24 +05:30
|
|
|
previousHeight = currentHeight;
|
2024-08-17 23:54:00 +05:30
|
|
|
break;
|
|
|
|
|
case 'scrollUp':
|
2024-09-20 20:26:51 +05:30
|
|
|
await page.evaluate(() => window.scrollTo(0, 0));
|
|
|
|
|
await page.waitForTimeout(2000);
|
2024-09-20 23:37:00 +05:30
|
|
|
|
2024-09-20 20:26:51 +05:30
|
|
|
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
|
|
|
|
|
if (currentTopHeight === 0) {
|
2024-09-20 23:37:00 +05:30
|
|
|
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
|
|
|
allResults = allResults.concat(finalResults);
|
|
|
|
|
return allResults;
|
2024-09-20 20:26:51 +05:30
|
|
|
}
|
2024-09-20 23:37:00 +05:30
|
|
|
|
2024-09-20 20:26:51 +05:30
|
|
|
previousHeight = currentTopHeight;
|
2024-08-17 23:54:00 +05:30
|
|
|
break;
|
2024-08-20 23:15:33 +05:30
|
|
|
case 'clickNext':
|
|
|
|
|
const pageResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
|
|
|
|
|
|
|
|
// Filter out already scraped items
|
|
|
|
|
const newResults = pageResults.filter(item => {
|
|
|
|
|
const uniqueKey = JSON.stringify(item);
|
|
|
|
|
if (scrapedItems.has(uniqueKey)) return false; // Ignore if already scraped
|
|
|
|
|
scrapedItems.add(uniqueKey); // Mark as scraped
|
|
|
|
|
return true;
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
allResults = allResults.concat(newResults);
|
|
|
|
|
|
|
|
|
|
if (config.limit && allResults.length >= config.limit) {
|
|
|
|
|
return allResults.slice(0, config.limit);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
const nextButton = await page.$(config.pagination.selector);
|
|
|
|
|
if (!nextButton) {
|
|
|
|
|
return allResults; // No more pages to scrape
|
|
|
|
|
}
|
|
|
|
|
await Promise.all([
|
|
|
|
|
nextButton.click(),
|
|
|
|
|
page.waitForNavigation({ waitUntil: 'networkidle' })
|
|
|
|
|
]);
|
|
|
|
|
|
|
|
|
|
await page.waitForTimeout(1000);
|
|
|
|
|
break;
|
2024-08-17 23:54:00 +05:30
|
|
|
case 'clickLoadMore':
|
2024-09-20 23:35:49 +05:30
|
|
|
while (true) {
|
2024-09-20 23:37:00 +05:30
|
|
|
const loadMoreButton = await page.$(config.pagination.selector);
|
|
|
|
|
if (!loadMoreButton) {
|
|
|
|
|
// No more "Load More" button, so scrape the remaining items
|
|
|
|
|
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
|
|
|
allResults = allResults.concat(finalResults);
|
|
|
|
|
return allResults;
|
|
|
|
|
}
|
|
|
|
|
// Click the 'Load More' button to load additional items
|
|
|
|
|
await loadMoreButton.click();
|
|
|
|
|
await page.waitForTimeout(2000); // Wait for new items to load
|
|
|
|
|
// After clicking 'Load More', scroll down to load more items
|
|
|
|
|
await page.evaluate(() => window.scrollTo(0, document.body.scrollHeight));
|
|
|
|
|
await page.waitForTimeout(2000);
|
|
|
|
|
// Check if more items are available
|
|
|
|
|
const currentHeight = await page.evaluate(() => document.body.scrollHeight);
|
|
|
|
|
if (currentHeight === previousHeight) {
|
|
|
|
|
// No more items loaded, return the scraped results
|
|
|
|
|
const finalResults = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
|
|
|
allResults = allResults.concat(finalResults);
|
|
|
|
|
return allResults;
|
|
|
|
|
}
|
|
|
|
|
previousHeight = currentHeight;
|
|
|
|
|
if (config.limit && allResults.length >= config.limit) {
|
|
|
|
|
// If limit is set and reached, return the limited results
|
|
|
|
|
allResults = allResults.slice(0, config.limit);
|
|
|
|
|
break;
|
|
|
|
|
}
|
2024-08-17 23:54:00 +05:30
|
|
|
}
|
|
|
|
|
break;
|
|
|
|
|
default:
|
2024-08-18 22:34:46 +05:30
|
|
|
const results = await page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
|
|
|
allResults = allResults.concat(results);
|
|
|
|
|
return allResults;
|
2024-08-17 23:54:00 +05:30
|
|
|
}
|
2024-08-18 22:19:24 +05:30
|
|
|
|
2024-08-18 22:46:10 +05:30
|
|
|
if (config.limit && allResults.length >= config.limit) {
|
|
|
|
|
allResults = allResults.slice(0, config.limit);
|
|
|
|
|
break;
|
2024-08-17 23:54:00 +05:30
|
|
|
}
|
|
|
|
|
}
|
2024-08-18 22:19:24 +05:30
|
|
|
|
2024-08-17 23:54:00 +05:30
|
|
|
return allResults;
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-31 20:40:14 +05:30
|
|
|
private async runLoop(p: Page, workflow: Workflow) {
|
2024-08-21 05:16:15 +05:30
|
|
|
// apply ad-blocker to the current page
|
|
|
|
|
await this.applyAdBlocker(p);
|
2024-07-31 20:40:14 +05:30
|
|
|
const usedActions: string[] = [];
|
|
|
|
|
let lastAction = null;
|
|
|
|
|
let repeatCount = 0;
|
|
|
|
|
|
|
|
|
|
/**
|
|
|
|
|
* Enables the interpreter functionality for popup windows.
|
|
|
|
|
* User-requested concurrency should be entirely managed by the concurrency manager,
|
|
|
|
|
* e.g. via `enqueueLinks`.
|
|
|
|
|
*/
|
|
|
|
|
p.on('popup', (popup) => {
|
|
|
|
|
this.concurrency.addJob(() => this.runLoop(popup, workflow));
|
|
|
|
|
});
|
|
|
|
|
|
|
|
|
|
/* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
|
|
|
|
|
while (true) {
|
|
|
|
|
// Checks whether the page was closed from outside,
|
|
|
|
|
// or the workflow execution has been stopped via `interpreter.stop()`
|
|
|
|
|
if (p.isClosed() || !this.stopper) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
await p.waitForLoadState();
|
|
|
|
|
} catch (e) {
|
|
|
|
|
await p.close();
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
let pageState = {};
|
|
|
|
|
try {
|
|
|
|
|
pageState = await this.getState(p, workflow);
|
|
|
|
|
} catch (e: any) {
|
|
|
|
|
this.log('The browser has been closed.');
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (this.options.debug) {
|
|
|
|
|
this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
|
|
|
|
|
}
|
2024-12-03 17:51:06 +05:30
|
|
|
|
|
|
|
|
const actionId = workflow.findIndex((step) => {
|
|
|
|
|
const isApplicable = this.applicable(step.where, pageState, usedActions);
|
|
|
|
|
console.log(`Where:`, step.where);
|
|
|
|
|
console.log(`Page state:`, pageState);
|
|
|
|
|
console.log(`Match result: ${isApplicable}`);
|
|
|
|
|
return isApplicable;
|
|
|
|
|
});
|
2024-07-31 20:40:14 +05:30
|
|
|
|
|
|
|
|
const action = workflow[actionId];
|
|
|
|
|
|
|
|
|
|
this.log(`Matched ${JSON.stringify(action?.where)}`, Level.LOG);
|
|
|
|
|
|
|
|
|
|
if (action) { // action is matched
|
|
|
|
|
if (this.options.debugChannel?.activeId) {
|
|
|
|
|
this.options.debugChannel.activeId(actionId);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
repeatCount = action === lastAction ? repeatCount + 1 : 0;
|
|
|
|
|
if (this.options.maxRepeats && repeatCount >= this.options.maxRepeats) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
lastAction = action;
|
|
|
|
|
|
|
|
|
|
try {
|
|
|
|
|
await this.carryOutSteps(p, action.what);
|
|
|
|
|
usedActions.push(action.id ?? 'undefined');
|
|
|
|
|
} catch (e) {
|
|
|
|
|
this.log(<Error>e, Level.ERROR);
|
|
|
|
|
}
|
|
|
|
|
} else {
|
2024-08-21 05:55:05 +05:30
|
|
|
//await this.disableAdBlocker(p);
|
2024-07-31 20:40:14 +05:30
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-08-02 19:05:41 +05:30
|
|
|
private async ensureScriptsLoaded(page: Page) {
|
2024-08-17 23:54:00 +05:30
|
|
|
const isScriptLoaded = await page.evaluate(() => typeof window.scrape === 'function' && typeof window.scrapeSchema === 'function' && typeof window.scrapeList === 'function' && typeof window.scrapeListAuto === 'function' && typeof window.scrollDown === 'function' && typeof window.scrollUp === 'function');
|
2024-08-02 19:05:41 +05:30
|
|
|
if (!isScriptLoaded) {
|
|
|
|
|
await page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
2024-07-31 20:40:14 +05:30
|
|
|
/**
|
|
|
|
|
* Spawns a browser context and runs given workflow.
|
|
|
|
|
* \
|
|
|
|
|
* Resolves after the playback is finished.
|
|
|
|
|
* @param {Page} [page] Page to run the workflow on.
|
|
|
|
|
* @param {ParamType} params Workflow specific, set of parameters
|
|
|
|
|
* for the `{$param: nameofparam}` fields.
|
|
|
|
|
*/
|
|
|
|
|
public async run(page: Page, params?: ParamType): Promise<void> {
|
2024-10-27 18:16:48 +05:30
|
|
|
this.log('Starting the workflow.', Level.LOG);
|
|
|
|
|
const context = page.context();
|
|
|
|
|
|
|
|
|
|
// Check proxy settings from context options
|
|
|
|
|
const contextOptions = (context as any)._options;
|
|
|
|
|
const hasProxy = !!contextOptions?.proxy;
|
|
|
|
|
|
|
|
|
|
this.log(`Proxy settings: ${hasProxy ? `Proxy is configured...` : 'No proxy configured...'}`);
|
|
|
|
|
|
|
|
|
|
if (hasProxy) {
|
|
|
|
|
if (contextOptions.proxy.username) {
|
|
|
|
|
this.log(`Proxy authenticated...`);
|
|
|
|
|
}
|
|
|
|
|
}
|
2024-07-31 20:40:14 +05:30
|
|
|
if (this.stopper) {
|
|
|
|
|
throw new Error('This Interpreter is already running a workflow. To run another workflow, please, spawn another Interpreter.');
|
|
|
|
|
}
|
|
|
|
|
/**
|
|
|
|
|
* `this.workflow` with the parameters initialized.
|
|
|
|
|
*/
|
|
|
|
|
this.initializedWorkflow = Preprocessor.initWorkflow(this.workflow, params);
|
|
|
|
|
|
2024-10-27 18:16:48 +05:30
|
|
|
await this.ensureScriptsLoaded(page);
|
2024-08-02 19:05:41 +05:30
|
|
|
|
2024-07-31 20:40:14 +05:30
|
|
|
this.stopper = () => {
|
|
|
|
|
this.stopper = null;
|
|
|
|
|
};
|
|
|
|
|
|
2024-10-27 18:16:48 +05:30
|
|
|
this.concurrency.addJob(() => this.runLoop(page, this.initializedWorkflow!));
|
2024-07-31 20:40:14 +05:30
|
|
|
|
|
|
|
|
await this.concurrency.waitForCompletion();
|
|
|
|
|
|
|
|
|
|
this.stopper = null;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
public async stop(): Promise<void> {
|
|
|
|
|
if (this.stopper) {
|
|
|
|
|
await this.stopper();
|
|
|
|
|
this.stopper = null;
|
|
|
|
|
} else {
|
|
|
|
|
throw new Error('Cannot stop, there is no running workflow!');
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
}
|