refactor: rename to maxun-core from mx-interpreter

This commit is contained in:
karishmas6
2024-07-31 20:40:14 +05:30
parent 30bfe50906
commit 30edc9a6ac
9 changed files with 1063 additions and 0 deletions

View File

@@ -0,0 +1,226 @@
/* eslint-disable @typescript-eslint/no-unused-vars */
const area = (element) => element.offsetHeight * element.offsetWidth;
function getBiggestElement(selector) {
const elements = Array.from(document.querySelectorAll(selector));
const biggest = elements.reduce(
(max, elem) => (
area(elem) > area(max) ? elem : max),
{ offsetHeight: 0, offsetWidth: 0 },
);
return biggest;
}
/**
* Generates structural selector (describing element by its DOM tree location).
*
* **The generated selector is not guaranteed to be unique!** (In fact, this is
* the desired behaviour in here.)
* @param {HTMLElement} element Element being described.
* @returns {string} CSS-compliant selector describing the element's location in the DOM tree.
*/
function GetSelectorStructural(element) {
// Base conditions for the recursive approach.
if (element.tagName === 'BODY') {
return 'BODY';
}
const selector = element.tagName;
if (element.parentElement) {
return `${GetSelectorStructural(element.parentElement)} > ${selector}`;
}
return selector;
}
/**
* Heuristic method to find collections of "interesting" items on the page.
* @returns {Array<HTMLElement>} A collection of interesting DOM nodes
* (online store products, plane tickets, list items... and many more?)
*/
function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, metricType = 'size_deviation') {
const restoreScroll = (() => {
const { scrollX, scrollY } = window;
return () => {
window.scrollTo(scrollX, scrollY);
};
})();
/**
* @typedef {Array<{x: number, y: number}>} Grid
*/
/**
* Returns an array of grid-aligned {x,y} points.
* @param {number} [granularity=0.005] sets the number of generated points
* (the higher the granularity, the more points).
* @returns {Grid} Array of {x, y} objects.
*/
function getGrid(startX = 0, startY = 0, granularity = 0.005) {
const width = window.innerWidth;
const height = window.innerHeight;
const out = [];
for (let x = 0; x < width; x += 1 / granularity) {
for (let y = 0; y < height; y += 1 / granularity) {
out.push({ x: startX + x, y: startY + y });
}
}
return out;
}
let maxSelector = { selector: 'body', metric: 0 };
const updateMaximumWithPoint = (point) => {
const currentElement = document.elementFromPoint(point.x, point.y);
const selector = GetSelectorStructural(currentElement);
const elements = Array.from(document.querySelectorAll(selector))
.filter((element) => area(element) > minArea);
// If the current selector targets less than three elements,
// we consider it not interesting (would be a very underwhelming scraper)
if (elements.length < 3) {
return;
}
let metric = null;
if (metricType === 'total_area') {
metric = elements
.reduce((p, x) => p + area(x), 0);
} else if (metricType === 'size_deviation') {
// This could use a proper "statistics" approach... but meh, so far so good!
const sizes = elements
.map((element) => area(element));
metric = (1 - (Math.max(...sizes) - Math.min(...sizes)) / Math.max(...sizes));
}
// console.debug(`Total ${metricType} is ${metric}.`)
if (metric > maxSelector.metric && elements.length < maxCountPerPage) {
maxSelector = { selector, metric };
}
};
for (let scroll = 0; scroll < scrolls; scroll += 1) {
window.scrollTo(0, scroll * window.innerHeight);
const grid = getGrid();
grid.forEach(updateMaximumWithPoint);
}
restoreScroll();
let out = Array.from(document.querySelectorAll(maxSelector.selector));
const different = (x, i, a) => a.findIndex((e) => e === x) === i;
// as long as we don't merge any two elements by substituing them for their parents,
// we substitute.
while (out.map((x) => x.parentElement).every(different)
&& out.forEach((x) => x.parentElement !== null)) {
out = out.map((x) => x.parentElement ?? x);
}
return out;
}
/**
* Returns a "scrape" result from the current page.
* @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
*/
function scrape(selector = null) {
/**
* **crudeRecords** contains uncurated rundowns of "scrapable" elements
* @type {Array<Object>}
*/
const crudeRecords = (selector
? Array.from(document.querySelectorAll(selector))
: scrapableHeuristics())
.map((record) => ({
...Array.from(record.querySelectorAll('img'))
.reduce((p, x, i) => {
let url = null;
if (x.srcset) {
const urls = x.srcset.split(', ');
[url] = urls[urls.length - 1].split(' ');
}
/**
* Contains the largest elements from `srcset` - if `srcset` is not present, contains
* URL from the `src` attribute
*
* If the `src` attribute contains a data url, imgUrl contains `undefined`.
*/
let imgUrl;
if (x.srcset) {
imgUrl = url;
} else if (x.src.indexOf('data:') === -1) {
imgUrl = x.src;
}
return ({
...p,
...(imgUrl ? { [`img_${i}`]: imgUrl } : {}),
});
}, {}),
...record.innerText.split('\n')
.reduce((p, x, i) => ({
...p,
[`record_${String(i).padStart(4, '0')}`]: x.trim(),
}), {}),
}));
return crudeRecords;
}
/**
* Given an object with named lists of elements,
* groups the elements by their distance in the DOM tree.
* @param {Object.<string, object[]>} lists The named lists of HTML elements.
* @returns {Array.<Object.<string, string>>}
*/
function scrapeSchema(lists) {
function omap(object, f, kf = (x) => x) {
return Object.fromEntries(
Object.entries(object)
.map(([k, v]) => [kf(k), f(v)]),
);
}
function ofilter(object, f) {
return Object.fromEntries(
Object.entries(object)
.filter(([k, v]) => f(k, v)),
);
}
function getSeedKey(listObj) {
const maxLength = Math.max(...Object.values(omap(listObj, (x) => x.length)));
return Object.keys(ofilter(listObj, (_, v) => v.length === maxLength))[0];
}
function getMBEs(elements) {
return elements.map((element) => {
let candidate = element;
const isUniqueChild = (e) => elements
.filter((elem) => e.parentNode?.contains(elem))
.length === 1;
while (candidate && isUniqueChild(candidate)) {
candidate = candidate.parentNode;
}
return candidate;
});
}
const seedName = getSeedKey(lists);
const MBEs = getMBEs(lists[seedName]);
return MBEs.map((mbe) => omap(
lists,
(listOfElements) => listOfElements.find((elem) => mbe.contains(elem))?.innerText,
));
}

8
maxun-core/index.ts Normal file
View File

@@ -0,0 +1,8 @@
import Interpreter from './interpret';
export default Interpreter;
export { default as Preprocessor } from './preprocessor';
export type {
WorkflowFile, WhereWhatPair, Where, What,
} from './types/workflow';
export { unaryOperators, naryOperators, meta as metaOperators } from './types/logic';

459
maxun-core/interpret.ts Normal file
View File

@@ -0,0 +1,459 @@
/* eslint-disable no-await-in-loop, no-restricted-syntax */
import { Page, PageScreenshotOptions } from 'playwright';
import path from 'path';
import { EventEmitter } from 'events';
import {
Where, What, PageState, Workflow, WorkflowFile,
ParamType, SelectorArray, CustomFunctions,
} from './types/workflow';
import { operators, meta } from './types/logic';
import { arrayToObject } from './utils/utils';
import Concurrency from './utils/concurrency';
import Preprocessor from './preprocessor';
import log, { Level } from './utils/logger';
/**
* Defines optional intepreter options (passed in constructor)
*/
interface InterpreterOptions {
maxRepeats: number;
maxConcurrency: number;
serializableCallback: (output: any) => (void | Promise<void>);
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
debug: boolean;
debugChannel: Partial<{
activeId: Function,
debugMessage: Function,
}>
}
/**
* Class for running the Smart Workflows.
*/
export default class Interpreter extends EventEmitter {
private workflow: Workflow;
private initializedWorkflow: Workflow | null;
private options: InterpreterOptions;
private concurrency: Concurrency;
private stopper: Function | null = null;
private log: typeof log;
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
super();
this.workflow = workflow.workflow;
this.initializedWorkflow = null;
this.options = {
maxRepeats: 5,
maxConcurrency: 5,
serializableCallback: (data) => { log(JSON.stringify(data), Level.WARN); },
binaryCallback: () => { log('Received binary data, thrashing them.', Level.WARN); },
debug: false,
debugChannel: {},
...options,
};
this.concurrency = new Concurrency(this.options.maxConcurrency);
this.log = (...args) => log(...args);
const error = Preprocessor.validateWorkflow(workflow);
if (error) {
throw (error);
}
if (this.options.debugChannel?.debugMessage) {
const oldLog = this.log;
// @ts-ignore
this.log = (...args: Parameters<typeof oldLog>) => {
if (args[1] !== Level.LOG) {
this.options.debugChannel.debugMessage!(typeof args[0] === 'string' ? args[0] : args[0].message);
}
oldLog(...args);
};
}
}
/**
* Returns the context object from given Page and the current workflow.\
* \
* `workflow` is used for selector extraction - function searches for used selectors to
* look for later in the page's context.
* @param page Playwright Page object
* @param workflow Current **initialized** workflow (array of where-what pairs).
* @returns {PageState} State of the current page.
*/
private async getState(page: Page, workflow: Workflow): Promise<PageState> {
/**
* All the selectors present in the current Workflow
*/
const selectors = Preprocessor.extractSelectors(workflow);
/**
* Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability).
* @param selector Selector to be queried
* @returns True if the targetted element is actionable, false otherwise.
*/
const actionable = async (selector: string): Promise<boolean> => {
try {
const proms = [
page.isEnabled(selector, { timeout: 500 }),
page.isVisible(selector, { timeout: 500 }),
];
return await Promise.all(proms).then((bools) => bools.every((x) => x));
} catch (e) {
// log(<Error>e, Level.ERROR);
return false;
}
};
/**
* Object of selectors present in the current page.
*/
const presentSelectors: SelectorArray = await Promise.all(
selectors.map(async (selector) => {
if (await actionable(selector)) {
return [selector];
}
return [];
}),
).then((x) => x.flat());
return {
url: page.url(),
cookies: (await page.context().cookies([page.url()]))
.reduce((p, cookie) => (
{
...p,
[cookie.name]: cookie.value,
}), {}),
selectors: presentSelectors,
};
}
/**
* Tests if the given action is applicable with the given context.
* @param where Tested *where* condition
* @param context Current browser context.
* @returns True if `where` is applicable in the given context, false otherwise
*/
private applicable(where: Where, context: PageState, usedActions: string[] = []): boolean {
/**
* Given two arbitrary objects, determines whether `subset` is a subset of `superset`.\
* \
* For every key in `subset`, there must be a corresponding key with equal scalar
* value in `superset`, or `inclusive(subset[key], superset[key])` must hold.
* @param subset Arbitrary non-cyclic JS object (where clause)
* @param superset Arbitrary non-cyclic JS object (browser context)
* @returns `true` if `subset <= superset`, `false` otherwise.
*/
const inclusive = (subset: Record<string, unknown>, superset: Record<string, unknown>)
: boolean => (
Object.entries(subset).every(
([key, value]) => {
/**
* Arrays are compared without order (are transformed into objects before comparison).
*/
const parsedValue = Array.isArray(value) ? arrayToObject(value) : value;
const parsedSuperset: Record<string, unknown> = {};
parsedSuperset[key] = Array.isArray(superset[key])
? arrayToObject(<any>superset[key])
: superset[key];
// Every `subset` key must exist in the `superset` and
// have the same value (strict equality), or subset[key] <= superset[key]
return parsedSuperset[key]
&& (
(parsedSuperset[key] === parsedValue)
|| ((parsedValue).constructor.name === 'RegExp' && (<RegExp>parsedValue).test(<string>parsedSuperset[key]))
|| (
(parsedValue).constructor.name !== 'RegExp'
&& typeof parsedValue === 'object' && inclusive(<typeof subset>parsedValue, <typeof superset>parsedSuperset[key])
)
);
},
)
);
// Every value in the "where" object should be compliant to the current state.
return Object.entries(where).every(
([key, value]) => {
if (operators.includes(<any>key)) {
const array = Array.isArray(value)
? value as Where[]
: Object.entries(value).map((a) => Object.fromEntries([a]));
// every condition is treated as a single context
switch (key as keyof typeof operators) {
case '$and':
return array?.every((x) => this.applicable(x, context));
case '$or':
return array?.some((x) => this.applicable(x, context));
case '$not':
return !this.applicable(<Where>value, context); // $not should be a unary operator
default:
throw new Error('Undefined logic operator.');
}
} else if (meta.includes(<any>key)) {
const testRegexString = (x: string) => {
if (typeof value === 'string') {
return x === value;
}
return (<RegExp><unknown>value).test(x);
};
switch (key as keyof typeof meta) {
case '$before':
return !usedActions.find(testRegexString);
case '$after':
return !!usedActions.find(testRegexString);
default:
throw new Error('Undefined meta operator.');
}
} else {
// Current key is a base condition (url, cookies, selectors)
return inclusive({ [key]: value }, context);
}
},
);
}
/**
* Given a Playwright's page object and a "declarative" list of actions, this function
* calls all mentioned functions on the Page object.\
* \
* Manipulates the iterator indexes (experimental feature, likely to be removed in
* the following versions of waw-interpreter)
* @param page Playwright Page object
* @param steps Array of actions.
*/
private async carryOutSteps(page: Page, steps: What[]): Promise<void> {
/**
* Defines overloaded (or added) methods/actions usable in the workflow.
* If a method overloads any existing method of the Page class, it accepts the same set
* of parameters *(but can override some!)*\
* \
* Also, following piece of code defines functions to be run in the browser's context.
* Beware of false linter errors - here, we know better!
*/
const wawActions: Record<CustomFunctions, (...args: any[]) => void> = {
screenshot: async (params: PageScreenshotOptions) => {
const screenshotBuffer = await page.screenshot({
...params, path: undefined,
});
await this.options.binaryCallback(screenshotBuffer, 'image/png');
},
enqueueLinks: async (selector: string) => {
const links: string[] = await page.locator(selector)
.evaluateAll(
// @ts-ignore
(elements) => elements.map((a) => a.href).filter((x) => x),
);
const context = page.context();
for (const link of links) {
// eslint-disable-next-line
this.concurrency.addJob(async () => {
try {
const newPage = await context.newPage();
await newPage.goto(link);
await newPage.waitForLoadState('networkidle');
await this.runLoop(newPage, this.initializedWorkflow!);
} catch (e) {
// `runLoop` uses soft mode, so it recovers from it's own exceptions
// but newPage(), goto() and waitForLoadState() don't (and will kill
// the interpreter by throwing).
this.log(<Error>e, Level.ERROR);
}
});
}
await page.close();
},
scrape: async (selector?: string) => {
const scrapeResults: Record<string, string>[] = <any>await page
// eslint-disable-next-line
// @ts-ignore
.evaluate((s) => scrape(s ?? null), selector);
await this.options.serializableCallback(scrapeResults);
},
scrapeSchema: async (schema: Record<string, string>) => {
const handleLists = await Promise.all(
Object.values(schema).map((selector) => page.$$(selector)),
);
const namedHandleLists = Object.fromEntries(
Object.keys(schema).map((key, i) => [key, handleLists[i]]),
);
const scrapeResult = await page.evaluate((n) => scrapeSchema(n), namedHandleLists);
this.options.serializableCallback(scrapeResult);
},
scroll: async (pages?: number) => {
await page.evaluate(async (pagesInternal) => {
for (let i = 1; i <= (pagesInternal ?? 1); i += 1) {
// @ts-ignore
window.scrollTo(0, window.scrollY + window.innerHeight);
}
}, pages ?? 1);
},
script: async (code: string) => {
const AsyncFunction: FunctionConstructor = Object.getPrototypeOf(
async () => { },
).constructor;
const x = new AsyncFunction('page', 'log', code);
await x(page, this.log);
},
flag: async () => new Promise((res) => {
this.emit('flag', page, res);
}),
};
for (const step of steps) {
this.log(`Launching ${step.action}`, Level.LOG);
if (step.action in wawActions) {
// "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
await wawActions[step.action as CustomFunctions](...(params ?? []));
} else {
// Implements the dot notation for the "method name" in the workflow
const levels = step.action.split('.');
const methodName = levels[levels.length - 1];
let invokee: any = page;
for (const level of levels.splice(0, levels.length - 1)) {
invokee = invokee[level];
}
if (!step.args || Array.isArray(step.args)) {
await (<any>invokee[methodName])(...(step.args ?? []));
} else {
await (<any>invokee[methodName])(step.args);
}
}
await new Promise((res) => { setTimeout(res, 500); });
}
}
private async runLoop(p: Page, workflow: Workflow) {
const usedActions: string[] = [];
let lastAction = null;
let repeatCount = 0;
/**
* Enables the interpreter functionality for popup windows.
* User-requested concurrency should be entirely managed by the concurrency manager,
* e.g. via `enqueueLinks`.
*/
p.on('popup', (popup) => {
this.concurrency.addJob(() => this.runLoop(popup, workflow));
});
/* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
while (true) {
// Checks whether the page was closed from outside,
// or the workflow execution has been stopped via `interpreter.stop()`
if (p.isClosed() || !this.stopper) {
return;
}
try {
await p.waitForLoadState();
} catch (e) {
await p.close();
return;
}
let pageState = {};
try {
pageState = await this.getState(p, workflow);
} catch (e: any) {
this.log('The browser has been closed.');
return;
}
if (this.options.debug) {
this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
}
const actionId = workflow.findIndex(
(step) => this.applicable(step.where, pageState, usedActions),
);
const action = workflow[actionId];
this.log(`Matched ${JSON.stringify(action?.where)}`, Level.LOG);
if (action) { // action is matched
if (this.options.debugChannel?.activeId) {
this.options.debugChannel.activeId(actionId);
}
repeatCount = action === lastAction ? repeatCount + 1 : 0;
if (this.options.maxRepeats && repeatCount >= this.options.maxRepeats) {
return;
}
lastAction = action;
try {
await this.carryOutSteps(p, action.what);
usedActions.push(action.id ?? 'undefined');
} catch (e) {
this.log(<Error>e, Level.ERROR);
}
} else {
return;
}
}
}
/**
* Spawns a browser context and runs given workflow.
* \
* Resolves after the playback is finished.
* @param {Page} [page] Page to run the workflow on.
* @param {ParamType} params Workflow specific, set of parameters
* for the `{$param: nameofparam}` fields.
*/
public async run(page: Page, params?: ParamType): Promise<void> {
if (this.stopper) {
throw new Error('This Interpreter is already running a workflow. To run another workflow, please, spawn another Interpreter.');
}
/**
* `this.workflow` with the parameters initialized.
*/
this.initializedWorkflow = Preprocessor.initWorkflow(this.workflow, params);
// @ts-ignore
if (await page.evaluate(() => !<any>window.scrape)) {
page.context().addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
}
this.stopper = () => {
this.stopper = null;
};
this.concurrency.addJob(() => this.runLoop(page, this.initializedWorkflow!));
await this.concurrency.waitForCompletion();
this.stopper = null;
}
public async stop(): Promise<void> {
if (this.stopper) {
await this.stopper();
this.stopper = null;
} else {
throw new Error('Cannot stop, there is no running workflow!');
}
}
}

179
maxun-core/preprocessor.ts Normal file
View File

@@ -0,0 +1,179 @@
import Joi from 'joi';
import {
Workflow, WorkflowFile, ParamType, SelectorArray, Where,
} from './types/workflow';
import { operators } from './types/logic';
/**
* Class for static processing the workflow files/objects.
*/
export default class Preprocessor {
static validateWorkflow(workflow: WorkflowFile): any {
const regex = Joi.object({
$regex: Joi.string().required(),
});
const whereSchema = Joi.object({
url: [Joi.string().uri(), regex],
selectors: Joi.array().items(Joi.string()),
cookies: Joi.object({}).pattern(Joi.string(), Joi.string()),
$after: [Joi.string(), regex],
$before: [Joi.string(), regex],
$and: Joi.array().items(Joi.link('#whereSchema')),
$or: Joi.array().items(Joi.link('#whereSchema')),
$not: Joi.link('#whereSchema'),
}).id('whereSchema');
const schema = Joi.object({
meta: Joi.object({
name: Joi.string(),
desc: Joi.string(),
}),
workflow: Joi.array().items(
Joi.object({
id: Joi.string(),
where: whereSchema.required(),
what: Joi.array().items({
action: Joi.string().required(),
args: Joi.array().items(Joi.any()),
}).required(),
}),
).required(),
});
const { error } = schema.validate(workflow);
return error;
}
/**
* Extracts parameter names from the workflow.
* @param {WorkflowFile} workflow The given workflow
* @returns {String[]} List of parameters' names.
*/
static getParams(workflow: WorkflowFile): string[] {
const getParamsRecurse = (object: any): string[] => {
if (typeof object === 'object') {
// Recursion base case
if (object.$param) {
return [object.$param];
}
// Recursion general case
return Object.values(object)
.reduce((p: string[], v: any): string[] => [...p, ...getParamsRecurse(v)], []);
}
return [];
};
return getParamsRecurse(workflow.workflow);
}
/**
* List all the selectors used in the given workflow (only literal "selector"
* field in WHERE clauses so far)
*/
// TODO : add recursive selector search (also in click/fill etc. events?)
static extractSelectors(workflow: Workflow): SelectorArray {
/**
* Given a Where condition, this function extracts
* all the existing selectors from it (recursively).
*/
const selectorsFromCondition = (where: Where): SelectorArray => {
// the `selectors` field is either on the top level
let out = where.selectors ?? [];
if (!Array.isArray(out)) {
out = [out];
}
// or nested in the "operator" array
operators.forEach((op) => {
let condWhere = where[op];
if (condWhere) {
condWhere = Array.isArray(condWhere) ? condWhere : [condWhere];
(condWhere).forEach((subWhere) => {
out = [...out, ...selectorsFromCondition(subWhere)];
});
}
});
return out;
};
// Iterate through all the steps and extract the selectors from all of them.
return workflow.reduce((p: SelectorArray, step) => [
...p,
...selectorsFromCondition(step.where).filter((x) => !p.includes(x)),
], []);
}
/**
* Recursively crawl `object` and initializes params - replaces the `{$param : paramName}` objects
* with the defined value.
* @returns {Workflow} Copy of the given workflow, modified (the initial workflow is left untouched).
*/
static initWorkflow(workflow: Workflow, params?: ParamType): Workflow {
const paramNames = this.getParams({ workflow });
if (Object.keys(params ?? {}).sort().join(',') !== paramNames.sort().join(',')) {
throw new Error(`Provided parameters do not match the workflow parameters
provided: ${Object.keys(params ?? {}).sort().join(',')},
expected: ${paramNames.sort().join(',')}
`);
}
/**
* A recursive method for initializing special `{key: value}` syntax objects in the workflow.
* @param object Workflow to initialize (or a part of it).
* @param k key to look for ($regex, $param)
* @param f function mutating the special `{}` syntax into
* its true representation (RegExp...)
* @returns Updated object
*/
const initSpecialRecurse = (
object: unknown,
k: string,
f: (value: string) => unknown,
): unknown => {
if (!object || typeof object !== 'object') {
return object;
}
const out = object;
// for every key (child) of the object
Object.keys(object!).forEach((key) => {
// if the field has only one key, which is `k`
if (Object.keys((<any>object)[key]).length === 1 && (<any>object)[key][k]) {
// process the current special tag (init param, hydrate regex...)
(<any>out)[key] = f((<any>object)[key][k]);
} else {
initSpecialRecurse((<any>object)[key], k, f);
}
});
return out;
};
// TODO: do better deep copy, this is hideous.
let workflowCopy = JSON.parse(JSON.stringify(workflow));
if (params) {
workflowCopy = initSpecialRecurse(
workflowCopy,
'$param',
(paramName) => {
if (params && params[paramName]) {
return params[paramName];
}
throw new SyntaxError(`Unspecified parameter found ${paramName}.`);
},
);
}
workflowCopy = initSpecialRecurse(
workflowCopy,
'$regex',
(regex) => new RegExp(regex),
);
return <Workflow>workflowCopy;
}
}

View File

@@ -0,0 +1,5 @@
export const unaryOperators = ['$not'] as const;
export const naryOperators = ['$and', '$or'] as const;
export const operators = [...unaryOperators, ...naryOperators] as const;
export const meta = ['$before', '$after'] as const;

View File

@@ -0,0 +1,58 @@
import { Page } from 'playwright';
import {
naryOperators, unaryOperators, operators, meta,
} from './logic';
export type Operator = typeof operators[number];
export type UnaryOperator = typeof unaryOperators[number];
export type NAryOperator = typeof naryOperators[number];
export type Meta = typeof meta[number];
export type SelectorArray = string[];
type RegexableString = string | { '$regex': string };
type BaseConditions = {
'url': RegexableString,
'cookies': Record<string, RegexableString>,
'selectors': SelectorArray, // (CSS/Playwright) selectors use their own logic, there is no reason (and several technical difficulties) to allow regular expression notation
} & Record<Meta, RegexableString>;
export type Where =
Partial<{ [key in NAryOperator]: Where[] }> & // either a logic operator (arity N)
Partial<{ [key in UnaryOperator]: Where }> & // or an unary operator
Partial<BaseConditions>; // or one of the base conditions
type MethodNames<T> = {
[K in keyof T]: T[K] extends Function ? K : never;
}[keyof T];
export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag';
export type What = {
action: MethodNames<Page> | CustomFunctions,
args?: any[]
};
export type PageState = Partial<BaseConditions>;
export type ParamType = Record<string, any>;
export type MetaData = {
name?: string,
desc?: string,
};
export interface WhereWhatPair {
id?: string
where: Where
what: What[]
}
export type Workflow = WhereWhatPair[];
export type WorkflowFile = {
meta?: MetaData,
workflow: Workflow
};

View File

@@ -0,0 +1,85 @@
/**
* Concurrency class for running concurrent tasks while managing a limited amount of resources.
*/
export default class Concurrency {
/**
* Maximum number of workers running in parallel. If set to `null`, there is no limit.
*/
maxConcurrency: number = 1;
/**
* Number of currently active workers.
*/
activeWorkers: number = 0;
/**
* Queue of jobs waiting to be completed.
*/
private jobQueue: Function[] = [];
/**
* "Resolve" callbacks of the waitForCompletion() promises.
*/
private waiting: Function[] = [];
/**
* Constructs a new instance of concurrency manager.
* @param {number} maxConcurrency Maximum number of workers running in parallel.
*/
constructor(maxConcurrency: number) {
this.maxConcurrency = maxConcurrency;
}
/**
* Takes a waiting job out of the queue and runs it.
*/
private runNextJob(): void {
const job = this.jobQueue.pop();
if (job) {
// console.debug("Running a job...");
job().then(() => {
// console.debug("Job finished, running the next waiting job...");
this.runNextJob();
});
} else {
// console.debug("No waiting job found!");
this.activeWorkers -= 1;
if (this.activeWorkers === 0) {
// console.debug("This concurrency manager is idle!");
this.waiting.forEach((x) => x());
}
}
}
/**
* Pass a job (a time-demanding async function) to the concurrency manager. \
* The time of the job's execution depends on the concurrency manager itself
* (given a generous enough `maxConcurrency` value, it might be immediate,
* but this is not guaranteed).
* @param worker Async function to be executed (job to be processed).
*/
addJob(job: () => Promise<any>): void {
// console.debug("Adding a worker!");
this.jobQueue.push(job);
if (!this.maxConcurrency || this.activeWorkers < this.maxConcurrency) {
this.runNextJob();
this.activeWorkers += 1;
} else {
// console.debug("No capacity to run a worker now, waiting!");
}
}
/**
* Waits until there is no running nor waiting job. \
* If the concurrency manager is idle at the time of calling this function,
* it waits until at least one job is compeleted (can be "presubscribed").
* @returns Promise, resolved after there is no running/waiting worker.
*/
waitForCompletion(): Promise<void> {
return new Promise((res) => {
this.waiting.push(res);
});
}
}

View File

@@ -0,0 +1,30 @@
/*
* Logger class for more detailed and comprehensible logs (with colors and timestamps)
*/
export enum Level {
DATE = 36,
LOG = 0,
WARN = 93,
ERROR = 31,
DEBUG = 95,
RESET = 0,
}
export default function logger(
message: string | Error,
level: (Level.LOG | Level.WARN | Level.ERROR | Level.DEBUG) = Level.LOG,
) {
let m = message;
if (message.constructor.name.includes('Error') && typeof message !== 'string') {
m = <Error><unknown>(message).message;
}
process.stdout.write(`\x1b[${Level.DATE}m[${(new Date()).toLocaleString()}]\x1b[0m `);
process.stdout.write(`\x1b[${level}m`);
if (level === Level.ERROR || level === Level.WARN) {
process.stderr.write(<string>m);
} else {
process.stdout.write(<string>m);
}
process.stdout.write(`\x1b[${Level.RESET}m\n`);
}

13
maxun-core/utils/utils.ts Normal file
View File

@@ -0,0 +1,13 @@
/**
* ESLint rule in case there is only one util function
* (it still does not represent the "utils" file)
*/
/* eslint-disable import/prefer-default-export */
/**
* Converts an array of scalars to an object with **items** of the array **for keys**.
*/
export function arrayToObject(array : any[]) {
return array.reduce((p, x) => ({ ...p, [x]: [] }), {});
}