diff --git a/maxun-core/browserSide/scraper.js b/maxun-core/browserSide/scraper.js new file mode 100644 index 00000000..c411f642 --- /dev/null +++ b/maxun-core/browserSide/scraper.js @@ -0,0 +1,226 @@ +/* eslint-disable @typescript-eslint/no-unused-vars */ + +const area = (element) => element.offsetHeight * element.offsetWidth; + +function getBiggestElement(selector) { + const elements = Array.from(document.querySelectorAll(selector)); + const biggest = elements.reduce( + (max, elem) => ( + area(elem) > area(max) ? elem : max), + { offsetHeight: 0, offsetWidth: 0 }, + ); + return biggest; +} + +/** + * Generates structural selector (describing element by its DOM tree location). + * + * **The generated selector is not guaranteed to be unique!** (In fact, this is + * the desired behaviour in here.) + * @param {HTMLElement} element Element being described. + * @returns {string} CSS-compliant selector describing the element's location in the DOM tree. + */ +function GetSelectorStructural(element) { + // Base conditions for the recursive approach. + if (element.tagName === 'BODY') { + return 'BODY'; + } + const selector = element.tagName; + if (element.parentElement) { + return `${GetSelectorStructural(element.parentElement)} > ${selector}`; + } + + return selector; +} + +/** + * Heuristic method to find collections of "interesting" items on the page. + * @returns {Array} A collection of interesting DOM nodes + * (online store products, plane tickets, list items... and many more?) + */ +function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, metricType = 'size_deviation') { + const restoreScroll = (() => { + const { scrollX, scrollY } = window; + return () => { + window.scrollTo(scrollX, scrollY); + }; + })(); + + /** +* @typedef {Array<{x: number, y: number}>} Grid +*/ + + /** + * Returns an array of grid-aligned {x,y} points. + * @param {number} [granularity=0.005] sets the number of generated points + * (the higher the granularity, the more points). + * @returns {Grid} Array of {x, y} objects. + */ + function getGrid(startX = 0, startY = 0, granularity = 0.005) { + const width = window.innerWidth; + const height = window.innerHeight; + + const out = []; + for (let x = 0; x < width; x += 1 / granularity) { + for (let y = 0; y < height; y += 1 / granularity) { + out.push({ x: startX + x, y: startY + y }); + } + } + return out; + } + + let maxSelector = { selector: 'body', metric: 0 }; + + const updateMaximumWithPoint = (point) => { + const currentElement = document.elementFromPoint(point.x, point.y); + const selector = GetSelectorStructural(currentElement); + + const elements = Array.from(document.querySelectorAll(selector)) + .filter((element) => area(element) > minArea); + + // If the current selector targets less than three elements, + // we consider it not interesting (would be a very underwhelming scraper) + if (elements.length < 3) { + return; + } + + let metric = null; + + if (metricType === 'total_area') { + metric = elements + .reduce((p, x) => p + area(x), 0); + } else if (metricType === 'size_deviation') { + // This could use a proper "statistics" approach... but meh, so far so good! + const sizes = elements + .map((element) => area(element)); + + metric = (1 - (Math.max(...sizes) - Math.min(...sizes)) / Math.max(...sizes)); + } + + // console.debug(`Total ${metricType} is ${metric}.`) + if (metric > maxSelector.metric && elements.length < maxCountPerPage) { + maxSelector = { selector, metric }; + } + }; + + for (let scroll = 0; scroll < scrolls; scroll += 1) { + window.scrollTo(0, scroll * window.innerHeight); + + const grid = getGrid(); + + grid.forEach(updateMaximumWithPoint); + } + + restoreScroll(); + + let out = Array.from(document.querySelectorAll(maxSelector.selector)); + + const different = (x, i, a) => a.findIndex((e) => e === x) === i; + // as long as we don't merge any two elements by substituing them for their parents, + // we substitute. + while (out.map((x) => x.parentElement).every(different) + && out.forEach((x) => x.parentElement !== null)) { + out = out.map((x) => x.parentElement ?? x); + } + + return out; +} + +/** + * Returns a "scrape" result from the current page. + * @returns {Array} *Curated* array of scraped information (with sparse rows removed) + */ +function scrape(selector = null) { + /** + * **crudeRecords** contains uncurated rundowns of "scrapable" elements + * @type {Array} + */ + const crudeRecords = (selector + ? Array.from(document.querySelectorAll(selector)) + : scrapableHeuristics()) + .map((record) => ({ + ...Array.from(record.querySelectorAll('img')) + .reduce((p, x, i) => { + let url = null; + if (x.srcset) { + const urls = x.srcset.split(', '); + [url] = urls[urls.length - 1].split(' '); + } + + /** + * Contains the largest elements from `srcset` - if `srcset` is not present, contains + * URL from the `src` attribute + * + * If the `src` attribute contains a data url, imgUrl contains `undefined`. + */ + let imgUrl; + if (x.srcset) { + imgUrl = url; + } else if (x.src.indexOf('data:') === -1) { + imgUrl = x.src; + } + + return ({ + ...p, + ...(imgUrl ? { [`img_${i}`]: imgUrl } : {}), + }); + }, {}), + ...record.innerText.split('\n') + .reduce((p, x, i) => ({ + ...p, + [`record_${String(i).padStart(4, '0')}`]: x.trim(), + }), {}), + })); + + return crudeRecords; +} + +/** + * Given an object with named lists of elements, + * groups the elements by their distance in the DOM tree. + * @param {Object.} lists The named lists of HTML elements. + * @returns {Array.>} + */ +function scrapeSchema(lists) { + function omap(object, f, kf = (x) => x) { + return Object.fromEntries( + Object.entries(object) + .map(([k, v]) => [kf(k), f(v)]), + ); + } + + function ofilter(object, f) { + return Object.fromEntries( + Object.entries(object) + .filter(([k, v]) => f(k, v)), + ); + } + + function getSeedKey(listObj) { + const maxLength = Math.max(...Object.values(omap(listObj, (x) => x.length))); + return Object.keys(ofilter(listObj, (_, v) => v.length === maxLength))[0]; + } + + function getMBEs(elements) { + return elements.map((element) => { + let candidate = element; + const isUniqueChild = (e) => elements + .filter((elem) => e.parentNode?.contains(elem)) + .length === 1; + + while (candidate && isUniqueChild(candidate)) { + candidate = candidate.parentNode; + } + + return candidate; + }); + } + + const seedName = getSeedKey(lists); + const MBEs = getMBEs(lists[seedName]); + + return MBEs.map((mbe) => omap( + lists, + (listOfElements) => listOfElements.find((elem) => mbe.contains(elem))?.innerText, + )); +} \ No newline at end of file diff --git a/maxun-core/index.ts b/maxun-core/index.ts new file mode 100644 index 00000000..571f5781 --- /dev/null +++ b/maxun-core/index.ts @@ -0,0 +1,8 @@ +import Interpreter from './interpret'; + +export default Interpreter; +export { default as Preprocessor } from './preprocessor'; +export type { + WorkflowFile, WhereWhatPair, Where, What, +} from './types/workflow'; +export { unaryOperators, naryOperators, meta as metaOperators } from './types/logic'; \ No newline at end of file diff --git a/maxun-core/interpret.ts b/maxun-core/interpret.ts new file mode 100644 index 00000000..9ac32b0e --- /dev/null +++ b/maxun-core/interpret.ts @@ -0,0 +1,459 @@ +/* eslint-disable no-await-in-loop, no-restricted-syntax */ +import { Page, PageScreenshotOptions } from 'playwright'; +import path from 'path'; + +import { EventEmitter } from 'events'; +import { + Where, What, PageState, Workflow, WorkflowFile, + ParamType, SelectorArray, CustomFunctions, +} from './types/workflow'; + +import { operators, meta } from './types/logic'; +import { arrayToObject } from './utils/utils'; +import Concurrency from './utils/concurrency'; +import Preprocessor from './preprocessor'; +import log, { Level } from './utils/logger'; + +/** + * Defines optional intepreter options (passed in constructor) + */ +interface InterpreterOptions { + maxRepeats: number; + maxConcurrency: number; + serializableCallback: (output: any) => (void | Promise); + binaryCallback: (output: any, mimeType: string) => (void | Promise); + debug: boolean; + debugChannel: Partial<{ + activeId: Function, + debugMessage: Function, + }> +} + +/** + * Class for running the Smart Workflows. + */ +export default class Interpreter extends EventEmitter { + private workflow: Workflow; + + private initializedWorkflow: Workflow | null; + + private options: InterpreterOptions; + + private concurrency: Concurrency; + + private stopper: Function | null = null; + + private log: typeof log; + + constructor(workflow: WorkflowFile, options?: Partial) { + super(); + this.workflow = workflow.workflow; + this.initializedWorkflow = null; + this.options = { + maxRepeats: 5, + maxConcurrency: 5, + serializableCallback: (data) => { log(JSON.stringify(data), Level.WARN); }, + binaryCallback: () => { log('Received binary data, thrashing them.', Level.WARN); }, + debug: false, + debugChannel: {}, + ...options, + }; + this.concurrency = new Concurrency(this.options.maxConcurrency); + this.log = (...args) => log(...args); + + const error = Preprocessor.validateWorkflow(workflow); + if (error) { + throw (error); + } + + if (this.options.debugChannel?.debugMessage) { + const oldLog = this.log; + // @ts-ignore + this.log = (...args: Parameters) => { + if (args[1] !== Level.LOG) { + this.options.debugChannel.debugMessage!(typeof args[0] === 'string' ? args[0] : args[0].message); + } + oldLog(...args); + }; + } + } + + /** + * Returns the context object from given Page and the current workflow.\ + * \ + * `workflow` is used for selector extraction - function searches for used selectors to + * look for later in the page's context. + * @param page Playwright Page object + * @param workflow Current **initialized** workflow (array of where-what pairs). + * @returns {PageState} State of the current page. + */ + private async getState(page: Page, workflow: Workflow): Promise { + /** + * All the selectors present in the current Workflow + */ + const selectors = Preprocessor.extractSelectors(workflow); + + /** + * Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability). + * @param selector Selector to be queried + * @returns True if the targetted element is actionable, false otherwise. + */ + const actionable = async (selector: string): Promise => { + try { + const proms = [ + page.isEnabled(selector, { timeout: 500 }), + page.isVisible(selector, { timeout: 500 }), + ]; + + return await Promise.all(proms).then((bools) => bools.every((x) => x)); + } catch (e) { + // log(e, Level.ERROR); + return false; + } + }; + + /** + * Object of selectors present in the current page. + */ + const presentSelectors: SelectorArray = await Promise.all( + selectors.map(async (selector) => { + if (await actionable(selector)) { + return [selector]; + } + return []; + }), + ).then((x) => x.flat()); + + return { + url: page.url(), + cookies: (await page.context().cookies([page.url()])) + .reduce((p, cookie) => ( + { + ...p, + [cookie.name]: cookie.value, + }), {}), + selectors: presentSelectors, + }; + } + + /** + * Tests if the given action is applicable with the given context. + * @param where Tested *where* condition + * @param context Current browser context. + * @returns True if `where` is applicable in the given context, false otherwise + */ + private applicable(where: Where, context: PageState, usedActions: string[] = []): boolean { + /** + * Given two arbitrary objects, determines whether `subset` is a subset of `superset`.\ + * \ + * For every key in `subset`, there must be a corresponding key with equal scalar + * value in `superset`, or `inclusive(subset[key], superset[key])` must hold. + * @param subset Arbitrary non-cyclic JS object (where clause) + * @param superset Arbitrary non-cyclic JS object (browser context) + * @returns `true` if `subset <= superset`, `false` otherwise. + */ + const inclusive = (subset: Record, superset: Record) + : boolean => ( + Object.entries(subset).every( + ([key, value]) => { + /** + * Arrays are compared without order (are transformed into objects before comparison). + */ + const parsedValue = Array.isArray(value) ? arrayToObject(value) : value; + + const parsedSuperset: Record = {}; + parsedSuperset[key] = Array.isArray(superset[key]) + ? arrayToObject(superset[key]) + : superset[key]; + + // Every `subset` key must exist in the `superset` and + // have the same value (strict equality), or subset[key] <= superset[key] + return parsedSuperset[key] + && ( + (parsedSuperset[key] === parsedValue) + || ((parsedValue).constructor.name === 'RegExp' && (parsedValue).test(parsedSuperset[key])) + || ( + (parsedValue).constructor.name !== 'RegExp' + && typeof parsedValue === 'object' && inclusive(parsedValue, parsedSuperset[key]) + ) + ); + }, + ) + ); + + // Every value in the "where" object should be compliant to the current state. + return Object.entries(where).every( + ([key, value]) => { + if (operators.includes(key)) { + const array = Array.isArray(value) + ? value as Where[] + : Object.entries(value).map((a) => Object.fromEntries([a])); + // every condition is treated as a single context + + switch (key as keyof typeof operators) { + case '$and': + return array?.every((x) => this.applicable(x, context)); + case '$or': + return array?.some((x) => this.applicable(x, context)); + case '$not': + return !this.applicable(value, context); // $not should be a unary operator + default: + throw new Error('Undefined logic operator.'); + } + } else if (meta.includes(key)) { + const testRegexString = (x: string) => { + if (typeof value === 'string') { + return x === value; + } + + return (value).test(x); + }; + + switch (key as keyof typeof meta) { + case '$before': + return !usedActions.find(testRegexString); + case '$after': + return !!usedActions.find(testRegexString); + default: + throw new Error('Undefined meta operator.'); + } + } else { + // Current key is a base condition (url, cookies, selectors) + return inclusive({ [key]: value }, context); + } + }, + ); + } + + /** + * Given a Playwright's page object and a "declarative" list of actions, this function + * calls all mentioned functions on the Page object.\ + * \ + * Manipulates the iterator indexes (experimental feature, likely to be removed in + * the following versions of waw-interpreter) + * @param page Playwright Page object + * @param steps Array of actions. + */ + private async carryOutSteps(page: Page, steps: What[]): Promise { + /** + * Defines overloaded (or added) methods/actions usable in the workflow. + * If a method overloads any existing method of the Page class, it accepts the same set + * of parameters *(but can override some!)*\ + * \ + * Also, following piece of code defines functions to be run in the browser's context. + * Beware of false linter errors - here, we know better! + */ + const wawActions: Record void> = { + screenshot: async (params: PageScreenshotOptions) => { + const screenshotBuffer = await page.screenshot({ + ...params, path: undefined, + }); + await this.options.binaryCallback(screenshotBuffer, 'image/png'); + }, + enqueueLinks: async (selector: string) => { + const links: string[] = await page.locator(selector) + .evaluateAll( + // @ts-ignore + (elements) => elements.map((a) => a.href).filter((x) => x), + ); + const context = page.context(); + + for (const link of links) { + // eslint-disable-next-line + this.concurrency.addJob(async () => { + try { + const newPage = await context.newPage(); + await newPage.goto(link); + await newPage.waitForLoadState('networkidle'); + await this.runLoop(newPage, this.initializedWorkflow!); + } catch (e) { + // `runLoop` uses soft mode, so it recovers from it's own exceptions + // but newPage(), goto() and waitForLoadState() don't (and will kill + // the interpreter by throwing). + this.log(e, Level.ERROR); + } + }); + } + await page.close(); + }, + scrape: async (selector?: string) => { + const scrapeResults: Record[] = await page + // eslint-disable-next-line + // @ts-ignore + .evaluate((s) => scrape(s ?? null), selector); + await this.options.serializableCallback(scrapeResults); + }, + scrapeSchema: async (schema: Record) => { + const handleLists = await Promise.all( + Object.values(schema).map((selector) => page.$$(selector)), + ); + + const namedHandleLists = Object.fromEntries( + Object.keys(schema).map((key, i) => [key, handleLists[i]]), + ); + + const scrapeResult = await page.evaluate((n) => scrapeSchema(n), namedHandleLists); + + this.options.serializableCallback(scrapeResult); + }, + scroll: async (pages?: number) => { + await page.evaluate(async (pagesInternal) => { + for (let i = 1; i <= (pagesInternal ?? 1); i += 1) { + // @ts-ignore + window.scrollTo(0, window.scrollY + window.innerHeight); + } + }, pages ?? 1); + }, + script: async (code: string) => { + const AsyncFunction: FunctionConstructor = Object.getPrototypeOf( + async () => { }, + ).constructor; + const x = new AsyncFunction('page', 'log', code); + await x(page, this.log); + }, + flag: async () => new Promise((res) => { + this.emit('flag', page, res); + }), + }; + + for (const step of steps) { + this.log(`Launching ${step.action}`, Level.LOG); + + if (step.action in wawActions) { + // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not) + const params = !step.args || Array.isArray(step.args) ? step.args : [step.args]; + await wawActions[step.action as CustomFunctions](...(params ?? [])); + } else { + // Implements the dot notation for the "method name" in the workflow + const levels = step.action.split('.'); + const methodName = levels[levels.length - 1]; + + let invokee: any = page; + for (const level of levels.splice(0, levels.length - 1)) { + invokee = invokee[level]; + } + + if (!step.args || Array.isArray(step.args)) { + await (invokee[methodName])(...(step.args ?? [])); + } else { + await (invokee[methodName])(step.args); + } + } + + await new Promise((res) => { setTimeout(res, 500); }); + } + } + + private async runLoop(p: Page, workflow: Workflow) { + const usedActions: string[] = []; + let lastAction = null; + let repeatCount = 0; + + /** + * Enables the interpreter functionality for popup windows. + * User-requested concurrency should be entirely managed by the concurrency manager, + * e.g. via `enqueueLinks`. + */ + p.on('popup', (popup) => { + this.concurrency.addJob(() => this.runLoop(popup, workflow)); + }); + + /* eslint no-constant-condition: ["warn", { "checkLoops": false }] */ + while (true) { + // Checks whether the page was closed from outside, + // or the workflow execution has been stopped via `interpreter.stop()` + if (p.isClosed() || !this.stopper) { + return; + } + + try { + await p.waitForLoadState(); + } catch (e) { + await p.close(); + return; + } + + let pageState = {}; + try { + pageState = await this.getState(p, workflow); + } catch (e: any) { + this.log('The browser has been closed.'); + return; + } + + if (this.options.debug) { + this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN); + } + const actionId = workflow.findIndex( + (step) => this.applicable(step.where, pageState, usedActions), + ); + + const action = workflow[actionId]; + + this.log(`Matched ${JSON.stringify(action?.where)}`, Level.LOG); + + if (action) { // action is matched + if (this.options.debugChannel?.activeId) { + this.options.debugChannel.activeId(actionId); + } + + repeatCount = action === lastAction ? repeatCount + 1 : 0; + if (this.options.maxRepeats && repeatCount >= this.options.maxRepeats) { + return; + } + lastAction = action; + + try { + await this.carryOutSteps(p, action.what); + usedActions.push(action.id ?? 'undefined'); + } catch (e) { + this.log(e, Level.ERROR); + } + } else { + return; + } + } + } + + /** + * Spawns a browser context and runs given workflow. + * \ + * Resolves after the playback is finished. + * @param {Page} [page] Page to run the workflow on. + * @param {ParamType} params Workflow specific, set of parameters + * for the `{$param: nameofparam}` fields. + */ + public async run(page: Page, params?: ParamType): Promise { + if (this.stopper) { + throw new Error('This Interpreter is already running a workflow. To run another workflow, please, spawn another Interpreter.'); + } + /** + * `this.workflow` with the parameters initialized. + */ + this.initializedWorkflow = Preprocessor.initWorkflow(this.workflow, params); + + // @ts-ignore + if (await page.evaluate(() => !window.scrape)) { + page.context().addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') }); + } + + this.stopper = () => { + this.stopper = null; + }; + + this.concurrency.addJob(() => this.runLoop(page, this.initializedWorkflow!)); + + await this.concurrency.waitForCompletion(); + + this.stopper = null; + } + + public async stop(): Promise { + if (this.stopper) { + await this.stopper(); + this.stopper = null; + } else { + throw new Error('Cannot stop, there is no running workflow!'); + } + } +} \ No newline at end of file diff --git a/maxun-core/preprocessor.ts b/maxun-core/preprocessor.ts new file mode 100644 index 00000000..9ad15c2a --- /dev/null +++ b/maxun-core/preprocessor.ts @@ -0,0 +1,179 @@ +import Joi from 'joi'; +import { + Workflow, WorkflowFile, ParamType, SelectorArray, Where, +} from './types/workflow'; +import { operators } from './types/logic'; + +/** +* Class for static processing the workflow files/objects. +*/ +export default class Preprocessor { + static validateWorkflow(workflow: WorkflowFile): any { + const regex = Joi.object({ + $regex: Joi.string().required(), + }); + + const whereSchema = Joi.object({ + url: [Joi.string().uri(), regex], + selectors: Joi.array().items(Joi.string()), + cookies: Joi.object({}).pattern(Joi.string(), Joi.string()), + $after: [Joi.string(), regex], + $before: [Joi.string(), regex], + $and: Joi.array().items(Joi.link('#whereSchema')), + $or: Joi.array().items(Joi.link('#whereSchema')), + $not: Joi.link('#whereSchema'), + }).id('whereSchema'); + + const schema = Joi.object({ + meta: Joi.object({ + name: Joi.string(), + desc: Joi.string(), + }), + workflow: Joi.array().items( + Joi.object({ + id: Joi.string(), + where: whereSchema.required(), + what: Joi.array().items({ + action: Joi.string().required(), + args: Joi.array().items(Joi.any()), + }).required(), + }), + ).required(), + }); + + const { error } = schema.validate(workflow); + + return error; + } + +/** +* Extracts parameter names from the workflow. +* @param {WorkflowFile} workflow The given workflow +* @returns {String[]} List of parameters' names. +*/ + static getParams(workflow: WorkflowFile): string[] { + const getParamsRecurse = (object: any): string[] => { + if (typeof object === 'object') { + // Recursion base case + if (object.$param) { + return [object.$param]; + } + + // Recursion general case + return Object.values(object) + .reduce((p: string[], v: any): string[] => [...p, ...getParamsRecurse(v)], []); + } + return []; + }; + + return getParamsRecurse(workflow.workflow); + } + +/** +* List all the selectors used in the given workflow (only literal "selector" +* field in WHERE clauses so far) +*/ + // TODO : add recursive selector search (also in click/fill etc. events?) + static extractSelectors(workflow: Workflow): SelectorArray { + /** +* Given a Where condition, this function extracts +* all the existing selectors from it (recursively). +*/ + const selectorsFromCondition = (where: Where): SelectorArray => { + // the `selectors` field is either on the top level + let out = where.selectors ?? []; + if (!Array.isArray(out)) { + out = [out]; + } + + // or nested in the "operator" array + operators.forEach((op) => { + let condWhere = where[op]; + if (condWhere) { + condWhere = Array.isArray(condWhere) ? condWhere : [condWhere]; + (condWhere).forEach((subWhere) => { + out = [...out, ...selectorsFromCondition(subWhere)]; + }); + } + }); + + return out; + }; + + // Iterate through all the steps and extract the selectors from all of them. + return workflow.reduce((p: SelectorArray, step) => [ + ...p, + ...selectorsFromCondition(step.where).filter((x) => !p.includes(x)), + ], []); + } + +/** +* Recursively crawl `object` and initializes params - replaces the `{$param : paramName}` objects +* with the defined value. +* @returns {Workflow} Copy of the given workflow, modified (the initial workflow is left untouched). +*/ + static initWorkflow(workflow: Workflow, params?: ParamType): Workflow { + const paramNames = this.getParams({ workflow }); + + if (Object.keys(params ?? {}).sort().join(',') !== paramNames.sort().join(',')) { + throw new Error(`Provided parameters do not match the workflow parameters + provided: ${Object.keys(params ?? {}).sort().join(',')}, + expected: ${paramNames.sort().join(',')} + `); + } + /** + * A recursive method for initializing special `{key: value}` syntax objects in the workflow. + * @param object Workflow to initialize (or a part of it). + * @param k key to look for ($regex, $param) + * @param f function mutating the special `{}` syntax into + * its true representation (RegExp...) + * @returns Updated object + */ + const initSpecialRecurse = ( + object: unknown, + k: string, + f: (value: string) => unknown, + ): unknown => { + if (!object || typeof object !== 'object') { + return object; + } + + const out = object; + // for every key (child) of the object + Object.keys(object!).forEach((key) => { + // if the field has only one key, which is `k` + if (Object.keys((object)[key]).length === 1 && (object)[key][k]) { + // process the current special tag (init param, hydrate regex...) + (out)[key] = f((object)[key][k]); + } else { + initSpecialRecurse((object)[key], k, f); + } + }); + return out; + }; + + // TODO: do better deep copy, this is hideous. + let workflowCopy = JSON.parse(JSON.stringify(workflow)); + + if (params) { + workflowCopy = initSpecialRecurse( + workflowCopy, + '$param', + (paramName) => { + if (params && params[paramName]) { + return params[paramName]; + } + throw new SyntaxError(`Unspecified parameter found ${paramName}.`); + }, + ); + } + + workflowCopy = initSpecialRecurse( + workflowCopy, + '$regex', + (regex) => new RegExp(regex), + ); + + return workflowCopy; + } +} \ No newline at end of file diff --git a/maxun-core/types/logic.ts b/maxun-core/types/logic.ts new file mode 100644 index 00000000..5d06abbe --- /dev/null +++ b/maxun-core/types/logic.ts @@ -0,0 +1,5 @@ +export const unaryOperators = ['$not'] as const; +export const naryOperators = ['$and', '$or'] as const; + +export const operators = [...unaryOperators, ...naryOperators] as const; +export const meta = ['$before', '$after'] as const; \ No newline at end of file diff --git a/maxun-core/types/workflow.ts b/maxun-core/types/workflow.ts new file mode 100644 index 00000000..36c6d14d --- /dev/null +++ b/maxun-core/types/workflow.ts @@ -0,0 +1,58 @@ +import { Page } from 'playwright'; +import { + naryOperators, unaryOperators, operators, meta, +} from './logic'; + +export type Operator = typeof operators[number]; +export type UnaryOperator = typeof unaryOperators[number]; +export type NAryOperator = typeof naryOperators[number]; + +export type Meta = typeof meta[number]; + +export type SelectorArray = string[]; + +type RegexableString = string | { '$regex': string }; + +type BaseConditions = { + 'url': RegexableString, + 'cookies': Record, + 'selectors': SelectorArray, // (CSS/Playwright) selectors use their own logic, there is no reason (and several technical difficulties) to allow regular expression notation +} & Record; + +export type Where = + Partial<{ [key in NAryOperator]: Where[] }> & // either a logic operator (arity N) + Partial<{ [key in UnaryOperator]: Where }> & // or an unary operator + Partial; // or one of the base conditions + +type MethodNames = { + [K in keyof T]: T[K] extends Function ? K : never; +}[keyof T]; + +export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag'; + +export type What = { + action: MethodNames | CustomFunctions, + args?: any[] +}; + +export type PageState = Partial; + +export type ParamType = Record; + +export type MetaData = { + name?: string, + desc?: string, +}; + +export interface WhereWhatPair { + id?: string + where: Where + what: What[] +} + +export type Workflow = WhereWhatPair[]; + +export type WorkflowFile = { + meta?: MetaData, + workflow: Workflow +}; \ No newline at end of file diff --git a/maxun-core/utils/concurrency.ts b/maxun-core/utils/concurrency.ts new file mode 100644 index 00000000..eec7eb33 --- /dev/null +++ b/maxun-core/utils/concurrency.ts @@ -0,0 +1,85 @@ +/** + * Concurrency class for running concurrent tasks while managing a limited amount of resources. + */ +export default class Concurrency { + /** + * Maximum number of workers running in parallel. If set to `null`, there is no limit. + */ + maxConcurrency: number = 1; + + /** + * Number of currently active workers. + */ + activeWorkers: number = 0; + + /** + * Queue of jobs waiting to be completed. + */ + private jobQueue: Function[] = []; + + /** + * "Resolve" callbacks of the waitForCompletion() promises. + */ + private waiting: Function[] = []; + + /** + * Constructs a new instance of concurrency manager. + * @param {number} maxConcurrency Maximum number of workers running in parallel. + */ + constructor(maxConcurrency: number) { + this.maxConcurrency = maxConcurrency; + } + + /** + * Takes a waiting job out of the queue and runs it. + */ + private runNextJob(): void { + const job = this.jobQueue.pop(); + + if (job) { + // console.debug("Running a job..."); + job().then(() => { + // console.debug("Job finished, running the next waiting job..."); + this.runNextJob(); + }); + } else { + // console.debug("No waiting job found!"); + this.activeWorkers -= 1; + if (this.activeWorkers === 0) { + // console.debug("This concurrency manager is idle!"); + this.waiting.forEach((x) => x()); + } + } + } + + /** + * Pass a job (a time-demanding async function) to the concurrency manager. \ + * The time of the job's execution depends on the concurrency manager itself + * (given a generous enough `maxConcurrency` value, it might be immediate, + * but this is not guaranteed). + * @param worker Async function to be executed (job to be processed). + */ + addJob(job: () => Promise): void { + // console.debug("Adding a worker!"); + this.jobQueue.push(job); + + if (!this.maxConcurrency || this.activeWorkers < this.maxConcurrency) { + this.runNextJob(); + this.activeWorkers += 1; + } else { + // console.debug("No capacity to run a worker now, waiting!"); + } + } + + /** + * Waits until there is no running nor waiting job. \ + * If the concurrency manager is idle at the time of calling this function, + * it waits until at least one job is compeleted (can be "presubscribed"). + * @returns Promise, resolved after there is no running/waiting worker. + */ + waitForCompletion(): Promise { + return new Promise((res) => { + this.waiting.push(res); + }); + } +} \ No newline at end of file diff --git a/maxun-core/utils/logger.ts b/maxun-core/utils/logger.ts new file mode 100644 index 00000000..e57421aa --- /dev/null +++ b/maxun-core/utils/logger.ts @@ -0,0 +1,30 @@ +/* +* Logger class for more detailed and comprehensible logs (with colors and timestamps) +*/ + +export enum Level { + DATE = 36, + LOG = 0, + WARN = 93, + ERROR = 31, + DEBUG = 95, + RESET = 0, + } + + export default function logger( + message: string | Error, + level: (Level.LOG | Level.WARN | Level.ERROR | Level.DEBUG) = Level.LOG, + ) { + let m = message; + if (message.constructor.name.includes('Error') && typeof message !== 'string') { + m = (message).message; + } + process.stdout.write(`\x1b[${Level.DATE}m[${(new Date()).toLocaleString()}]\x1b[0m `); + process.stdout.write(`\x1b[${level}m`); + if (level === Level.ERROR || level === Level.WARN) { + process.stderr.write(m); + } else { + process.stdout.write(m); + } + process.stdout.write(`\x1b[${Level.RESET}m\n`); + } \ No newline at end of file diff --git a/maxun-core/utils/utils.ts b/maxun-core/utils/utils.ts new file mode 100644 index 00000000..48883dcf --- /dev/null +++ b/maxun-core/utils/utils.ts @@ -0,0 +1,13 @@ +/** + * ESLint rule in case there is only one util function + * (it still does not represent the "utils" file) +*/ + +/* eslint-disable import/prefer-default-export */ + +/** + * Converts an array of scalars to an object with **items** of the array **for keys**. + */ +export function arrayToObject(array : any[]) { + return array.reduce((p, x) => ({ ...p, [x]: [] }), {}); + } \ No newline at end of file