diff --git a/config-overrides.js b/config-overrides.js deleted file mode 100644 index cb2e9960..00000000 --- a/config-overrides.js +++ /dev/null @@ -1,15 +0,0 @@ -/* config-overrides.js */ - -module.exports = function override(config, env) { - //do stuff with the webpack config... - return { - ...config, - resolve: { - ...config.resolve, - fallback: { - ...config.resolve.fallback, - path: false, - }, - }, - } -} diff --git a/mx-interpreter/browserSide/scraper.js b/mx-interpreter/browserSide/scraper.js deleted file mode 100644 index c411f642..00000000 --- a/mx-interpreter/browserSide/scraper.js +++ /dev/null @@ -1,226 +0,0 @@ -/* eslint-disable @typescript-eslint/no-unused-vars */ - -const area = (element) => element.offsetHeight * element.offsetWidth; - -function getBiggestElement(selector) { - const elements = Array.from(document.querySelectorAll(selector)); - const biggest = elements.reduce( - (max, elem) => ( - area(elem) > area(max) ? elem : max), - { offsetHeight: 0, offsetWidth: 0 }, - ); - return biggest; -} - -/** - * Generates structural selector (describing element by its DOM tree location). - * - * **The generated selector is not guaranteed to be unique!** (In fact, this is - * the desired behaviour in here.) - * @param {HTMLElement} element Element being described. - * @returns {string} CSS-compliant selector describing the element's location in the DOM tree. - */ -function GetSelectorStructural(element) { - // Base conditions for the recursive approach. - if (element.tagName === 'BODY') { - return 'BODY'; - } - const selector = element.tagName; - if (element.parentElement) { - return `${GetSelectorStructural(element.parentElement)} > ${selector}`; - } - - return selector; -} - -/** - * Heuristic method to find collections of "interesting" items on the page. - * @returns {Array} A collection of interesting DOM nodes - * (online store products, plane tickets, list items... and many more?) - */ -function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, metricType = 'size_deviation') { - const restoreScroll = (() => { - const { scrollX, scrollY } = window; - return () => { - window.scrollTo(scrollX, scrollY); - }; - })(); - - /** -* @typedef {Array<{x: number, y: number}>} Grid -*/ - - /** - * Returns an array of grid-aligned {x,y} points. - * @param {number} [granularity=0.005] sets the number of generated points - * (the higher the granularity, the more points). - * @returns {Grid} Array of {x, y} objects. - */ - function getGrid(startX = 0, startY = 0, granularity = 0.005) { - const width = window.innerWidth; - const height = window.innerHeight; - - const out = []; - for (let x = 0; x < width; x += 1 / granularity) { - for (let y = 0; y < height; y += 1 / granularity) { - out.push({ x: startX + x, y: startY + y }); - } - } - return out; - } - - let maxSelector = { selector: 'body', metric: 0 }; - - const updateMaximumWithPoint = (point) => { - const currentElement = document.elementFromPoint(point.x, point.y); - const selector = GetSelectorStructural(currentElement); - - const elements = Array.from(document.querySelectorAll(selector)) - .filter((element) => area(element) > minArea); - - // If the current selector targets less than three elements, - // we consider it not interesting (would be a very underwhelming scraper) - if (elements.length < 3) { - return; - } - - let metric = null; - - if (metricType === 'total_area') { - metric = elements - .reduce((p, x) => p + area(x), 0); - } else if (metricType === 'size_deviation') { - // This could use a proper "statistics" approach... but meh, so far so good! - const sizes = elements - .map((element) => area(element)); - - metric = (1 - (Math.max(...sizes) - Math.min(...sizes)) / Math.max(...sizes)); - } - - // console.debug(`Total ${metricType} is ${metric}.`) - if (metric > maxSelector.metric && elements.length < maxCountPerPage) { - maxSelector = { selector, metric }; - } - }; - - for (let scroll = 0; scroll < scrolls; scroll += 1) { - window.scrollTo(0, scroll * window.innerHeight); - - const grid = getGrid(); - - grid.forEach(updateMaximumWithPoint); - } - - restoreScroll(); - - let out = Array.from(document.querySelectorAll(maxSelector.selector)); - - const different = (x, i, a) => a.findIndex((e) => e === x) === i; - // as long as we don't merge any two elements by substituing them for their parents, - // we substitute. - while (out.map((x) => x.parentElement).every(different) - && out.forEach((x) => x.parentElement !== null)) { - out = out.map((x) => x.parentElement ?? x); - } - - return out; -} - -/** - * Returns a "scrape" result from the current page. - * @returns {Array} *Curated* array of scraped information (with sparse rows removed) - */ -function scrape(selector = null) { - /** - * **crudeRecords** contains uncurated rundowns of "scrapable" elements - * @type {Array} - */ - const crudeRecords = (selector - ? Array.from(document.querySelectorAll(selector)) - : scrapableHeuristics()) - .map((record) => ({ - ...Array.from(record.querySelectorAll('img')) - .reduce((p, x, i) => { - let url = null; - if (x.srcset) { - const urls = x.srcset.split(', '); - [url] = urls[urls.length - 1].split(' '); - } - - /** - * Contains the largest elements from `srcset` - if `srcset` is not present, contains - * URL from the `src` attribute - * - * If the `src` attribute contains a data url, imgUrl contains `undefined`. - */ - let imgUrl; - if (x.srcset) { - imgUrl = url; - } else if (x.src.indexOf('data:') === -1) { - imgUrl = x.src; - } - - return ({ - ...p, - ...(imgUrl ? { [`img_${i}`]: imgUrl } : {}), - }); - }, {}), - ...record.innerText.split('\n') - .reduce((p, x, i) => ({ - ...p, - [`record_${String(i).padStart(4, '0')}`]: x.trim(), - }), {}), - })); - - return crudeRecords; -} - -/** - * Given an object with named lists of elements, - * groups the elements by their distance in the DOM tree. - * @param {Object.} lists The named lists of HTML elements. - * @returns {Array.>} - */ -function scrapeSchema(lists) { - function omap(object, f, kf = (x) => x) { - return Object.fromEntries( - Object.entries(object) - .map(([k, v]) => [kf(k), f(v)]), - ); - } - - function ofilter(object, f) { - return Object.fromEntries( - Object.entries(object) - .filter(([k, v]) => f(k, v)), - ); - } - - function getSeedKey(listObj) { - const maxLength = Math.max(...Object.values(omap(listObj, (x) => x.length))); - return Object.keys(ofilter(listObj, (_, v) => v.length === maxLength))[0]; - } - - function getMBEs(elements) { - return elements.map((element) => { - let candidate = element; - const isUniqueChild = (e) => elements - .filter((elem) => e.parentNode?.contains(elem)) - .length === 1; - - while (candidate && isUniqueChild(candidate)) { - candidate = candidate.parentNode; - } - - return candidate; - }); - } - - const seedName = getSeedKey(lists); - const MBEs = getMBEs(lists[seedName]); - - return MBEs.map((mbe) => omap( - lists, - (listOfElements) => listOfElements.find((elem) => mbe.contains(elem))?.innerText, - )); -} \ No newline at end of file diff --git a/mx-interpreter/index.ts b/mx-interpreter/index.ts deleted file mode 100644 index 571f5781..00000000 --- a/mx-interpreter/index.ts +++ /dev/null @@ -1,8 +0,0 @@ -import Interpreter from './interpret'; - -export default Interpreter; -export { default as Preprocessor } from './preprocessor'; -export type { - WorkflowFile, WhereWhatPair, Where, What, -} from './types/workflow'; -export { unaryOperators, naryOperators, meta as metaOperators } from './types/logic'; \ No newline at end of file diff --git a/mx-interpreter/interpret.ts b/mx-interpreter/interpret.ts deleted file mode 100644 index 9ac32b0e..00000000 --- a/mx-interpreter/interpret.ts +++ /dev/null @@ -1,459 +0,0 @@ -/* eslint-disable no-await-in-loop, no-restricted-syntax */ -import { Page, PageScreenshotOptions } from 'playwright'; -import path from 'path'; - -import { EventEmitter } from 'events'; -import { - Where, What, PageState, Workflow, WorkflowFile, - ParamType, SelectorArray, CustomFunctions, -} from './types/workflow'; - -import { operators, meta } from './types/logic'; -import { arrayToObject } from './utils/utils'; -import Concurrency from './utils/concurrency'; -import Preprocessor from './preprocessor'; -import log, { Level } from './utils/logger'; - -/** - * Defines optional intepreter options (passed in constructor) - */ -interface InterpreterOptions { - maxRepeats: number; - maxConcurrency: number; - serializableCallback: (output: any) => (void | Promise); - binaryCallback: (output: any, mimeType: string) => (void | Promise); - debug: boolean; - debugChannel: Partial<{ - activeId: Function, - debugMessage: Function, - }> -} - -/** - * Class for running the Smart Workflows. - */ -export default class Interpreter extends EventEmitter { - private workflow: Workflow; - - private initializedWorkflow: Workflow | null; - - private options: InterpreterOptions; - - private concurrency: Concurrency; - - private stopper: Function | null = null; - - private log: typeof log; - - constructor(workflow: WorkflowFile, options?: Partial) { - super(); - this.workflow = workflow.workflow; - this.initializedWorkflow = null; - this.options = { - maxRepeats: 5, - maxConcurrency: 5, - serializableCallback: (data) => { log(JSON.stringify(data), Level.WARN); }, - binaryCallback: () => { log('Received binary data, thrashing them.', Level.WARN); }, - debug: false, - debugChannel: {}, - ...options, - }; - this.concurrency = new Concurrency(this.options.maxConcurrency); - this.log = (...args) => log(...args); - - const error = Preprocessor.validateWorkflow(workflow); - if (error) { - throw (error); - } - - if (this.options.debugChannel?.debugMessage) { - const oldLog = this.log; - // @ts-ignore - this.log = (...args: Parameters) => { - if (args[1] !== Level.LOG) { - this.options.debugChannel.debugMessage!(typeof args[0] === 'string' ? args[0] : args[0].message); - } - oldLog(...args); - }; - } - } - - /** - * Returns the context object from given Page and the current workflow.\ - * \ - * `workflow` is used for selector extraction - function searches for used selectors to - * look for later in the page's context. - * @param page Playwright Page object - * @param workflow Current **initialized** workflow (array of where-what pairs). - * @returns {PageState} State of the current page. - */ - private async getState(page: Page, workflow: Workflow): Promise { - /** - * All the selectors present in the current Workflow - */ - const selectors = Preprocessor.extractSelectors(workflow); - - /** - * Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability). - * @param selector Selector to be queried - * @returns True if the targetted element is actionable, false otherwise. - */ - const actionable = async (selector: string): Promise => { - try { - const proms = [ - page.isEnabled(selector, { timeout: 500 }), - page.isVisible(selector, { timeout: 500 }), - ]; - - return await Promise.all(proms).then((bools) => bools.every((x) => x)); - } catch (e) { - // log(e, Level.ERROR); - return false; - } - }; - - /** - * Object of selectors present in the current page. - */ - const presentSelectors: SelectorArray = await Promise.all( - selectors.map(async (selector) => { - if (await actionable(selector)) { - return [selector]; - } - return []; - }), - ).then((x) => x.flat()); - - return { - url: page.url(), - cookies: (await page.context().cookies([page.url()])) - .reduce((p, cookie) => ( - { - ...p, - [cookie.name]: cookie.value, - }), {}), - selectors: presentSelectors, - }; - } - - /** - * Tests if the given action is applicable with the given context. - * @param where Tested *where* condition - * @param context Current browser context. - * @returns True if `where` is applicable in the given context, false otherwise - */ - private applicable(where: Where, context: PageState, usedActions: string[] = []): boolean { - /** - * Given two arbitrary objects, determines whether `subset` is a subset of `superset`.\ - * \ - * For every key in `subset`, there must be a corresponding key with equal scalar - * value in `superset`, or `inclusive(subset[key], superset[key])` must hold. - * @param subset Arbitrary non-cyclic JS object (where clause) - * @param superset Arbitrary non-cyclic JS object (browser context) - * @returns `true` if `subset <= superset`, `false` otherwise. - */ - const inclusive = (subset: Record, superset: Record) - : boolean => ( - Object.entries(subset).every( - ([key, value]) => { - /** - * Arrays are compared without order (are transformed into objects before comparison). - */ - const parsedValue = Array.isArray(value) ? arrayToObject(value) : value; - - const parsedSuperset: Record = {}; - parsedSuperset[key] = Array.isArray(superset[key]) - ? arrayToObject(superset[key]) - : superset[key]; - - // Every `subset` key must exist in the `superset` and - // have the same value (strict equality), or subset[key] <= superset[key] - return parsedSuperset[key] - && ( - (parsedSuperset[key] === parsedValue) - || ((parsedValue).constructor.name === 'RegExp' && (parsedValue).test(parsedSuperset[key])) - || ( - (parsedValue).constructor.name !== 'RegExp' - && typeof parsedValue === 'object' && inclusive(parsedValue, parsedSuperset[key]) - ) - ); - }, - ) - ); - - // Every value in the "where" object should be compliant to the current state. - return Object.entries(where).every( - ([key, value]) => { - if (operators.includes(key)) { - const array = Array.isArray(value) - ? value as Where[] - : Object.entries(value).map((a) => Object.fromEntries([a])); - // every condition is treated as a single context - - switch (key as keyof typeof operators) { - case '$and': - return array?.every((x) => this.applicable(x, context)); - case '$or': - return array?.some((x) => this.applicable(x, context)); - case '$not': - return !this.applicable(value, context); // $not should be a unary operator - default: - throw new Error('Undefined logic operator.'); - } - } else if (meta.includes(key)) { - const testRegexString = (x: string) => { - if (typeof value === 'string') { - return x === value; - } - - return (value).test(x); - }; - - switch (key as keyof typeof meta) { - case '$before': - return !usedActions.find(testRegexString); - case '$after': - return !!usedActions.find(testRegexString); - default: - throw new Error('Undefined meta operator.'); - } - } else { - // Current key is a base condition (url, cookies, selectors) - return inclusive({ [key]: value }, context); - } - }, - ); - } - - /** - * Given a Playwright's page object and a "declarative" list of actions, this function - * calls all mentioned functions on the Page object.\ - * \ - * Manipulates the iterator indexes (experimental feature, likely to be removed in - * the following versions of waw-interpreter) - * @param page Playwright Page object - * @param steps Array of actions. - */ - private async carryOutSteps(page: Page, steps: What[]): Promise { - /** - * Defines overloaded (or added) methods/actions usable in the workflow. - * If a method overloads any existing method of the Page class, it accepts the same set - * of parameters *(but can override some!)*\ - * \ - * Also, following piece of code defines functions to be run in the browser's context. - * Beware of false linter errors - here, we know better! - */ - const wawActions: Record void> = { - screenshot: async (params: PageScreenshotOptions) => { - const screenshotBuffer = await page.screenshot({ - ...params, path: undefined, - }); - await this.options.binaryCallback(screenshotBuffer, 'image/png'); - }, - enqueueLinks: async (selector: string) => { - const links: string[] = await page.locator(selector) - .evaluateAll( - // @ts-ignore - (elements) => elements.map((a) => a.href).filter((x) => x), - ); - const context = page.context(); - - for (const link of links) { - // eslint-disable-next-line - this.concurrency.addJob(async () => { - try { - const newPage = await context.newPage(); - await newPage.goto(link); - await newPage.waitForLoadState('networkidle'); - await this.runLoop(newPage, this.initializedWorkflow!); - } catch (e) { - // `runLoop` uses soft mode, so it recovers from it's own exceptions - // but newPage(), goto() and waitForLoadState() don't (and will kill - // the interpreter by throwing). - this.log(e, Level.ERROR); - } - }); - } - await page.close(); - }, - scrape: async (selector?: string) => { - const scrapeResults: Record[] = await page - // eslint-disable-next-line - // @ts-ignore - .evaluate((s) => scrape(s ?? null), selector); - await this.options.serializableCallback(scrapeResults); - }, - scrapeSchema: async (schema: Record) => { - const handleLists = await Promise.all( - Object.values(schema).map((selector) => page.$$(selector)), - ); - - const namedHandleLists = Object.fromEntries( - Object.keys(schema).map((key, i) => [key, handleLists[i]]), - ); - - const scrapeResult = await page.evaluate((n) => scrapeSchema(n), namedHandleLists); - - this.options.serializableCallback(scrapeResult); - }, - scroll: async (pages?: number) => { - await page.evaluate(async (pagesInternal) => { - for (let i = 1; i <= (pagesInternal ?? 1); i += 1) { - // @ts-ignore - window.scrollTo(0, window.scrollY + window.innerHeight); - } - }, pages ?? 1); - }, - script: async (code: string) => { - const AsyncFunction: FunctionConstructor = Object.getPrototypeOf( - async () => { }, - ).constructor; - const x = new AsyncFunction('page', 'log', code); - await x(page, this.log); - }, - flag: async () => new Promise((res) => { - this.emit('flag', page, res); - }), - }; - - for (const step of steps) { - this.log(`Launching ${step.action}`, Level.LOG); - - if (step.action in wawActions) { - // "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not) - const params = !step.args || Array.isArray(step.args) ? step.args : [step.args]; - await wawActions[step.action as CustomFunctions](...(params ?? [])); - } else { - // Implements the dot notation for the "method name" in the workflow - const levels = step.action.split('.'); - const methodName = levels[levels.length - 1]; - - let invokee: any = page; - for (const level of levels.splice(0, levels.length - 1)) { - invokee = invokee[level]; - } - - if (!step.args || Array.isArray(step.args)) { - await (invokee[methodName])(...(step.args ?? [])); - } else { - await (invokee[methodName])(step.args); - } - } - - await new Promise((res) => { setTimeout(res, 500); }); - } - } - - private async runLoop(p: Page, workflow: Workflow) { - const usedActions: string[] = []; - let lastAction = null; - let repeatCount = 0; - - /** - * Enables the interpreter functionality for popup windows. - * User-requested concurrency should be entirely managed by the concurrency manager, - * e.g. via `enqueueLinks`. - */ - p.on('popup', (popup) => { - this.concurrency.addJob(() => this.runLoop(popup, workflow)); - }); - - /* eslint no-constant-condition: ["warn", { "checkLoops": false }] */ - while (true) { - // Checks whether the page was closed from outside, - // or the workflow execution has been stopped via `interpreter.stop()` - if (p.isClosed() || !this.stopper) { - return; - } - - try { - await p.waitForLoadState(); - } catch (e) { - await p.close(); - return; - } - - let pageState = {}; - try { - pageState = await this.getState(p, workflow); - } catch (e: any) { - this.log('The browser has been closed.'); - return; - } - - if (this.options.debug) { - this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN); - } - const actionId = workflow.findIndex( - (step) => this.applicable(step.where, pageState, usedActions), - ); - - const action = workflow[actionId]; - - this.log(`Matched ${JSON.stringify(action?.where)}`, Level.LOG); - - if (action) { // action is matched - if (this.options.debugChannel?.activeId) { - this.options.debugChannel.activeId(actionId); - } - - repeatCount = action === lastAction ? repeatCount + 1 : 0; - if (this.options.maxRepeats && repeatCount >= this.options.maxRepeats) { - return; - } - lastAction = action; - - try { - await this.carryOutSteps(p, action.what); - usedActions.push(action.id ?? 'undefined'); - } catch (e) { - this.log(e, Level.ERROR); - } - } else { - return; - } - } - } - - /** - * Spawns a browser context and runs given workflow. - * \ - * Resolves after the playback is finished. - * @param {Page} [page] Page to run the workflow on. - * @param {ParamType} params Workflow specific, set of parameters - * for the `{$param: nameofparam}` fields. - */ - public async run(page: Page, params?: ParamType): Promise { - if (this.stopper) { - throw new Error('This Interpreter is already running a workflow. To run another workflow, please, spawn another Interpreter.'); - } - /** - * `this.workflow` with the parameters initialized. - */ - this.initializedWorkflow = Preprocessor.initWorkflow(this.workflow, params); - - // @ts-ignore - if (await page.evaluate(() => !window.scrape)) { - page.context().addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') }); - } - - this.stopper = () => { - this.stopper = null; - }; - - this.concurrency.addJob(() => this.runLoop(page, this.initializedWorkflow!)); - - await this.concurrency.waitForCompletion(); - - this.stopper = null; - } - - public async stop(): Promise { - if (this.stopper) { - await this.stopper(); - this.stopper = null; - } else { - throw new Error('Cannot stop, there is no running workflow!'); - } - } -} \ No newline at end of file diff --git a/mx-interpreter/preprocessor.ts b/mx-interpreter/preprocessor.ts deleted file mode 100644 index 9ad15c2a..00000000 --- a/mx-interpreter/preprocessor.ts +++ /dev/null @@ -1,179 +0,0 @@ -import Joi from 'joi'; -import { - Workflow, WorkflowFile, ParamType, SelectorArray, Where, -} from './types/workflow'; -import { operators } from './types/logic'; - -/** -* Class for static processing the workflow files/objects. -*/ -export default class Preprocessor { - static validateWorkflow(workflow: WorkflowFile): any { - const regex = Joi.object({ - $regex: Joi.string().required(), - }); - - const whereSchema = Joi.object({ - url: [Joi.string().uri(), regex], - selectors: Joi.array().items(Joi.string()), - cookies: Joi.object({}).pattern(Joi.string(), Joi.string()), - $after: [Joi.string(), regex], - $before: [Joi.string(), regex], - $and: Joi.array().items(Joi.link('#whereSchema')), - $or: Joi.array().items(Joi.link('#whereSchema')), - $not: Joi.link('#whereSchema'), - }).id('whereSchema'); - - const schema = Joi.object({ - meta: Joi.object({ - name: Joi.string(), - desc: Joi.string(), - }), - workflow: Joi.array().items( - Joi.object({ - id: Joi.string(), - where: whereSchema.required(), - what: Joi.array().items({ - action: Joi.string().required(), - args: Joi.array().items(Joi.any()), - }).required(), - }), - ).required(), - }); - - const { error } = schema.validate(workflow); - - return error; - } - -/** -* Extracts parameter names from the workflow. -* @param {WorkflowFile} workflow The given workflow -* @returns {String[]} List of parameters' names. -*/ - static getParams(workflow: WorkflowFile): string[] { - const getParamsRecurse = (object: any): string[] => { - if (typeof object === 'object') { - // Recursion base case - if (object.$param) { - return [object.$param]; - } - - // Recursion general case - return Object.values(object) - .reduce((p: string[], v: any): string[] => [...p, ...getParamsRecurse(v)], []); - } - return []; - }; - - return getParamsRecurse(workflow.workflow); - } - -/** -* List all the selectors used in the given workflow (only literal "selector" -* field in WHERE clauses so far) -*/ - // TODO : add recursive selector search (also in click/fill etc. events?) - static extractSelectors(workflow: Workflow): SelectorArray { - /** -* Given a Where condition, this function extracts -* all the existing selectors from it (recursively). -*/ - const selectorsFromCondition = (where: Where): SelectorArray => { - // the `selectors` field is either on the top level - let out = where.selectors ?? []; - if (!Array.isArray(out)) { - out = [out]; - } - - // or nested in the "operator" array - operators.forEach((op) => { - let condWhere = where[op]; - if (condWhere) { - condWhere = Array.isArray(condWhere) ? condWhere : [condWhere]; - (condWhere).forEach((subWhere) => { - out = [...out, ...selectorsFromCondition(subWhere)]; - }); - } - }); - - return out; - }; - - // Iterate through all the steps and extract the selectors from all of them. - return workflow.reduce((p: SelectorArray, step) => [ - ...p, - ...selectorsFromCondition(step.where).filter((x) => !p.includes(x)), - ], []); - } - -/** -* Recursively crawl `object` and initializes params - replaces the `{$param : paramName}` objects -* with the defined value. -* @returns {Workflow} Copy of the given workflow, modified (the initial workflow is left untouched). -*/ - static initWorkflow(workflow: Workflow, params?: ParamType): Workflow { - const paramNames = this.getParams({ workflow }); - - if (Object.keys(params ?? {}).sort().join(',') !== paramNames.sort().join(',')) { - throw new Error(`Provided parameters do not match the workflow parameters - provided: ${Object.keys(params ?? {}).sort().join(',')}, - expected: ${paramNames.sort().join(',')} - `); - } - /** - * A recursive method for initializing special `{key: value}` syntax objects in the workflow. - * @param object Workflow to initialize (or a part of it). - * @param k key to look for ($regex, $param) - * @param f function mutating the special `{}` syntax into - * its true representation (RegExp...) - * @returns Updated object - */ - const initSpecialRecurse = ( - object: unknown, - k: string, - f: (value: string) => unknown, - ): unknown => { - if (!object || typeof object !== 'object') { - return object; - } - - const out = object; - // for every key (child) of the object - Object.keys(object!).forEach((key) => { - // if the field has only one key, which is `k` - if (Object.keys((object)[key]).length === 1 && (object)[key][k]) { - // process the current special tag (init param, hydrate regex...) - (out)[key] = f((object)[key][k]); - } else { - initSpecialRecurse((object)[key], k, f); - } - }); - return out; - }; - - // TODO: do better deep copy, this is hideous. - let workflowCopy = JSON.parse(JSON.stringify(workflow)); - - if (params) { - workflowCopy = initSpecialRecurse( - workflowCopy, - '$param', - (paramName) => { - if (params && params[paramName]) { - return params[paramName]; - } - throw new SyntaxError(`Unspecified parameter found ${paramName}.`); - }, - ); - } - - workflowCopy = initSpecialRecurse( - workflowCopy, - '$regex', - (regex) => new RegExp(regex), - ); - - return workflowCopy; - } -} \ No newline at end of file diff --git a/mx-interpreter/types/logic.ts b/mx-interpreter/types/logic.ts deleted file mode 100644 index 5d06abbe..00000000 --- a/mx-interpreter/types/logic.ts +++ /dev/null @@ -1,5 +0,0 @@ -export const unaryOperators = ['$not'] as const; -export const naryOperators = ['$and', '$or'] as const; - -export const operators = [...unaryOperators, ...naryOperators] as const; -export const meta = ['$before', '$after'] as const; \ No newline at end of file diff --git a/mx-interpreter/types/workflow.ts b/mx-interpreter/types/workflow.ts deleted file mode 100644 index 36c6d14d..00000000 --- a/mx-interpreter/types/workflow.ts +++ /dev/null @@ -1,58 +0,0 @@ -import { Page } from 'playwright'; -import { - naryOperators, unaryOperators, operators, meta, -} from './logic'; - -export type Operator = typeof operators[number]; -export type UnaryOperator = typeof unaryOperators[number]; -export type NAryOperator = typeof naryOperators[number]; - -export type Meta = typeof meta[number]; - -export type SelectorArray = string[]; - -type RegexableString = string | { '$regex': string }; - -type BaseConditions = { - 'url': RegexableString, - 'cookies': Record, - 'selectors': SelectorArray, // (CSS/Playwright) selectors use their own logic, there is no reason (and several technical difficulties) to allow regular expression notation -} & Record; - -export type Where = - Partial<{ [key in NAryOperator]: Where[] }> & // either a logic operator (arity N) - Partial<{ [key in UnaryOperator]: Where }> & // or an unary operator - Partial; // or one of the base conditions - -type MethodNames = { - [K in keyof T]: T[K] extends Function ? K : never; -}[keyof T]; - -export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag'; - -export type What = { - action: MethodNames | CustomFunctions, - args?: any[] -}; - -export type PageState = Partial; - -export type ParamType = Record; - -export type MetaData = { - name?: string, - desc?: string, -}; - -export interface WhereWhatPair { - id?: string - where: Where - what: What[] -} - -export type Workflow = WhereWhatPair[]; - -export type WorkflowFile = { - meta?: MetaData, - workflow: Workflow -}; \ No newline at end of file diff --git a/mx-interpreter/utils/concurrency.ts b/mx-interpreter/utils/concurrency.ts deleted file mode 100644 index eec7eb33..00000000 --- a/mx-interpreter/utils/concurrency.ts +++ /dev/null @@ -1,85 +0,0 @@ -/** - * Concurrency class for running concurrent tasks while managing a limited amount of resources. - */ -export default class Concurrency { - /** - * Maximum number of workers running in parallel. If set to `null`, there is no limit. - */ - maxConcurrency: number = 1; - - /** - * Number of currently active workers. - */ - activeWorkers: number = 0; - - /** - * Queue of jobs waiting to be completed. - */ - private jobQueue: Function[] = []; - - /** - * "Resolve" callbacks of the waitForCompletion() promises. - */ - private waiting: Function[] = []; - - /** - * Constructs a new instance of concurrency manager. - * @param {number} maxConcurrency Maximum number of workers running in parallel. - */ - constructor(maxConcurrency: number) { - this.maxConcurrency = maxConcurrency; - } - - /** - * Takes a waiting job out of the queue and runs it. - */ - private runNextJob(): void { - const job = this.jobQueue.pop(); - - if (job) { - // console.debug("Running a job..."); - job().then(() => { - // console.debug("Job finished, running the next waiting job..."); - this.runNextJob(); - }); - } else { - // console.debug("No waiting job found!"); - this.activeWorkers -= 1; - if (this.activeWorkers === 0) { - // console.debug("This concurrency manager is idle!"); - this.waiting.forEach((x) => x()); - } - } - } - - /** - * Pass a job (a time-demanding async function) to the concurrency manager. \ - * The time of the job's execution depends on the concurrency manager itself - * (given a generous enough `maxConcurrency` value, it might be immediate, - * but this is not guaranteed). - * @param worker Async function to be executed (job to be processed). - */ - addJob(job: () => Promise): void { - // console.debug("Adding a worker!"); - this.jobQueue.push(job); - - if (!this.maxConcurrency || this.activeWorkers < this.maxConcurrency) { - this.runNextJob(); - this.activeWorkers += 1; - } else { - // console.debug("No capacity to run a worker now, waiting!"); - } - } - - /** - * Waits until there is no running nor waiting job. \ - * If the concurrency manager is idle at the time of calling this function, - * it waits until at least one job is compeleted (can be "presubscribed"). - * @returns Promise, resolved after there is no running/waiting worker. - */ - waitForCompletion(): Promise { - return new Promise((res) => { - this.waiting.push(res); - }); - } -} \ No newline at end of file diff --git a/mx-interpreter/utils/logger.ts b/mx-interpreter/utils/logger.ts deleted file mode 100644 index e57421aa..00000000 --- a/mx-interpreter/utils/logger.ts +++ /dev/null @@ -1,30 +0,0 @@ -/* -* Logger class for more detailed and comprehensible logs (with colors and timestamps) -*/ - -export enum Level { - DATE = 36, - LOG = 0, - WARN = 93, - ERROR = 31, - DEBUG = 95, - RESET = 0, - } - - export default function logger( - message: string | Error, - level: (Level.LOG | Level.WARN | Level.ERROR | Level.DEBUG) = Level.LOG, - ) { - let m = message; - if (message.constructor.name.includes('Error') && typeof message !== 'string') { - m = (message).message; - } - process.stdout.write(`\x1b[${Level.DATE}m[${(new Date()).toLocaleString()}]\x1b[0m `); - process.stdout.write(`\x1b[${level}m`); - if (level === Level.ERROR || level === Level.WARN) { - process.stderr.write(m); - } else { - process.stdout.write(m); - } - process.stdout.write(`\x1b[${Level.RESET}m\n`); - } \ No newline at end of file diff --git a/mx-interpreter/utils/utils.ts b/mx-interpreter/utils/utils.ts deleted file mode 100644 index 48883dcf..00000000 --- a/mx-interpreter/utils/utils.ts +++ /dev/null @@ -1,13 +0,0 @@ -/** - * ESLint rule in case there is only one util function - * (it still does not represent the "utils" file) -*/ - -/* eslint-disable import/prefer-default-export */ - -/** - * Converts an array of scalars to an object with **items** of the array **for keys**. - */ -export function arrayToObject(array : any[]) { - return array.reduce((p, x) => ({ ...p, [x]: [] }), {}); - } \ No newline at end of file diff --git a/server/Dockerfile b/server/Dockerfile deleted file mode 100644 index f42c2d21..00000000 --- a/server/Dockerfile +++ /dev/null @@ -1,19 +0,0 @@ -FROM node:18-alpine - -# Set working directory -WORKDIR /app - -# Copy package.json and package-lock.json -COPY backend/package*.json ./ - -# Install dependencies -RUN npm install --production - -# Copy the backend code -COPY backend/ ./ - -# Expose the API port (8080 for the backend) -EXPOSE 8080 - -# Start the backend server -CMD ["npm", "start"] diff --git a/src/Dockerfile b/src/Dockerfile deleted file mode 100644 index 27aed44a..00000000 --- a/src/Dockerfile +++ /dev/null @@ -1,32 +0,0 @@ -# Stage 1: Build the React frontend -FROM node:18-alpine AS build - -# Set working directory -WORKDIR /app - -# Copy package.json and package-lock.json -COPY frontend/package*.json ./ - -# Install dependencies -RUN npm install - -# Copy the entire frontend project -COPY frontend/ ./ - -# Build the frontend -RUN npm run build - -# Stage 2: Serve the static files with nginx -FROM nginx:stable-alpine - -# Copy build output from the previous stage -COPY --from=build /app/build /usr/share/nginx/html - -# Copy nginx configuration (if needed) -COPY frontend/nginx.conf /etc/nginx/conf.d/default.conf - -# Expose the port NGINX will run on (3000 for React frontend) -EXPOSE 3000 - -# Start NGINX -CMD ["nginx", "-g", "daemon off;"]