refactor: rename to maxun-core from mx-interpreter
This commit is contained in:
226
maxun-core/browserSide/scraper.js
Normal file
226
maxun-core/browserSide/scraper.js
Normal file
@@ -0,0 +1,226 @@
|
||||
/* eslint-disable @typescript-eslint/no-unused-vars */
|
||||
|
||||
const area = (element) => element.offsetHeight * element.offsetWidth;
|
||||
|
||||
function getBiggestElement(selector) {
|
||||
const elements = Array.from(document.querySelectorAll(selector));
|
||||
const biggest = elements.reduce(
|
||||
(max, elem) => (
|
||||
area(elem) > area(max) ? elem : max),
|
||||
{ offsetHeight: 0, offsetWidth: 0 },
|
||||
);
|
||||
return biggest;
|
||||
}
|
||||
|
||||
/**
|
||||
* Generates structural selector (describing element by its DOM tree location).
|
||||
*
|
||||
* **The generated selector is not guaranteed to be unique!** (In fact, this is
|
||||
* the desired behaviour in here.)
|
||||
* @param {HTMLElement} element Element being described.
|
||||
* @returns {string} CSS-compliant selector describing the element's location in the DOM tree.
|
||||
*/
|
||||
function GetSelectorStructural(element) {
|
||||
// Base conditions for the recursive approach.
|
||||
if (element.tagName === 'BODY') {
|
||||
return 'BODY';
|
||||
}
|
||||
const selector = element.tagName;
|
||||
if (element.parentElement) {
|
||||
return `${GetSelectorStructural(element.parentElement)} > ${selector}`;
|
||||
}
|
||||
|
||||
return selector;
|
||||
}
|
||||
|
||||
/**
|
||||
* Heuristic method to find collections of "interesting" items on the page.
|
||||
* @returns {Array<HTMLElement>} A collection of interesting DOM nodes
|
||||
* (online store products, plane tickets, list items... and many more?)
|
||||
*/
|
||||
function scrapableHeuristics(maxCountPerPage = 50, minArea = 20000, scrolls = 3, metricType = 'size_deviation') {
|
||||
const restoreScroll = (() => {
|
||||
const { scrollX, scrollY } = window;
|
||||
return () => {
|
||||
window.scrollTo(scrollX, scrollY);
|
||||
};
|
||||
})();
|
||||
|
||||
/**
|
||||
* @typedef {Array<{x: number, y: number}>} Grid
|
||||
*/
|
||||
|
||||
/**
|
||||
* Returns an array of grid-aligned {x,y} points.
|
||||
* @param {number} [granularity=0.005] sets the number of generated points
|
||||
* (the higher the granularity, the more points).
|
||||
* @returns {Grid} Array of {x, y} objects.
|
||||
*/
|
||||
function getGrid(startX = 0, startY = 0, granularity = 0.005) {
|
||||
const width = window.innerWidth;
|
||||
const height = window.innerHeight;
|
||||
|
||||
const out = [];
|
||||
for (let x = 0; x < width; x += 1 / granularity) {
|
||||
for (let y = 0; y < height; y += 1 / granularity) {
|
||||
out.push({ x: startX + x, y: startY + y });
|
||||
}
|
||||
}
|
||||
return out;
|
||||
}
|
||||
|
||||
let maxSelector = { selector: 'body', metric: 0 };
|
||||
|
||||
const updateMaximumWithPoint = (point) => {
|
||||
const currentElement = document.elementFromPoint(point.x, point.y);
|
||||
const selector = GetSelectorStructural(currentElement);
|
||||
|
||||
const elements = Array.from(document.querySelectorAll(selector))
|
||||
.filter((element) => area(element) > minArea);
|
||||
|
||||
// If the current selector targets less than three elements,
|
||||
// we consider it not interesting (would be a very underwhelming scraper)
|
||||
if (elements.length < 3) {
|
||||
return;
|
||||
}
|
||||
|
||||
let metric = null;
|
||||
|
||||
if (metricType === 'total_area') {
|
||||
metric = elements
|
||||
.reduce((p, x) => p + area(x), 0);
|
||||
} else if (metricType === 'size_deviation') {
|
||||
// This could use a proper "statistics" approach... but meh, so far so good!
|
||||
const sizes = elements
|
||||
.map((element) => area(element));
|
||||
|
||||
metric = (1 - (Math.max(...sizes) - Math.min(...sizes)) / Math.max(...sizes));
|
||||
}
|
||||
|
||||
// console.debug(`Total ${metricType} is ${metric}.`)
|
||||
if (metric > maxSelector.metric && elements.length < maxCountPerPage) {
|
||||
maxSelector = { selector, metric };
|
||||
}
|
||||
};
|
||||
|
||||
for (let scroll = 0; scroll < scrolls; scroll += 1) {
|
||||
window.scrollTo(0, scroll * window.innerHeight);
|
||||
|
||||
const grid = getGrid();
|
||||
|
||||
grid.forEach(updateMaximumWithPoint);
|
||||
}
|
||||
|
||||
restoreScroll();
|
||||
|
||||
let out = Array.from(document.querySelectorAll(maxSelector.selector));
|
||||
|
||||
const different = (x, i, a) => a.findIndex((e) => e === x) === i;
|
||||
// as long as we don't merge any two elements by substituing them for their parents,
|
||||
// we substitute.
|
||||
while (out.map((x) => x.parentElement).every(different)
|
||||
&& out.forEach((x) => x.parentElement !== null)) {
|
||||
out = out.map((x) => x.parentElement ?? x);
|
||||
}
|
||||
|
||||
return out;
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns a "scrape" result from the current page.
|
||||
* @returns {Array<Object>} *Curated* array of scraped information (with sparse rows removed)
|
||||
*/
|
||||
function scrape(selector = null) {
|
||||
/**
|
||||
* **crudeRecords** contains uncurated rundowns of "scrapable" elements
|
||||
* @type {Array<Object>}
|
||||
*/
|
||||
const crudeRecords = (selector
|
||||
? Array.from(document.querySelectorAll(selector))
|
||||
: scrapableHeuristics())
|
||||
.map((record) => ({
|
||||
...Array.from(record.querySelectorAll('img'))
|
||||
.reduce((p, x, i) => {
|
||||
let url = null;
|
||||
if (x.srcset) {
|
||||
const urls = x.srcset.split(', ');
|
||||
[url] = urls[urls.length - 1].split(' ');
|
||||
}
|
||||
|
||||
/**
|
||||
* Contains the largest elements from `srcset` - if `srcset` is not present, contains
|
||||
* URL from the `src` attribute
|
||||
*
|
||||
* If the `src` attribute contains a data url, imgUrl contains `undefined`.
|
||||
*/
|
||||
let imgUrl;
|
||||
if (x.srcset) {
|
||||
imgUrl = url;
|
||||
} else if (x.src.indexOf('data:') === -1) {
|
||||
imgUrl = x.src;
|
||||
}
|
||||
|
||||
return ({
|
||||
...p,
|
||||
...(imgUrl ? { [`img_${i}`]: imgUrl } : {}),
|
||||
});
|
||||
}, {}),
|
||||
...record.innerText.split('\n')
|
||||
.reduce((p, x, i) => ({
|
||||
...p,
|
||||
[`record_${String(i).padStart(4, '0')}`]: x.trim(),
|
||||
}), {}),
|
||||
}));
|
||||
|
||||
return crudeRecords;
|
||||
}
|
||||
|
||||
/**
|
||||
* Given an object with named lists of elements,
|
||||
* groups the elements by their distance in the DOM tree.
|
||||
* @param {Object.<string, object[]>} lists The named lists of HTML elements.
|
||||
* @returns {Array.<Object.<string, string>>}
|
||||
*/
|
||||
function scrapeSchema(lists) {
|
||||
function omap(object, f, kf = (x) => x) {
|
||||
return Object.fromEntries(
|
||||
Object.entries(object)
|
||||
.map(([k, v]) => [kf(k), f(v)]),
|
||||
);
|
||||
}
|
||||
|
||||
function ofilter(object, f) {
|
||||
return Object.fromEntries(
|
||||
Object.entries(object)
|
||||
.filter(([k, v]) => f(k, v)),
|
||||
);
|
||||
}
|
||||
|
||||
function getSeedKey(listObj) {
|
||||
const maxLength = Math.max(...Object.values(omap(listObj, (x) => x.length)));
|
||||
return Object.keys(ofilter(listObj, (_, v) => v.length === maxLength))[0];
|
||||
}
|
||||
|
||||
function getMBEs(elements) {
|
||||
return elements.map((element) => {
|
||||
let candidate = element;
|
||||
const isUniqueChild = (e) => elements
|
||||
.filter((elem) => e.parentNode?.contains(elem))
|
||||
.length === 1;
|
||||
|
||||
while (candidate && isUniqueChild(candidate)) {
|
||||
candidate = candidate.parentNode;
|
||||
}
|
||||
|
||||
return candidate;
|
||||
});
|
||||
}
|
||||
|
||||
const seedName = getSeedKey(lists);
|
||||
const MBEs = getMBEs(lists[seedName]);
|
||||
|
||||
return MBEs.map((mbe) => omap(
|
||||
lists,
|
||||
(listOfElements) => listOfElements.find((elem) => mbe.contains(elem))?.innerText,
|
||||
));
|
||||
}
|
||||
8
maxun-core/index.ts
Normal file
8
maxun-core/index.ts
Normal file
@@ -0,0 +1,8 @@
|
||||
import Interpreter from './interpret';
|
||||
|
||||
export default Interpreter;
|
||||
export { default as Preprocessor } from './preprocessor';
|
||||
export type {
|
||||
WorkflowFile, WhereWhatPair, Where, What,
|
||||
} from './types/workflow';
|
||||
export { unaryOperators, naryOperators, meta as metaOperators } from './types/logic';
|
||||
459
maxun-core/interpret.ts
Normal file
459
maxun-core/interpret.ts
Normal file
@@ -0,0 +1,459 @@
|
||||
/* eslint-disable no-await-in-loop, no-restricted-syntax */
|
||||
import { Page, PageScreenshotOptions } from 'playwright';
|
||||
import path from 'path';
|
||||
|
||||
import { EventEmitter } from 'events';
|
||||
import {
|
||||
Where, What, PageState, Workflow, WorkflowFile,
|
||||
ParamType, SelectorArray, CustomFunctions,
|
||||
} from './types/workflow';
|
||||
|
||||
import { operators, meta } from './types/logic';
|
||||
import { arrayToObject } from './utils/utils';
|
||||
import Concurrency from './utils/concurrency';
|
||||
import Preprocessor from './preprocessor';
|
||||
import log, { Level } from './utils/logger';
|
||||
|
||||
/**
|
||||
* Defines optional intepreter options (passed in constructor)
|
||||
*/
|
||||
interface InterpreterOptions {
|
||||
maxRepeats: number;
|
||||
maxConcurrency: number;
|
||||
serializableCallback: (output: any) => (void | Promise<void>);
|
||||
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
||||
debug: boolean;
|
||||
debugChannel: Partial<{
|
||||
activeId: Function,
|
||||
debugMessage: Function,
|
||||
}>
|
||||
}
|
||||
|
||||
/**
|
||||
* Class for running the Smart Workflows.
|
||||
*/
|
||||
export default class Interpreter extends EventEmitter {
|
||||
private workflow: Workflow;
|
||||
|
||||
private initializedWorkflow: Workflow | null;
|
||||
|
||||
private options: InterpreterOptions;
|
||||
|
||||
private concurrency: Concurrency;
|
||||
|
||||
private stopper: Function | null = null;
|
||||
|
||||
private log: typeof log;
|
||||
|
||||
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
|
||||
super();
|
||||
this.workflow = workflow.workflow;
|
||||
this.initializedWorkflow = null;
|
||||
this.options = {
|
||||
maxRepeats: 5,
|
||||
maxConcurrency: 5,
|
||||
serializableCallback: (data) => { log(JSON.stringify(data), Level.WARN); },
|
||||
binaryCallback: () => { log('Received binary data, thrashing them.', Level.WARN); },
|
||||
debug: false,
|
||||
debugChannel: {},
|
||||
...options,
|
||||
};
|
||||
this.concurrency = new Concurrency(this.options.maxConcurrency);
|
||||
this.log = (...args) => log(...args);
|
||||
|
||||
const error = Preprocessor.validateWorkflow(workflow);
|
||||
if (error) {
|
||||
throw (error);
|
||||
}
|
||||
|
||||
if (this.options.debugChannel?.debugMessage) {
|
||||
const oldLog = this.log;
|
||||
// @ts-ignore
|
||||
this.log = (...args: Parameters<typeof oldLog>) => {
|
||||
if (args[1] !== Level.LOG) {
|
||||
this.options.debugChannel.debugMessage!(typeof args[0] === 'string' ? args[0] : args[0].message);
|
||||
}
|
||||
oldLog(...args);
|
||||
};
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Returns the context object from given Page and the current workflow.\
|
||||
* \
|
||||
* `workflow` is used for selector extraction - function searches for used selectors to
|
||||
* look for later in the page's context.
|
||||
* @param page Playwright Page object
|
||||
* @param workflow Current **initialized** workflow (array of where-what pairs).
|
||||
* @returns {PageState} State of the current page.
|
||||
*/
|
||||
private async getState(page: Page, workflow: Workflow): Promise<PageState> {
|
||||
/**
|
||||
* All the selectors present in the current Workflow
|
||||
*/
|
||||
const selectors = Preprocessor.extractSelectors(workflow);
|
||||
|
||||
/**
|
||||
* Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability).
|
||||
* @param selector Selector to be queried
|
||||
* @returns True if the targetted element is actionable, false otherwise.
|
||||
*/
|
||||
const actionable = async (selector: string): Promise<boolean> => {
|
||||
try {
|
||||
const proms = [
|
||||
page.isEnabled(selector, { timeout: 500 }),
|
||||
page.isVisible(selector, { timeout: 500 }),
|
||||
];
|
||||
|
||||
return await Promise.all(proms).then((bools) => bools.every((x) => x));
|
||||
} catch (e) {
|
||||
// log(<Error>e, Level.ERROR);
|
||||
return false;
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Object of selectors present in the current page.
|
||||
*/
|
||||
const presentSelectors: SelectorArray = await Promise.all(
|
||||
selectors.map(async (selector) => {
|
||||
if (await actionable(selector)) {
|
||||
return [selector];
|
||||
}
|
||||
return [];
|
||||
}),
|
||||
).then((x) => x.flat());
|
||||
|
||||
return {
|
||||
url: page.url(),
|
||||
cookies: (await page.context().cookies([page.url()]))
|
||||
.reduce((p, cookie) => (
|
||||
{
|
||||
...p,
|
||||
[cookie.name]: cookie.value,
|
||||
}), {}),
|
||||
selectors: presentSelectors,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Tests if the given action is applicable with the given context.
|
||||
* @param where Tested *where* condition
|
||||
* @param context Current browser context.
|
||||
* @returns True if `where` is applicable in the given context, false otherwise
|
||||
*/
|
||||
private applicable(where: Where, context: PageState, usedActions: string[] = []): boolean {
|
||||
/**
|
||||
* Given two arbitrary objects, determines whether `subset` is a subset of `superset`.\
|
||||
* \
|
||||
* For every key in `subset`, there must be a corresponding key with equal scalar
|
||||
* value in `superset`, or `inclusive(subset[key], superset[key])` must hold.
|
||||
* @param subset Arbitrary non-cyclic JS object (where clause)
|
||||
* @param superset Arbitrary non-cyclic JS object (browser context)
|
||||
* @returns `true` if `subset <= superset`, `false` otherwise.
|
||||
*/
|
||||
const inclusive = (subset: Record<string, unknown>, superset: Record<string, unknown>)
|
||||
: boolean => (
|
||||
Object.entries(subset).every(
|
||||
([key, value]) => {
|
||||
/**
|
||||
* Arrays are compared without order (are transformed into objects before comparison).
|
||||
*/
|
||||
const parsedValue = Array.isArray(value) ? arrayToObject(value) : value;
|
||||
|
||||
const parsedSuperset: Record<string, unknown> = {};
|
||||
parsedSuperset[key] = Array.isArray(superset[key])
|
||||
? arrayToObject(<any>superset[key])
|
||||
: superset[key];
|
||||
|
||||
// Every `subset` key must exist in the `superset` and
|
||||
// have the same value (strict equality), or subset[key] <= superset[key]
|
||||
return parsedSuperset[key]
|
||||
&& (
|
||||
(parsedSuperset[key] === parsedValue)
|
||||
|| ((parsedValue).constructor.name === 'RegExp' && (<RegExp>parsedValue).test(<string>parsedSuperset[key]))
|
||||
|| (
|
||||
(parsedValue).constructor.name !== 'RegExp'
|
||||
&& typeof parsedValue === 'object' && inclusive(<typeof subset>parsedValue, <typeof superset>parsedSuperset[key])
|
||||
)
|
||||
);
|
||||
},
|
||||
)
|
||||
);
|
||||
|
||||
// Every value in the "where" object should be compliant to the current state.
|
||||
return Object.entries(where).every(
|
||||
([key, value]) => {
|
||||
if (operators.includes(<any>key)) {
|
||||
const array = Array.isArray(value)
|
||||
? value as Where[]
|
||||
: Object.entries(value).map((a) => Object.fromEntries([a]));
|
||||
// every condition is treated as a single context
|
||||
|
||||
switch (key as keyof typeof operators) {
|
||||
case '$and':
|
||||
return array?.every((x) => this.applicable(x, context));
|
||||
case '$or':
|
||||
return array?.some((x) => this.applicable(x, context));
|
||||
case '$not':
|
||||
return !this.applicable(<Where>value, context); // $not should be a unary operator
|
||||
default:
|
||||
throw new Error('Undefined logic operator.');
|
||||
}
|
||||
} else if (meta.includes(<any>key)) {
|
||||
const testRegexString = (x: string) => {
|
||||
if (typeof value === 'string') {
|
||||
return x === value;
|
||||
}
|
||||
|
||||
return (<RegExp><unknown>value).test(x);
|
||||
};
|
||||
|
||||
switch (key as keyof typeof meta) {
|
||||
case '$before':
|
||||
return !usedActions.find(testRegexString);
|
||||
case '$after':
|
||||
return !!usedActions.find(testRegexString);
|
||||
default:
|
||||
throw new Error('Undefined meta operator.');
|
||||
}
|
||||
} else {
|
||||
// Current key is a base condition (url, cookies, selectors)
|
||||
return inclusive({ [key]: value }, context);
|
||||
}
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
/**
|
||||
* Given a Playwright's page object and a "declarative" list of actions, this function
|
||||
* calls all mentioned functions on the Page object.\
|
||||
* \
|
||||
* Manipulates the iterator indexes (experimental feature, likely to be removed in
|
||||
* the following versions of waw-interpreter)
|
||||
* @param page Playwright Page object
|
||||
* @param steps Array of actions.
|
||||
*/
|
||||
private async carryOutSteps(page: Page, steps: What[]): Promise<void> {
|
||||
/**
|
||||
* Defines overloaded (or added) methods/actions usable in the workflow.
|
||||
* If a method overloads any existing method of the Page class, it accepts the same set
|
||||
* of parameters *(but can override some!)*\
|
||||
* \
|
||||
* Also, following piece of code defines functions to be run in the browser's context.
|
||||
* Beware of false linter errors - here, we know better!
|
||||
*/
|
||||
const wawActions: Record<CustomFunctions, (...args: any[]) => void> = {
|
||||
screenshot: async (params: PageScreenshotOptions) => {
|
||||
const screenshotBuffer = await page.screenshot({
|
||||
...params, path: undefined,
|
||||
});
|
||||
await this.options.binaryCallback(screenshotBuffer, 'image/png');
|
||||
},
|
||||
enqueueLinks: async (selector: string) => {
|
||||
const links: string[] = await page.locator(selector)
|
||||
.evaluateAll(
|
||||
// @ts-ignore
|
||||
(elements) => elements.map((a) => a.href).filter((x) => x),
|
||||
);
|
||||
const context = page.context();
|
||||
|
||||
for (const link of links) {
|
||||
// eslint-disable-next-line
|
||||
this.concurrency.addJob(async () => {
|
||||
try {
|
||||
const newPage = await context.newPage();
|
||||
await newPage.goto(link);
|
||||
await newPage.waitForLoadState('networkidle');
|
||||
await this.runLoop(newPage, this.initializedWorkflow!);
|
||||
} catch (e) {
|
||||
// `runLoop` uses soft mode, so it recovers from it's own exceptions
|
||||
// but newPage(), goto() and waitForLoadState() don't (and will kill
|
||||
// the interpreter by throwing).
|
||||
this.log(<Error>e, Level.ERROR);
|
||||
}
|
||||
});
|
||||
}
|
||||
await page.close();
|
||||
},
|
||||
scrape: async (selector?: string) => {
|
||||
const scrapeResults: Record<string, string>[] = <any>await page
|
||||
// eslint-disable-next-line
|
||||
// @ts-ignore
|
||||
.evaluate((s) => scrape(s ?? null), selector);
|
||||
await this.options.serializableCallback(scrapeResults);
|
||||
},
|
||||
scrapeSchema: async (schema: Record<string, string>) => {
|
||||
const handleLists = await Promise.all(
|
||||
Object.values(schema).map((selector) => page.$$(selector)),
|
||||
);
|
||||
|
||||
const namedHandleLists = Object.fromEntries(
|
||||
Object.keys(schema).map((key, i) => [key, handleLists[i]]),
|
||||
);
|
||||
|
||||
const scrapeResult = await page.evaluate((n) => scrapeSchema(n), namedHandleLists);
|
||||
|
||||
this.options.serializableCallback(scrapeResult);
|
||||
},
|
||||
scroll: async (pages?: number) => {
|
||||
await page.evaluate(async (pagesInternal) => {
|
||||
for (let i = 1; i <= (pagesInternal ?? 1); i += 1) {
|
||||
// @ts-ignore
|
||||
window.scrollTo(0, window.scrollY + window.innerHeight);
|
||||
}
|
||||
}, pages ?? 1);
|
||||
},
|
||||
script: async (code: string) => {
|
||||
const AsyncFunction: FunctionConstructor = Object.getPrototypeOf(
|
||||
async () => { },
|
||||
).constructor;
|
||||
const x = new AsyncFunction('page', 'log', code);
|
||||
await x(page, this.log);
|
||||
},
|
||||
flag: async () => new Promise((res) => {
|
||||
this.emit('flag', page, res);
|
||||
}),
|
||||
};
|
||||
|
||||
for (const step of steps) {
|
||||
this.log(`Launching ${step.action}`, Level.LOG);
|
||||
|
||||
if (step.action in wawActions) {
|
||||
// "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
|
||||
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
|
||||
await wawActions[step.action as CustomFunctions](...(params ?? []));
|
||||
} else {
|
||||
// Implements the dot notation for the "method name" in the workflow
|
||||
const levels = step.action.split('.');
|
||||
const methodName = levels[levels.length - 1];
|
||||
|
||||
let invokee: any = page;
|
||||
for (const level of levels.splice(0, levels.length - 1)) {
|
||||
invokee = invokee[level];
|
||||
}
|
||||
|
||||
if (!step.args || Array.isArray(step.args)) {
|
||||
await (<any>invokee[methodName])(...(step.args ?? []));
|
||||
} else {
|
||||
await (<any>invokee[methodName])(step.args);
|
||||
}
|
||||
}
|
||||
|
||||
await new Promise((res) => { setTimeout(res, 500); });
|
||||
}
|
||||
}
|
||||
|
||||
private async runLoop(p: Page, workflow: Workflow) {
|
||||
const usedActions: string[] = [];
|
||||
let lastAction = null;
|
||||
let repeatCount = 0;
|
||||
|
||||
/**
|
||||
* Enables the interpreter functionality for popup windows.
|
||||
* User-requested concurrency should be entirely managed by the concurrency manager,
|
||||
* e.g. via `enqueueLinks`.
|
||||
*/
|
||||
p.on('popup', (popup) => {
|
||||
this.concurrency.addJob(() => this.runLoop(popup, workflow));
|
||||
});
|
||||
|
||||
/* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
|
||||
while (true) {
|
||||
// Checks whether the page was closed from outside,
|
||||
// or the workflow execution has been stopped via `interpreter.stop()`
|
||||
if (p.isClosed() || !this.stopper) {
|
||||
return;
|
||||
}
|
||||
|
||||
try {
|
||||
await p.waitForLoadState();
|
||||
} catch (e) {
|
||||
await p.close();
|
||||
return;
|
||||
}
|
||||
|
||||
let pageState = {};
|
||||
try {
|
||||
pageState = await this.getState(p, workflow);
|
||||
} catch (e: any) {
|
||||
this.log('The browser has been closed.');
|
||||
return;
|
||||
}
|
||||
|
||||
if (this.options.debug) {
|
||||
this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
|
||||
}
|
||||
const actionId = workflow.findIndex(
|
||||
(step) => this.applicable(step.where, pageState, usedActions),
|
||||
);
|
||||
|
||||
const action = workflow[actionId];
|
||||
|
||||
this.log(`Matched ${JSON.stringify(action?.where)}`, Level.LOG);
|
||||
|
||||
if (action) { // action is matched
|
||||
if (this.options.debugChannel?.activeId) {
|
||||
this.options.debugChannel.activeId(actionId);
|
||||
}
|
||||
|
||||
repeatCount = action === lastAction ? repeatCount + 1 : 0;
|
||||
if (this.options.maxRepeats && repeatCount >= this.options.maxRepeats) {
|
||||
return;
|
||||
}
|
||||
lastAction = action;
|
||||
|
||||
try {
|
||||
await this.carryOutSteps(p, action.what);
|
||||
usedActions.push(action.id ?? 'undefined');
|
||||
} catch (e) {
|
||||
this.log(<Error>e, Level.ERROR);
|
||||
}
|
||||
} else {
|
||||
return;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Spawns a browser context and runs given workflow.
|
||||
* \
|
||||
* Resolves after the playback is finished.
|
||||
* @param {Page} [page] Page to run the workflow on.
|
||||
* @param {ParamType} params Workflow specific, set of parameters
|
||||
* for the `{$param: nameofparam}` fields.
|
||||
*/
|
||||
public async run(page: Page, params?: ParamType): Promise<void> {
|
||||
if (this.stopper) {
|
||||
throw new Error('This Interpreter is already running a workflow. To run another workflow, please, spawn another Interpreter.');
|
||||
}
|
||||
/**
|
||||
* `this.workflow` with the parameters initialized.
|
||||
*/
|
||||
this.initializedWorkflow = Preprocessor.initWorkflow(this.workflow, params);
|
||||
|
||||
// @ts-ignore
|
||||
if (await page.evaluate(() => !<any>window.scrape)) {
|
||||
page.context().addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
|
||||
}
|
||||
|
||||
this.stopper = () => {
|
||||
this.stopper = null;
|
||||
};
|
||||
|
||||
this.concurrency.addJob(() => this.runLoop(page, this.initializedWorkflow!));
|
||||
|
||||
await this.concurrency.waitForCompletion();
|
||||
|
||||
this.stopper = null;
|
||||
}
|
||||
|
||||
public async stop(): Promise<void> {
|
||||
if (this.stopper) {
|
||||
await this.stopper();
|
||||
this.stopper = null;
|
||||
} else {
|
||||
throw new Error('Cannot stop, there is no running workflow!');
|
||||
}
|
||||
}
|
||||
}
|
||||
179
maxun-core/preprocessor.ts
Normal file
179
maxun-core/preprocessor.ts
Normal file
@@ -0,0 +1,179 @@
|
||||
import Joi from 'joi';
|
||||
import {
|
||||
Workflow, WorkflowFile, ParamType, SelectorArray, Where,
|
||||
} from './types/workflow';
|
||||
import { operators } from './types/logic';
|
||||
|
||||
/**
|
||||
* Class for static processing the workflow files/objects.
|
||||
*/
|
||||
export default class Preprocessor {
|
||||
static validateWorkflow(workflow: WorkflowFile): any {
|
||||
const regex = Joi.object({
|
||||
$regex: Joi.string().required(),
|
||||
});
|
||||
|
||||
const whereSchema = Joi.object({
|
||||
url: [Joi.string().uri(), regex],
|
||||
selectors: Joi.array().items(Joi.string()),
|
||||
cookies: Joi.object({}).pattern(Joi.string(), Joi.string()),
|
||||
$after: [Joi.string(), regex],
|
||||
$before: [Joi.string(), regex],
|
||||
$and: Joi.array().items(Joi.link('#whereSchema')),
|
||||
$or: Joi.array().items(Joi.link('#whereSchema')),
|
||||
$not: Joi.link('#whereSchema'),
|
||||
}).id('whereSchema');
|
||||
|
||||
const schema = Joi.object({
|
||||
meta: Joi.object({
|
||||
name: Joi.string(),
|
||||
desc: Joi.string(),
|
||||
}),
|
||||
workflow: Joi.array().items(
|
||||
Joi.object({
|
||||
id: Joi.string(),
|
||||
where: whereSchema.required(),
|
||||
what: Joi.array().items({
|
||||
action: Joi.string().required(),
|
||||
args: Joi.array().items(Joi.any()),
|
||||
}).required(),
|
||||
}),
|
||||
).required(),
|
||||
});
|
||||
|
||||
const { error } = schema.validate(workflow);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
/**
|
||||
* Extracts parameter names from the workflow.
|
||||
* @param {WorkflowFile} workflow The given workflow
|
||||
* @returns {String[]} List of parameters' names.
|
||||
*/
|
||||
static getParams(workflow: WorkflowFile): string[] {
|
||||
const getParamsRecurse = (object: any): string[] => {
|
||||
if (typeof object === 'object') {
|
||||
// Recursion base case
|
||||
if (object.$param) {
|
||||
return [object.$param];
|
||||
}
|
||||
|
||||
// Recursion general case
|
||||
return Object.values(object)
|
||||
.reduce((p: string[], v: any): string[] => [...p, ...getParamsRecurse(v)], []);
|
||||
}
|
||||
return [];
|
||||
};
|
||||
|
||||
return getParamsRecurse(workflow.workflow);
|
||||
}
|
||||
|
||||
/**
|
||||
* List all the selectors used in the given workflow (only literal "selector"
|
||||
* field in WHERE clauses so far)
|
||||
*/
|
||||
// TODO : add recursive selector search (also in click/fill etc. events?)
|
||||
static extractSelectors(workflow: Workflow): SelectorArray {
|
||||
/**
|
||||
* Given a Where condition, this function extracts
|
||||
* all the existing selectors from it (recursively).
|
||||
*/
|
||||
const selectorsFromCondition = (where: Where): SelectorArray => {
|
||||
// the `selectors` field is either on the top level
|
||||
let out = where.selectors ?? [];
|
||||
if (!Array.isArray(out)) {
|
||||
out = [out];
|
||||
}
|
||||
|
||||
// or nested in the "operator" array
|
||||
operators.forEach((op) => {
|
||||
let condWhere = where[op];
|
||||
if (condWhere) {
|
||||
condWhere = Array.isArray(condWhere) ? condWhere : [condWhere];
|
||||
(condWhere).forEach((subWhere) => {
|
||||
out = [...out, ...selectorsFromCondition(subWhere)];
|
||||
});
|
||||
}
|
||||
});
|
||||
|
||||
return out;
|
||||
};
|
||||
|
||||
// Iterate through all the steps and extract the selectors from all of them.
|
||||
return workflow.reduce((p: SelectorArray, step) => [
|
||||
...p,
|
||||
...selectorsFromCondition(step.where).filter((x) => !p.includes(x)),
|
||||
], []);
|
||||
}
|
||||
|
||||
/**
|
||||
* Recursively crawl `object` and initializes params - replaces the `{$param : paramName}` objects
|
||||
* with the defined value.
|
||||
* @returns {Workflow} Copy of the given workflow, modified (the initial workflow is left untouched).
|
||||
*/
|
||||
static initWorkflow(workflow: Workflow, params?: ParamType): Workflow {
|
||||
const paramNames = this.getParams({ workflow });
|
||||
|
||||
if (Object.keys(params ?? {}).sort().join(',') !== paramNames.sort().join(',')) {
|
||||
throw new Error(`Provided parameters do not match the workflow parameters
|
||||
provided: ${Object.keys(params ?? {}).sort().join(',')},
|
||||
expected: ${paramNames.sort().join(',')}
|
||||
`);
|
||||
}
|
||||
/**
|
||||
* A recursive method for initializing special `{key: value}` syntax objects in the workflow.
|
||||
* @param object Workflow to initialize (or a part of it).
|
||||
* @param k key to look for ($regex, $param)
|
||||
* @param f function mutating the special `{}` syntax into
|
||||
* its true representation (RegExp...)
|
||||
* @returns Updated object
|
||||
*/
|
||||
const initSpecialRecurse = (
|
||||
object: unknown,
|
||||
k: string,
|
||||
f: (value: string) => unknown,
|
||||
): unknown => {
|
||||
if (!object || typeof object !== 'object') {
|
||||
return object;
|
||||
}
|
||||
|
||||
const out = object;
|
||||
// for every key (child) of the object
|
||||
Object.keys(object!).forEach((key) => {
|
||||
// if the field has only one key, which is `k`
|
||||
if (Object.keys((<any>object)[key]).length === 1 && (<any>object)[key][k]) {
|
||||
// process the current special tag (init param, hydrate regex...)
|
||||
(<any>out)[key] = f((<any>object)[key][k]);
|
||||
} else {
|
||||
initSpecialRecurse((<any>object)[key], k, f);
|
||||
}
|
||||
});
|
||||
return out;
|
||||
};
|
||||
|
||||
// TODO: do better deep copy, this is hideous.
|
||||
let workflowCopy = JSON.parse(JSON.stringify(workflow));
|
||||
|
||||
if (params) {
|
||||
workflowCopy = initSpecialRecurse(
|
||||
workflowCopy,
|
||||
'$param',
|
||||
(paramName) => {
|
||||
if (params && params[paramName]) {
|
||||
return params[paramName];
|
||||
}
|
||||
throw new SyntaxError(`Unspecified parameter found ${paramName}.`);
|
||||
},
|
||||
);
|
||||
}
|
||||
|
||||
workflowCopy = initSpecialRecurse(
|
||||
workflowCopy,
|
||||
'$regex',
|
||||
(regex) => new RegExp(regex),
|
||||
);
|
||||
|
||||
return <Workflow>workflowCopy;
|
||||
}
|
||||
}
|
||||
5
maxun-core/types/logic.ts
Normal file
5
maxun-core/types/logic.ts
Normal file
@@ -0,0 +1,5 @@
|
||||
export const unaryOperators = ['$not'] as const;
|
||||
export const naryOperators = ['$and', '$or'] as const;
|
||||
|
||||
export const operators = [...unaryOperators, ...naryOperators] as const;
|
||||
export const meta = ['$before', '$after'] as const;
|
||||
58
maxun-core/types/workflow.ts
Normal file
58
maxun-core/types/workflow.ts
Normal file
@@ -0,0 +1,58 @@
|
||||
import { Page } from 'playwright';
|
||||
import {
|
||||
naryOperators, unaryOperators, operators, meta,
|
||||
} from './logic';
|
||||
|
||||
export type Operator = typeof operators[number];
|
||||
export type UnaryOperator = typeof unaryOperators[number];
|
||||
export type NAryOperator = typeof naryOperators[number];
|
||||
|
||||
export type Meta = typeof meta[number];
|
||||
|
||||
export type SelectorArray = string[];
|
||||
|
||||
type RegexableString = string | { '$regex': string };
|
||||
|
||||
type BaseConditions = {
|
||||
'url': RegexableString,
|
||||
'cookies': Record<string, RegexableString>,
|
||||
'selectors': SelectorArray, // (CSS/Playwright) selectors use their own logic, there is no reason (and several technical difficulties) to allow regular expression notation
|
||||
} & Record<Meta, RegexableString>;
|
||||
|
||||
export type Where =
|
||||
Partial<{ [key in NAryOperator]: Where[] }> & // either a logic operator (arity N)
|
||||
Partial<{ [key in UnaryOperator]: Where }> & // or an unary operator
|
||||
Partial<BaseConditions>; // or one of the base conditions
|
||||
|
||||
type MethodNames<T> = {
|
||||
[K in keyof T]: T[K] extends Function ? K : never;
|
||||
}[keyof T];
|
||||
|
||||
export type CustomFunctions = 'scrape' | 'scrapeSchema' | 'scroll' | 'screenshot' | 'script' | 'enqueueLinks' | 'flag';
|
||||
|
||||
export type What = {
|
||||
action: MethodNames<Page> | CustomFunctions,
|
||||
args?: any[]
|
||||
};
|
||||
|
||||
export type PageState = Partial<BaseConditions>;
|
||||
|
||||
export type ParamType = Record<string, any>;
|
||||
|
||||
export type MetaData = {
|
||||
name?: string,
|
||||
desc?: string,
|
||||
};
|
||||
|
||||
export interface WhereWhatPair {
|
||||
id?: string
|
||||
where: Where
|
||||
what: What[]
|
||||
}
|
||||
|
||||
export type Workflow = WhereWhatPair[];
|
||||
|
||||
export type WorkflowFile = {
|
||||
meta?: MetaData,
|
||||
workflow: Workflow
|
||||
};
|
||||
85
maxun-core/utils/concurrency.ts
Normal file
85
maxun-core/utils/concurrency.ts
Normal file
@@ -0,0 +1,85 @@
|
||||
/**
|
||||
* Concurrency class for running concurrent tasks while managing a limited amount of resources.
|
||||
*/
|
||||
export default class Concurrency {
|
||||
/**
|
||||
* Maximum number of workers running in parallel. If set to `null`, there is no limit.
|
||||
*/
|
||||
maxConcurrency: number = 1;
|
||||
|
||||
/**
|
||||
* Number of currently active workers.
|
||||
*/
|
||||
activeWorkers: number = 0;
|
||||
|
||||
/**
|
||||
* Queue of jobs waiting to be completed.
|
||||
*/
|
||||
private jobQueue: Function[] = [];
|
||||
|
||||
/**
|
||||
* "Resolve" callbacks of the waitForCompletion() promises.
|
||||
*/
|
||||
private waiting: Function[] = [];
|
||||
|
||||
/**
|
||||
* Constructs a new instance of concurrency manager.
|
||||
* @param {number} maxConcurrency Maximum number of workers running in parallel.
|
||||
*/
|
||||
constructor(maxConcurrency: number) {
|
||||
this.maxConcurrency = maxConcurrency;
|
||||
}
|
||||
|
||||
/**
|
||||
* Takes a waiting job out of the queue and runs it.
|
||||
*/
|
||||
private runNextJob(): void {
|
||||
const job = this.jobQueue.pop();
|
||||
|
||||
if (job) {
|
||||
// console.debug("Running a job...");
|
||||
job().then(() => {
|
||||
// console.debug("Job finished, running the next waiting job...");
|
||||
this.runNextJob();
|
||||
});
|
||||
} else {
|
||||
// console.debug("No waiting job found!");
|
||||
this.activeWorkers -= 1;
|
||||
if (this.activeWorkers === 0) {
|
||||
// console.debug("This concurrency manager is idle!");
|
||||
this.waiting.forEach((x) => x());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Pass a job (a time-demanding async function) to the concurrency manager. \
|
||||
* The time of the job's execution depends on the concurrency manager itself
|
||||
* (given a generous enough `maxConcurrency` value, it might be immediate,
|
||||
* but this is not guaranteed).
|
||||
* @param worker Async function to be executed (job to be processed).
|
||||
*/
|
||||
addJob(job: () => Promise<any>): void {
|
||||
// console.debug("Adding a worker!");
|
||||
this.jobQueue.push(job);
|
||||
|
||||
if (!this.maxConcurrency || this.activeWorkers < this.maxConcurrency) {
|
||||
this.runNextJob();
|
||||
this.activeWorkers += 1;
|
||||
} else {
|
||||
// console.debug("No capacity to run a worker now, waiting!");
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Waits until there is no running nor waiting job. \
|
||||
* If the concurrency manager is idle at the time of calling this function,
|
||||
* it waits until at least one job is compeleted (can be "presubscribed").
|
||||
* @returns Promise, resolved after there is no running/waiting worker.
|
||||
*/
|
||||
waitForCompletion(): Promise<void> {
|
||||
return new Promise((res) => {
|
||||
this.waiting.push(res);
|
||||
});
|
||||
}
|
||||
}
|
||||
30
maxun-core/utils/logger.ts
Normal file
30
maxun-core/utils/logger.ts
Normal file
@@ -0,0 +1,30 @@
|
||||
/*
|
||||
* Logger class for more detailed and comprehensible logs (with colors and timestamps)
|
||||
*/
|
||||
|
||||
export enum Level {
|
||||
DATE = 36,
|
||||
LOG = 0,
|
||||
WARN = 93,
|
||||
ERROR = 31,
|
||||
DEBUG = 95,
|
||||
RESET = 0,
|
||||
}
|
||||
|
||||
export default function logger(
|
||||
message: string | Error,
|
||||
level: (Level.LOG | Level.WARN | Level.ERROR | Level.DEBUG) = Level.LOG,
|
||||
) {
|
||||
let m = message;
|
||||
if (message.constructor.name.includes('Error') && typeof message !== 'string') {
|
||||
m = <Error><unknown>(message).message;
|
||||
}
|
||||
process.stdout.write(`\x1b[${Level.DATE}m[${(new Date()).toLocaleString()}]\x1b[0m `);
|
||||
process.stdout.write(`\x1b[${level}m`);
|
||||
if (level === Level.ERROR || level === Level.WARN) {
|
||||
process.stderr.write(<string>m);
|
||||
} else {
|
||||
process.stdout.write(<string>m);
|
||||
}
|
||||
process.stdout.write(`\x1b[${Level.RESET}m\n`);
|
||||
}
|
||||
13
maxun-core/utils/utils.ts
Normal file
13
maxun-core/utils/utils.ts
Normal file
@@ -0,0 +1,13 @@
|
||||
/**
|
||||
* ESLint rule in case there is only one util function
|
||||
* (it still does not represent the "utils" file)
|
||||
*/
|
||||
|
||||
/* eslint-disable import/prefer-default-export */
|
||||
|
||||
/**
|
||||
* Converts an array of scalars to an object with **items** of the array **for keys**.
|
||||
*/
|
||||
export function arrayToObject(array : any[]) {
|
||||
return array.reduce((p, x) => ({ ...p, [x]: [] }), {});
|
||||
}
|
||||
Reference in New Issue
Block a user