2659 lines
96 KiB
TypeScript
2659 lines
96 KiB
TypeScript
/* eslint-disable no-await-in-loop, no-restricted-syntax */
|
|
import { ElementHandle, Page, PageScreenshotOptions } from 'playwright-core';
|
|
import fetch from 'cross-fetch';
|
|
import path from 'path';
|
|
|
|
import { EventEmitter } from 'events';
|
|
import {
|
|
Where, What, PageState, Workflow, WorkflowFile,
|
|
ParamType, SelectorArray, CustomFunctions,
|
|
} from './types/workflow';
|
|
|
|
import { operators, meta } from './types/logic';
|
|
import { arrayToObject } from './utils/utils';
|
|
import Concurrency from './utils/concurrency';
|
|
import Preprocessor from './preprocessor';
|
|
import log, { Level } from './utils/logger';
|
|
|
|
/**
|
|
* Extending the Window interface for custom scraping functions.
|
|
*/
|
|
declare global {
|
|
interface Window {
|
|
scrape: (selector: string | null) => Record<string, string>[];
|
|
scrapeSchema: (
|
|
schema: Record<string, { selector: string; tag: string; attribute: string }>
|
|
) => Record<string, any>;
|
|
scrapeList: (config: { listSelector: string; fields: any; limit?: number; pagination: any }) => Record<string, any>[];
|
|
scrapeListAuto: (listSelector: string) => { selector: string; innerText: string }[];
|
|
scrollDown: (pages?: number) => void;
|
|
scrollUp: (pages?: number) => void;
|
|
}
|
|
}
|
|
|
|
|
|
/**
|
|
* Defines optional intepreter options (passed in constructor)
|
|
*/
|
|
interface InterpreterOptions {
|
|
mode?: string;
|
|
maxRepeats: number;
|
|
maxConcurrency: number;
|
|
serializableCallback: (output: any) => (void | Promise<void>);
|
|
binaryCallback: (output: any, mimeType: string) => (void | Promise<void>);
|
|
debug: boolean;
|
|
debugChannel: Partial<{
|
|
activeId: (id: number) => void,
|
|
debugMessage: (msg: string) => void,
|
|
setActionType: (type: string) => void,
|
|
incrementScrapeListIndex: () => void,
|
|
progressUpdate: (current: number, total: number, percentage: number) => void,
|
|
}>
|
|
}
|
|
|
|
/**
|
|
* Class for running the Smart Workflows.
|
|
*/
|
|
export default class Interpreter extends EventEmitter {
|
|
private workflow: Workflow;
|
|
|
|
private initializedWorkflow: Workflow | null;
|
|
|
|
private options: InterpreterOptions;
|
|
|
|
private concurrency: Concurrency;
|
|
|
|
private stopper: Function | null = null;
|
|
|
|
private isAborted: boolean = false;
|
|
|
|
private log: typeof log;
|
|
|
|
// private blocker: PlaywrightBlocker | null = null;
|
|
|
|
private cumulativeResults: Record<string, any>[] = [];
|
|
|
|
private namedResults: Record<string, Record<string, any>> = {};
|
|
|
|
private screenshotCounter: number = 0;
|
|
|
|
private serializableDataByType: Record<string, Record<string, any>> = {
|
|
scrapeList: {},
|
|
scrapeSchema: {},
|
|
crawl: {},
|
|
search: {}
|
|
};
|
|
|
|
private scrapeListCounter: number = 0;
|
|
|
|
private totalActions: number = 0;
|
|
|
|
private executedActions: number = 0;
|
|
|
|
constructor(workflow: WorkflowFile, options?: Partial<InterpreterOptions>) {
|
|
super();
|
|
this.workflow = workflow.workflow;
|
|
this.initializedWorkflow = null;
|
|
this.options = {
|
|
maxRepeats: 5,
|
|
maxConcurrency: 5,
|
|
serializableCallback: (data) => {
|
|
log(JSON.stringify(data), Level.WARN);
|
|
},
|
|
binaryCallback: () => { log('Received binary data, thrashing them.', Level.WARN); },
|
|
debug: false,
|
|
debugChannel: {},
|
|
...options,
|
|
};
|
|
this.concurrency = new Concurrency(this.options.maxConcurrency);
|
|
this.log = (...args) => log(...args);
|
|
|
|
const error = Preprocessor.validateWorkflow(workflow);
|
|
if (error) {
|
|
throw (error);
|
|
}
|
|
|
|
if (this.options.debugChannel?.debugMessage) {
|
|
const oldLog = this.log;
|
|
// @ts-ignore
|
|
this.log = (...args: Parameters<typeof oldLog>) => {
|
|
if (args[1] !== Level.LOG) {
|
|
this.options.debugChannel.debugMessage!(typeof args[0] === 'string' ? args[0] : args[0].message);
|
|
}
|
|
oldLog(...args);
|
|
};
|
|
}
|
|
|
|
// PlaywrightBlocker.fromLists(fetch, ['https://easylist.to/easylist/easylist.txt']).then(blocker => {
|
|
// this.blocker = blocker;
|
|
// }).catch(err => {
|
|
// this.log(`Failed to initialize ad-blocker: ${err.message}`, Level.ERROR);
|
|
// // Continue without ad-blocker rather than crashing
|
|
// this.blocker = null;
|
|
// })
|
|
}
|
|
|
|
/**
|
|
* Sets the abort flag to immediately stop all operations
|
|
*/
|
|
public abort(): void {
|
|
this.isAborted = true;
|
|
}
|
|
|
|
/**
|
|
* Returns the current abort status
|
|
*/
|
|
public getIsAborted(): boolean {
|
|
return this.isAborted;
|
|
}
|
|
|
|
// private getSelectors(workflow: Workflow, actionId: number): string[] {
|
|
// const selectors: string[] = [];
|
|
|
|
// // Validate actionId
|
|
// if (actionId <= 0) {
|
|
// console.log("No previous selectors to collect.");
|
|
// return selectors; // Empty array as there are no previous steps
|
|
// }
|
|
|
|
// // Iterate from the start up to (but not including) actionId
|
|
// for (let index = 0; index < actionId; index++) {
|
|
// const currentSelectors = workflow[index]?.where?.selectors;
|
|
// console.log(`Selectors at step ${index}:`, currentSelectors);
|
|
|
|
// if (currentSelectors && currentSelectors.length > 0) {
|
|
// currentSelectors.forEach((selector) => {
|
|
// if (!selectors.includes(selector)) {
|
|
// selectors.push(selector); // Avoid duplicates
|
|
// }
|
|
// });
|
|
// }
|
|
// }
|
|
|
|
// console.log("Collected Selectors:", selectors);
|
|
// return selectors;
|
|
// }
|
|
|
|
private getSelectors(workflow: Workflow): string[] {
|
|
const selectorsSet = new Set<string>();
|
|
|
|
if (workflow.length === 0) {
|
|
return [];
|
|
}
|
|
|
|
for (let index = workflow.length - 1; index >= 0; index--) {
|
|
const currentSelectors = workflow[index]?.where?.selectors;
|
|
|
|
if (currentSelectors && currentSelectors.length > 0) {
|
|
currentSelectors.forEach((selector) => selectorsSet.add(selector));
|
|
return Array.from(selectorsSet);
|
|
}
|
|
}
|
|
|
|
return [];
|
|
}
|
|
|
|
|
|
/**
|
|
* Returns the context object from given Page and the current workflow.\
|
|
* \
|
|
* `workflow` is used for selector extraction - function searches for used selectors to
|
|
* look for later in the page's context.
|
|
* @param page Playwright Page object
|
|
* @param workflow Current **initialized** workflow (array of where-what pairs).
|
|
* @returns {PageState} State of the current page.
|
|
*/
|
|
private async getState(page: Page, workflowCopy: Workflow, selectors: string[]): Promise<PageState> {
|
|
/**
|
|
* All the selectors present in the current Workflow
|
|
*/
|
|
// const selectors = Preprocessor.extractSelectors(workflow);
|
|
// console.log("Current selectors:", selectors);
|
|
|
|
/**
|
|
* Determines whether the element targetted by the selector is [actionable](https://playwright.dev/docs/actionability).
|
|
* @param selector Selector to be queried
|
|
* @returns True if the targetted element is actionable, false otherwise.
|
|
*/
|
|
// const actionable = async (selector: string): Promise<boolean> => {
|
|
// try {
|
|
// const proms = [
|
|
// page.isEnabled(selector, { timeout: 10000 }),
|
|
// page.isVisible(selector, { timeout: 10000 }),
|
|
// ];
|
|
|
|
// return await Promise.all(proms).then((bools) => bools.every((x) => x));
|
|
// } catch (e) {
|
|
// // log(<Error>e, Level.ERROR);
|
|
// return false;
|
|
// }
|
|
// };
|
|
|
|
/**
|
|
* Object of selectors present in the current page.
|
|
*/
|
|
// const presentSelectors: SelectorArray = await Promise.all(
|
|
// selectors.map(async (selector) => {
|
|
// if (await actionable(selector)) {
|
|
// return [selector];
|
|
// }
|
|
// return [];
|
|
// }),
|
|
// ).then((x) => x.flat());
|
|
|
|
const presentSelectors: SelectorArray = await Promise.all(
|
|
selectors.map(async (selector) => {
|
|
try {
|
|
await page.waitForSelector(selector, { state: 'attached' });
|
|
return [selector];
|
|
} catch (e) {
|
|
return [];
|
|
}
|
|
}),
|
|
).then((x) => x.flat());
|
|
|
|
const action = workflowCopy[workflowCopy.length - 1];
|
|
|
|
// console.log("Next action:", action)
|
|
|
|
let url: any = page.url();
|
|
|
|
if (action && action.where.url !== url && action.where.url !== "about:blank") {
|
|
url = action.where.url;
|
|
}
|
|
|
|
return {
|
|
url,
|
|
cookies: (await page.context().cookies([page.url()]))
|
|
.reduce((p, cookie) => (
|
|
{
|
|
...p,
|
|
[cookie.name]: cookie.value,
|
|
}), {}),
|
|
selectors: presentSelectors,
|
|
};
|
|
}
|
|
|
|
/**
|
|
* Tests if the given action is applicable with the given context.
|
|
* @param where Tested *where* condition
|
|
* @param context Current browser context.
|
|
* @returns True if `where` is applicable in the given context, false otherwise
|
|
*/
|
|
private applicable(where: Where, context: PageState, usedActions: string[] = []): boolean {
|
|
/**
|
|
* Given two arbitrary objects, determines whether `subset` is a subset of `superset`.\
|
|
* \
|
|
* For every key in `subset`, there must be a corresponding key with equal scalar
|
|
* value in `superset`, or `inclusive(subset[key], superset[key])` must hold.
|
|
* @param subset Arbitrary non-cyclic JS object (where clause)
|
|
* @param superset Arbitrary non-cyclic JS object (browser context)
|
|
* @returns `true` if `subset <= superset`, `false` otherwise.
|
|
*/
|
|
const inclusive = (subset: Record<string, unknown>, superset: Record<string, unknown>)
|
|
: boolean => (
|
|
Object.entries(subset).every(
|
|
([key, value]) => {
|
|
/**
|
|
* Arrays are compared without order (are transformed into objects before comparison).
|
|
*/
|
|
const parsedValue = Array.isArray(value) ? arrayToObject(value) : value;
|
|
|
|
const parsedSuperset: Record<string, unknown> = {};
|
|
parsedSuperset[key] = Array.isArray(superset[key])
|
|
? arrayToObject(<any>superset[key])
|
|
: superset[key];
|
|
|
|
if ((key === 'url' || key === 'selectors') &&
|
|
Array.isArray(value) && Array.isArray(superset[key]) &&
|
|
value.length === 0 && (superset[key] as any[]).length === 0) {
|
|
return true;
|
|
}
|
|
|
|
if (key === 'selectors' && Array.isArray(value) && Array.isArray(superset[key])) {
|
|
return value.some(selector =>
|
|
(superset[key] as any[]).includes(selector)
|
|
);
|
|
}
|
|
|
|
// Every `subset` key must exist in the `superset` and
|
|
// have the same value (strict equality), or subset[key] <= superset[key]
|
|
return parsedSuperset[key]
|
|
&& (
|
|
(parsedSuperset[key] === parsedValue)
|
|
|| ((parsedValue).constructor.name === 'RegExp' && (<RegExp>parsedValue).test(<string>parsedSuperset[key]))
|
|
|| (
|
|
(parsedValue).constructor.name !== 'RegExp'
|
|
&& typeof parsedValue === 'object' && inclusive(<typeof subset>parsedValue, <typeof superset>parsedSuperset[key])
|
|
)
|
|
);
|
|
},
|
|
)
|
|
);
|
|
|
|
// Every value in the "where" object should be compliant to the current state.
|
|
return Object.entries(where).every(
|
|
([key, value]) => {
|
|
if (operators.includes(<any>key)) {
|
|
const array = Array.isArray(value)
|
|
? value as Where[]
|
|
: Object.entries(value).map((a) => Object.fromEntries([a]));
|
|
// every condition is treated as a single context
|
|
|
|
switch (key as keyof typeof operators) {
|
|
case '$and' as keyof typeof operators:
|
|
return array?.every((x) => this.applicable(x, context));
|
|
case '$or' as keyof typeof operators:
|
|
return array?.some((x) => this.applicable(x, context));
|
|
case '$not' as keyof typeof operators:
|
|
return !this.applicable(<Where>value, context); // $not should be a unary operator
|
|
default:
|
|
throw new Error('Undefined logic operator.');
|
|
}
|
|
} else if (meta.includes(<any>key)) {
|
|
const testRegexString = (x: string) => {
|
|
if (typeof value === 'string') {
|
|
return x === value;
|
|
}
|
|
|
|
return (<RegExp><unknown>value).test(x);
|
|
};
|
|
|
|
switch (key as keyof typeof meta) {
|
|
case '$before' as keyof typeof meta:
|
|
return !usedActions.find(testRegexString);
|
|
case '$after' as keyof typeof meta:
|
|
return !!usedActions.find(testRegexString);
|
|
default:
|
|
throw new Error('Undefined meta operator.');
|
|
}
|
|
} else {
|
|
// Current key is a base condition (url, cookies, selectors)
|
|
return inclusive({ [key]: value }, context);
|
|
}
|
|
},
|
|
);
|
|
}
|
|
|
|
/**
|
|
* Given a Playwright's page object and a "declarative" list of actions, this function
|
|
* calls all mentioned functions on the Page object.\
|
|
* \
|
|
* Manipulates the iterator indexes (experimental feature, likely to be removed in
|
|
* the following versions of maxun-core)
|
|
* @param page Playwright Page object
|
|
* @param steps Array of actions.
|
|
*/
|
|
private async carryOutSteps(page: Page, steps: What[]): Promise<void> {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted, stopping execution', Level.WARN);
|
|
return;
|
|
}
|
|
|
|
/**
|
|
* Defines overloaded (or added) methods/actions usable in the workflow.
|
|
* If a method overloads any existing method of the Page class, it accepts the same set
|
|
* of parameters *(but can override some!)*\
|
|
* \
|
|
* Also, following piece of code defines functions to be run in the browser's context.
|
|
* Beware of false linter errors - here, we know better!
|
|
*/
|
|
const wawActions: Record<CustomFunctions, (...args: any[]) => void> = {
|
|
screenshot: async (
|
|
params: PageScreenshotOptions,
|
|
nameOverride?: string
|
|
) => {
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType("screenshot");
|
|
}
|
|
|
|
const screenshotBuffer = await page.screenshot({
|
|
...params,
|
|
path: undefined,
|
|
});
|
|
|
|
const explicitName = (typeof nameOverride === 'string' && nameOverride.trim().length > 0) ? nameOverride.trim() : null;
|
|
let screenshotName: string;
|
|
|
|
if (explicitName) {
|
|
screenshotName = explicitName;
|
|
} else {
|
|
this.screenshotCounter += 1;
|
|
screenshotName = `Screenshot ${this.screenshotCounter}`;
|
|
}
|
|
|
|
await this.options.binaryCallback(
|
|
{
|
|
name: screenshotName,
|
|
data: screenshotBuffer,
|
|
mimeType: "image/png",
|
|
},
|
|
"image/png"
|
|
);
|
|
},
|
|
enqueueLinks: async (selector: string) => {
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType('enqueueLinks');
|
|
}
|
|
|
|
const links: string[] = await page.locator(selector)
|
|
.evaluateAll(
|
|
// @ts-ignore
|
|
(elements) => elements.map((a) => a.href).filter((x) => x),
|
|
);
|
|
const context = page.context();
|
|
|
|
for (const link of links) {
|
|
// eslint-disable-next-line
|
|
this.concurrency.addJob(async () => {
|
|
let newPage = null;
|
|
try {
|
|
newPage = await context.newPage();
|
|
await newPage.goto(link);
|
|
await newPage.waitForLoadState('networkidle');
|
|
await this.runLoop(newPage, this.initializedWorkflow!);
|
|
} catch (e) {
|
|
// `runLoop` uses soft mode, so it recovers from it's own exceptions
|
|
// but newPage(), goto() and waitForLoadState() don't (and will kill
|
|
// the interpreter by throwing).
|
|
this.log(<Error>e, Level.ERROR);
|
|
} finally {
|
|
if (newPage && !newPage.isClosed()) {
|
|
try {
|
|
await newPage.close();
|
|
} catch (closeError) {
|
|
this.log('Failed to close enqueued page', Level.WARN);
|
|
}
|
|
}
|
|
}
|
|
});
|
|
}
|
|
await page.close();
|
|
},
|
|
scrape: async (selector?: string) => {
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType('scrape');
|
|
}
|
|
|
|
await this.ensureScriptsLoaded(page);
|
|
|
|
const scrapeResults: Record<string, string>[] = await page.evaluate((s) => window.scrape(s ?? null), selector);
|
|
await this.options.serializableCallback(scrapeResults);
|
|
},
|
|
|
|
scrapeSchema: async (schema: Record<string, { selector: string; tag: string, attribute: string; shadow: string}>, actionName: string = "") => {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted, stopping scrapeSchema', Level.WARN);
|
|
return;
|
|
}
|
|
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType('scrapeSchema');
|
|
}
|
|
|
|
if (this.options.mode && this.options.mode === 'editor') {
|
|
await this.options.serializableCallback({});
|
|
return;
|
|
}
|
|
|
|
await this.ensureScriptsLoaded(page);
|
|
|
|
const scrapeResult = await page.evaluate((schemaObj) => window.scrapeSchema(schemaObj), schema);
|
|
|
|
if (!this.cumulativeResults || !Array.isArray(this.cumulativeResults)) {
|
|
this.cumulativeResults = [];
|
|
}
|
|
|
|
const resultToProcess = Array.isArray(scrapeResult) ? scrapeResult[0] : scrapeResult;
|
|
|
|
if (this.cumulativeResults.length === 0) {
|
|
const newRow = {};
|
|
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
if (value !== undefined) {
|
|
newRow[key] = value;
|
|
}
|
|
});
|
|
this.cumulativeResults.push(newRow);
|
|
} else {
|
|
const lastRow = this.cumulativeResults[this.cumulativeResults.length - 1];
|
|
const newResultKeys = Object.keys(resultToProcess).filter(key => resultToProcess[key] !== undefined);
|
|
const hasRepeatedKeys = newResultKeys.some(key => lastRow.hasOwnProperty(key));
|
|
|
|
if (hasRepeatedKeys) {
|
|
const newRow = {};
|
|
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
if (value !== undefined) {
|
|
newRow[key] = value;
|
|
}
|
|
});
|
|
this.cumulativeResults.push(newRow);
|
|
} else {
|
|
Object.entries(resultToProcess).forEach(([key, value]) => {
|
|
if (value !== undefined) {
|
|
lastRow[key] = value;
|
|
}
|
|
});
|
|
}
|
|
}
|
|
|
|
const actionType = "scrapeSchema";
|
|
const name = actionName || "Texts";
|
|
|
|
if (!this.namedResults[actionType]) this.namedResults[actionType] = {};
|
|
this.namedResults[actionType][name] = this.cumulativeResults;
|
|
|
|
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
|
|
if (!this.serializableDataByType[actionType][name]) {
|
|
this.serializableDataByType[actionType][name] = [];
|
|
}
|
|
|
|
this.serializableDataByType[actionType][name] = [...this.cumulativeResults];
|
|
|
|
await this.options.serializableCallback({
|
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
scrapeSchema: this.serializableDataByType.scrapeSchema,
|
|
crawl: this.serializableDataByType.crawl || {},
|
|
search: this.serializableDataByType.search || {}
|
|
});
|
|
},
|
|
|
|
scrapeList: async (config: { listSelector: string, fields: any, limit?: number, pagination: any }, actionName: string = "") => {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted, stopping scrapeList', Level.WARN);
|
|
return;
|
|
}
|
|
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType('scrapeList');
|
|
}
|
|
|
|
if (this.options.mode && this.options.mode === 'editor') {
|
|
await this.options.serializableCallback({});
|
|
return;
|
|
}
|
|
|
|
try {
|
|
await this.ensureScriptsLoaded(page);
|
|
|
|
if (this.options.debugChannel?.incrementScrapeListIndex) {
|
|
this.options.debugChannel.incrementScrapeListIndex();
|
|
}
|
|
|
|
let scrapeResults = [];
|
|
let paginationUsed = false;
|
|
|
|
if (!config.pagination) {
|
|
scrapeResults = await page.evaluate((cfg) => {
|
|
try {
|
|
return window.scrapeList(cfg);
|
|
} catch (error) {
|
|
console.warn('ScrapeList evaluation failed:', error.message);
|
|
return [];
|
|
}
|
|
}, config);
|
|
} else {
|
|
paginationUsed = true;
|
|
scrapeResults = await this.handlePagination(page, config, actionName);
|
|
}
|
|
|
|
if (!Array.isArray(scrapeResults)) {
|
|
scrapeResults = [];
|
|
}
|
|
|
|
console.log(`ScrapeList completed with ${scrapeResults.length} results`);
|
|
|
|
if (!paginationUsed) {
|
|
const actionType = "scrapeList";
|
|
let name = actionName || "";
|
|
|
|
if (!name || name.trim() === "") {
|
|
this.scrapeListCounter++;
|
|
name = `List ${this.scrapeListCounter}`;
|
|
}
|
|
|
|
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
|
|
if (!this.serializableDataByType[actionType][name]) {
|
|
this.serializableDataByType[actionType][name] = [];
|
|
}
|
|
|
|
this.serializableDataByType[actionType][name].push(...scrapeResults);
|
|
|
|
await this.options.serializableCallback({
|
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
});
|
|
}
|
|
} catch (error) {
|
|
console.error('ScrapeList action failed completely:', error.message);
|
|
|
|
const actionType = "scrapeList";
|
|
let name = actionName || "";
|
|
|
|
if (!name || name.trim() === "") {
|
|
this.scrapeListCounter++;
|
|
name = `List ${this.scrapeListCounter}`;
|
|
}
|
|
|
|
if (!this.namedResults[actionType]) this.namedResults[actionType] = {};
|
|
this.namedResults[actionType][name] = [];
|
|
|
|
if (!this.serializableDataByType[actionType]) this.serializableDataByType[actionType] = {};
|
|
this.serializableDataByType[actionType][name] = [];
|
|
|
|
await this.options.serializableCallback({
|
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
scrapeSchema: this.serializableDataByType.scrapeSchema
|
|
});
|
|
}
|
|
},
|
|
|
|
scrapeListAuto: async (config: { listSelector: string }) => {
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType('scrapeListAuto');
|
|
}
|
|
|
|
await this.ensureScriptsLoaded(page);
|
|
|
|
const scrapeResults: { selector: string, innerText: string }[] = await page.evaluate((listSelector) => {
|
|
return window.scrapeListAuto(listSelector);
|
|
}, config.listSelector);
|
|
|
|
await this.options.serializableCallback(scrapeResults);
|
|
},
|
|
|
|
scroll: async (pages?: number) => {
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType('scroll');
|
|
}
|
|
|
|
await page.evaluate(async (pagesInternal) => {
|
|
for (let i = 1; i <= (pagesInternal ?? 1); i += 1) {
|
|
// @ts-ignore
|
|
window.scrollTo(0, window.scrollY + window.innerHeight);
|
|
}
|
|
}, pages ?? 1);
|
|
},
|
|
|
|
script: async (code: string) => {
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType('script');
|
|
}
|
|
|
|
try {
|
|
const AsyncFunction: FunctionConstructor = Object.getPrototypeOf(
|
|
async () => { },
|
|
).constructor;
|
|
const x = new AsyncFunction('page', 'log', code);
|
|
await x(page, this.log);
|
|
} catch (error) {
|
|
this.log(`Script execution failed: ${error.message}`, Level.ERROR);
|
|
throw new Error(`Script execution error: ${error.message}`);
|
|
}
|
|
},
|
|
|
|
crawl: async (crawlConfig: {
|
|
mode: 'domain' | 'subdomain' | 'path';
|
|
limit: number;
|
|
maxDepth: number;
|
|
includePaths: string[];
|
|
excludePaths: string[];
|
|
useSitemap: boolean;
|
|
followLinks: boolean;
|
|
respectRobots: boolean;
|
|
}) => {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted, stopping crawl', Level.WARN);
|
|
return;
|
|
}
|
|
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType('crawl');
|
|
}
|
|
|
|
this.log('Starting crawl operation', Level.LOG);
|
|
|
|
try {
|
|
const currentUrl = page.url();
|
|
this.log(`Current page URL: ${currentUrl}`, Level.LOG);
|
|
|
|
if (!currentUrl || currentUrl === 'about:blank' || currentUrl === '') {
|
|
this.log('Page not yet navigated, waiting for navigation...', Level.WARN);
|
|
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
|
|
}
|
|
|
|
const baseUrl = page.url();
|
|
this.log(`Using base URL for crawl: ${baseUrl}`, Level.LOG);
|
|
|
|
const parsedBase = new URL(baseUrl);
|
|
const baseDomain = parsedBase.hostname;
|
|
|
|
interface RobotRules {
|
|
disallowedPaths: string[];
|
|
allowedPaths: string[];
|
|
crawlDelay: number | null;
|
|
}
|
|
|
|
let robotRules: RobotRules = {
|
|
disallowedPaths: [],
|
|
allowedPaths: [],
|
|
crawlDelay: null
|
|
};
|
|
|
|
if (crawlConfig.respectRobots) {
|
|
this.log('Fetching robots.txt...', Level.LOG);
|
|
try {
|
|
const robotsUrl = `${parsedBase.protocol}//${parsedBase.host}/robots.txt`;
|
|
|
|
const robotsContent = await page.evaluate((url) => {
|
|
return new Promise<string>((resolve) => {
|
|
const xhr = new XMLHttpRequest();
|
|
xhr.open('GET', url, true);
|
|
xhr.onload = function() {
|
|
if (xhr.status === 200) {
|
|
resolve(xhr.responseText);
|
|
} else {
|
|
resolve('');
|
|
}
|
|
};
|
|
xhr.onerror = function() {
|
|
resolve('');
|
|
};
|
|
xhr.send();
|
|
});
|
|
}, robotsUrl);
|
|
|
|
if (robotsContent) {
|
|
const lines = robotsContent.split('\n');
|
|
let isRelevantUserAgent = false;
|
|
let foundSpecificUserAgent = false;
|
|
|
|
for (const line of lines) {
|
|
const trimmedLine = line.trim().toLowerCase();
|
|
|
|
if (trimmedLine.startsWith('#') || trimmedLine === '') {
|
|
continue;
|
|
}
|
|
|
|
const colonIndex = line.indexOf(':');
|
|
if (colonIndex === -1) continue;
|
|
|
|
const directive = line.substring(0, colonIndex).trim().toLowerCase();
|
|
const value = line.substring(colonIndex + 1).trim();
|
|
|
|
if (directive === 'user-agent') {
|
|
const agent = value.toLowerCase();
|
|
if (agent === '*' && !foundSpecificUserAgent) {
|
|
isRelevantUserAgent = true;
|
|
} else if (agent.includes('bot') || agent.includes('crawler') || agent.includes('spider')) {
|
|
isRelevantUserAgent = true;
|
|
foundSpecificUserAgent = true;
|
|
} else {
|
|
if (!foundSpecificUserAgent) {
|
|
isRelevantUserAgent = false;
|
|
}
|
|
}
|
|
} else if (isRelevantUserAgent) {
|
|
if (directive === 'disallow' && value) {
|
|
robotRules.disallowedPaths.push(value);
|
|
} else if (directive === 'allow' && value) {
|
|
robotRules.allowedPaths.push(value);
|
|
} else if (directive === 'crawl-delay' && value) {
|
|
const delay = parseFloat(value);
|
|
if (!isNaN(delay) && delay > 0) {
|
|
robotRules.crawlDelay = delay * 1000;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
this.log(`Robots.txt parsed: ${robotRules.disallowedPaths.length} disallowed paths, ${robotRules.allowedPaths.length} allowed paths, crawl-delay: ${robotRules.crawlDelay || 'none'}`, Level.LOG);
|
|
} else {
|
|
this.log('No robots.txt found or not accessible, proceeding without restrictions', Level.WARN);
|
|
}
|
|
} catch (error) {
|
|
this.log(`Failed to fetch robots.txt: ${error.message}, proceeding without restrictions`, Level.WARN);
|
|
}
|
|
}
|
|
|
|
const isUrlAllowedByRobots = (url: string): boolean => {
|
|
if (!crawlConfig.respectRobots) return true;
|
|
|
|
try {
|
|
const urlObj = new URL(url);
|
|
const pathname = urlObj.pathname;
|
|
|
|
for (const allowedPath of robotRules.allowedPaths) {
|
|
if (allowedPath === pathname || pathname.startsWith(allowedPath)) {
|
|
return true;
|
|
}
|
|
if (allowedPath.includes('*')) {
|
|
const regex = new RegExp('^' + allowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
|
|
if (regex.test(pathname)) {
|
|
return true;
|
|
}
|
|
}
|
|
}
|
|
|
|
for (const disallowedPath of robotRules.disallowedPaths) {
|
|
if (disallowedPath === '/') {
|
|
return false;
|
|
}
|
|
if (pathname.startsWith(disallowedPath)) {
|
|
return false;
|
|
}
|
|
if (disallowedPath.includes('*')) {
|
|
const regex = new RegExp('^' + disallowedPath.replace(/\*/g, '.*').replace(/\?/g, '.') + '$');
|
|
if (regex.test(pathname)) {
|
|
return false;
|
|
}
|
|
}
|
|
if (disallowedPath.endsWith('$')) {
|
|
const pattern = disallowedPath.slice(0, -1);
|
|
if (pathname === pattern || pathname.endsWith(pattern)) {
|
|
return false;
|
|
}
|
|
}
|
|
}
|
|
|
|
return true;
|
|
} catch (error) {
|
|
return true;
|
|
}
|
|
};
|
|
|
|
const isUrlAllowedByConfig = (url: string): boolean => {
|
|
try {
|
|
const urlObj = new URL(url);
|
|
|
|
if (crawlConfig.mode === 'domain') {
|
|
if (urlObj.hostname !== baseDomain) return false;
|
|
} else if (crawlConfig.mode === 'subdomain') {
|
|
if (!urlObj.hostname.endsWith(baseDomain) && urlObj.hostname !== baseDomain) return false;
|
|
} else if (crawlConfig.mode === 'path') {
|
|
if (urlObj.hostname !== baseDomain || !urlObj.pathname.startsWith(parsedBase.pathname)) return false;
|
|
}
|
|
|
|
if (crawlConfig.includePaths && crawlConfig.includePaths.length > 0) {
|
|
const matches = crawlConfig.includePaths.some(pattern => {
|
|
try {
|
|
const regex = new RegExp(pattern);
|
|
return regex.test(url);
|
|
} catch {
|
|
return url.includes(pattern);
|
|
}
|
|
});
|
|
if (!matches) return false;
|
|
}
|
|
|
|
if (crawlConfig.excludePaths && crawlConfig.excludePaths.length > 0) {
|
|
const matches = crawlConfig.excludePaths.some(pattern => {
|
|
try {
|
|
const regex = new RegExp(pattern);
|
|
return regex.test(url);
|
|
} catch {
|
|
return url.includes(pattern);
|
|
}
|
|
});
|
|
if (matches) return false;
|
|
}
|
|
|
|
return true;
|
|
} catch (error) {
|
|
return false;
|
|
}
|
|
};
|
|
|
|
const normalizeUrl = (url: string): string => {
|
|
return url.replace(/#.*$/, '').replace(/\/$/, '');
|
|
};
|
|
|
|
const extractLinksFromPage = async (): Promise<string[]> => {
|
|
try {
|
|
await page.waitForLoadState('load', { timeout: 15000 }).catch(() => {});
|
|
await page.waitForLoadState('networkidle', { timeout: 10000 }).catch(() => {});
|
|
await new Promise(resolve => setTimeout(resolve, 1000));
|
|
|
|
const pageLinks = await page.evaluate(() => {
|
|
const links: string[] = [];
|
|
const allAnchors = document.querySelectorAll('a');
|
|
|
|
for (let i = 0; i < allAnchors.length; i++) {
|
|
const anchor = allAnchors[i] as HTMLAnchorElement;
|
|
const fullHref = anchor.href;
|
|
|
|
if (fullHref && (fullHref.startsWith('http://') || fullHref.startsWith('https://'))) {
|
|
links.push(fullHref);
|
|
}
|
|
}
|
|
|
|
return links;
|
|
});
|
|
|
|
return pageLinks;
|
|
} catch (error) {
|
|
this.log(`Link extraction failed: ${error.message}`, Level.WARN);
|
|
return [];
|
|
}
|
|
};
|
|
|
|
const scrapePageContent = async (url: string) => {
|
|
const pageData = await page.evaluate(() => {
|
|
const getMeta = (name: string) => {
|
|
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
|
return meta?.getAttribute('content') || '';
|
|
};
|
|
|
|
const getAllMeta = () => {
|
|
const metadata: Record<string, string> = {};
|
|
const metaTags = document.querySelectorAll('meta');
|
|
metaTags.forEach(tag => {
|
|
const name = tag.getAttribute('name') || tag.getAttribute('property');
|
|
const content = tag.getAttribute('content');
|
|
if (name && content) {
|
|
metadata[name] = content;
|
|
}
|
|
});
|
|
return metadata;
|
|
};
|
|
|
|
const title = document.title || '';
|
|
const bodyText = document.body?.innerText || '';
|
|
|
|
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
|
|
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
|
|
|
const html = document.documentElement.outerHTML;
|
|
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
|
|
const allMetadata = getAllMeta();
|
|
|
|
return {
|
|
title,
|
|
description: getMeta('description'),
|
|
text: bodyText,
|
|
html: html,
|
|
links: links,
|
|
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
|
|
metadata: {
|
|
...allMetadata,
|
|
title,
|
|
language: document.documentElement.lang || '',
|
|
favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '',
|
|
statusCode: 200
|
|
}
|
|
};
|
|
});
|
|
|
|
return {
|
|
metadata: {
|
|
...pageData.metadata,
|
|
url: url,
|
|
sourceURL: url
|
|
} as Record<string, any>,
|
|
html: pageData.html,
|
|
text: pageData.text,
|
|
links: pageData.links,
|
|
wordCount: pageData.wordCount,
|
|
scrapedAt: new Date().toISOString()
|
|
};
|
|
};
|
|
|
|
const visitedUrls = new Set<string>();
|
|
const crawlResults: any[] = [];
|
|
|
|
interface CrawlQueueItem {
|
|
url: string;
|
|
depth: number;
|
|
}
|
|
|
|
const crawlQueue: CrawlQueueItem[] = [];
|
|
|
|
const normalizedBaseUrl = normalizeUrl(baseUrl);
|
|
visitedUrls.add(normalizedBaseUrl);
|
|
crawlQueue.push({ url: baseUrl, depth: 0 });
|
|
|
|
this.log(`Starting breadth-first crawl with maxDepth: ${crawlConfig.maxDepth}, limit: ${crawlConfig.limit}`, Level.LOG);
|
|
|
|
if (crawlConfig.useSitemap) {
|
|
this.log('Fetching sitemap URLs...', Level.LOG);
|
|
try {
|
|
const sitemapUrl = `${parsedBase.protocol}//${parsedBase.host}/sitemap.xml`;
|
|
|
|
const sitemapUrls = await page.evaluate((url) => {
|
|
return new Promise<string[]>((resolve) => {
|
|
const xhr = new XMLHttpRequest();
|
|
xhr.open('GET', url, true);
|
|
xhr.onload = function() {
|
|
if (xhr.status === 200) {
|
|
const text = xhr.responseText;
|
|
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
|
|
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
|
|
resolve(urls);
|
|
} else {
|
|
resolve([]);
|
|
}
|
|
};
|
|
xhr.onerror = function() {
|
|
resolve([]);
|
|
};
|
|
xhr.send();
|
|
});
|
|
}, sitemapUrl);
|
|
|
|
if (sitemapUrls.length > 0) {
|
|
const nestedSitemaps = sitemapUrls.filter(url =>
|
|
url.endsWith('/sitemap') || url.endsWith('sitemap.xml') || url.includes('/sitemap/')
|
|
);
|
|
const regularUrls = sitemapUrls.filter(url =>
|
|
!url.endsWith('/sitemap') && !url.endsWith('sitemap.xml') && !url.includes('/sitemap/')
|
|
);
|
|
|
|
for (const sitemapPageUrl of regularUrls) {
|
|
const normalized = normalizeUrl(sitemapPageUrl);
|
|
if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(sitemapPageUrl) && isUrlAllowedByRobots(sitemapPageUrl)) {
|
|
visitedUrls.add(normalized);
|
|
crawlQueue.push({ url: sitemapPageUrl, depth: 1 });
|
|
}
|
|
}
|
|
|
|
this.log(`Found ${regularUrls.length} regular URLs from main sitemap`, Level.LOG);
|
|
|
|
for (const nestedUrl of nestedSitemaps.slice(0, 10)) {
|
|
try {
|
|
this.log(`Fetching nested sitemap: ${nestedUrl}`, Level.LOG);
|
|
const nestedUrls = await page.evaluate((url) => {
|
|
return new Promise<string[]>((resolve) => {
|
|
const xhr = new XMLHttpRequest();
|
|
xhr.open('GET', url, true);
|
|
xhr.onload = function() {
|
|
if (xhr.status === 200) {
|
|
const text = xhr.responseText;
|
|
const locMatches = text.match(/<loc>(.*?)<\/loc>/g) || [];
|
|
const urls = locMatches.map(match => match.replace(/<\/?loc>/g, ''));
|
|
resolve(urls);
|
|
} else {
|
|
resolve([]);
|
|
}
|
|
};
|
|
xhr.onerror = function() {
|
|
resolve([]);
|
|
};
|
|
xhr.send();
|
|
});
|
|
}, nestedUrl);
|
|
|
|
for (const nestedPageUrl of nestedUrls) {
|
|
const normalized = normalizeUrl(nestedPageUrl);
|
|
if (!visitedUrls.has(normalized) && isUrlAllowedByConfig(nestedPageUrl) && isUrlAllowedByRobots(nestedPageUrl)) {
|
|
visitedUrls.add(normalized);
|
|
crawlQueue.push({ url: nestedPageUrl, depth: 1 });
|
|
}
|
|
}
|
|
|
|
this.log(`Found ${nestedUrls.length} URLs from nested sitemap ${nestedUrl}`, Level.LOG);
|
|
} catch (error) {
|
|
this.log(`Failed to fetch nested sitemap ${nestedUrl}: ${error.message}`, Level.WARN);
|
|
}
|
|
}
|
|
|
|
this.log(`Total URLs queued from sitemaps: ${crawlQueue.length - 1}`, Level.LOG);
|
|
} else {
|
|
this.log('No URLs found in sitemap or sitemap not available', Level.WARN);
|
|
}
|
|
} catch (error) {
|
|
this.log(`Sitemap fetch failed: ${error.message}`, Level.WARN);
|
|
}
|
|
}
|
|
|
|
let processedCount = 0;
|
|
|
|
while (crawlQueue.length > 0 && crawlResults.length < crawlConfig.limit) {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted during crawl', Level.WARN);
|
|
break;
|
|
}
|
|
|
|
const { url, depth } = crawlQueue.shift()!;
|
|
processedCount++;
|
|
|
|
this.log(`[${crawlResults.length + 1}/${crawlConfig.limit}] Crawling (depth ${depth}): ${url}`, Level.LOG);
|
|
|
|
try {
|
|
if (robotRules.crawlDelay && crawlResults.length > 0) {
|
|
this.log(`Applying crawl delay: ${robotRules.crawlDelay}ms`, Level.LOG);
|
|
await new Promise(resolve => setTimeout(resolve, robotRules.crawlDelay!));
|
|
}
|
|
|
|
await page.goto(url, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 30000
|
|
}).catch((err) => {
|
|
throw new Error(`Navigation failed: ${err.message}`);
|
|
});
|
|
|
|
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
|
|
|
|
const pageResult = await scrapePageContent(url);
|
|
pageResult.metadata.depth = depth;
|
|
crawlResults.push(pageResult);
|
|
|
|
this.log(`✓ Scraped ${url} (${pageResult.wordCount} words, depth ${depth})`, Level.LOG);
|
|
|
|
if (crawlConfig.followLinks && depth < crawlConfig.maxDepth) {
|
|
const newLinks = await extractLinksFromPage();
|
|
let addedCount = 0;
|
|
|
|
for (const link of newLinks) {
|
|
const normalized = normalizeUrl(link);
|
|
|
|
if (!visitedUrls.has(normalized) &&
|
|
isUrlAllowedByConfig(link) &&
|
|
isUrlAllowedByRobots(link)) {
|
|
visitedUrls.add(normalized);
|
|
crawlQueue.push({ url: link, depth: depth + 1 });
|
|
addedCount++;
|
|
}
|
|
}
|
|
|
|
if (addedCount > 0) {
|
|
this.log(`Added ${addedCount} new URLs to queue at depth ${depth + 1}`, Level.LOG);
|
|
}
|
|
}
|
|
|
|
} catch (error) {
|
|
this.log(`Failed to crawl ${url}: ${error.message}`, Level.WARN);
|
|
crawlResults.push({
|
|
metadata: {
|
|
url: url,
|
|
sourceURL: url,
|
|
depth: depth
|
|
},
|
|
error: error.message,
|
|
scrapedAt: new Date().toISOString()
|
|
});
|
|
}
|
|
}
|
|
|
|
this.log(`Crawl completed: ${crawlResults.length} pages scraped (${processedCount} URLs processed, ${visitedUrls.size} URLs discovered)`, Level.LOG);
|
|
|
|
const actionType = "crawl";
|
|
const actionName = "Crawl Results";
|
|
|
|
if (!this.serializableDataByType[actionType]) {
|
|
this.serializableDataByType[actionType] = {};
|
|
}
|
|
if (!this.serializableDataByType[actionType][actionName]) {
|
|
this.serializableDataByType[actionType][actionName] = [];
|
|
}
|
|
|
|
this.serializableDataByType[actionType][actionName] = crawlResults;
|
|
|
|
await this.options.serializableCallback({
|
|
scrapeList: this.serializableDataByType.scrapeList || {},
|
|
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
|
|
crawl: this.serializableDataByType.crawl || {},
|
|
search: this.serializableDataByType.search || {}
|
|
});
|
|
|
|
} catch (error) {
|
|
this.log(`Crawl action failed: ${error.message}`, Level.ERROR);
|
|
throw new Error(`Crawl execution error: ${error.message}`);
|
|
}
|
|
},
|
|
|
|
search: async (searchConfig: {
|
|
query: string;
|
|
limit: number;
|
|
provider?: 'duckduckgo';
|
|
filters?: {
|
|
timeRange?: 'day' | 'week' | 'month' | 'year';
|
|
};
|
|
mode: 'discover' | 'scrape';
|
|
}) => {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted, stopping search', Level.WARN);
|
|
return;
|
|
}
|
|
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType('search');
|
|
}
|
|
|
|
searchConfig.provider = 'duckduckgo';
|
|
|
|
this.log(`Performing DuckDuckGo search for: ${searchConfig.query}`, Level.LOG);
|
|
|
|
try {
|
|
let searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(searchConfig.query)}`;
|
|
|
|
if (searchConfig.filters?.timeRange) {
|
|
const timeMap: Record<string, string> = {
|
|
'day': 'd',
|
|
'week': 'w',
|
|
'month': 'm',
|
|
'year': 'y'
|
|
};
|
|
searchUrl += `&df=${timeMap[searchConfig.filters.timeRange]}`;
|
|
}
|
|
|
|
const initialDelay = 500 + Math.random() * 1000;
|
|
await new Promise(resolve => setTimeout(resolve, initialDelay));
|
|
|
|
await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
|
|
|
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {
|
|
this.log('Load state timeout, continuing anyway', Level.WARN);
|
|
});
|
|
|
|
const pageLoadDelay = 2000 + Math.random() * 1500;
|
|
await new Promise(resolve => setTimeout(resolve, pageLoadDelay));
|
|
|
|
let searchResults: any[] = [];
|
|
let retryCount = 0;
|
|
const maxRetries = 2;
|
|
|
|
while (searchResults.length === 0 && retryCount <= maxRetries) {
|
|
if (retryCount > 0) {
|
|
this.log(`Retry attempt ${retryCount}/${maxRetries} for DuckDuckGo search...`, Level.LOG);
|
|
const retryDelay = 1000 * Math.pow(2, retryCount) + Math.random() * 1000;
|
|
await new Promise(resolve => setTimeout(resolve, retryDelay));
|
|
}
|
|
|
|
this.log('Attempting to extract DuckDuckGo search results...', Level.LOG);
|
|
|
|
await page.waitForSelector('[data-testid="result"], .result', { timeout: 5000 }).catch(() => {
|
|
this.log('DuckDuckGo results not found on initial wait', Level.WARN);
|
|
});
|
|
|
|
let currentResultCount = 0;
|
|
const maxLoadAttempts = Math.ceil(searchConfig.limit / 10) * 2;
|
|
let loadAttempts = 0;
|
|
let noNewResultsCount = 0;
|
|
|
|
while (currentResultCount < searchConfig.limit && loadAttempts < maxLoadAttempts && noNewResultsCount < 3) {
|
|
const previousCount = currentResultCount;
|
|
|
|
currentResultCount = await page.evaluate(() => {
|
|
const selectors = [
|
|
'[data-testid="result"]',
|
|
'article[data-testid="result"]',
|
|
'li[data-layout="organic"]',
|
|
'.result',
|
|
'article[data-testid]'
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
const elements = document.querySelectorAll(selector);
|
|
if (elements.length > 0) {
|
|
return elements.length;
|
|
}
|
|
}
|
|
return 0;
|
|
});
|
|
|
|
if (currentResultCount >= searchConfig.limit) {
|
|
this.log(`Reached desired result count: ${currentResultCount}`, Level.LOG);
|
|
break;
|
|
}
|
|
|
|
if (currentResultCount === previousCount) {
|
|
noNewResultsCount++;
|
|
this.log(`No new results after load more (attempt ${noNewResultsCount}/3)`, Level.WARN);
|
|
if (noNewResultsCount >= 3) break;
|
|
} else {
|
|
noNewResultsCount = 0;
|
|
this.log(`Current results count: ${currentResultCount}/${searchConfig.limit}`, Level.LOG);
|
|
}
|
|
|
|
await page.evaluate(() => {
|
|
window.scrollTo(0, document.body.scrollHeight);
|
|
});
|
|
|
|
await new Promise(resolve => setTimeout(resolve, 800));
|
|
|
|
const loadMoreClicked = await page.evaluate(() => {
|
|
const selectors = [
|
|
'#more-results',
|
|
'button:has-text("More results")',
|
|
'button:has-text("more results")',
|
|
'button[id*="more"]',
|
|
'button:has-text("Load more")'
|
|
];
|
|
|
|
for (const selector of selectors) {
|
|
try {
|
|
const button = document.querySelector(selector) as HTMLButtonElement;
|
|
if (button && button.offsetParent !== null) {
|
|
button.click();
|
|
console.log(`Clicked load more button with selector: ${selector}`);
|
|
return true;
|
|
}
|
|
} catch (e) {
|
|
continue;
|
|
}
|
|
}
|
|
return false;
|
|
});
|
|
|
|
if (loadMoreClicked) {
|
|
this.log('Clicked "More results" button', Level.LOG);
|
|
await new Promise(resolve => setTimeout(resolve, 1500 + Math.random() * 1000));
|
|
} else {
|
|
this.log('No "More results" button found, results may be limited', Level.WARN);
|
|
break;
|
|
}
|
|
|
|
loadAttempts++;
|
|
}
|
|
|
|
this.log(`Finished pagination. Total results available: ${currentResultCount}`, Level.LOG);
|
|
|
|
searchResults = await page.evaluate((limit: number) => {
|
|
const results: any[] = [];
|
|
|
|
const cleanDescription = (text: string): string => {
|
|
if (!text) return '';
|
|
let cleaned = text.replace(/^\d+\s+(second|minute|hour|day|week|month|year)s?\s+ago\s*/i, '');
|
|
cleaned = cleaned.replace(/^[A-Z][a-z]{2}\s+\d{1,2},?\s+\d{4}\s*[—\-]\s*/i, '');
|
|
cleaned = cleaned.replace(/^\d{4}-\d{2}-\d{2}\s*[—\-]\s*/i, '');
|
|
cleaned = cleaned.trim().replace(/\s+/g, ' ');
|
|
return cleaned;
|
|
};
|
|
|
|
const selectors = [
|
|
'[data-testid="result"]',
|
|
'article[data-testid="result"]',
|
|
'li[data-layout="organic"]',
|
|
'.result',
|
|
'article[data-testid]'
|
|
];
|
|
let allElements: Element[] = [];
|
|
|
|
for (const selector of selectors) {
|
|
const elements = Array.from(document.querySelectorAll(selector));
|
|
if (elements.length > 0) {
|
|
console.log(`Found ${elements.length} DDG elements with: ${selector}`);
|
|
allElements = elements;
|
|
break;
|
|
}
|
|
}
|
|
|
|
for (let i = 0; i < Math.min(allElements.length, limit); i++) {
|
|
const element = allElements[i];
|
|
|
|
const titleEl = element.querySelector('h2, [data-testid="result-title-a"], h3, [data-testid="result-title"]');
|
|
|
|
let linkEl = titleEl?.querySelector('a[href]') as HTMLAnchorElement;
|
|
if (!linkEl) {
|
|
linkEl = element.querySelector('a[href]') as HTMLAnchorElement;
|
|
}
|
|
|
|
if (!linkEl || !linkEl.href) continue;
|
|
|
|
let actualUrl = linkEl.href;
|
|
|
|
if (actualUrl.includes('uddg=')) {
|
|
try {
|
|
const urlParams = new URLSearchParams(actualUrl.split('?')[1]);
|
|
const uddgUrl = urlParams.get('uddg');
|
|
if (uddgUrl) {
|
|
actualUrl = decodeURIComponent(uddgUrl);
|
|
}
|
|
} catch (e) {
|
|
console.log('Failed to parse uddg parameter:', e);
|
|
}
|
|
}
|
|
|
|
if (actualUrl.includes('duckduckgo.com')) {
|
|
console.log(`Skipping DDG internal URL: ${actualUrl}`);
|
|
continue;
|
|
}
|
|
|
|
const descEl = element.querySelector('[data-result="snippet"], .result__snippet, [data-testid="result-snippet"]');
|
|
|
|
if (titleEl && titleEl.textContent && actualUrl) {
|
|
const rawDescription = (descEl?.textContent || '').trim();
|
|
const cleanedDescription = cleanDescription(rawDescription);
|
|
|
|
results.push({
|
|
url: actualUrl,
|
|
title: titleEl.textContent.trim(),
|
|
description: cleanedDescription,
|
|
position: results.length + 1
|
|
});
|
|
}
|
|
}
|
|
|
|
console.log(`Extracted ${results.length} DuckDuckGo search results`);
|
|
return results;
|
|
}, searchConfig.limit);
|
|
|
|
if (searchResults.length === 0) {
|
|
this.log(`No DuckDuckGo results found (attempt ${retryCount + 1}/${maxRetries + 1})`, Level.WARN);
|
|
retryCount++;
|
|
} else {
|
|
this.log(`Successfully extracted ${searchResults.length} results`, Level.LOG);
|
|
break;
|
|
}
|
|
}
|
|
|
|
this.log(`Search found ${searchResults.length} results`, Level.LOG);
|
|
|
|
if (searchConfig.mode === 'discover') {
|
|
const actionType = "search";
|
|
const actionName = "Search Results";
|
|
|
|
if (!this.serializableDataByType[actionType]) {
|
|
this.serializableDataByType[actionType] = {};
|
|
}
|
|
if (!this.serializableDataByType[actionType][actionName]) {
|
|
this.serializableDataByType[actionType][actionName] = {};
|
|
}
|
|
|
|
const searchData = {
|
|
query: searchConfig.query,
|
|
provider: searchConfig.provider,
|
|
filters: searchConfig.filters || {},
|
|
resultsCount: searchResults.length,
|
|
results: searchResults,
|
|
searchedAt: new Date().toISOString()
|
|
};
|
|
|
|
this.serializableDataByType[actionType][actionName] = searchData;
|
|
|
|
await this.options.serializableCallback({
|
|
scrapeList: this.serializableDataByType.scrapeList || {},
|
|
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
|
|
crawl: this.serializableDataByType.crawl || {},
|
|
search: this.serializableDataByType.search || {}
|
|
});
|
|
|
|
this.log(`Search completed in discover mode with ${searchResults.length} results`, Level.LOG);
|
|
return;
|
|
}
|
|
|
|
this.log(`Starting to scrape content from ${searchResults.length} search results...`, Level.LOG);
|
|
const scrapedResults = [];
|
|
|
|
for (let i = 0; i < searchResults.length; i++) {
|
|
const result = searchResults[i];
|
|
try {
|
|
this.log(`[${i + 1}/${searchResults.length}] Scraping: ${result.url}`, Level.LOG);
|
|
|
|
await page.goto(result.url, {
|
|
waitUntil: 'domcontentloaded',
|
|
timeout: 30000
|
|
}).catch(() => {
|
|
this.log(`Failed to navigate to ${result.url}, skipping...`, Level.WARN);
|
|
});
|
|
|
|
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {});
|
|
|
|
const pageData = await page.evaluate(() => {
|
|
const getMeta = (name: string) => {
|
|
const meta = document.querySelector(`meta[name="${name}"], meta[property="${name}"]`);
|
|
return meta?.getAttribute('content') || '';
|
|
};
|
|
|
|
const getAllMeta = () => {
|
|
const metadata: Record<string, string> = {};
|
|
const metaTags = document.querySelectorAll('meta');
|
|
metaTags.forEach(tag => {
|
|
const name = tag.getAttribute('name') || tag.getAttribute('property');
|
|
const content = tag.getAttribute('content');
|
|
if (name && content) {
|
|
metadata[name] = content;
|
|
}
|
|
});
|
|
return metadata;
|
|
};
|
|
|
|
const title = document.title || '';
|
|
const bodyText = document.body?.innerText || '';
|
|
|
|
const elementsWithMxId = document.querySelectorAll('[data-mx-id]');
|
|
elementsWithMxId.forEach(el => el.removeAttribute('data-mx-id'));
|
|
|
|
const html = document.documentElement.outerHTML;
|
|
const links = Array.from(document.querySelectorAll('a')).map(a => a.href);
|
|
const allMetadata = getAllMeta();
|
|
|
|
return {
|
|
title,
|
|
description: getMeta('description'),
|
|
text: bodyText,
|
|
html: html,
|
|
links: links,
|
|
wordCount: bodyText.split(/\s+/).filter(w => w.length > 0).length,
|
|
metadata: {
|
|
...allMetadata,
|
|
title,
|
|
language: document.documentElement.lang || '',
|
|
favicon: (document.querySelector('link[rel="icon"], link[rel="shortcut icon"]') as HTMLLinkElement)?.href || '',
|
|
statusCode: 200
|
|
}
|
|
};
|
|
});
|
|
|
|
scrapedResults.push({
|
|
searchResult: {
|
|
query: searchConfig.query,
|
|
position: result.position,
|
|
searchTitle: result.title,
|
|
searchDescription: result.description,
|
|
},
|
|
metadata: {
|
|
...pageData.metadata,
|
|
url: result.url,
|
|
sourceURL: result.url
|
|
},
|
|
html: pageData.html,
|
|
text: pageData.text,
|
|
links: pageData.links,
|
|
wordCount: pageData.wordCount,
|
|
scrapedAt: new Date().toISOString()
|
|
});
|
|
|
|
this.log(`✓ Scraped ${result.url} (${pageData.wordCount} words)`, Level.LOG);
|
|
|
|
} catch (error) {
|
|
this.log(`Failed to scrape ${result.url}: ${error.message}`, Level.WARN);
|
|
scrapedResults.push({
|
|
searchResult: {
|
|
query: searchConfig.query,
|
|
position: result.position,
|
|
searchTitle: result.title,
|
|
searchDescription: result.description,
|
|
},
|
|
url: result.url,
|
|
error: error.message,
|
|
scrapedAt: new Date().toISOString()
|
|
});
|
|
}
|
|
}
|
|
|
|
this.log(`Successfully scraped ${scrapedResults.length} search results`, Level.LOG);
|
|
|
|
const actionType = "search";
|
|
const actionName = "Search Results";
|
|
|
|
if (!this.serializableDataByType[actionType]) {
|
|
this.serializableDataByType[actionType] = {};
|
|
}
|
|
if (!this.serializableDataByType[actionType][actionName]) {
|
|
this.serializableDataByType[actionType][actionName] = {};
|
|
}
|
|
|
|
const searchData = {
|
|
query: searchConfig.query,
|
|
provider: searchConfig.provider,
|
|
filters: searchConfig.filters || {},
|
|
mode: searchConfig.mode,
|
|
resultsCount: scrapedResults.length,
|
|
results: scrapedResults,
|
|
searchedAt: new Date().toISOString()
|
|
};
|
|
|
|
this.serializableDataByType[actionType][actionName] = searchData;
|
|
|
|
await this.options.serializableCallback({
|
|
scrapeList: this.serializableDataByType.scrapeList || {},
|
|
scrapeSchema: this.serializableDataByType.scrapeSchema || {},
|
|
crawl: this.serializableDataByType.crawl || {},
|
|
search: this.serializableDataByType.search || {}
|
|
});
|
|
|
|
} catch (error) {
|
|
this.log(`Search action failed: ${error.message}`, Level.ERROR);
|
|
throw new Error(`Search execution error: ${error.message}`);
|
|
}
|
|
},
|
|
|
|
flag: async () => new Promise((res) => {
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType('flag');
|
|
}
|
|
|
|
this.emit('flag', page, res);
|
|
}),
|
|
};
|
|
|
|
const executeAction = async (invokee: any, methodName: string, args: any) => {
|
|
console.log("Executing action:", methodName, args);
|
|
|
|
if (methodName === 'press' || methodName === 'type') {
|
|
// Extract only the first two arguments for these methods
|
|
const limitedArgs = Array.isArray(args) ? args.slice(0, 2) : [args];
|
|
await (<any>invokee[methodName])(...limitedArgs);
|
|
return;
|
|
}
|
|
|
|
if (!args || Array.isArray(args)) {
|
|
await (<any>invokee[methodName])(...(args ?? []));
|
|
} else {
|
|
await (<any>invokee[methodName])(args);
|
|
}
|
|
};
|
|
|
|
|
|
for (const step of steps) {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted during step execution', Level.WARN);
|
|
return;
|
|
}
|
|
|
|
this.log(`Launching ${String(step.action)}`, Level.LOG);
|
|
|
|
let stepName: string | null = null;
|
|
try {
|
|
const debug = this.options.debugChannel;
|
|
if (debug?.setActionType) {
|
|
debug.setActionType(String(step.action));
|
|
}
|
|
|
|
stepName = (step as any)?.name || String(step.action);
|
|
|
|
if (debug && typeof (debug as any).setActionName === "function") {
|
|
(debug as any).setActionName(stepName);
|
|
}
|
|
} catch (err) {
|
|
this.log(`Failed to set action name/type: ${(err as Error).message}`, Level.WARN);
|
|
}
|
|
|
|
if (step.action in wawActions) {
|
|
// "Arrayifying" here should not be needed (TS + syntax checker - only arrays; but why not)
|
|
const params = !step.args || Array.isArray(step.args) ? step.args : [step.args];
|
|
if (step.action === 'screenshot') {
|
|
await (wawActions.screenshot as any)(...(params ?? []), stepName ?? undefined);
|
|
} else if (step.action === 'scrapeList' || step.action === 'scrapeSchema') {
|
|
const actionName = (step as any).name || "";
|
|
await wawActions[step.action as CustomFunctions](...(params ?? []), actionName);
|
|
} else {
|
|
await wawActions[step.action as CustomFunctions](...(params ?? []));
|
|
}
|
|
} else {
|
|
if (this.options.debugChannel?.setActionType) {
|
|
this.options.debugChannel.setActionType(String(step.action));
|
|
}
|
|
|
|
// Implements the dot notation for the "method name" in the workflow
|
|
const levels = String(step.action).split('.');
|
|
const methodName = levels[levels.length - 1];
|
|
|
|
let invokee: any = page;
|
|
for (const level of levels.splice(0, levels.length - 1)) {
|
|
invokee = invokee[level];
|
|
}
|
|
|
|
if (methodName === 'waitForLoadState') {
|
|
try {
|
|
let args = step.args;
|
|
|
|
if (Array.isArray(args) && args.length === 1) {
|
|
args = [args[0], { timeout: 30000 }];
|
|
} else if (!Array.isArray(args)) {
|
|
args = [args, { timeout: 30000 }];
|
|
}
|
|
await executeAction(invokee, methodName, step.args);
|
|
} catch (error) {
|
|
await executeAction(invokee, methodName, 'domcontentloaded');
|
|
}
|
|
} else if (methodName === 'click') {
|
|
try {
|
|
await executeAction(invokee, methodName, step.args);
|
|
} catch (error) {
|
|
try{
|
|
await executeAction(invokee, methodName, [step.args[0], { force: true }]);
|
|
} catch (error) {
|
|
this.log(`Click action failed: ${error.message}`, Level.WARN);
|
|
continue;
|
|
}
|
|
}
|
|
} else {
|
|
try {
|
|
await executeAction(invokee, methodName, step.args);
|
|
} catch (error) {
|
|
this.log(`Action ${methodName} failed: ${error.message}`, Level.ERROR);
|
|
// Continue with next action instead of crashing
|
|
continue;
|
|
}
|
|
}
|
|
}
|
|
|
|
await new Promise((res) => { setTimeout(res, 500); });
|
|
}
|
|
}
|
|
|
|
private async handlePagination(page: Page, config: {
|
|
listSelector: string,
|
|
fields: any,
|
|
limit?: number,
|
|
pagination: any
|
|
}, providedActionName: string = "") {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted, stopping pagination', Level.WARN);
|
|
return [];
|
|
}
|
|
|
|
const actionType = "scrapeList";
|
|
let actionName = providedActionName || "";
|
|
if (!actionName || actionName.trim() === "") {
|
|
this.scrapeListCounter++;
|
|
actionName = `List ${this.scrapeListCounter}`;
|
|
}
|
|
|
|
if (!this.serializableDataByType[actionType]) {
|
|
this.serializableDataByType[actionType] = {};
|
|
}
|
|
if (!this.serializableDataByType[actionType][actionName]) {
|
|
this.serializableDataByType[actionType][actionName] = [];
|
|
}
|
|
|
|
let allResults: Record<string, any>[] = [];
|
|
let previousHeight = 0;
|
|
let scrapedItems: Set<string> = new Set<string>();
|
|
let visitedUrls: Set<string> = new Set<string>();
|
|
const MAX_RETRIES = 3;
|
|
const RETRY_DELAY = 1000;
|
|
const MAX_UNCHANGED_RESULTS = 5;
|
|
|
|
const debugLog = (message: string, ...args: any[]) => {
|
|
console.log(`[Page ${visitedUrls.size}] [URL: ${page.url()}] ${message}`, ...args);
|
|
};
|
|
|
|
const scrapeCurrentPage = async () => {
|
|
if (this.isAborted) {
|
|
debugLog("Workflow aborted, stopping scrapeCurrentPage");
|
|
return;
|
|
}
|
|
|
|
const evaluationPromise = page.evaluate((cfg) => window.scrapeList(cfg), config);
|
|
const timeoutPromise = new Promise<any[]>((_, reject) =>
|
|
setTimeout(() => reject(new Error('Page evaluation timeout')), 10000)
|
|
);
|
|
|
|
let results;
|
|
try {
|
|
results = await Promise.race([evaluationPromise, timeoutPromise]);
|
|
} catch (error) {
|
|
debugLog(`Page evaluation failed: ${error.message}`);
|
|
return;
|
|
}
|
|
const newResults = results.filter(item => {
|
|
const uniqueKey = JSON.stringify(item);
|
|
if (scrapedItems.has(uniqueKey)) return false;
|
|
scrapedItems.add(uniqueKey);
|
|
return true;
|
|
});
|
|
allResults = allResults.concat(newResults);
|
|
debugLog("Results collected:", allResults.length);
|
|
|
|
this.serializableDataByType[actionType][actionName] = [...allResults];
|
|
await this.options.serializableCallback({
|
|
scrapeList: this.serializableDataByType.scrapeList,
|
|
scrapeSchema: this.serializableDataByType.scrapeSchema,
|
|
crawl: this.serializableDataByType.crawl || {},
|
|
search: this.serializableDataByType.search || {}
|
|
});
|
|
};
|
|
|
|
const checkLimit = () => {
|
|
if (config.limit && allResults.length >= config.limit) {
|
|
allResults = allResults.slice(0, config.limit);
|
|
return true;
|
|
}
|
|
return false;
|
|
};
|
|
|
|
// Helper function to detect if a selector is XPath
|
|
const isXPathSelector = (selector: string): boolean => {
|
|
return selector.startsWith('//') ||
|
|
selector.startsWith('/') ||
|
|
selector.startsWith('./') ||
|
|
selector.includes('contains(@') ||
|
|
selector.includes('[count(') ||
|
|
selector.includes('@class=') ||
|
|
selector.includes('@id=') ||
|
|
selector.includes(' and ') ||
|
|
selector.includes(' or ');
|
|
};
|
|
|
|
// Helper function to wait for selector (CSS or XPath)
|
|
const waitForSelectorUniversal = async (selector: string, options: any = {}): Promise<ElementHandle | null> => {
|
|
try {
|
|
if (isXPathSelector(selector)) {
|
|
// Use XPath locator
|
|
const locator = page.locator(`xpath=${selector}`);
|
|
await locator.waitFor({
|
|
state: 'attached',
|
|
timeout: options.timeout || 10000
|
|
});
|
|
return await locator.elementHandle();
|
|
} else {
|
|
// Use CSS selector
|
|
return await page.waitForSelector(selector, {
|
|
state: 'attached',
|
|
timeout: options.timeout || 10000
|
|
});
|
|
}
|
|
} catch (error) {
|
|
return null;
|
|
}
|
|
};
|
|
|
|
// Enhanced button finder with retry mechanism
|
|
const findWorkingButton = async (selectors: string[]): Promise<{
|
|
button: ElementHandle | null,
|
|
workingSelector: string | null,
|
|
updatedSelectors: string[]
|
|
}> => {
|
|
const startTime = Date.now();
|
|
const MAX_BUTTON_SEARCH_TIME = 15000;
|
|
let updatedSelectors = [...selectors];
|
|
|
|
for (let i = 0; i < selectors.length; i++) {
|
|
if (Date.now() - startTime > MAX_BUTTON_SEARCH_TIME) {
|
|
debugLog(`Button search timeout reached (${MAX_BUTTON_SEARCH_TIME}ms), aborting`);
|
|
break;
|
|
}
|
|
const selector = selectors[i];
|
|
let retryCount = 0;
|
|
let selectorSuccess = false;
|
|
|
|
while (retryCount < MAX_RETRIES && !selectorSuccess) {
|
|
try {
|
|
const button = await waitForSelectorUniversal(selector, { timeout: 2000 });
|
|
|
|
if (button) {
|
|
debugLog('Found working selector:', selector);
|
|
return {
|
|
button,
|
|
workingSelector: selector,
|
|
updatedSelectors
|
|
};
|
|
} else {
|
|
retryCount++;
|
|
debugLog(`Selector "${selector}" not found: attempt ${retryCount}/${MAX_RETRIES}`);
|
|
|
|
if (retryCount < MAX_RETRIES) {
|
|
await page.waitForTimeout(RETRY_DELAY);
|
|
} else {
|
|
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
|
|
updatedSelectors = updatedSelectors.filter(s => s !== selector);
|
|
selectorSuccess = true;
|
|
}
|
|
}
|
|
} catch (error) {
|
|
retryCount++;
|
|
debugLog(`Selector "${selector}" error: attempt ${retryCount}/${MAX_RETRIES} - ${error.message}`);
|
|
|
|
if (retryCount < MAX_RETRIES) {
|
|
await page.waitForTimeout(RETRY_DELAY);
|
|
} else {
|
|
debugLog(`Removing failed selector "${selector}" after ${MAX_RETRIES} attempts`);
|
|
updatedSelectors = updatedSelectors.filter(s => s !== selector);
|
|
selectorSuccess = true;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
return {
|
|
button: null,
|
|
workingSelector: null,
|
|
updatedSelectors
|
|
};
|
|
};
|
|
|
|
const retryOperation = async (operation: () => Promise<boolean>, retryCount = 0): Promise<boolean> => {
|
|
try {
|
|
return await operation();
|
|
} catch (error) {
|
|
if (retryCount < MAX_RETRIES) {
|
|
debugLog(`Retrying operation. Attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
await page.waitForTimeout(RETRY_DELAY);
|
|
return retryOperation(operation, retryCount + 1);
|
|
}
|
|
debugLog(`Operation failed after ${MAX_RETRIES} retries`);
|
|
return false;
|
|
}
|
|
};
|
|
|
|
let availableSelectors = config.pagination.selector.split(',');
|
|
let unchangedResultCounter = 0;
|
|
|
|
try {
|
|
while (true) {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted during pagination loop', Level.WARN);
|
|
return allResults;
|
|
}
|
|
|
|
switch (config.pagination.type) {
|
|
case 'scrollDown': {
|
|
let previousResultCount = allResults.length;
|
|
|
|
await scrapeCurrentPage();
|
|
|
|
if (checkLimit()) {
|
|
return allResults;
|
|
}
|
|
|
|
await page.evaluate(() => {
|
|
const scrollHeight = Math.max(
|
|
document.body.scrollHeight,
|
|
document.documentElement.scrollHeight
|
|
);
|
|
|
|
window.scrollTo(0, scrollHeight);
|
|
});
|
|
await page.waitForTimeout(2000);
|
|
|
|
const currentHeight = await page.evaluate(() => {
|
|
return Math.max(
|
|
document.body.scrollHeight,
|
|
document.documentElement.scrollHeight
|
|
);
|
|
});
|
|
const currentResultCount = allResults.length;
|
|
|
|
if (currentResultCount === previousResultCount) {
|
|
unchangedResultCounter++;
|
|
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
return allResults;
|
|
}
|
|
} else {
|
|
unchangedResultCounter = 0;
|
|
}
|
|
|
|
if (currentHeight === previousHeight) {
|
|
return allResults;
|
|
}
|
|
|
|
previousHeight = currentHeight;
|
|
break;
|
|
}
|
|
|
|
case 'scrollUp': {
|
|
let previousResultCount = allResults.length;
|
|
|
|
await scrapeCurrentPage();
|
|
|
|
if (checkLimit()) {
|
|
return allResults;
|
|
}
|
|
|
|
await page.evaluate(() => window.scrollTo(0, 0));
|
|
await page.waitForTimeout(2000);
|
|
|
|
const currentTopHeight = await page.evaluate(() => document.documentElement.scrollTop);
|
|
const currentResultCount = allResults.length;
|
|
|
|
if (currentResultCount === previousResultCount) {
|
|
unchangedResultCounter++;
|
|
if (unchangedResultCounter >= MAX_UNCHANGED_RESULTS) {
|
|
return allResults;
|
|
}
|
|
} else {
|
|
unchangedResultCounter = 0;
|
|
}
|
|
|
|
if (currentTopHeight === 0) {
|
|
return allResults;
|
|
}
|
|
|
|
previousHeight = currentTopHeight;
|
|
break;
|
|
}
|
|
|
|
case 'clickNext': {
|
|
const currentUrl = page.url();
|
|
visitedUrls.add(currentUrl);
|
|
|
|
await scrapeCurrentPage();
|
|
if (checkLimit()) return allResults;
|
|
|
|
const { button, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);
|
|
|
|
availableSelectors = updatedSelectors;
|
|
|
|
if (!button || !workingSelector) {
|
|
// Final retry for navigation when no selectors work
|
|
const success = await retryOperation(async () => {
|
|
try {
|
|
await page.evaluate(() => window.history.forward());
|
|
const newUrl = page.url();
|
|
return !visitedUrls.has(newUrl);
|
|
} catch {
|
|
return false;
|
|
}
|
|
});
|
|
|
|
if (!success) return allResults;
|
|
break;
|
|
}
|
|
|
|
let retryCount = 0;
|
|
let paginationSuccess = false;
|
|
|
|
// Capture basic content signature before click - with XPath support
|
|
const captureContentSignature = async () => {
|
|
return await page.evaluate((listSelector) => {
|
|
const isXPath = (selector: string) => {
|
|
return selector.startsWith('//') || selector.startsWith('./') || selector.includes('::');
|
|
};
|
|
|
|
let items: NodeListOf<Element> | Element[] = [];
|
|
|
|
if (isXPath(listSelector)) {
|
|
try {
|
|
// Use XPath to find elements
|
|
const xpathResult = document.evaluate(
|
|
listSelector,
|
|
document,
|
|
null,
|
|
XPathResult.ORDERED_NODE_SNAPSHOT_TYPE,
|
|
null
|
|
);
|
|
|
|
items = [];
|
|
for (let i = 0; i < xpathResult.snapshotLength; i++) {
|
|
const node = xpathResult.snapshotItem(i);
|
|
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
|
items.push(node as Element);
|
|
}
|
|
}
|
|
} catch (xpathError) {
|
|
console.warn('XPath evaluation failed, trying CSS selector as fallback:', xpathError);
|
|
// Fallback to CSS selector
|
|
try {
|
|
items = document.querySelectorAll(listSelector);
|
|
} catch (cssError) {
|
|
console.warn('CSS selector fallback also failed:', cssError);
|
|
items = [];
|
|
}
|
|
}
|
|
} else {
|
|
try {
|
|
// Use CSS selector
|
|
items = document.querySelectorAll(listSelector);
|
|
} catch (cssError) {
|
|
console.warn('CSS selector failed:', cssError);
|
|
items = [];
|
|
}
|
|
}
|
|
|
|
return {
|
|
url: window.location.href,
|
|
itemCount: items.length,
|
|
firstItems: Array.from(items).slice(0, 3).map(el => el.textContent || '').join('|')
|
|
};
|
|
}, config.listSelector);
|
|
};
|
|
|
|
const beforeSignature = await captureContentSignature();
|
|
debugLog(`Before click: ${beforeSignature.itemCount} items`);
|
|
|
|
while (retryCount < MAX_RETRIES && !paginationSuccess) {
|
|
try {
|
|
try {
|
|
await Promise.all([
|
|
page.waitForNavigation({
|
|
waitUntil: 'networkidle',
|
|
timeout: 15000
|
|
}).catch(e => {
|
|
throw e;
|
|
}),
|
|
page.locator(workingSelector).first().click()
|
|
]);
|
|
debugLog("Navigation successful after regular click");
|
|
await page.waitForTimeout(2000);
|
|
paginationSuccess = true;
|
|
} catch (navError) {
|
|
debugLog("Regular click with navigation failed, trying dispatch event with navigation");
|
|
try {
|
|
await Promise.all([
|
|
page.waitForNavigation({
|
|
waitUntil: 'networkidle',
|
|
timeout: 15000
|
|
}).catch(e => {
|
|
throw e;
|
|
}),
|
|
page.locator(workingSelector).first().dispatchEvent('click')
|
|
]);
|
|
debugLog("Navigation successful after dispatch event");
|
|
await page.waitForTimeout(2000);
|
|
paginationSuccess = true;
|
|
} catch (dispatchNavError) {
|
|
try {
|
|
await page.locator(workingSelector).first().click();
|
|
await page.waitForTimeout(2000);
|
|
} catch (clickError) {
|
|
await page.locator(workingSelector).first().dispatchEvent('click');
|
|
await page.waitForTimeout(2000);
|
|
}
|
|
}
|
|
}
|
|
|
|
await page.waitForLoadState('networkidle', { timeout: 5000 }).catch(() => {});
|
|
|
|
if (!paginationSuccess) {
|
|
const newUrl = page.url();
|
|
const afterSignature = await captureContentSignature();
|
|
|
|
if (newUrl !== currentUrl) {
|
|
debugLog(`URL changed to ${newUrl}`);
|
|
visitedUrls.add(newUrl);
|
|
paginationSuccess = true;
|
|
}
|
|
else if (afterSignature.firstItems !== beforeSignature.firstItems) {
|
|
debugLog("Content changed without URL change");
|
|
paginationSuccess = true;
|
|
}
|
|
else if (afterSignature.itemCount !== beforeSignature.itemCount) {
|
|
debugLog(`Item count changed from ${beforeSignature.itemCount} to ${afterSignature.itemCount}`);
|
|
paginationSuccess = true;
|
|
}
|
|
}
|
|
} catch (error) {
|
|
debugLog(`Pagination attempt ${retryCount + 1} failed: ${error.message}`);
|
|
}
|
|
|
|
if (!paginationSuccess) {
|
|
retryCount++;
|
|
if (retryCount < MAX_RETRIES) {
|
|
debugLog(`Retrying pagination - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
await page.waitForTimeout(RETRY_DELAY);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!paginationSuccess) {
|
|
debugLog(`Pagination failed after ${MAX_RETRIES} attempts`);
|
|
return allResults;
|
|
}
|
|
|
|
break;
|
|
}
|
|
|
|
case 'clickLoadMore': {
|
|
await scrapeCurrentPage();
|
|
if (checkLimit()) return allResults;
|
|
|
|
let loadMoreCounter = 0;
|
|
let previousResultCount = allResults.length;
|
|
let noNewItemsCounter = 0;
|
|
const MAX_NO_NEW_ITEMS = 5;
|
|
|
|
while (true) {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted during pagination loop', Level.WARN);
|
|
return allResults;
|
|
}
|
|
|
|
// Find working button with retry mechanism
|
|
const { button: loadMoreButton, workingSelector, updatedSelectors } = await findWorkingButton(availableSelectors);
|
|
|
|
availableSelectors = updatedSelectors;
|
|
|
|
if (!workingSelector || !loadMoreButton) {
|
|
debugLog('No working Load More selector found after retries');
|
|
return allResults;
|
|
}
|
|
|
|
// Implement retry mechanism for clicking the button
|
|
let retryCount = 0;
|
|
let clickSuccess = false;
|
|
|
|
while (retryCount < MAX_RETRIES && !clickSuccess) {
|
|
try {
|
|
try {
|
|
await loadMoreButton.click();
|
|
clickSuccess = true;
|
|
} catch (error) {
|
|
debugLog(`Regular click failed on attempt ${retryCount + 1}. Trying DispatchEvent`);
|
|
|
|
// If regular click fails, try dispatchEvent
|
|
try {
|
|
await loadMoreButton.dispatchEvent('click');
|
|
clickSuccess = true;
|
|
} catch (dispatchError) {
|
|
debugLog(`DispatchEvent failed on attempt ${retryCount + 1}.`);
|
|
throw dispatchError; // Propagate error to trigger retry
|
|
}
|
|
}
|
|
|
|
if (clickSuccess) {
|
|
await page.waitForTimeout(1000);
|
|
loadMoreCounter++;
|
|
debugLog(`Successfully clicked Load More button (${loadMoreCounter} times)`);
|
|
}
|
|
} catch (error) {
|
|
debugLog(`Click attempt ${retryCount + 1} failed completely.`);
|
|
retryCount++;
|
|
|
|
if (retryCount < MAX_RETRIES) {
|
|
debugLog(`Retrying click - attempt ${retryCount + 1} of ${MAX_RETRIES}`);
|
|
await page.waitForTimeout(RETRY_DELAY);
|
|
}
|
|
}
|
|
}
|
|
|
|
if (!clickSuccess) {
|
|
debugLog(`Load More clicking failed after ${MAX_RETRIES} attempts`);
|
|
return allResults;
|
|
}
|
|
|
|
// Wait for content to load and check scroll height
|
|
await page.waitForTimeout(2000);
|
|
await page.evaluate(() => {
|
|
const scrollHeight = Math.max(
|
|
document.body.scrollHeight,
|
|
document.documentElement.scrollHeight
|
|
);
|
|
|
|
window.scrollTo(0, scrollHeight);
|
|
});
|
|
await page.waitForTimeout(2000);
|
|
|
|
const currentHeight = await page.evaluate(() => {
|
|
return Math.max(
|
|
document.body.scrollHeight,
|
|
document.documentElement.scrollHeight
|
|
);
|
|
});
|
|
const heightChanged = currentHeight !== previousHeight;
|
|
previousHeight = currentHeight;
|
|
|
|
await scrapeCurrentPage();
|
|
|
|
const currentResultCount = allResults.length;
|
|
const newItemsAdded = currentResultCount > previousResultCount;
|
|
|
|
if (!newItemsAdded) {
|
|
noNewItemsCounter++;
|
|
debugLog(`No new items added after click (${noNewItemsCounter}/${MAX_NO_NEW_ITEMS})`);
|
|
|
|
if (noNewItemsCounter >= MAX_NO_NEW_ITEMS) {
|
|
debugLog(`Stopping after ${MAX_NO_NEW_ITEMS} clicks with no new items`);
|
|
return allResults;
|
|
}
|
|
} else {
|
|
noNewItemsCounter = 0;
|
|
previousResultCount = currentResultCount;
|
|
}
|
|
|
|
if (checkLimit()) return allResults;
|
|
|
|
if (!heightChanged) {
|
|
debugLog('No more items loaded after Load More');
|
|
return allResults;
|
|
}
|
|
}
|
|
}
|
|
|
|
default: {
|
|
await scrapeCurrentPage();
|
|
return allResults;
|
|
}
|
|
}
|
|
|
|
if (checkLimit()) break;
|
|
}
|
|
} catch (error) {
|
|
debugLog(`Fatal error: ${error.message}`);
|
|
return allResults;
|
|
}
|
|
|
|
return allResults;
|
|
}
|
|
|
|
private getMatchingActionId(workflow: Workflow, pageState: PageState, usedActions: string[]) {
|
|
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
|
const step = workflow[actionId];
|
|
const isApplicable = this.applicable(step.where, pageState, usedActions);
|
|
console.log("-------------------------------------------------------------");
|
|
console.log(`Where:`, step.where);
|
|
console.log(`Page state:`, pageState);
|
|
console.log(`Match result: ${isApplicable}`);
|
|
console.log("-------------------------------------------------------------");
|
|
|
|
if (isApplicable) {
|
|
return actionId;
|
|
}
|
|
}
|
|
}
|
|
|
|
private removeShadowSelectors(workflow: Workflow) {
|
|
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
|
const step = workflow[actionId];
|
|
|
|
// Check if step has where and selectors
|
|
if (step.where && Array.isArray(step.where.selectors)) {
|
|
// Filter out selectors that contain ">>"
|
|
step.where.selectors = step.where.selectors.filter(selector => !selector.includes('>>'));
|
|
}
|
|
}
|
|
|
|
return workflow;
|
|
}
|
|
|
|
private removeSpecialSelectors(workflow: Workflow) {
|
|
for (let actionId = workflow.length - 1; actionId >= 0; actionId--) {
|
|
const step = workflow[actionId];
|
|
|
|
if (step.where && Array.isArray(step.where.selectors)) {
|
|
// Filter out if selector has EITHER ":>>" OR ">>"
|
|
step.where.selectors = step.where.selectors.filter(selector =>
|
|
!(selector.includes(':>>') || selector.includes('>>'))
|
|
);
|
|
}
|
|
}
|
|
|
|
return workflow;
|
|
}
|
|
|
|
private async runLoop(p: Page, workflow: Workflow) {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted in runLoop', Level.WARN);
|
|
return;
|
|
}
|
|
|
|
let workflowCopy: Workflow = JSON.parse(JSON.stringify(workflow));
|
|
|
|
workflowCopy = this.removeSpecialSelectors(workflowCopy);
|
|
|
|
const usedActions: string[] = [];
|
|
let selectors: string[] = [];
|
|
let lastAction = null;
|
|
let actionId = -1
|
|
let repeatCount = 0;
|
|
|
|
/**
|
|
* Enables the interpreter functionality for popup windows.
|
|
* User-requested concurrency should be entirely managed by the concurrency manager,
|
|
* e.g. via `enqueueLinks`.
|
|
*/
|
|
const popupHandler = (popup) => {
|
|
this.concurrency.addJob(() => this.runLoop(popup, workflowCopy));
|
|
};
|
|
p.on('popup', popupHandler);
|
|
|
|
/* eslint no-constant-condition: ["warn", { "checkLoops": false }] */
|
|
let loopIterations = 0;
|
|
const MAX_LOOP_ITERATIONS = 1000; // Circuit breaker
|
|
|
|
// Cleanup function to remove popup listener
|
|
const cleanup = () => {
|
|
try {
|
|
if (!p.isClosed()) {
|
|
p.removeListener('popup', popupHandler);
|
|
}
|
|
} catch (cleanupError) {
|
|
}
|
|
};
|
|
|
|
while (true) {
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted during step execution', Level.WARN);
|
|
cleanup();
|
|
return;
|
|
}
|
|
|
|
// Circuit breaker to prevent infinite loops
|
|
if (++loopIterations > MAX_LOOP_ITERATIONS) {
|
|
this.log('Maximum loop iterations reached, terminating to prevent infinite loop', Level.ERROR);
|
|
cleanup();
|
|
return;
|
|
}
|
|
|
|
// Checks whether the page was closed from outside,
|
|
// or the workflow execution has been stopped via `interpreter.stop()`
|
|
if (p.isClosed() || !this.stopper) {
|
|
cleanup();
|
|
return;
|
|
}
|
|
|
|
try {
|
|
await p.waitForLoadState();
|
|
} catch (e) {
|
|
cleanup();
|
|
await p.close();
|
|
return;
|
|
}
|
|
|
|
if (workflowCopy.length === 0) {
|
|
this.log('All actions completed. Workflow finished.', Level.LOG);
|
|
cleanup();
|
|
return;
|
|
}
|
|
|
|
// let pageState = {};
|
|
// try {
|
|
// // Check if page is still valid before accessing state
|
|
// if (p.isClosed()) {
|
|
// this.log('Page was closed during execution', Level.WARN);
|
|
// return;
|
|
// }
|
|
|
|
// pageState = await this.getState(p, workflowCopy, selectors);
|
|
// selectors = [];
|
|
// console.log("Empty selectors:", selectors)
|
|
// } catch (e: any) {
|
|
// this.log(`Failed to get page state: ${e.message}`, Level.ERROR);
|
|
// // If state access fails, attempt graceful recovery
|
|
// if (p.isClosed()) {
|
|
// this.log('Browser has been closed, terminating workflow', Level.WARN);
|
|
// return;
|
|
// }
|
|
// // For other errors, continue with empty state to avoid complete failure
|
|
// pageState = { url: p.url(), selectors: [], cookies: {} };
|
|
// }
|
|
|
|
// if (this.options.debug) {
|
|
// this.log(`Current state is: \n${JSON.stringify(pageState, null, 2)}`, Level.WARN);
|
|
// }
|
|
|
|
// const actionId = workflow.findIndex((step) => {
|
|
// const isApplicable = this.applicable(step.where, pageState, usedActions);
|
|
// console.log("-------------------------------------------------------------");
|
|
// console.log(`Where:`, step.where);
|
|
// console.log(`Page state:`, pageState);
|
|
// console.log(`Match result: ${isApplicable}`);
|
|
// console.log("-------------------------------------------------------------");
|
|
// return isApplicable;
|
|
// });
|
|
|
|
// actionId = this.getMatchingActionId(workflowCopy, pageState, usedActions);
|
|
|
|
const actionId = workflowCopy.length - 1;
|
|
const action = workflowCopy[actionId];
|
|
|
|
console.log("MATCHED ACTION:", action);
|
|
console.log("MATCHED ACTION ID:", actionId);
|
|
this.log(`Matched ${JSON.stringify(action?.where)}`, Level.LOG);
|
|
|
|
if (action) { // action is matched
|
|
if (this.options.debugChannel?.activeId) {
|
|
this.options.debugChannel.activeId(actionId);
|
|
}
|
|
|
|
repeatCount = action === lastAction ? repeatCount + 1 : 0;
|
|
|
|
console.log("REPEAT COUNT", repeatCount);
|
|
if (this.options.maxRepeats && repeatCount > this.options.maxRepeats) {
|
|
return;
|
|
}
|
|
lastAction = action;
|
|
|
|
if (this.isAborted) {
|
|
this.log('Workflow aborted before action execution', Level.WARN);
|
|
return;
|
|
}
|
|
|
|
try {
|
|
console.log("Carrying out:", action.what);
|
|
await this.carryOutSteps(p, action.what);
|
|
usedActions.push(action.id ?? 'undefined');
|
|
|
|
workflowCopy.splice(actionId, 1);
|
|
console.log(`Action with ID ${action.id} removed from the workflow copy.`);
|
|
|
|
this.executedActions++;
|
|
const percentage = Math.round((this.executedActions / this.totalActions) * 100);
|
|
|
|
if (this.options.debugChannel?.progressUpdate) {
|
|
this.options.debugChannel.progressUpdate(
|
|
this.executedActions,
|
|
this.totalActions,
|
|
percentage
|
|
);
|
|
}
|
|
|
|
// const newSelectors = this.getPreviousSelectors(workflow, actionId);
|
|
// const newSelectors = this.getSelectors(workflowCopy);
|
|
// newSelectors.forEach(selector => {
|
|
// if (!selectors.includes(selector)) {
|
|
// selectors.push(selector);
|
|
// }
|
|
// });
|
|
|
|
// Reset loop iteration counter on successful action
|
|
loopIterations = 0;
|
|
} catch (e) {
|
|
this.log(<Error>e, Level.ERROR);
|
|
// Don't crash on individual action failures - continue with next iteration
|
|
continue;
|
|
}
|
|
} else {
|
|
//await this.disableAdBlocker(p);
|
|
cleanup();
|
|
return;
|
|
}
|
|
}
|
|
}
|
|
|
|
private async ensureScriptsLoaded(page: Page) {
|
|
try {
|
|
const evaluationPromise = page.evaluate(() =>
|
|
typeof window.scrape === 'function' &&
|
|
typeof window.scrapeSchema === 'function' &&
|
|
typeof window.scrapeList === 'function' &&
|
|
typeof window.scrapeListAuto === 'function' &&
|
|
typeof window.scrollDown === 'function' &&
|
|
typeof window.scrollUp === 'function'
|
|
);
|
|
|
|
const timeoutPromise = new Promise<boolean>((_, reject) =>
|
|
setTimeout(() => reject(new Error('Script check timeout')), 3000)
|
|
);
|
|
|
|
const isScriptLoaded = await Promise.race([
|
|
evaluationPromise,
|
|
timeoutPromise
|
|
]);
|
|
|
|
if (!isScriptLoaded) {
|
|
await page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
|
|
}
|
|
} catch (error) {
|
|
this.log(`Script check failed, adding script anyway: ${error.message}`, Level.WARN);
|
|
try {
|
|
await page.addInitScript({ path: path.join(__dirname, 'browserSide', 'scraper.js') });
|
|
} catch (scriptError) {
|
|
this.log(`Failed to add script: ${scriptError.message}`, Level.ERROR);
|
|
}
|
|
}
|
|
}
|
|
|
|
/**
|
|
* Spawns a browser context and runs given workflow.
|
|
* \
|
|
* Resolves after the playback is finished.
|
|
* @param {Page} [page] Page to run the workflow on.
|
|
* @param {ParamType} params Workflow specific, set of parameters
|
|
* for the `{$param: nameofparam}` fields.
|
|
*/
|
|
public async run(page: Page, params?: ParamType): Promise<void> {
|
|
this.log('Starting the workflow.', Level.LOG);
|
|
const context = page.context();
|
|
|
|
page.setDefaultNavigationTimeout(100000);
|
|
|
|
// Check proxy settings from context options
|
|
const contextOptions = (context as any)._options;
|
|
const hasProxy = !!contextOptions?.proxy;
|
|
|
|
this.log(`Proxy settings: ${hasProxy ? `Proxy is configured...` : 'No proxy configured...'}`);
|
|
|
|
if (hasProxy) {
|
|
if (contextOptions.proxy.username) {
|
|
this.log(`Proxy authenticated...`);
|
|
}
|
|
}
|
|
if (this.stopper) {
|
|
throw new Error('This Interpreter is already running a workflow. To run another workflow, please, spawn another Interpreter.');
|
|
}
|
|
/**
|
|
* `this.workflow` with the parameters initialized.
|
|
*/
|
|
this.initializedWorkflow = Preprocessor.initWorkflow(this.workflow, params);
|
|
|
|
this.totalActions = this.initializedWorkflow.length;
|
|
this.executedActions = 0;
|
|
|
|
if (this.options.debugChannel?.progressUpdate) {
|
|
this.options.debugChannel.progressUpdate(0, this.totalActions, 0);
|
|
}
|
|
|
|
await this.ensureScriptsLoaded(page);
|
|
|
|
this.stopper = () => {
|
|
this.stopper = null;
|
|
};
|
|
|
|
this.concurrency.addJob(() => this.runLoop(page, this.initializedWorkflow!));
|
|
|
|
await this.concurrency.waitForCompletion();
|
|
|
|
this.stopper = null;
|
|
}
|
|
|
|
public async stop(): Promise<void> {
|
|
if (this.stopper) {
|
|
await this.stopper();
|
|
this.stopper = null;
|
|
} else {
|
|
throw new Error('Cannot stop, there is no running workflow!');
|
|
}
|
|
}
|
|
/**
|
|
* Cleanup method to release resources and prevent memory leaks
|
|
* Call this when the interpreter is no longer needed
|
|
*/
|
|
public async cleanup(): Promise<void> {
|
|
try {
|
|
// Stop any running workflows first
|
|
if (this.stopper) {
|
|
try {
|
|
await this.stop();
|
|
} catch (error: any) {
|
|
this.log(`Error stopping workflow during cleanup: ${error.message}`, Level.WARN);
|
|
}
|
|
}
|
|
|
|
// Clear accumulated data to free memory
|
|
this.cumulativeResults = [];
|
|
this.namedResults = {};
|
|
this.serializableDataByType = { scrapeList: {}, scrapeSchema: {}, crawl: {}, search: {} };
|
|
|
|
// Reset state
|
|
this.isAborted = false;
|
|
this.initializedWorkflow = null;
|
|
|
|
this.log('Interpreter cleanup completed', Level.DEBUG);
|
|
} catch (error: any) {
|
|
this.log(`Error during interpreter cleanup: ${error.message}`, Level.ERROR);
|
|
throw error;
|
|
}
|
|
}
|
|
} |