Files
parcer/server/src/api/sdk.ts

716 lines
22 KiB
TypeScript
Raw Normal View History

/**
* SDK API Routes
* Separate API endpoints specifically for Maxun SDKs
* All routes require API key authentication
*/
import { Router, Request, Response } from 'express';
import { requireAPIKey } from "../middlewares/api";
import Robot from "../models/Robot";
import Run from "../models/Run";
import { v4 as uuid } from 'uuid';
import { WorkflowFile } from "maxun-core";
import logger from "../logger";
import { capture } from "../utils/analytics";
import { handleRunRecording } from "./record";
import { WorkflowEnricher } from "../sdk/workflowEnricher";
import { cancelScheduledWorkflow, scheduleWorkflow } from '../storage/schedule';
import { computeNextRun } from "../utils/schedule";
import moment from 'moment-timezone';
const router = Router();
interface AuthenticatedRequest extends Request {
user?: any;
}
/**
* Create a new robot programmatically
* POST /api/sdk/robots
*/
router.post("/sdk/robots", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const user = req.user;
const workflowFile: WorkflowFile = req.body;
if (!workflowFile.meta || !workflowFile.workflow) {
return res.status(400).json({
error: "Invalid workflow structure. Expected { meta, workflow }"
});
}
if (!workflowFile.meta.name) {
return res.status(400).json({
error: "Robot name is required in meta.name"
});
}
const type = (workflowFile.meta as any).type || 'extract';
let enrichedWorkflow: any[] = [];
let extractedUrl: string | undefined;
if (type === 'scrape') {
enrichedWorkflow = [];
extractedUrl = (workflowFile.meta as any).url;
if (!extractedUrl) {
return res.status(400).json({
error: "URL is required for scrape robots"
});
}
} else {
const enrichResult = await WorkflowEnricher.enrichWorkflow(workflowFile.workflow);
if (!enrichResult.success) {
return res.status(400).json({
error: "Workflow validation failed",
details: enrichResult.errors
});
}
enrichedWorkflow = enrichResult.workflow!;
extractedUrl = enrichResult.url;
}
const robotId = uuid();
const metaId = uuid();
const robotMeta: any = {
name: workflowFile.meta.name,
id: metaId,
createdAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
pairs: enrichedWorkflow.length,
params: [],
type,
url: extractedUrl,
formats: (workflowFile.meta as any).formats || [],
};
const robot = await Robot.create({
id: robotId,
userId: user.id,
recording_meta: robotMeta,
recording: {
workflow: enrichedWorkflow
}
});
capture("maxun-oss-robot-created", {
robot_meta: robot.recording_meta,
recording: robot.recording,
});
return res.status(201).json({
data: robot,
message: "Robot created successfully"
});
} catch (error: any) {
logger.error("[SDK] Error creating robot:", error);
return res.status(500).json({
error: "Failed to create robot",
message: error.message
});
}
});
/**
* List all robots for the authenticated user
* GET /api/sdk/robots
*/
router.get("/sdk/robots", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robots = await Robot.findAll();
return res.status(200).json({
data: robots
});
} catch (error: any) {
logger.error("[SDK] Error listing robots:", error);
return res.status(500).json({
error: "Failed to list robots",
message: error.message
});
}
});
/**
* Get a specific robot by ID
* GET /api/sdk/robots/:id
*/
router.get("/sdk/robots/:id", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
return res.status(200).json({
data: robot
});
} catch (error: any) {
logger.error("[SDK] Error getting robot:", error);
return res.status(500).json({
error: "Failed to get robot",
message: error.message
});
}
});
/**
* Update a robot
* PUT /api/sdk/robots/:id
*/
router.put("/sdk/robots/:id", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const updates = req.body;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
const updateData: any = {};
if (updates.workflow) {
updateData.recording = {
workflow: updates.workflow
};
}
if (updates.meta) {
updateData.recording_meta = {
...robot.recording_meta,
...updates.meta,
updatedAt: new Date().toISOString()
};
}
if (updates.google_sheet_email !== undefined) {
updateData.google_sheet_email = updates.google_sheet_email;
}
if (updates.google_sheet_name !== undefined) {
updateData.google_sheet_name = updates.google_sheet_name;
}
if (updates.airtable_base_id !== undefined) {
updateData.airtable_base_id = updates.airtable_base_id;
}
if (updates.airtable_table_name !== undefined) {
updateData.airtable_table_name = updates.airtable_table_name;
}
if (updates.schedule !== undefined) {
if (updates.schedule === null) {
try {
await cancelScheduledWorkflow(robotId);
} catch (cancelError) {
logger.warn(`[SDK] Failed to cancel existing schedule for robot ${robotId}: ${cancelError}`);
}
updateData.schedule = null;
} else {
const {
runEvery,
runEveryUnit,
timezone,
startFrom = 'SUNDAY',
dayOfMonth = 1,
atTimeStart = '00:00',
atTimeEnd = '23:59'
} = updates.schedule;
if (!runEvery || !runEveryUnit || !timezone) {
return res.status(400).json({
error: "Missing required schedule parameters: runEvery, runEveryUnit, timezone"
});
}
if (!moment.tz.zone(timezone)) {
return res.status(400).json({
error: "Invalid timezone"
});
}
const [startHours, startMinutes] = atTimeStart.split(':').map(Number);
const [endHours, endMinutes] = atTimeEnd.split(':').map(Number);
if (isNaN(startHours) || isNaN(startMinutes) || isNaN(endHours) || isNaN(endMinutes) ||
startHours < 0 || startHours > 23 || startMinutes < 0 || startMinutes > 59 ||
endHours < 0 || endHours > 23 || endMinutes < 0 || endMinutes > 59) {
return res.status(400).json({ error: 'Invalid time format. Expected HH:MM (e.g., 09:30)' });
}
const days = ['SUNDAY', 'MONDAY', 'TUESDAY', 'WEDNESDAY', 'THURSDAY', 'FRIDAY', 'SATURDAY'];
if (!days.includes(startFrom)) {
return res.status(400).json({ error: 'Invalid startFrom day. Must be one of: SUNDAY, MONDAY, TUESDAY, WEDNESDAY, THURSDAY, FRIDAY, SATURDAY' });
}
let cronExpression;
const dayIndex = days.indexOf(startFrom);
switch (runEveryUnit) {
case 'MINUTES':
cronExpression = `*/${runEvery} * * * *`;
break;
case 'HOURS':
cronExpression = `${startMinutes} */${runEvery} * * *`;
break;
case 'DAYS':
cronExpression = `${startMinutes} ${startHours} */${runEvery} * *`;
break;
case 'WEEKS':
cronExpression = `${startMinutes} ${startHours} * * ${dayIndex}`;
break;
case 'MONTHS':
cronExpression = `${startMinutes} ${startHours} ${dayOfMonth} */${runEvery} *`;
if (startFrom !== 'SUNDAY') {
cronExpression += ` ${dayIndex}`;
}
break;
default:
return res.status(400).json({
error: "Invalid runEveryUnit. Must be one of: MINUTES, HOURS, DAYS, WEEKS, MONTHS"
});
}
try {
await cancelScheduledWorkflow(robotId);
} catch (cancelError) {
logger.warn(`[SDK] Failed to cancel existing schedule for robot ${robotId}: ${cancelError}`);
}
try {
await scheduleWorkflow(robotId, cronExpression, timezone);
} catch (scheduleError: any) {
logger.error(`[SDK] Failed to schedule workflow for robot ${robotId}: ${scheduleError.message}`);
return res.status(500).json({
error: "Failed to schedule workflow",
message: scheduleError.message
});
}
const nextRunAt = computeNextRun(cronExpression, timezone);
updateData.schedule = {
runEvery,
runEveryUnit,
timezone,
startFrom,
dayOfMonth,
atTimeStart,
atTimeEnd,
cronExpression,
lastRunAt: undefined,
nextRunAt: nextRunAt || undefined,
};
logger.info(`[SDK] Scheduled robot ${robotId} with cron: ${cronExpression} in timezone: ${timezone}`);
}
}
if (updates.webhooks !== undefined) {
updateData.webhooks = updates.webhooks;
}
if (updates.proxy_url !== undefined) {
updateData.proxy_url = updates.proxy_url;
}
if (updates.proxy_username !== undefined) {
updateData.proxy_username = updates.proxy_username;
}
if (updates.proxy_password !== undefined) {
updateData.proxy_password = updates.proxy_password;
}
await robot.update(updateData);
logger.info(`[SDK] Robot updated: ${robotId}`);
return res.status(200).json({
data: robot,
message: "Robot updated successfully"
});
} catch (error: any) {
logger.error("[SDK] Error updating robot:", error);
return res.status(500).json({
error: "Failed to update robot",
message: error.message
});
}
});
/**
* Delete a robot
* DELETE /api/sdk/robots/:id
*/
router.delete("/sdk/robots/:id", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
await Run.destroy({
where: {
robotMetaId: robot.recording_meta.id
}
});
await robot.destroy();
logger.info(`[SDK] Robot deleted: ${robotId}`);
capture(
'maxun-oss-robot-deleted',
{
robotId: robotId,
user_id: req.user?.id,
deleted_at: new Date().toISOString(),
}
)
return res.status(200).json({
message: "Robot deleted successfully"
});
} catch (error: any) {
logger.error("[SDK] Error deleting robot:", error);
return res.status(500).json({
error: "Failed to delete robot",
message: error.message
});
}
});
/**
* Execute a robot
* POST /api/sdk/robots/:id/execute
*/
router.post("/sdk/robots/:id/execute", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const user = req.user;
const robotId = req.params.id;
logger.info(`[SDK] Starting execution for robot ${robotId}`);
const runId = await handleRunRecording(robotId, user.id.toString());
if (!runId) {
throw new Error('Failed to start robot execution');
}
const run = await waitForRunCompletion(runId, user.id.toString());
let listData: any[] = [];
if (run.serializableOutput?.scrapeList) {
const scrapeList: any = run.serializableOutput.scrapeList;
if (scrapeList.scrapeList && Array.isArray(scrapeList.scrapeList)) {
listData = scrapeList.scrapeList;
}
else if (Array.isArray(scrapeList)) {
listData = scrapeList;
}
else if (typeof scrapeList === 'object') {
const listValues = Object.values(scrapeList);
if (listValues.length > 0 && Array.isArray(listValues[0])) {
listData = listValues[0] as any[];
}
}
}
return res.status(200).json({
data: {
runId: run.runId,
status: run.status,
data: {
textData: run.serializableOutput?.scrapeSchema || {},
listData: listData
},
screenshots: Object.values(run.binaryOutput || {})
}
});
} catch (error: any) {
logger.error("[SDK] Error executing robot:", error);
return res.status(500).json({
error: "Failed to execute robot",
message: error.message
});
}
});
/**
* Wait for run completion
*/
async function waitForRunCompletion(runId: string, interval: number = 2000) {
const MAX_WAIT_TIME = 180 * 60 * 1000;
const startTime = Date.now();
while (true) {
if (Date.now() - startTime > MAX_WAIT_TIME) {
throw new Error('Run completion timeout after 3 hours');
}
const run = await Run.findOne({ where: { runId } });
if (!run) throw new Error('Run not found');
if (run.status === 'success') {
return run.toJSON();
} else if (run.status === 'failed') {
throw new Error('Run failed');
} else if (run.status === 'aborted') {
throw new Error('Run was aborted');
}
await new Promise(resolve => setTimeout(resolve, interval));
}
}
/**
* Get all runs for a robot
* GET /api/sdk/robots/:id/runs
*/
router.get("/sdk/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
const runs = await Run.findAll({
where: {
robotMetaId: robot.recording_meta.id
},
order: [['startedAt', 'DESC']]
});
return res.status(200).json({
data: runs
});
} catch (error: any) {
logger.error("[SDK] Error getting runs:", error);
return res.status(500).json({
error: "Failed to get runs",
message: error.message
});
}
});
/**
* Get a specific run
* GET /api/sdk/robots/:id/runs/:runId
*/
router.get("/sdk/robots/:id/runs/:runId", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const runId = req.params.runId;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
const run = await Run.findOne({
where: {
runId: runId,
robotMetaId: robot.recording_meta.id
}
});
if (!run) {
return res.status(404).json({
error: "Run not found"
});
}
return res.status(200).json({
data: run
});
} catch (error: any) {
logger.error("[SDK] Error getting run:", error);
return res.status(500).json({
error: "Failed to get run",
message: error.message
});
}
});
/**
* Abort a running execution
* POST /api/sdk/robots/:id/runs/:runId/abort
*/
router.post("/sdk/robots/:id/runs/:runId/abort", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const robotId = req.params.id;
const runId = req.params.runId;
const robot = await Robot.findOne({
where: {
'recording_meta.id': robotId
}
});
if (!robot) {
return res.status(404).json({
error: "Robot not found"
});
}
const run = await Run.findOne({
where: {
runId: runId,
robotMetaId: robot.recording_meta.id
}
});
if (!run) {
return res.status(404).json({
error: "Run not found"
});
}
if (run.status !== 'running' && run.status !== 'queued') {
return res.status(400).json({
error: "Run is not in a state that can be aborted",
currentStatus: run.status
});
}
await run.update({ status: 'aborted' });
logger.info(`[SDK] Run ${runId} marked for abortion`);
return res.status(200).json({
message: "Run abortion initiated",
data: run
});
} catch (error: any) {
logger.error("[SDK] Error aborting run:", error);
return res.status(500).json({
error: "Failed to abort run",
message: error.message
});
}
});
/**
* LLM-based extraction - generate workflow from natural language prompt
* POST /api/sdk/extract/llm
*/
router.post("/sdk/extract/llm", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
try {
const user = req.user.id
const { url, prompt, llmProvider, llmModel, llmApiKey, llmBaseUrl, robotName } = req.body;
if (!url || !prompt) {
return res.status(400).json({
error: "URL and prompt are required"
});
}
const workflowResult = await WorkflowEnricher.generateWorkflowFromPrompt(url, prompt, {
provider: llmProvider,
model: llmModel,
apiKey: llmApiKey,
baseUrl: llmBaseUrl
});
if (!workflowResult.success || !workflowResult.workflow) {
return res.status(400).json({
error: "Failed to generate workflow from prompt",
details: workflowResult.errors
});
}
const robotId = uuid();
const metaId = uuid();
const robotMeta: any = {
name: robotName || `LLM Extract: ${prompt.substring(0, 50)}`,
id: metaId,
createdAt: new Date().toISOString(),
updatedAt: new Date().toISOString(),
pairs: workflowResult.workflow.length,
params: [],
type: 'extract',
url: workflowResult.url,
};
const robot = await Robot.create({
id: robotId,
userId: user.id,
recording_meta: robotMeta,
recording: {
workflow: workflowResult.workflow
},
});
logger.info(`[SDK] Persistent robot created: ${metaId} for LLM extraction`);
capture("maxun-oss-robot-created", {
robot_meta: robot.recording_meta,
recording: robot.recording,
});
return res.status(200).json({
success: true,
data: {
robotId: metaId,
name: robotMeta.name,
description: prompt,
url: workflowResult.url,
workflow: workflowResult.workflow
}
});
} catch (error: any) {
logger.error("[SDK] Error in LLM extraction:", error);
return res.status(500).json({
error: "Failed to perform LLM extraction",
message: error.message
});
}
});
export default router;