diff --git a/server/src/api/record.ts b/server/src/api/record.ts index 29d1f261..fd7376ab 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -18,6 +18,7 @@ import { WorkflowFile } from "maxun-core"; import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet"; import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable"; import { sendWebhook } from "../routes/webhook"; +import { convertPageToMarkdown } from '../markdownify/scrape'; chromium.use(stealthPlugin()); @@ -344,7 +345,8 @@ function formatRunResponse(run: any) { runByAPI: run.runByAPI, data: { textData: {}, - listData: {} + listData: {}, + markdown: '' }, screenshots: [] as any[], }; @@ -359,6 +361,10 @@ function formatRunResponse(run: any) { formattedRun.data.listData = output.scrapeList; } + if (output.markdown && Array.isArray(output.markdown)) { + formattedRun.data.markdown = output.markdown[0]?.content || ''; + } + if (run.binaryOutput) { Object.keys(run.binaryOutput).forEach(key => { if (run.binaryOutput[key]) { @@ -651,6 +657,105 @@ async function executeRun(id: string, userId: string) { }; } + if (recording.recording_meta.type === 'markdown') { + logger.log('info', `Executing markdown robot for API run ${id}`); + + await run.update({ + status: 'running', + log: 'Converting page to markdown' + }); + + try { + const url = recording.recording_meta.url; + + if (!url) { + throw new Error('No URL specified for markdown robot'); + } + + const markdown = await convertPageToMarkdown(url); + + await run.update({ + status: 'success', + finishedAt: new Date().toLocaleString(), + log: 'Markdown conversion completed successfully', + serializableOutput: { + markdown: [{ content: markdown }] + }, + binaryOutput: {}, + }); + + logger.log('info', `Markdown robot execution completed for API run ${id}`); + + try { + const completionData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`); + } + + const webhookPayload = { + robot_id: plainRun.robotMetaId, + run_id: plainRun.runId, + robot_name: recording.recording_meta.name, + status: 'success', + started_at: plainRun.startedAt, + finished_at: new Date().toLocaleString(), + markdown: markdown, + metadata: { + browser_id: plainRun.browserId, + user_id: userId, + } + }; + + try { + await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); + logger.log('info', `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`); + } catch (webhookError: any) { + logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`); + } + + await destroyRemoteBrowser(plainRun.browserId, userId); + + return { + success: true, + interpretationInfo: run.toJSON() + }; + } catch (error: any) { + logger.log('error', `Markdown conversion failed for API run ${id}: ${error.message}`); + + await run.update({ + status: 'failed', + finishedAt: new Date().toLocaleString(), + log: `Markdown conversion failed: ${error.message}`, + }); + + try { + const failureData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'failed', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`); + } + + await destroyRemoteBrowser(plainRun.browserId, userId); + + throw error; + } + } + plainRun.status = 'running'; browser = browserPool.getRemoteBrowser(plainRun.browserId); @@ -889,12 +994,11 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) { if (!run) throw new Error('Run not found'); if (run.status === 'success') { - return run.toJSON(); + return run; } else if (run.status === 'failed') { throw new Error('Run failed'); } - // Wait for the next polling interval await new Promise(resolve => setTimeout(resolve, interval)); } } diff --git a/server/src/models/Robot.ts b/server/src/models/Robot.ts index eae9438e..5acbdf13 100644 --- a/server/src/models/Robot.ts +++ b/server/src/models/Robot.ts @@ -9,6 +9,8 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; + type?: 'traditional' | 'markdown'; + url?: string; } interface RobotWorkflow { diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts index b9f41100..b2d5bdb3 100644 --- a/server/src/pgboss-worker.ts +++ b/server/src/pgboss-worker.ts @@ -20,6 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme import { io as serverIo } from "./server"; import { sendWebhook } from './routes/webhook'; import { BinaryOutputService } from './storage/mino'; +import { convertPageToMarkdown } from './markdownify/scrape'; if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) { throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.'); @@ -183,11 +184,103 @@ async function processRunExecution(job: Job) { try { // Find the recording const recording = await Robot.findOne({ where: { 'recording_meta.id': plainRun.robotMetaId }, raw: true }); - + if (!recording) { throw new Error(`Recording for run ${data.runId} not found`); } - + + if (recording.recording_meta.type === 'markdown') { + logger.log('info', `Executing markdown robot for run ${data.runId}`); + + await run.update({ + status: 'running', + log: 'Converting page to markdown' + }); + + try { + const url = recording.recording_meta.url; + + if (!url) { + throw new Error('No URL specified for markdown robot'); + } + + const markdown = await convertPageToMarkdown(url); + + await run.update({ + status: 'success', + finishedAt: new Date().toLocaleString(), + log: 'Markdown conversion completed successfully', + serializableOutput: { + markdown: [{ content: markdown }] + }, + binaryOutput: {}, + }); + + logger.log('info', `Markdown robot execution completed for run ${data.runId}`); + + try { + const completionData = { + runId: data.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(browserId).emit('run-completed', completionData); + serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', completionData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`); + } + + try { + const webhookPayload = { + runId: data.runId, + robotId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString(), + markdown: markdown + }; + await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); + logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`); + } catch (webhookError: any) { + logger.log('warn', `Failed to send webhooks for markdown robot run ${data.runId}: ${webhookError.message}`); + } + + await destroyRemoteBrowser(browserId, data.userId); + + return { success: true }; + } catch (error: any) { + logger.log('error', `Markdown conversion failed for run ${data.runId}: ${error.message}`); + + await run.update({ + status: 'failed', + finishedAt: new Date().toLocaleString(), + log: `Markdown conversion failed: ${error.message}`, + }); + + try { + const failureData = { + runId: data.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'failed', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(browserId).emit('run-completed', failureData); + serverIo.of('/queued-run').to(`user-${data.userId}`).emit('run-completed', failureData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-failed notification for markdown robot run ${data.runId}: ${socketError.message}`); + } + + await destroyRemoteBrowser(browserId, data.userId); + + throw error; + } + } + const isRunAborted = async (): Promise => { try { const currentRun = await Run.findOne({ where: { runId: data.runId } }); diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index 89872d6a..ee23ee44 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -274,7 +274,10 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r } if (targetUrl) { + robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl }); + const updatedWorkflow = [...robot.recording.workflow]; + let foundGoto = false; for (let i = updatedWorkflow.length - 1; i >= 0; i--) { const step = updatedWorkflow[i]; @@ -289,6 +292,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r robot.set('recording', { ...robot.recording, workflow: updatedWorkflow }); robot.changed('recording', true); + foundGoto = true; i = -1; break; } @@ -331,10 +335,11 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r } }; - if (name) { + if (name || targetUrl) { updates.recording_meta = { ...robot.recording_meta, - name + ...(name && { name }), + ...(targetUrl && { url: targetUrl }) }; } @@ -432,6 +437,78 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate } }); +/** + * POST endpoint for creating a markdown robot + */ +router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequest, res) => { + try { + const { url, name } = req.body; + + if (!url) { + return res.status(400).json({ error: 'The "url" field is required.' }); + } + + if (!req.user) { + return res.status(401).send({ error: 'Unauthorized' }); + } + + // Validate URL format + try { + new URL(url); + } catch (err) { + return res.status(400).json({ error: 'Invalid URL format' }); + } + + const robotName = name || `Markdown Robot - ${new URL(url).hostname}`; + const currentTimestamp = new Date().toLocaleString(); + const robotId = uuid(); + + const newRobot = await Robot.create({ + id: uuid(), + userId: req.user.id, + recording_meta: { + name: robotName, + id: robotId, + createdAt: currentTimestamp, + updatedAt: currentTimestamp, + pairs: 0, + params: [], + type: 'markdown', + url: url, + }, + recording: { workflow: [] }, + google_sheet_email: null, + google_sheet_name: null, + google_sheet_id: null, + google_access_token: null, + google_refresh_token: null, + schedule: null, + }); + + logger.log('info', `Markdown robot created with id: ${newRobot.id}`); + capture( + 'maxun-oss-markdown-robot-created', + { + robot_meta: newRobot.recording_meta, + url: url, + } + ); + + return res.status(201).json({ + message: 'Markdown robot created successfully.', + robot: newRobot, + }); + } catch (error) { + if (error instanceof Error) { + logger.log('error', `Error creating markdown robot: ${error.message}`); + return res.status(500).json({ error: error.message }); + } else { + logger.log('error', 'Unknown error creating markdown robot'); + return res.status(500).json({ error: 'An unknown error occurred.' }); + } + } +}); + /** * DELETE endpoint for deleting a recording from the storage. */ diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts index 899cb7f6..7c2cb408 100644 --- a/server/src/workflow-management/scheduler/index.ts +++ b/server/src/workflow-management/scheduler/index.ts @@ -15,6 +15,7 @@ import { WorkflowFile } from "maxun-core"; import { Page } from "playwright"; import { sendWebhook } from "../../routes/webhook"; import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable"; +import { convertPageToMarkdown } from "../../markdownify/scrape"; chromium.use(stealthPlugin()); async function createWorkflowAndStoreMetadata(id: string, userId: string) { @@ -207,6 +208,119 @@ async function executeRun(id: string, userId: string) { } } + if (recording.recording_meta.type === 'markdown') { + logger.log('info', `Executing markdown robot for scheduled run ${id}`); + + await run.update({ + status: 'running', + log: 'Converting page to markdown' + }); + + try { + const runStartedData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'running', + startedAt: plainRun.startedAt + }; + + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData); + logger.log('info', `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`); + } + + try { + const url = recording.recording_meta.url; + + if (!url) { + throw new Error('No URL specified for markdown robot'); + } + + const markdown = await convertPageToMarkdown(url); + + await run.update({ + status: 'success', + finishedAt: new Date().toLocaleString(), + log: 'Markdown conversion completed successfully', + serializableOutput: { + markdown: [{ content: markdown }] + }, + binaryOutput: {}, + }); + + logger.log('info', `Markdown robot execution completed for scheduled run ${id}`); + + try { + const completionData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'success', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(plainRun.browserId).emit('run-completed', completionData); + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`); + } + + const webhookPayload = { + robot_id: plainRun.robotMetaId, + run_id: plainRun.runId, + robot_name: recording.recording_meta.name, + status: 'success', + started_at: plainRun.startedAt, + finished_at: new Date().toLocaleString(), + markdown: markdown, + metadata: { + browser_id: plainRun.browserId, + user_id: userId, + } + }; + + try { + await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); + logger.log('info', `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`); + } catch (webhookError: any) { + logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`); + } + + await destroyRemoteBrowser(plainRun.browserId, userId); + + return true; + } catch (error: any) { + logger.log('error', `Markdown conversion failed for scheduled run ${id}: ${error.message}`); + + await run.update({ + status: 'failed', + finishedAt: new Date().toLocaleString(), + log: `Markdown conversion failed: ${error.message}`, + }); + + try { + const failureData = { + runId: plainRun.runId, + robotMetaId: plainRun.robotMetaId, + robotName: recording.recording_meta.name, + status: 'failed', + finishedAt: new Date().toLocaleString() + }; + + serverIo.of(plainRun.browserId).emit('run-completed', failureData); + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData); + } catch (socketError: any) { + logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`); + } + + await destroyRemoteBrowser(plainRun.browserId, userId); + + throw error; + } + } + plainRun.status = 'running'; try { @@ -217,7 +331,7 @@ async function executeRun(id: string, userId: string) { status: 'running', startedAt: plainRun.startedAt }; - + serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData); logger.log('info', `Run started notification sent for run: ${plainRun.runId} to user-${userId}`); } catch (socketError: any) { diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 70058642..4bec52d8 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -13,21 +13,48 @@ import { Card, CircularProgress, Container, - CardContent + CardContent, + Tabs, + Tab } from '@mui/material'; -import { ArrowBack, PlayCircleOutline, Article } from '@mui/icons-material'; +import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material'; import { useGlobalInfoStore } from '../../../context/globalInfo'; import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording'; +import { createMarkdownRobot } from "../../../api/storage"; import { AuthContext } from '../../../context/auth'; import { GenericModal } from '../../ui/GenericModal'; +interface TabPanelProps { + children?: React.ReactNode; + index: number; + value: number; +} + +function TabPanel(props: TabPanelProps) { + const { children, value, index, ...other } = props; + + return ( + + ); +} + const RobotCreate: React.FC = () => { const { t } = useTranslation(); const navigate = useNavigate(); - const { setBrowserId, setRecordingUrl, notify, setRecordingId } = useGlobalInfoStore(); + const { setBrowserId, setRecordingUrl, notify, setRecordingId, setRerenderRobots } = useGlobalInfoStore(); + const [tabValue, setTabValue] = useState(0); const [url, setUrl] = useState(''); + const [markdownRobotName, setMarkdownRobotName] = useState(''); const [needsLogin, setNeedsLogin] = useState(false); const [isLoading, setIsLoading] = useState(false); const [isWarningModalOpen, setWarningModalOpen] = useState(false); @@ -36,6 +63,10 @@ const RobotCreate: React.FC = () => { const { state } = React.useContext(AuthContext); const { user } = state; + const handleTabChange = (event: React.SyntheticEvent, newValue: number) => { + setTabValue(newValue); + }; + const handleStartRecording = async () => { if (!url.trim()) { @@ -146,11 +177,31 @@ const RobotCreate: React.FC = () => { - New Data Extraction Robot + Create New Robot - + + + } + iconPosition="start" + label="Data Extraction Robot" + id="robot-tab-0" + aria-controls="robot-tabpanel-0" + /> + } + iconPosition="start" + label="Markdown Robot" + id="robot-tab-1" + aria-controls="robot-tabpanel-1" + /> + + + + + {/* Logo (kept as original) */} { + + + + + + Maxun Logo + + + Create Markdown Robot + + + Convert any webpage to clean markdown format + + + + setMarkdownRobotName(e.target.value)} + label="Robot Name" + sx={{ mb: 2 }} + /> + setUrl(e.target.value)} + label="URL to convert" + /> + + + + + + diff --git a/src/components/robot/pages/RobotDuplicatePage.tsx b/src/components/robot/pages/RobotDuplicatePage.tsx index b02cecde..7c45c8e8 100644 --- a/src/components/robot/pages/RobotDuplicatePage.tsx +++ b/src/components/robot/pages/RobotDuplicatePage.tsx @@ -24,12 +24,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: string; - description?: string; - usedByUsers?: number[]; - subscriptionLevel?: number; - access?: string; - sample?: any[]; + type?: 'traditional' | 'markdown'; url?: string; } diff --git a/src/components/robot/pages/RobotEditPage.tsx b/src/components/robot/pages/RobotEditPage.tsx index 80671c1f..19b9e43b 100644 --- a/src/components/robot/pages/RobotEditPage.tsx +++ b/src/components/robot/pages/RobotEditPage.tsx @@ -24,12 +24,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: string; - description?: string; - usedByUsers?: number[]; - subscriptionLevel?: number; - access?: string; - sample?: any[]; + type?: 'traditional' | 'markdown'; url?: string; } @@ -795,11 +790,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { navigate(basePath); }; - const lastPair = - robot?.recording.workflow[robot?.recording.workflow.length - 1]; - const targetUrl = lastPair?.what.find((action) => action.action === "goto") - ?.args?.[0]; - return ( { handleTargetUrlChange(e.target.value)} style={{ marginBottom: "20px" }} /> diff --git a/src/components/robot/pages/RobotSettingsPage.tsx b/src/components/robot/pages/RobotSettingsPage.tsx index 11832935..96b7d3ec 100644 --- a/src/components/robot/pages/RobotSettingsPage.tsx +++ b/src/components/robot/pages/RobotSettingsPage.tsx @@ -16,6 +16,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; + type?: 'traditional' | 'markdown'; url?: string; } diff --git a/src/components/run/RunContent.tsx b/src/components/run/RunContent.tsx index 2cc1bb86..3a676a00 100644 --- a/src/components/run/RunContent.tsx +++ b/src/components/run/RunContent.tsx @@ -37,6 +37,7 @@ interface RunContentProps { export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRef, abortRunHandler }: RunContentProps) => { const { t } = useTranslation(); const [tab, setTab] = React.useState('output'); + const [markdownContent, setMarkdownContent] = useState(''); const [schemaData, setSchemaData] = useState([]); const [schemaColumns, setSchemaColumns] = useState([]); @@ -63,6 +64,15 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe setTab(tab); }, [interpretationInProgress]); + useEffect(() => { + if (row.serializableOutput?.markdown && Array.isArray(row.serializableOutput.markdown)) { + const markdownData = row.serializableOutput.markdown[0]; + if (markdownData && markdownData.content) { + setMarkdownContent(markdownData.content); + } + } + }, [row.serializableOutput]); + useEffect(() => { if (row.status === 'running' || row.status === 'queued' || row.status === 'scheduled') { setSchemaData([]); @@ -374,6 +384,22 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe }, 100); }; + const downloadMarkdown = (content: string, filename: string) => { + const blob = new Blob([content], { type: 'text/markdown;charset=utf-8;' }); + const url = URL.createObjectURL(blob); + + const link = document.createElement("a"); + link.href = url; + link.setAttribute("download", filename); + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + + setTimeout(() => { + URL.revokeObjectURL(url); + }, 100); + }; + const renderDataTable = ( data: any[], @@ -636,11 +662,70 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe const hasData = schemaData.length > 0 || listData.length > 0 || legacyData.length > 0; const hasScreenshots = row.binaryOutput && Object.keys(row.binaryOutput).length > 0; + const hasMarkdown = markdownContent.length > 0; return ( + {hasMarkdown ? ( + + + }> + + + Markdown Output + + + + + theme.palette.mode === 'dark' ? '#1e1e1e' : '#f5f5f5' + }} + > + + {markdownContent} + + + + + + + + + + + ) : ( + // Traditional robot output + <> {row.status === 'running' || row.status === 'queued' ? ( <> @@ -939,6 +1024,8 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe )} + + )} diff --git a/src/context/globalInfo.tsx b/src/context/globalInfo.tsx index 69969a09..a0c79622 100644 --- a/src/context/globalInfo.tsx +++ b/src/context/globalInfo.tsx @@ -27,6 +27,8 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; + type?: 'traditional' | 'markdown'; + url?: string; } interface RobotWorkflow {