diff --git a/server/src/api/record.ts b/server/src/api/record.ts index fd7376ab..cbf4f67e 100644 --- a/server/src/api/record.ts +++ b/server/src/api/record.ts @@ -18,7 +18,7 @@ import { WorkflowFile } from "maxun-core"; import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet"; import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable"; import { sendWebhook } from "../routes/webhook"; -import { convertPageToMarkdown } from '../markdownify/scrape'; +import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape'; chromium.use(stealthPlugin()); @@ -346,7 +346,8 @@ function formatRunResponse(run: any) { data: { textData: {}, listData: {}, - markdown: '' + markdown: '', + html: '' }, screenshots: [] as any[], }; @@ -365,6 +366,10 @@ function formatRunResponse(run: any) { formattedRun.data.markdown = output.markdown[0]?.content || ''; } + if (output.html && Array.isArray(output.html)) { + formattedRun.data.html = output.html[0]?.content || ''; + } + if (run.binaryOutput) { Object.keys(run.binaryOutput).forEach(key => { if (run.binaryOutput[key]) { @@ -575,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr } } -async function readyForRunHandler(browserId: string, id: string, userId: string){ +async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){ try { - const result = await executeRun(id, userId); + const result = await executeRun(id, userId, requestedFormats); if (result && result.success) { logger.log('info', `Interpretation of ${id} succeeded`); @@ -614,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) { return copy; }; -async function executeRun(id: string, userId: string) { +async function executeRun(id: string, userId: string, requestedFormats?: string[]) { let browser: any = null; try { @@ -657,12 +662,19 @@ async function executeRun(id: string, userId: string) { }; } - if (recording.recording_meta.type === 'markdown') { - logger.log('info', `Executing markdown robot for API run ${id}`); + if (recording.recording_meta.type === 'scrape') { + logger.log('info', `Executing scrape robot for API run ${id}`); + + let formats = recording.recording_meta.formats || ['markdown']; + + // Override if API request defines formats + if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) { + formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f)); + } await run.update({ status: 'running', - log: 'Converting page to markdown' + log: `Converting page to: ${formats.join(', ')}` }); try { @@ -672,20 +684,33 @@ async function executeRun(id: string, userId: string) { throw new Error('No URL specified for markdown robot'); } - const markdown = await convertPageToMarkdown(url); + let markdown = ''; + let html = ''; + const serializableOutput: any = {}; + + // Markdown conversion + if (formats.includes('markdown')) { + markdown = await convertPageToMarkdown(url); + serializableOutput.markdown = [{ content: markdown }]; + } + + // HTML conversion + if (formats.includes('html')) { + html = await convertPageToHTML(url); + serializableOutput.html = [{ content: html }]; + } await run.update({ status: 'success', finishedAt: new Date().toLocaleString(), - log: 'Markdown conversion completed successfully', - serializableOutput: { - markdown: [{ content: markdown }] - }, + log: `${formats.join(', ')} conversion completed successfully`, + serializableOutput, binaryOutput: {}, }); logger.log('info', `Markdown robot execution completed for API run ${id}`); + // Push success socket event try { const completionData = { runId: plainRun.runId, @@ -695,30 +720,45 @@ async function executeRun(id: string, userId: string) { finishedAt: new Date().toLocaleString() }; - serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData); + serverIo + .of('/queued-run') + .to(`user-${userId}`) + .emit('run-completed', completionData); } catch (socketError: any) { - logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`); + logger.log( + 'warn', + `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}` + ); } - const webhookPayload = { + // Build webhook payload + const webhookPayload: any = { robot_id: plainRun.robotMetaId, run_id: plainRun.runId, robot_name: recording.recording_meta.name, status: 'success', started_at: plainRun.startedAt, finished_at: new Date().toLocaleString(), - markdown: markdown, metadata: { browser_id: plainRun.browserId, user_id: userId, - } + }, }; + if (formats.includes('markdown')) webhookPayload.markdown = markdown; + if (formats.includes('html')) webhookPayload.html = html; + try { await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); - logger.log('info', `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`); + logger.log( + 'info', + `Webhooks sent successfully for markdown robot API run ${plainRun.runId}` + ); } catch (webhookError: any) { - logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`); + logger.log( + 'warn', + `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}` + ); } await destroyRemoteBrowser(plainRun.browserId, userId); @@ -728,14 +768,18 @@ async function executeRun(id: string, userId: string) { interpretationInfo: run.toJSON() }; } catch (error: any) { - logger.log('error', `Markdown conversion failed for API run ${id}: ${error.message}`); + logger.log( + 'error', + `${formats.join(', ')} conversion failed for API run ${id}: ${error.message}` + ); await run.update({ status: 'failed', finishedAt: new Date().toLocaleString(), - log: `Markdown conversion failed: ${error.message}`, + log: `${formats.join(', ')} conversion failed: ${error.message}`, }); + // Send failure socket event try { const failureData = { runId: plainRun.runId, @@ -745,9 +789,15 @@ async function executeRun(id: string, userId: string) { finishedAt: new Date().toLocaleString() }; - serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData); + serverIo + .of('/queued-run') + .to(`user-${userId}`) + .emit('run-completed', failureData); } catch (socketError: any) { - logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`); + logger.log( + 'warn', + `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}` + ); } await destroyRemoteBrowser(plainRun.browserId, userId); @@ -953,7 +1003,7 @@ async function executeRun(id: string, userId: string) { } } -export async function handleRunRecording(id: string, userId: string) { +export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) { try { const result = await createWorkflowAndStoreMetadata(id, userId); const { browserId, runId: newRunId } = result; @@ -967,7 +1017,7 @@ export async function handleRunRecording(id: string, userId: string) { rejectUnauthorized: false }); - socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId)); + socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats)); logger.log('info', `Running Robot: ${id}`); @@ -1018,6 +1068,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) { * type: string * required: true * description: The ID of the robot to run. + * requestBody: + * required: false + * content: + * application/json: + * schema: + * type: object + * properties: + * formats: + * type: array + * items: + * type: string + * enum: [markdown, html] + * description: Optional override formats for this run. + * example: + * formats: ["html"] * responses: * 200: * description: Robot run started successfully. @@ -1076,7 +1141,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest, if (!req.user) { return res.status(401).json({ ok: false, error: 'Unauthorized' }); } - const runId = await handleRunRecording(req.params.id, req.user.id); + + const requestedFormats = req.body.formats; + + const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats); if (!runId) { throw new Error('Run ID is undefined'); diff --git a/server/src/markdownify/scrape.ts b/server/src/markdownify/scrape.ts index b58265a2..935fa0cb 100644 --- a/server/src/markdownify/scrape.ts +++ b/server/src/markdownify/scrape.ts @@ -55,3 +55,57 @@ export async function convertPageToMarkdown(url: string): Promise { const markdown = await parseMarkdown(cleanedHtml, url); return markdown; } + +/** + * Fetches a webpage, strips scripts/styles/images/etc, + * returns clean HTML. + */ +export async function convertPageToHTML(url: string): Promise { + const browser = await chromium.launch(); + const page = await browser.newPage(); + + await page.goto(url, { waitUntil: "networkidle" }); + + await page.addInitScript(() => { + const selectors = [ + "script", + "style", + "link[rel='stylesheet']", + "noscript", + "meta", + "svg", + "img", + "picture", + "source", + "video", + "audio", + "iframe", + "object", + "embed" + ]; + + selectors.forEach(sel => { + document.querySelectorAll(sel).forEach(e => e.remove()); + }); + + // Remove inline event handlers (onclick, onload…) + const all = document.querySelectorAll("*"); + all.forEach(el => { + [...el.attributes].forEach(attr => { + if (attr.name.startsWith("on")) { + el.removeAttribute(attr.name); + } + }); + }); + }); + + // Re-extract HTML after cleanup + const cleanedHtml = await page.evaluate(() => { + return document.documentElement.outerHTML; + }); + + await browser.close(); + + // Return cleaned HTML directly + return cleanedHtml; +} diff --git a/server/src/models/Robot.ts b/server/src/models/Robot.ts index 5acbdf13..39218de2 100644 --- a/server/src/models/Robot.ts +++ b/server/src/models/Robot.ts @@ -9,8 +9,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'traditional' | 'markdown'; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { diff --git a/server/src/pgboss-worker.ts b/server/src/pgboss-worker.ts index b2d5bdb3..66e852b8 100644 --- a/server/src/pgboss-worker.ts +++ b/server/src/pgboss-worker.ts @@ -20,7 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme import { io as serverIo } from "./server"; import { sendWebhook } from './routes/webhook'; import { BinaryOutputService } from './storage/mino'; -import { convertPageToMarkdown } from './markdownify/scrape'; +import { convertPageToMarkdown, convertPageToHTML } from './markdownify/scrape'; if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) { throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.'); @@ -189,12 +189,14 @@ async function processRunExecution(job: Job) { throw new Error(`Recording for run ${data.runId} not found`); } - if (recording.recording_meta.type === 'markdown') { - logger.log('info', `Executing markdown robot for run ${data.runId}`); + if (recording.recording_meta.type === 'scrape') { + logger.log('info', `Executing scrape robot for run ${data.runId}`); + + const formats = recording.recording_meta.formats || ['markdown']; await run.update({ status: 'running', - log: 'Converting page to markdown' + log: `Converting page to ${formats.join(', ')}` }); try { @@ -204,20 +206,34 @@ async function processRunExecution(job: Job) { throw new Error('No URL specified for markdown robot'); } - const markdown = await convertPageToMarkdown(url); + let markdown = ''; + let html = ''; + const serializableOutput: any = {}; + // Markdown conversion + if (formats.includes('markdown')) { + markdown = await convertPageToMarkdown(url); + serializableOutput.markdown = [{ content: markdown }]; + } + + // HTML conversion + if (formats.includes('html')) { + html = await convertPageToHTML(url); + serializableOutput.html = [{ content: html }]; + } + + // Success update await run.update({ status: 'success', finishedAt: new Date().toLocaleString(), - log: 'Markdown conversion completed successfully', - serializableOutput: { - markdown: [{ content: markdown }] - }, + log: `${formats.join(', ').toUpperCase()} conversion completed successfully`, + serializableOutput, binaryOutput: {}, }); logger.log('info', `Markdown robot execution completed for run ${data.runId}`); + // Notify sockets try { const completionData = { runId: data.runId, @@ -233,15 +249,19 @@ async function processRunExecution(job: Job) { logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`); } + // Webhooks try { - const webhookPayload = { + const webhookPayload: any = { runId: data.runId, robotId: plainRun.robotMetaId, robotName: recording.recording_meta.name, status: 'success', finishedAt: new Date().toLocaleString(), - markdown: markdown }; + + if (formats.includes('markdown')) webhookPayload.markdown = markdown; + if (formats.includes('html')) webhookPayload.html = html; + await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`); } catch (webhookError: any) { @@ -251,13 +271,14 @@ async function processRunExecution(job: Job) { await destroyRemoteBrowser(browserId, data.userId); return { success: true }; + } catch (error: any) { - logger.log('error', `Markdown conversion failed for run ${data.runId}: ${error.message}`); + logger.log('error', `${formats.join(', ')} conversion failed for run ${data.runId}: ${error.message}`); await run.update({ status: 'failed', finishedAt: new Date().toLocaleString(), - log: `Markdown conversion failed: ${error.message}`, + log: `${formats.join(', ').toUpperCase()} conversion failed: ${error.message}`, }); try { diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index ee23ee44..44279e9c 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -440,9 +440,9 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate /** * POST endpoint for creating a markdown robot */ -router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequest, res) => { +router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedRequest, res) => { try { - const { url, name } = req.body; + const { url, name, formats } = req.body; if (!url) { return res.status(400).json({ error: 'The "url" field is required.' }); @@ -459,6 +459,18 @@ router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequ return res.status(400).json({ error: 'Invalid URL format' }); } + // Validate format + const validFormats = ['markdown', 'html']; + + if (!Array.isArray(formats) || formats.length === 0) { + return res.status(400).json({ error: 'At least one output format must be selected.' }); + } + + const invalid = formats.filter(f => !validFormats.includes(f)); + if (invalid.length > 0) { + return res.status(400).json({ error: `Invalid formats: ${invalid.join(', ')}` }); + } + const robotName = name || `Markdown Robot - ${new URL(url).hostname}`; const currentTimestamp = new Date().toLocaleString(); const robotId = uuid(); @@ -473,8 +485,9 @@ router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequ updatedAt: currentTimestamp, pairs: 0, params: [], - type: 'markdown', + type: 'scrape', url: url, + formats: formats, }, recording: { workflow: [] }, google_sheet_email: null, diff --git a/server/src/workflow-management/scheduler/index.ts b/server/src/workflow-management/scheduler/index.ts index 7c2cb408..d5ba76f4 100644 --- a/server/src/workflow-management/scheduler/index.ts +++ b/server/src/workflow-management/scheduler/index.ts @@ -15,7 +15,7 @@ import { WorkflowFile } from "maxun-core"; import { Page } from "playwright"; import { sendWebhook } from "../../routes/webhook"; import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable"; -import { convertPageToMarkdown } from "../../markdownify/scrape"; +import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape"; chromium.use(stealthPlugin()); async function createWorkflowAndStoreMetadata(id: string, userId: string) { @@ -208,12 +208,14 @@ async function executeRun(id: string, userId: string) { } } - if (recording.recording_meta.type === 'markdown') { - logger.log('info', `Executing markdown robot for scheduled run ${id}`); + if (recording.recording_meta.type === 'scrape') { + logger.log('info', `Executing scrape robot for scheduled run ${id}`); + + const formats = recording.recording_meta.formats || ['markdown']; await run.update({ status: 'running', - log: 'Converting page to markdown' + log: `Converting page to: ${formats.join(', ')}` }); try { @@ -226,9 +228,15 @@ async function executeRun(id: string, userId: string) { }; serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData); - logger.log('info', `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`); + logger.log( + 'info', + `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}` + ); } catch (socketError: any) { - logger.log('warn', `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`); + logger.log( + 'warn', + `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}` + ); } try { @@ -238,20 +246,33 @@ async function executeRun(id: string, userId: string) { throw new Error('No URL specified for markdown robot'); } - const markdown = await convertPageToMarkdown(url); + let markdown = ''; + let html = ''; + const serializableOutput: any = {}; + + // Markdown conversion + if (formats.includes('markdown')) { + markdown = await convertPageToMarkdown(url); + serializableOutput.markdown = [{ content: markdown }]; + } + + // HTML conversion + if (formats.includes('html')) { + html = await convertPageToHTML(url); + serializableOutput.html = [{ content: html }]; + } await run.update({ status: 'success', finishedAt: new Date().toLocaleString(), - log: 'Markdown conversion completed successfully', - serializableOutput: { - markdown: [{ content: markdown }] - }, + log: `${formats.join(', ')} conversion completed successfully`, + serializableOutput, binaryOutput: {}, }); logger.log('info', `Markdown robot execution completed for scheduled run ${id}`); + // Run-completed socket notifications try { const completionData = { runId: plainRun.runId, @@ -264,40 +285,53 @@ async function executeRun(id: string, userId: string) { serverIo.of(plainRun.browserId).emit('run-completed', completionData); serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData); } catch (socketError: any) { - logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`); + logger.log( + 'warn', + `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}` + ); } - const webhookPayload = { + // Webhook payload + const webhookPayload: any = { robot_id: plainRun.robotMetaId, run_id: plainRun.runId, robot_name: recording.recording_meta.name, status: 'success', started_at: plainRun.startedAt, finished_at: new Date().toLocaleString(), - markdown: markdown, metadata: { browser_id: plainRun.browserId, user_id: userId, } }; + if (formats.includes('markdown')) webhookPayload.markdown = markdown; + if (formats.includes('html')) webhookPayload.html = html; + try { await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload); - logger.log('info', `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`); + logger.log( + 'info', + `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}` + ); } catch (webhookError: any) { - logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`); + logger.log( + 'warn', + `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}` + ); } await destroyRemoteBrowser(plainRun.browserId, userId); return true; + } catch (error: any) { - logger.log('error', `Markdown conversion failed for scheduled run ${id}: ${error.message}`); + logger.log('error', `${formats.join(', ')} conversion failed for scheduled run ${id}: ${error.message}`); await run.update({ status: 'failed', finishedAt: new Date().toLocaleString(), - log: `Markdown conversion failed: ${error.message}`, + log: `${formats.join(', ')} conversion failed: ${error.message}`, }); try { @@ -312,7 +346,10 @@ async function executeRun(id: string, userId: string) { serverIo.of(plainRun.browserId).emit('run-completed', failureData); serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData); } catch (socketError: any) { - logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`); + logger.log( + 'warn', + `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}` + ); } await destroyRemoteBrowser(plainRun.browserId, userId); diff --git a/src/components/robot/RecordingsTable.tsx b/src/components/robot/RecordingsTable.tsx index f06270ed..79319b92 100644 --- a/src/components/robot/RecordingsTable.tsx +++ b/src/components/robot/RecordingsTable.tsx @@ -110,7 +110,10 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => { case 'integrate': return ( - handlers.handleIntegrateRecording(row.id, row.name, row.params || [])} /> + handlers.handleIntegrateRecording(row.id, row.name, row.params || [])} + robotType={row.type} + /> ); case 'options': @@ -121,6 +124,7 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => { handleEdit={() => handlers.handleEditRobot(row.id, row.name, row.params || [])} handleDuplicate={() => handlers.handleDuplicateRobot(row.id, row.name, row.params || [])} handleDelete={() => handlers.handleDelete(row.id)} + robotType={row.type} /> ); @@ -709,13 +713,22 @@ const ScheduleButton = ({ handleSchedule }: ScheduleButtonProps) => { interface IntegrateButtonProps { handleIntegrate: () => void; + robotType: string; } -const IntegrateButton = ({ handleIntegrate }: IntegrateButtonProps) => { +const IntegrateButton = ({ handleIntegrate, robotType }: IntegrateButtonProps) => { + const isDisabled = robotType === 'scrape'; + return ( - { - handleIntegrate(); - }} + @@ -742,9 +755,10 @@ interface OptionsButtonProps { handleEdit: () => void; handleDelete: () => void; handleDuplicate: () => void; + robotType: string; } -const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate }: OptionsButtonProps) => { +const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate, robotType }: OptionsButtonProps) => { const [anchorEl, setAnchorEl] = React.useState(null); const handleClick = (event: React.MouseEvent) => { @@ -771,34 +785,33 @@ const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicat open={Boolean(anchorEl)} onClose={handleClose} > - { handleRetrain(); handleClose(); }}> - - - - {t('recordingtable.retrain')} - + {robotType !== 'scrape' && ( + { handleRetrain(); handleClose(); }}> + + + + Retrain + + )} { handleEdit(); handleClose(); }}> - - - - {t('recordingtable.edit')} + + Edit { handleDelete(); handleClose(); }}> - - - - {t('recordingtable.delete')} + + Delete - { handleDuplicate(); handleClose(); }}> - - - - {t('recordingtable.duplicate')} - + {robotType !== 'scrape' && ( + { handleDuplicate(); handleClose(); }}> + + Duplicate + + )} + ); }; diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 88daa49b..312d7bae 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -15,12 +15,16 @@ import { Container, CardContent, Tabs, - Tab + Tab, + RadioGroup, + Radio, + FormControl, + FormLabel } from '@mui/material'; import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material'; import { useGlobalInfoStore } from '../../../context/globalInfo'; import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording'; -import { createMarkdownRobot } from "../../../api/storage"; +import { createScrapeRobot } from "../../../api/storage"; import { AuthContext } from '../../../context/auth'; import { GenericModal } from '../../ui/GenericModal'; @@ -54,11 +58,12 @@ const RobotCreate: React.FC = () => { const [tabValue, setTabValue] = useState(0); const [url, setUrl] = useState(''); - const [markdownRobotName, setMarkdownRobotName] = useState(''); + const [scrapeRobotName, setScrapeRobotName] = useState(''); const [needsLogin, setNeedsLogin] = useState(false); const [isLoading, setIsLoading] = useState(false); const [isWarningModalOpen, setWarningModalOpen] = useState(false); const [activeBrowserId, setActiveBrowserId] = useState(''); + const [outputFormats, setOutputFormats] = useState([]); const { state } = React.useContext(AuthContext); const { user } = state; @@ -200,7 +205,7 @@ const RobotCreate: React.FC = () => { }} > - + @@ -370,7 +375,7 @@ const RobotCreate: React.FC = () => { /> - Turn websites into LLM-ready Markdown content for AI apps. + Turn websites into LLM-ready Markdown or clean HTML content for AI apps. @@ -378,8 +383,8 @@ const RobotCreate: React.FC = () => { placeholder="Example: YC Companies Scraper" variant="outlined" fullWidth - value={markdownRobotName} - onChange={(e) => setMarkdownRobotName(e.target.value)} + value={scrapeRobotName} + onChange={(e) => setScrapeRobotName(e.target.value)} sx={{ mb: 2 }} label="Robot Name" /> @@ -390,7 +395,44 @@ const RobotCreate: React.FC = () => { value={url} onChange={(e) => setUrl(e.target.value)} label="Website URL" + sx={{ mb: 2 }} /> + + + Output Format (Select at least one) + + { + if (e.target.checked) { + setOutputFormats([...outputFormats, 'markdown']); + } else { + setOutputFormats(outputFormats.filter(f => f !== 'markdown')); + } + }} + /> + } + label="Markdown" + /> + + { + if (e.target.checked) { + setOutputFormats([...outputFormats, 'html']); + } else { + setOutputFormats(outputFormats.filter(f => f !== 'html')); + } + }} + /> + } + label="HTML" + /> + diff --git a/src/components/robot/pages/RobotDuplicatePage.tsx b/src/components/robot/pages/RobotDuplicatePage.tsx index 7c45c8e8..ac602f8e 100644 --- a/src/components/robot/pages/RobotDuplicatePage.tsx +++ b/src/components/robot/pages/RobotDuplicatePage.tsx @@ -24,8 +24,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'traditional' | 'markdown'; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { diff --git a/src/components/robot/pages/RobotEditPage.tsx b/src/components/robot/pages/RobotEditPage.tsx index 19b9e43b..53424bb2 100644 --- a/src/components/robot/pages/RobotEditPage.tsx +++ b/src/components/robot/pages/RobotEditPage.tsx @@ -24,8 +24,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'traditional' | 'markdown'; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { diff --git a/src/components/robot/pages/RobotSettingsPage.tsx b/src/components/robot/pages/RobotSettingsPage.tsx index 96b7d3ec..f0f2f6ae 100644 --- a/src/components/robot/pages/RobotSettingsPage.tsx +++ b/src/components/robot/pages/RobotSettingsPage.tsx @@ -16,8 +16,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'traditional' | 'markdown'; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow { diff --git a/src/context/globalInfo.tsx b/src/context/globalInfo.tsx index a0c79622..973714b7 100644 --- a/src/context/globalInfo.tsx +++ b/src/context/globalInfo.tsx @@ -27,8 +27,9 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'traditional' | 'markdown'; + type?: 'extract' | 'scrape'; url?: string; + formats?: ('markdown' | 'html')[]; } interface RobotWorkflow {