feat: add html scrape support
This commit is contained in:
@@ -18,7 +18,7 @@ import { WorkflowFile } from "maxun-core";
|
|||||||
import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
|
import { googleSheetUpdateTasks, processGoogleSheetUpdates } from "../workflow-management/integrations/gsheet";
|
||||||
import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
|
import { airtableUpdateTasks, processAirtableUpdates } from "../workflow-management/integrations/airtable";
|
||||||
import { sendWebhook } from "../routes/webhook";
|
import { sendWebhook } from "../routes/webhook";
|
||||||
import { convertPageToMarkdown } from '../markdownify/scrape';
|
import { convertPageToHTML, convertPageToMarkdown } from '../markdownify/scrape';
|
||||||
|
|
||||||
chromium.use(stealthPlugin());
|
chromium.use(stealthPlugin());
|
||||||
|
|
||||||
@@ -346,7 +346,8 @@ function formatRunResponse(run: any) {
|
|||||||
data: {
|
data: {
|
||||||
textData: {},
|
textData: {},
|
||||||
listData: {},
|
listData: {},
|
||||||
markdown: ''
|
markdown: '',
|
||||||
|
html: ''
|
||||||
},
|
},
|
||||||
screenshots: [] as any[],
|
screenshots: [] as any[],
|
||||||
};
|
};
|
||||||
@@ -365,6 +366,10 @@ function formatRunResponse(run: any) {
|
|||||||
formattedRun.data.markdown = output.markdown[0]?.content || '';
|
formattedRun.data.markdown = output.markdown[0]?.content || '';
|
||||||
}
|
}
|
||||||
|
|
||||||
|
if (output.html && Array.isArray(output.html)) {
|
||||||
|
formattedRun.data.html = output.html[0]?.content || '';
|
||||||
|
}
|
||||||
|
|
||||||
if (run.binaryOutput) {
|
if (run.binaryOutput) {
|
||||||
Object.keys(run.binaryOutput).forEach(key => {
|
Object.keys(run.binaryOutput).forEach(key => {
|
||||||
if (run.binaryOutput[key]) {
|
if (run.binaryOutput[key]) {
|
||||||
@@ -575,9 +580,9 @@ async function triggerIntegrationUpdates(runId: string, robotMetaId: string): Pr
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
async function readyForRunHandler(browserId: string, id: string, userId: string){
|
async function readyForRunHandler(browserId: string, id: string, userId: string, requestedFormats?: string[]){
|
||||||
try {
|
try {
|
||||||
const result = await executeRun(id, userId);
|
const result = await executeRun(id, userId, requestedFormats);
|
||||||
|
|
||||||
if (result && result.success) {
|
if (result && result.success) {
|
||||||
logger.log('info', `Interpretation of ${id} succeeded`);
|
logger.log('info', `Interpretation of ${id} succeeded`);
|
||||||
@@ -614,7 +619,7 @@ function AddGeneratedFlags(workflow: WorkflowFile) {
|
|||||||
return copy;
|
return copy;
|
||||||
};
|
};
|
||||||
|
|
||||||
async function executeRun(id: string, userId: string) {
|
async function executeRun(id: string, userId: string, requestedFormats?: string[]) {
|
||||||
let browser: any = null;
|
let browser: any = null;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -657,12 +662,19 @@ async function executeRun(id: string, userId: string) {
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
if (recording.recording_meta.type === 'markdown') {
|
if (recording.recording_meta.type === 'scrape') {
|
||||||
logger.log('info', `Executing markdown robot for API run ${id}`);
|
logger.log('info', `Executing scrape robot for API run ${id}`);
|
||||||
|
|
||||||
|
let formats = recording.recording_meta.formats || ['markdown'];
|
||||||
|
|
||||||
|
// Override if API request defines formats
|
||||||
|
if (requestedFormats && Array.isArray(requestedFormats) && requestedFormats.length > 0) {
|
||||||
|
formats = requestedFormats.filter((f): f is 'markdown' | 'html' => ['markdown', 'html'].includes(f));
|
||||||
|
}
|
||||||
|
|
||||||
await run.update({
|
await run.update({
|
||||||
status: 'running',
|
status: 'running',
|
||||||
log: 'Converting page to markdown'
|
log: `Converting page to: ${formats.join(', ')}`
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -672,20 +684,33 @@ async function executeRun(id: string, userId: string) {
|
|||||||
throw new Error('No URL specified for markdown robot');
|
throw new Error('No URL specified for markdown robot');
|
||||||
}
|
}
|
||||||
|
|
||||||
const markdown = await convertPageToMarkdown(url);
|
let markdown = '';
|
||||||
|
let html = '';
|
||||||
|
const serializableOutput: any = {};
|
||||||
|
|
||||||
|
// Markdown conversion
|
||||||
|
if (formats.includes('markdown')) {
|
||||||
|
markdown = await convertPageToMarkdown(url);
|
||||||
|
serializableOutput.markdown = [{ content: markdown }];
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTML conversion
|
||||||
|
if (formats.includes('html')) {
|
||||||
|
html = await convertPageToHTML(url);
|
||||||
|
serializableOutput.html = [{ content: html }];
|
||||||
|
}
|
||||||
|
|
||||||
await run.update({
|
await run.update({
|
||||||
status: 'success',
|
status: 'success',
|
||||||
finishedAt: new Date().toLocaleString(),
|
finishedAt: new Date().toLocaleString(),
|
||||||
log: 'Markdown conversion completed successfully',
|
log: `${formats.join(', ')} conversion completed successfully`,
|
||||||
serializableOutput: {
|
serializableOutput,
|
||||||
markdown: [{ content: markdown }]
|
|
||||||
},
|
|
||||||
binaryOutput: {},
|
binaryOutput: {},
|
||||||
});
|
});
|
||||||
|
|
||||||
logger.log('info', `Markdown robot execution completed for API run ${id}`);
|
logger.log('info', `Markdown robot execution completed for API run ${id}`);
|
||||||
|
|
||||||
|
// Push success socket event
|
||||||
try {
|
try {
|
||||||
const completionData = {
|
const completionData = {
|
||||||
runId: plainRun.runId,
|
runId: plainRun.runId,
|
||||||
@@ -695,30 +720,45 @@ async function executeRun(id: string, userId: string) {
|
|||||||
finishedAt: new Date().toLocaleString()
|
finishedAt: new Date().toLocaleString()
|
||||||
};
|
};
|
||||||
|
|
||||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
|
serverIo
|
||||||
|
.of('/queued-run')
|
||||||
|
.to(`user-${userId}`)
|
||||||
|
.emit('run-completed', completionData);
|
||||||
} catch (socketError: any) {
|
} catch (socketError: any) {
|
||||||
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
|
logger.log(
|
||||||
|
'warn',
|
||||||
|
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const webhookPayload = {
|
// Build webhook payload
|
||||||
|
const webhookPayload: any = {
|
||||||
robot_id: plainRun.robotMetaId,
|
robot_id: plainRun.robotMetaId,
|
||||||
run_id: plainRun.runId,
|
run_id: plainRun.runId,
|
||||||
robot_name: recording.recording_meta.name,
|
robot_name: recording.recording_meta.name,
|
||||||
status: 'success',
|
status: 'success',
|
||||||
started_at: plainRun.startedAt,
|
started_at: plainRun.startedAt,
|
||||||
finished_at: new Date().toLocaleString(),
|
finished_at: new Date().toLocaleString(),
|
||||||
markdown: markdown,
|
|
||||||
metadata: {
|
metadata: {
|
||||||
browser_id: plainRun.browserId,
|
browser_id: plainRun.browserId,
|
||||||
user_id: userId,
|
user_id: userId,
|
||||||
}
|
},
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
|
||||||
|
if (formats.includes('html')) webhookPayload.html = html;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
||||||
logger.log('info', `Webhooks sent successfully for markdown robot API run ${plainRun.runId}`);
|
logger.log(
|
||||||
|
'info',
|
||||||
|
`Webhooks sent successfully for markdown robot API run ${plainRun.runId}`
|
||||||
|
);
|
||||||
} catch (webhookError: any) {
|
} catch (webhookError: any) {
|
||||||
logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
|
logger.log(
|
||||||
|
'warn',
|
||||||
|
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||||
@@ -728,14 +768,18 @@ async function executeRun(id: string, userId: string) {
|
|||||||
interpretationInfo: run.toJSON()
|
interpretationInfo: run.toJSON()
|
||||||
};
|
};
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
logger.log('error', `Markdown conversion failed for API run ${id}: ${error.message}`);
|
logger.log(
|
||||||
|
'error',
|
||||||
|
`${formats.join(', ')} conversion failed for API run ${id}: ${error.message}`
|
||||||
|
);
|
||||||
|
|
||||||
await run.update({
|
await run.update({
|
||||||
status: 'failed',
|
status: 'failed',
|
||||||
finishedAt: new Date().toLocaleString(),
|
finishedAt: new Date().toLocaleString(),
|
||||||
log: `Markdown conversion failed: ${error.message}`,
|
log: `${formats.join(', ')} conversion failed: ${error.message}`,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
// Send failure socket event
|
||||||
try {
|
try {
|
||||||
const failureData = {
|
const failureData = {
|
||||||
runId: plainRun.runId,
|
runId: plainRun.runId,
|
||||||
@@ -745,9 +789,15 @@ async function executeRun(id: string, userId: string) {
|
|||||||
finishedAt: new Date().toLocaleString()
|
finishedAt: new Date().toLocaleString()
|
||||||
};
|
};
|
||||||
|
|
||||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
|
serverIo
|
||||||
|
.of('/queued-run')
|
||||||
|
.to(`user-${userId}`)
|
||||||
|
.emit('run-completed', failureData);
|
||||||
} catch (socketError: any) {
|
} catch (socketError: any) {
|
||||||
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
|
logger.log(
|
||||||
|
'warn',
|
||||||
|
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||||
@@ -953,7 +1003,7 @@ async function executeRun(id: string, userId: string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
export async function handleRunRecording(id: string, userId: string) {
|
export async function handleRunRecording(id: string, userId: string, requestedFormats?: string[]) {
|
||||||
try {
|
try {
|
||||||
const result = await createWorkflowAndStoreMetadata(id, userId);
|
const result = await createWorkflowAndStoreMetadata(id, userId);
|
||||||
const { browserId, runId: newRunId } = result;
|
const { browserId, runId: newRunId } = result;
|
||||||
@@ -967,7 +1017,7 @@ export async function handleRunRecording(id: string, userId: string) {
|
|||||||
rejectUnauthorized: false
|
rejectUnauthorized: false
|
||||||
});
|
});
|
||||||
|
|
||||||
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId));
|
socket.on('ready-for-run', () => readyForRunHandler(browserId, newRunId, userId, requestedFormats));
|
||||||
|
|
||||||
logger.log('info', `Running Robot: ${id}`);
|
logger.log('info', `Running Robot: ${id}`);
|
||||||
|
|
||||||
@@ -1018,6 +1068,21 @@ async function waitForRunCompletion(runId: string, interval: number = 2000) {
|
|||||||
* type: string
|
* type: string
|
||||||
* required: true
|
* required: true
|
||||||
* description: The ID of the robot to run.
|
* description: The ID of the robot to run.
|
||||||
|
* requestBody:
|
||||||
|
* required: false
|
||||||
|
* content:
|
||||||
|
* application/json:
|
||||||
|
* schema:
|
||||||
|
* type: object
|
||||||
|
* properties:
|
||||||
|
* formats:
|
||||||
|
* type: array
|
||||||
|
* items:
|
||||||
|
* type: string
|
||||||
|
* enum: [markdown, html]
|
||||||
|
* description: Optional override formats for this run.
|
||||||
|
* example:
|
||||||
|
* formats: ["html"]
|
||||||
* responses:
|
* responses:
|
||||||
* 200:
|
* 200:
|
||||||
* description: Robot run started successfully.
|
* description: Robot run started successfully.
|
||||||
@@ -1076,7 +1141,10 @@ router.post("/robots/:id/runs", requireAPIKey, async (req: AuthenticatedRequest,
|
|||||||
if (!req.user) {
|
if (!req.user) {
|
||||||
return res.status(401).json({ ok: false, error: 'Unauthorized' });
|
return res.status(401).json({ ok: false, error: 'Unauthorized' });
|
||||||
}
|
}
|
||||||
const runId = await handleRunRecording(req.params.id, req.user.id);
|
|
||||||
|
const requestedFormats = req.body.formats;
|
||||||
|
|
||||||
|
const runId = await handleRunRecording(req.params.id, req.user.id, requestedFormats);
|
||||||
|
|
||||||
if (!runId) {
|
if (!runId) {
|
||||||
throw new Error('Run ID is undefined');
|
throw new Error('Run ID is undefined');
|
||||||
|
|||||||
@@ -55,3 +55,57 @@ export async function convertPageToMarkdown(url: string): Promise<string> {
|
|||||||
const markdown = await parseMarkdown(cleanedHtml, url);
|
const markdown = await parseMarkdown(cleanedHtml, url);
|
||||||
return markdown;
|
return markdown;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Fetches a webpage, strips scripts/styles/images/etc,
|
||||||
|
* returns clean HTML.
|
||||||
|
*/
|
||||||
|
export async function convertPageToHTML(url: string): Promise<string> {
|
||||||
|
const browser = await chromium.launch();
|
||||||
|
const page = await browser.newPage();
|
||||||
|
|
||||||
|
await page.goto(url, { waitUntil: "networkidle" });
|
||||||
|
|
||||||
|
await page.addInitScript(() => {
|
||||||
|
const selectors = [
|
||||||
|
"script",
|
||||||
|
"style",
|
||||||
|
"link[rel='stylesheet']",
|
||||||
|
"noscript",
|
||||||
|
"meta",
|
||||||
|
"svg",
|
||||||
|
"img",
|
||||||
|
"picture",
|
||||||
|
"source",
|
||||||
|
"video",
|
||||||
|
"audio",
|
||||||
|
"iframe",
|
||||||
|
"object",
|
||||||
|
"embed"
|
||||||
|
];
|
||||||
|
|
||||||
|
selectors.forEach(sel => {
|
||||||
|
document.querySelectorAll(sel).forEach(e => e.remove());
|
||||||
|
});
|
||||||
|
|
||||||
|
// Remove inline event handlers (onclick, onload…)
|
||||||
|
const all = document.querySelectorAll("*");
|
||||||
|
all.forEach(el => {
|
||||||
|
[...el.attributes].forEach(attr => {
|
||||||
|
if (attr.name.startsWith("on")) {
|
||||||
|
el.removeAttribute(attr.name);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
});
|
||||||
|
});
|
||||||
|
|
||||||
|
// Re-extract HTML after cleanup
|
||||||
|
const cleanedHtml = await page.evaluate(() => {
|
||||||
|
return document.documentElement.outerHTML;
|
||||||
|
});
|
||||||
|
|
||||||
|
await browser.close();
|
||||||
|
|
||||||
|
// Return cleaned HTML directly
|
||||||
|
return cleanedHtml;
|
||||||
|
}
|
||||||
|
|||||||
@@ -9,8 +9,9 @@ interface RobotMeta {
|
|||||||
pairs: number;
|
pairs: number;
|
||||||
updatedAt: string;
|
updatedAt: string;
|
||||||
params: any[];
|
params: any[];
|
||||||
type?: 'traditional' | 'markdown';
|
type?: 'extract' | 'scrape';
|
||||||
url?: string;
|
url?: string;
|
||||||
|
formats?: ('markdown' | 'html')[];
|
||||||
}
|
}
|
||||||
|
|
||||||
interface RobotWorkflow {
|
interface RobotWorkflow {
|
||||||
|
|||||||
@@ -20,7 +20,7 @@ import { airtableUpdateTasks, processAirtableUpdates } from './workflow-manageme
|
|||||||
import { io as serverIo } from "./server";
|
import { io as serverIo } from "./server";
|
||||||
import { sendWebhook } from './routes/webhook';
|
import { sendWebhook } from './routes/webhook';
|
||||||
import { BinaryOutputService } from './storage/mino';
|
import { BinaryOutputService } from './storage/mino';
|
||||||
import { convertPageToMarkdown } from './markdownify/scrape';
|
import { convertPageToMarkdown, convertPageToHTML } from './markdownify/scrape';
|
||||||
|
|
||||||
if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
|
if (!process.env.DB_USER || !process.env.DB_PASSWORD || !process.env.DB_HOST || !process.env.DB_PORT || !process.env.DB_NAME) {
|
||||||
throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
|
throw new Error('Failed to start pgboss worker: one or more required environment variables are missing.');
|
||||||
@@ -189,12 +189,14 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
|||||||
throw new Error(`Recording for run ${data.runId} not found`);
|
throw new Error(`Recording for run ${data.runId} not found`);
|
||||||
}
|
}
|
||||||
|
|
||||||
if (recording.recording_meta.type === 'markdown') {
|
if (recording.recording_meta.type === 'scrape') {
|
||||||
logger.log('info', `Executing markdown robot for run ${data.runId}`);
|
logger.log('info', `Executing scrape robot for run ${data.runId}`);
|
||||||
|
|
||||||
|
const formats = recording.recording_meta.formats || ['markdown'];
|
||||||
|
|
||||||
await run.update({
|
await run.update({
|
||||||
status: 'running',
|
status: 'running',
|
||||||
log: 'Converting page to markdown'
|
log: `Converting page to ${formats.join(', ')}`
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -204,20 +206,34 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
|||||||
throw new Error('No URL specified for markdown robot');
|
throw new Error('No URL specified for markdown robot');
|
||||||
}
|
}
|
||||||
|
|
||||||
const markdown = await convertPageToMarkdown(url);
|
let markdown = '';
|
||||||
|
let html = '';
|
||||||
|
const serializableOutput: any = {};
|
||||||
|
|
||||||
|
// Markdown conversion
|
||||||
|
if (formats.includes('markdown')) {
|
||||||
|
markdown = await convertPageToMarkdown(url);
|
||||||
|
serializableOutput.markdown = [{ content: markdown }];
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTML conversion
|
||||||
|
if (formats.includes('html')) {
|
||||||
|
html = await convertPageToHTML(url);
|
||||||
|
serializableOutput.html = [{ content: html }];
|
||||||
|
}
|
||||||
|
|
||||||
|
// Success update
|
||||||
await run.update({
|
await run.update({
|
||||||
status: 'success',
|
status: 'success',
|
||||||
finishedAt: new Date().toLocaleString(),
|
finishedAt: new Date().toLocaleString(),
|
||||||
log: 'Markdown conversion completed successfully',
|
log: `${formats.join(', ').toUpperCase()} conversion completed successfully`,
|
||||||
serializableOutput: {
|
serializableOutput,
|
||||||
markdown: [{ content: markdown }]
|
|
||||||
},
|
|
||||||
binaryOutput: {},
|
binaryOutput: {},
|
||||||
});
|
});
|
||||||
|
|
||||||
logger.log('info', `Markdown robot execution completed for run ${data.runId}`);
|
logger.log('info', `Markdown robot execution completed for run ${data.runId}`);
|
||||||
|
|
||||||
|
// Notify sockets
|
||||||
try {
|
try {
|
||||||
const completionData = {
|
const completionData = {
|
||||||
runId: data.runId,
|
runId: data.runId,
|
||||||
@@ -233,15 +249,19 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
|||||||
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`);
|
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${data.runId}: ${socketError.message}`);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Webhooks
|
||||||
try {
|
try {
|
||||||
const webhookPayload = {
|
const webhookPayload: any = {
|
||||||
runId: data.runId,
|
runId: data.runId,
|
||||||
robotId: plainRun.robotMetaId,
|
robotId: plainRun.robotMetaId,
|
||||||
robotName: recording.recording_meta.name,
|
robotName: recording.recording_meta.name,
|
||||||
status: 'success',
|
status: 'success',
|
||||||
finishedAt: new Date().toLocaleString(),
|
finishedAt: new Date().toLocaleString(),
|
||||||
markdown: markdown
|
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
|
||||||
|
if (formats.includes('html')) webhookPayload.html = html;
|
||||||
|
|
||||||
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
||||||
logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
|
logger.log('info', `Webhooks sent successfully for markdown robot run ${data.runId}`);
|
||||||
} catch (webhookError: any) {
|
} catch (webhookError: any) {
|
||||||
@@ -251,13 +271,14 @@ async function processRunExecution(job: Job<ExecuteRunData>) {
|
|||||||
await destroyRemoteBrowser(browserId, data.userId);
|
await destroyRemoteBrowser(browserId, data.userId);
|
||||||
|
|
||||||
return { success: true };
|
return { success: true };
|
||||||
|
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
logger.log('error', `Markdown conversion failed for run ${data.runId}: ${error.message}`);
|
logger.log('error', `${formats.join(', ')} conversion failed for run ${data.runId}: ${error.message}`);
|
||||||
|
|
||||||
await run.update({
|
await run.update({
|
||||||
status: 'failed',
|
status: 'failed',
|
||||||
finishedAt: new Date().toLocaleString(),
|
finishedAt: new Date().toLocaleString(),
|
||||||
log: `Markdown conversion failed: ${error.message}`,
|
log: `${formats.join(', ').toUpperCase()} conversion failed: ${error.message}`,
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
|
|||||||
@@ -440,9 +440,9 @@ router.post('/recordings/:id/duplicate', requireSignIn, async (req: Authenticate
|
|||||||
/**
|
/**
|
||||||
* POST endpoint for creating a markdown robot
|
* POST endpoint for creating a markdown robot
|
||||||
*/
|
*/
|
||||||
router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
||||||
try {
|
try {
|
||||||
const { url, name } = req.body;
|
const { url, name, formats } = req.body;
|
||||||
|
|
||||||
if (!url) {
|
if (!url) {
|
||||||
return res.status(400).json({ error: 'The "url" field is required.' });
|
return res.status(400).json({ error: 'The "url" field is required.' });
|
||||||
@@ -459,6 +459,18 @@ router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequ
|
|||||||
return res.status(400).json({ error: 'Invalid URL format' });
|
return res.status(400).json({ error: 'Invalid URL format' });
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Validate format
|
||||||
|
const validFormats = ['markdown', 'html'];
|
||||||
|
|
||||||
|
if (!Array.isArray(formats) || formats.length === 0) {
|
||||||
|
return res.status(400).json({ error: 'At least one output format must be selected.' });
|
||||||
|
}
|
||||||
|
|
||||||
|
const invalid = formats.filter(f => !validFormats.includes(f));
|
||||||
|
if (invalid.length > 0) {
|
||||||
|
return res.status(400).json({ error: `Invalid formats: ${invalid.join(', ')}` });
|
||||||
|
}
|
||||||
|
|
||||||
const robotName = name || `Markdown Robot - ${new URL(url).hostname}`;
|
const robotName = name || `Markdown Robot - ${new URL(url).hostname}`;
|
||||||
const currentTimestamp = new Date().toLocaleString();
|
const currentTimestamp = new Date().toLocaleString();
|
||||||
const robotId = uuid();
|
const robotId = uuid();
|
||||||
@@ -473,8 +485,9 @@ router.post('/recordings/markdown', requireSignIn, async (req: AuthenticatedRequ
|
|||||||
updatedAt: currentTimestamp,
|
updatedAt: currentTimestamp,
|
||||||
pairs: 0,
|
pairs: 0,
|
||||||
params: [],
|
params: [],
|
||||||
type: 'markdown',
|
type: 'scrape',
|
||||||
url: url,
|
url: url,
|
||||||
|
formats: formats,
|
||||||
},
|
},
|
||||||
recording: { workflow: [] },
|
recording: { workflow: [] },
|
||||||
google_sheet_email: null,
|
google_sheet_email: null,
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ import { WorkflowFile } from "maxun-core";
|
|||||||
import { Page } from "playwright";
|
import { Page } from "playwright";
|
||||||
import { sendWebhook } from "../../routes/webhook";
|
import { sendWebhook } from "../../routes/webhook";
|
||||||
import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
|
import { airtableUpdateTasks, processAirtableUpdates } from "../integrations/airtable";
|
||||||
import { convertPageToMarkdown } from "../../markdownify/scrape";
|
import { convertPageToMarkdown, convertPageToHTML } from "../../markdownify/scrape";
|
||||||
chromium.use(stealthPlugin());
|
chromium.use(stealthPlugin());
|
||||||
|
|
||||||
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
|
async function createWorkflowAndStoreMetadata(id: string, userId: string) {
|
||||||
@@ -208,12 +208,14 @@ async function executeRun(id: string, userId: string) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
if (recording.recording_meta.type === 'markdown') {
|
if (recording.recording_meta.type === 'scrape') {
|
||||||
logger.log('info', `Executing markdown robot for scheduled run ${id}`);
|
logger.log('info', `Executing scrape robot for scheduled run ${id}`);
|
||||||
|
|
||||||
|
const formats = recording.recording_meta.formats || ['markdown'];
|
||||||
|
|
||||||
await run.update({
|
await run.update({
|
||||||
status: 'running',
|
status: 'running',
|
||||||
log: 'Converting page to markdown'
|
log: `Converting page to: ${formats.join(', ')}`
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -226,9 +228,15 @@ async function executeRun(id: string, userId: string) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
|
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-started', runStartedData);
|
||||||
logger.log('info', `Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`);
|
logger.log(
|
||||||
|
'info',
|
||||||
|
`Markdown robot run started notification sent for run: ${plainRun.runId} to user-${userId}`
|
||||||
|
);
|
||||||
} catch (socketError: any) {
|
} catch (socketError: any) {
|
||||||
logger.log('warn', `Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`);
|
logger.log(
|
||||||
|
'warn',
|
||||||
|
`Failed to send run-started notification for markdown robot run ${plainRun.runId}: ${socketError.message}`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -238,20 +246,33 @@ async function executeRun(id: string, userId: string) {
|
|||||||
throw new Error('No URL specified for markdown robot');
|
throw new Error('No URL specified for markdown robot');
|
||||||
}
|
}
|
||||||
|
|
||||||
const markdown = await convertPageToMarkdown(url);
|
let markdown = '';
|
||||||
|
let html = '';
|
||||||
|
const serializableOutput: any = {};
|
||||||
|
|
||||||
|
// Markdown conversion
|
||||||
|
if (formats.includes('markdown')) {
|
||||||
|
markdown = await convertPageToMarkdown(url);
|
||||||
|
serializableOutput.markdown = [{ content: markdown }];
|
||||||
|
}
|
||||||
|
|
||||||
|
// HTML conversion
|
||||||
|
if (formats.includes('html')) {
|
||||||
|
html = await convertPageToHTML(url);
|
||||||
|
serializableOutput.html = [{ content: html }];
|
||||||
|
}
|
||||||
|
|
||||||
await run.update({
|
await run.update({
|
||||||
status: 'success',
|
status: 'success',
|
||||||
finishedAt: new Date().toLocaleString(),
|
finishedAt: new Date().toLocaleString(),
|
||||||
log: 'Markdown conversion completed successfully',
|
log: `${formats.join(', ')} conversion completed successfully`,
|
||||||
serializableOutput: {
|
serializableOutput,
|
||||||
markdown: [{ content: markdown }]
|
|
||||||
},
|
|
||||||
binaryOutput: {},
|
binaryOutput: {},
|
||||||
});
|
});
|
||||||
|
|
||||||
logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
|
logger.log('info', `Markdown robot execution completed for scheduled run ${id}`);
|
||||||
|
|
||||||
|
// Run-completed socket notifications
|
||||||
try {
|
try {
|
||||||
const completionData = {
|
const completionData = {
|
||||||
runId: plainRun.runId,
|
runId: plainRun.runId,
|
||||||
@@ -264,40 +285,53 @@ async function executeRun(id: string, userId: string) {
|
|||||||
serverIo.of(plainRun.browserId).emit('run-completed', completionData);
|
serverIo.of(plainRun.browserId).emit('run-completed', completionData);
|
||||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
|
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', completionData);
|
||||||
} catch (socketError: any) {
|
} catch (socketError: any) {
|
||||||
logger.log('warn', `Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`);
|
logger.log(
|
||||||
|
'warn',
|
||||||
|
`Failed to send run-completed notification for markdown robot run ${id}: ${socketError.message}`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
const webhookPayload = {
|
// Webhook payload
|
||||||
|
const webhookPayload: any = {
|
||||||
robot_id: plainRun.robotMetaId,
|
robot_id: plainRun.robotMetaId,
|
||||||
run_id: plainRun.runId,
|
run_id: plainRun.runId,
|
||||||
robot_name: recording.recording_meta.name,
|
robot_name: recording.recording_meta.name,
|
||||||
status: 'success',
|
status: 'success',
|
||||||
started_at: plainRun.startedAt,
|
started_at: plainRun.startedAt,
|
||||||
finished_at: new Date().toLocaleString(),
|
finished_at: new Date().toLocaleString(),
|
||||||
markdown: markdown,
|
|
||||||
metadata: {
|
metadata: {
|
||||||
browser_id: plainRun.browserId,
|
browser_id: plainRun.browserId,
|
||||||
user_id: userId,
|
user_id: userId,
|
||||||
}
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (formats.includes('markdown')) webhookPayload.markdown = markdown;
|
||||||
|
if (formats.includes('html')) webhookPayload.html = html;
|
||||||
|
|
||||||
try {
|
try {
|
||||||
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
await sendWebhook(plainRun.robotMetaId, 'run_completed', webhookPayload);
|
||||||
logger.log('info', `Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`);
|
logger.log(
|
||||||
|
'info',
|
||||||
|
`Webhooks sent successfully for markdown robot scheduled run ${plainRun.runId}`
|
||||||
|
);
|
||||||
} catch (webhookError: any) {
|
} catch (webhookError: any) {
|
||||||
logger.log('warn', `Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`);
|
logger.log(
|
||||||
|
'warn',
|
||||||
|
`Failed to send webhooks for markdown robot run ${plainRun.runId}: ${webhookError.message}`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||||
|
|
||||||
return true;
|
return true;
|
||||||
|
|
||||||
} catch (error: any) {
|
} catch (error: any) {
|
||||||
logger.log('error', `Markdown conversion failed for scheduled run ${id}: ${error.message}`);
|
logger.log('error', `${formats.join(', ')} conversion failed for scheduled run ${id}: ${error.message}`);
|
||||||
|
|
||||||
await run.update({
|
await run.update({
|
||||||
status: 'failed',
|
status: 'failed',
|
||||||
finishedAt: new Date().toLocaleString(),
|
finishedAt: new Date().toLocaleString(),
|
||||||
log: `Markdown conversion failed: ${error.message}`,
|
log: `${formats.join(', ')} conversion failed: ${error.message}`,
|
||||||
});
|
});
|
||||||
|
|
||||||
try {
|
try {
|
||||||
@@ -312,7 +346,10 @@ async function executeRun(id: string, userId: string) {
|
|||||||
serverIo.of(plainRun.browserId).emit('run-completed', failureData);
|
serverIo.of(plainRun.browserId).emit('run-completed', failureData);
|
||||||
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
|
serverIo.of('/queued-run').to(`user-${userId}`).emit('run-completed', failureData);
|
||||||
} catch (socketError: any) {
|
} catch (socketError: any) {
|
||||||
logger.log('warn', `Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`);
|
logger.log(
|
||||||
|
'warn',
|
||||||
|
`Failed to send run-failed notification for markdown robot run ${id}: ${socketError.message}`
|
||||||
|
);
|
||||||
}
|
}
|
||||||
|
|
||||||
await destroyRemoteBrowser(plainRun.browserId, userId);
|
await destroyRemoteBrowser(plainRun.browserId, userId);
|
||||||
|
|||||||
@@ -110,7 +110,10 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => {
|
|||||||
case 'integrate':
|
case 'integrate':
|
||||||
return (
|
return (
|
||||||
<MemoizedTableCell key={column.id} align={column.align}>
|
<MemoizedTableCell key={column.id} align={column.align}>
|
||||||
<MemoizedIntegrateButton handleIntegrate={() => handlers.handleIntegrateRecording(row.id, row.name, row.params || [])} />
|
<MemoizedIntegrateButton
|
||||||
|
handleIntegrate={() => handlers.handleIntegrateRecording(row.id, row.name, row.params || [])}
|
||||||
|
robotType={row.type}
|
||||||
|
/>
|
||||||
</MemoizedTableCell>
|
</MemoizedTableCell>
|
||||||
);
|
);
|
||||||
case 'options':
|
case 'options':
|
||||||
@@ -121,6 +124,7 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => {
|
|||||||
handleEdit={() => handlers.handleEditRobot(row.id, row.name, row.params || [])}
|
handleEdit={() => handlers.handleEditRobot(row.id, row.name, row.params || [])}
|
||||||
handleDuplicate={() => handlers.handleDuplicateRobot(row.id, row.name, row.params || [])}
|
handleDuplicate={() => handlers.handleDuplicateRobot(row.id, row.name, row.params || [])}
|
||||||
handleDelete={() => handlers.handleDelete(row.id)}
|
handleDelete={() => handlers.handleDelete(row.id)}
|
||||||
|
robotType={row.type}
|
||||||
/>
|
/>
|
||||||
</MemoizedTableCell>
|
</MemoizedTableCell>
|
||||||
);
|
);
|
||||||
@@ -709,13 +713,22 @@ const ScheduleButton = ({ handleSchedule }: ScheduleButtonProps) => {
|
|||||||
|
|
||||||
interface IntegrateButtonProps {
|
interface IntegrateButtonProps {
|
||||||
handleIntegrate: () => void;
|
handleIntegrate: () => void;
|
||||||
|
robotType: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
const IntegrateButton = ({ handleIntegrate }: IntegrateButtonProps) => {
|
const IntegrateButton = ({ handleIntegrate, robotType }: IntegrateButtonProps) => {
|
||||||
|
const isDisabled = robotType === 'scrape';
|
||||||
|
|
||||||
return (
|
return (
|
||||||
<IconButton aria-label="add" size="small" onClick={() => {
|
<IconButton
|
||||||
handleIntegrate();
|
aria-label="integrate"
|
||||||
}}
|
size="small"
|
||||||
|
onClick={isDisabled ? undefined : handleIntegrate}
|
||||||
|
disabled={isDisabled}
|
||||||
|
sx={{
|
||||||
|
opacity: isDisabled ? 0.4 : 1,
|
||||||
|
cursor: isDisabled ? 'not-allowed' : 'pointer',
|
||||||
|
}}
|
||||||
>
|
>
|
||||||
<Power />
|
<Power />
|
||||||
</IconButton>
|
</IconButton>
|
||||||
@@ -742,9 +755,10 @@ interface OptionsButtonProps {
|
|||||||
handleEdit: () => void;
|
handleEdit: () => void;
|
||||||
handleDelete: () => void;
|
handleDelete: () => void;
|
||||||
handleDuplicate: () => void;
|
handleDuplicate: () => void;
|
||||||
|
robotType: string;
|
||||||
}
|
}
|
||||||
|
|
||||||
const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate }: OptionsButtonProps) => {
|
const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate, robotType }: OptionsButtonProps) => {
|
||||||
const [anchorEl, setAnchorEl] = React.useState<null | HTMLElement>(null);
|
const [anchorEl, setAnchorEl] = React.useState<null | HTMLElement>(null);
|
||||||
|
|
||||||
const handleClick = (event: React.MouseEvent<HTMLElement>) => {
|
const handleClick = (event: React.MouseEvent<HTMLElement>) => {
|
||||||
@@ -771,34 +785,33 @@ const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicat
|
|||||||
open={Boolean(anchorEl)}
|
open={Boolean(anchorEl)}
|
||||||
onClose={handleClose}
|
onClose={handleClose}
|
||||||
>
|
>
|
||||||
<MenuItem onClick={() => { handleRetrain(); handleClose(); }}>
|
{robotType !== 'scrape' && (
|
||||||
<ListItemIcon>
|
<MenuItem onClick={() => { handleRetrain(); handleClose(); }}>
|
||||||
<Refresh fontSize="small" />
|
<ListItemIcon>
|
||||||
</ListItemIcon>
|
<Refresh fontSize="small" />
|
||||||
<ListItemText>{t('recordingtable.retrain')}</ListItemText>
|
</ListItemIcon>
|
||||||
</MenuItem>
|
<ListItemText>Retrain</ListItemText>
|
||||||
|
</MenuItem>
|
||||||
|
)}
|
||||||
|
|
||||||
<MenuItem onClick={() => { handleEdit(); handleClose(); }}>
|
<MenuItem onClick={() => { handleEdit(); handleClose(); }}>
|
||||||
<ListItemIcon>
|
<ListItemIcon><Edit fontSize="small" /></ListItemIcon>
|
||||||
<Edit fontSize="small" />
|
<ListItemText>Edit</ListItemText>
|
||||||
</ListItemIcon>
|
|
||||||
<ListItemText>{t('recordingtable.edit')}</ListItemText>
|
|
||||||
</MenuItem>
|
</MenuItem>
|
||||||
|
|
||||||
<MenuItem onClick={() => { handleDelete(); handleClose(); }}>
|
<MenuItem onClick={() => { handleDelete(); handleClose(); }}>
|
||||||
<ListItemIcon>
|
<ListItemIcon><DeleteForever fontSize="small" /></ListItemIcon>
|
||||||
<DeleteForever fontSize="small" />
|
<ListItemText>Delete</ListItemText>
|
||||||
</ListItemIcon>
|
|
||||||
<ListItemText>{t('recordingtable.delete')}</ListItemText>
|
|
||||||
</MenuItem>
|
</MenuItem>
|
||||||
|
|
||||||
<MenuItem onClick={() => { handleDuplicate(); handleClose(); }}>
|
{robotType !== 'scrape' && (
|
||||||
<ListItemIcon>
|
<MenuItem onClick={() => { handleDuplicate(); handleClose(); }}>
|
||||||
<ContentCopy fontSize="small" />
|
<ListItemIcon><ContentCopy fontSize="small" /></ListItemIcon>
|
||||||
</ListItemIcon>
|
<ListItemText>Duplicate</ListItemText>
|
||||||
<ListItemText>{t('recordingtable.duplicate')}</ListItemText>
|
</MenuItem>
|
||||||
</MenuItem>
|
)}
|
||||||
</Menu>
|
</Menu>
|
||||||
|
|
||||||
</>
|
</>
|
||||||
);
|
);
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -15,12 +15,16 @@ import {
|
|||||||
Container,
|
Container,
|
||||||
CardContent,
|
CardContent,
|
||||||
Tabs,
|
Tabs,
|
||||||
Tab
|
Tab,
|
||||||
|
RadioGroup,
|
||||||
|
Radio,
|
||||||
|
FormControl,
|
||||||
|
FormLabel
|
||||||
} from '@mui/material';
|
} from '@mui/material';
|
||||||
import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material';
|
import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material';
|
||||||
import { useGlobalInfoStore } from '../../../context/globalInfo';
|
import { useGlobalInfoStore } from '../../../context/globalInfo';
|
||||||
import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording';
|
import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording';
|
||||||
import { createMarkdownRobot } from "../../../api/storage";
|
import { createScrapeRobot } from "../../../api/storage";
|
||||||
import { AuthContext } from '../../../context/auth';
|
import { AuthContext } from '../../../context/auth';
|
||||||
import { GenericModal } from '../../ui/GenericModal';
|
import { GenericModal } from '../../ui/GenericModal';
|
||||||
|
|
||||||
@@ -54,11 +58,12 @@ const RobotCreate: React.FC = () => {
|
|||||||
|
|
||||||
const [tabValue, setTabValue] = useState(0);
|
const [tabValue, setTabValue] = useState(0);
|
||||||
const [url, setUrl] = useState('');
|
const [url, setUrl] = useState('');
|
||||||
const [markdownRobotName, setMarkdownRobotName] = useState('');
|
const [scrapeRobotName, setScrapeRobotName] = useState('');
|
||||||
const [needsLogin, setNeedsLogin] = useState(false);
|
const [needsLogin, setNeedsLogin] = useState(false);
|
||||||
const [isLoading, setIsLoading] = useState(false);
|
const [isLoading, setIsLoading] = useState(false);
|
||||||
const [isWarningModalOpen, setWarningModalOpen] = useState(false);
|
const [isWarningModalOpen, setWarningModalOpen] = useState(false);
|
||||||
const [activeBrowserId, setActiveBrowserId] = useState('');
|
const [activeBrowserId, setActiveBrowserId] = useState('');
|
||||||
|
const [outputFormats, setOutputFormats] = useState<string[]>([]);
|
||||||
|
|
||||||
const { state } = React.useContext(AuthContext);
|
const { state } = React.useContext(AuthContext);
|
||||||
const { user } = state;
|
const { user } = state;
|
||||||
@@ -200,7 +205,7 @@ const RobotCreate: React.FC = () => {
|
|||||||
}}
|
}}
|
||||||
>
|
>
|
||||||
<Tab label="Extract" id="extract-robot" aria-controls="extract-robot" />
|
<Tab label="Extract" id="extract-robot" aria-controls="extract-robot" />
|
||||||
<Tab label="Markdown" id="markdown-robot" aria-controls="markdown-robot" />
|
<Tab label="Scrape" id="scrape-robot" aria-controls="scrape-robot" />
|
||||||
</Tabs>
|
</Tabs>
|
||||||
</Box>
|
</Box>
|
||||||
|
|
||||||
@@ -370,7 +375,7 @@ const RobotCreate: React.FC = () => {
|
|||||||
/>
|
/>
|
||||||
|
|
||||||
<Typography variant="body2" color="text.secondary" mb={3}>
|
<Typography variant="body2" color="text.secondary" mb={3}>
|
||||||
Turn websites into LLM-ready Markdown content for AI apps.
|
Turn websites into LLM-ready Markdown or clean HTML content for AI apps.
|
||||||
</Typography>
|
</Typography>
|
||||||
|
|
||||||
<Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
|
<Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
|
||||||
@@ -378,8 +383,8 @@ const RobotCreate: React.FC = () => {
|
|||||||
placeholder="Example: YC Companies Scraper"
|
placeholder="Example: YC Companies Scraper"
|
||||||
variant="outlined"
|
variant="outlined"
|
||||||
fullWidth
|
fullWidth
|
||||||
value={markdownRobotName}
|
value={scrapeRobotName}
|
||||||
onChange={(e) => setMarkdownRobotName(e.target.value)}
|
onChange={(e) => setScrapeRobotName(e.target.value)}
|
||||||
sx={{ mb: 2 }}
|
sx={{ mb: 2 }}
|
||||||
label="Robot Name"
|
label="Robot Name"
|
||||||
/>
|
/>
|
||||||
@@ -390,7 +395,44 @@ const RobotCreate: React.FC = () => {
|
|||||||
value={url}
|
value={url}
|
||||||
onChange={(e) => setUrl(e.target.value)}
|
onChange={(e) => setUrl(e.target.value)}
|
||||||
label="Website URL"
|
label="Website URL"
|
||||||
|
sx={{ mb: 2 }}
|
||||||
/>
|
/>
|
||||||
|
|
||||||
|
<FormControl component="fieldset" sx={{ width: '100%', textAlign: 'left' }}>
|
||||||
|
<FormLabel component="legend" sx={{ mb: 1 }}>Output Format (Select at least one)</FormLabel>
|
||||||
|
|
||||||
|
<FormControlLabel
|
||||||
|
control={
|
||||||
|
<Checkbox
|
||||||
|
checked={outputFormats.includes('markdown')}
|
||||||
|
onChange={(e) => {
|
||||||
|
if (e.target.checked) {
|
||||||
|
setOutputFormats([...outputFormats, 'markdown']);
|
||||||
|
} else {
|
||||||
|
setOutputFormats(outputFormats.filter(f => f !== 'markdown'));
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
}
|
||||||
|
label="Markdown"
|
||||||
|
/>
|
||||||
|
|
||||||
|
<FormControlLabel
|
||||||
|
control={
|
||||||
|
<Checkbox
|
||||||
|
checked={outputFormats.includes('html')}
|
||||||
|
onChange={(e) => {
|
||||||
|
if (e.target.checked) {
|
||||||
|
setOutputFormats([...outputFormats, 'html']);
|
||||||
|
} else {
|
||||||
|
setOutputFormats(outputFormats.filter(f => f !== 'html'));
|
||||||
|
}
|
||||||
|
}}
|
||||||
|
/>
|
||||||
|
}
|
||||||
|
label="HTML"
|
||||||
|
/>
|
||||||
|
</FormControl>
|
||||||
</Box>
|
</Box>
|
||||||
|
|
||||||
<Button
|
<Button
|
||||||
@@ -401,23 +443,28 @@ const RobotCreate: React.FC = () => {
|
|||||||
notify('error', 'Please enter a valid URL');
|
notify('error', 'Please enter a valid URL');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (!markdownRobotName.trim()) {
|
if (!scrapeRobotName.trim()) {
|
||||||
notify('error', 'Please enter a robot name');
|
notify('error', 'Please enter a robot name');
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
if (outputFormats.length === 0) {
|
||||||
|
notify('error', 'Please select at least one output format');
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
setIsLoading(true);
|
setIsLoading(true);
|
||||||
const result = await createMarkdownRobot(url, markdownRobotName);
|
const result = await createScrapeRobot(url, scrapeRobotName, outputFormats);
|
||||||
setIsLoading(false);
|
setIsLoading(false);
|
||||||
|
|
||||||
if (result) {
|
if (result) {
|
||||||
setRerenderRobots(true);
|
setRerenderRobots(true);
|
||||||
notify('success', `${markdownRobotName} created successfully!`);
|
notify('success', `${scrapeRobotName} created successfully!`);
|
||||||
navigate('/robots');
|
navigate('/robots');
|
||||||
} else {
|
} else {
|
||||||
notify('error', 'Failed to create markdown robot');
|
notify('error', 'Failed to create markdown robot');
|
||||||
}
|
}
|
||||||
}}
|
}}
|
||||||
disabled={!url.trim() || !markdownRobotName.trim() || isLoading}
|
disabled={!url.trim() || !scrapeRobotName.trim() || outputFormats.length === 0 || isLoading}
|
||||||
sx={{
|
sx={{
|
||||||
bgcolor: '#ff00c3',
|
bgcolor: '#ff00c3',
|
||||||
py: 1.4,
|
py: 1.4,
|
||||||
@@ -428,7 +475,10 @@ const RobotCreate: React.FC = () => {
|
|||||||
}}
|
}}
|
||||||
startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
|
startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
|
||||||
>
|
>
|
||||||
{isLoading ? 'Turning...' : 'Turn to Markdown'}
|
{isLoading
|
||||||
|
? "Creating..."
|
||||||
|
: `Create ${outputFormats.join(" + ").toUpperCase()} Robot`
|
||||||
|
}
|
||||||
</Button>
|
</Button>
|
||||||
</Box>
|
</Box>
|
||||||
</Card>
|
</Card>
|
||||||
|
|||||||
@@ -24,8 +24,9 @@ interface RobotMeta {
|
|||||||
pairs: number;
|
pairs: number;
|
||||||
updatedAt: string;
|
updatedAt: string;
|
||||||
params: any[];
|
params: any[];
|
||||||
type?: 'traditional' | 'markdown';
|
type?: 'extract' | 'scrape';
|
||||||
url?: string;
|
url?: string;
|
||||||
|
formats?: ('markdown' | 'html')[];
|
||||||
}
|
}
|
||||||
|
|
||||||
interface RobotWorkflow {
|
interface RobotWorkflow {
|
||||||
|
|||||||
@@ -24,8 +24,9 @@ interface RobotMeta {
|
|||||||
pairs: number;
|
pairs: number;
|
||||||
updatedAt: string;
|
updatedAt: string;
|
||||||
params: any[];
|
params: any[];
|
||||||
type?: 'traditional' | 'markdown';
|
type?: 'extract' | 'scrape';
|
||||||
url?: string;
|
url?: string;
|
||||||
|
formats?: ('markdown' | 'html')[];
|
||||||
}
|
}
|
||||||
|
|
||||||
interface RobotWorkflow {
|
interface RobotWorkflow {
|
||||||
|
|||||||
@@ -16,8 +16,9 @@ interface RobotMeta {
|
|||||||
pairs: number;
|
pairs: number;
|
||||||
updatedAt: string;
|
updatedAt: string;
|
||||||
params: any[];
|
params: any[];
|
||||||
type?: 'traditional' | 'markdown';
|
type?: 'extract' | 'scrape';
|
||||||
url?: string;
|
url?: string;
|
||||||
|
formats?: ('markdown' | 'html')[];
|
||||||
}
|
}
|
||||||
|
|
||||||
interface RobotWorkflow {
|
interface RobotWorkflow {
|
||||||
|
|||||||
@@ -27,8 +27,9 @@ interface RobotMeta {
|
|||||||
pairs: number;
|
pairs: number;
|
||||||
updatedAt: string;
|
updatedAt: string;
|
||||||
params: any[];
|
params: any[];
|
||||||
type?: 'traditional' | 'markdown';
|
type?: 'extract' | 'scrape';
|
||||||
url?: string;
|
url?: string;
|
||||||
|
formats?: ('markdown' | 'html')[];
|
||||||
}
|
}
|
||||||
|
|
||||||
interface RobotWorkflow {
|
interface RobotWorkflow {
|
||||||
|
|||||||
Reference in New Issue
Block a user