feat: add server side crawl and search logic

This commit is contained in:
Rohit Rajan
2026-01-02 15:46:10 +05:30
parent 3689eb96bd
commit 9898dc410d
15 changed files with 1577 additions and 124 deletions

View File

@@ -251,21 +251,18 @@ function handleWorkflowActions(workflow: any[], credentials: Credentials) {
router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, res) => {
try {
const { id } = req.params;
const { name, limits, credentials, targetUrl, workflow: incomingWorkflow } = req.body;
const { name, limits, credentials, targetUrl, workflow: incomingWorkflow } = req.body;
// Validate input
if (!name && !limits && !credentials && !targetUrl) {
if (!name && !limits && !credentials && !targetUrl && !incomingWorkflow) {
return res.status(400).json({ error: 'Either "name", "limits", "credentials" or "target_url" must be provided.' });
}
// Fetch the robot by ID
const robot = await Robot.findOne({ where: { 'recording_meta.id': id } });
if (!robot) {
return res.status(404).json({ error: 'Robot not found.' });
}
// Update fields if provided
if (name) {
robot.set('recording_meta', { ...robot.recording_meta, name });
}
@@ -274,7 +271,6 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
robot.set('recording_meta', { ...robot.recording_meta, url: targetUrl });
const updatedWorkflow = [...robot.recording.workflow];
let foundGoto = false;
for (let i = updatedWorkflow.length - 1; i >= 0; i--) {
const step = updatedWorkflow[i];
@@ -289,7 +285,6 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
robot.set('recording', { ...robot.recording, workflow: updatedWorkflow });
robot.changed('recording', true);
foundGoto = true;
i = -1;
break;
}
@@ -299,10 +294,9 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
await robot.save();
// Start with existing workflow or allow client to supply a full workflow replacement
let workflow = incomingWorkflow && Array.isArray(incomingWorkflow)
? JSON.parse(JSON.stringify(incomingWorkflow))
: [...robot.recording.workflow]; // Create a copy of the workflow
: [...robot.recording.workflow];
if (credentials) {
workflow = handleWorkflowActions(workflow, credentials);
@@ -344,7 +338,7 @@ router.put('/recordings/:id', requireSignIn, async (req: AuthenticatedRequest, r
where: { 'recording_meta.id': id }
});
const updatedRobot = await Robot.findOne({ where: { 'recording_meta.id': id } });
await Robot.findOne({ where: { 'recording_meta.id': id } });
logger.log('info', `Robot with ID ${id} was updated successfully.`);
@@ -1322,4 +1316,198 @@ export async function recoverOrphanedRuns() {
}
}
/**
* POST endpoint for creating a crawl robot
* @route POST /recordings/crawl
* @auth requireSignIn - JWT authentication required
*/
router.post('/recordings/crawl', requireSignIn, async (req: AuthenticatedRequest, res) => {
try {
const { url, name, crawlConfig } = req.body;
if (!url || !crawlConfig) {
return res.status(400).json({ error: 'URL and crawl configuration are required.' });
}
if (!req.user) {
return res.status(401).send({ error: 'Unauthorized' });
}
try {
new URL(url);
} catch (err) {
return res.status(400).json({ error: 'Invalid URL format' });
}
const robotName = name || `Crawl Robot - ${new URL(url).hostname}`;
const currentTimestamp = new Date().toLocaleString('en-US');
const robotId = uuid();
const newRobot = await Robot.create({
id: uuid(),
userId: req.user.id,
recording_meta: {
name: robotName,
id: robotId,
createdAt: currentTimestamp,
updatedAt: currentTimestamp,
pairs: 1,
params: [],
type: 'crawl',
url: url,
},
recording: {
workflow: [
{
where: { url },
what: [
{ action: 'flag', args: ['generated'] },
{
action: 'crawl',
args: [crawlConfig],
name: 'Crawl'
}
]
},
{
where: { url: 'about:blank' },
what: [
{
action: 'goto',
args: [url]
},
{
action: 'waitForLoadState',
args: ['networkidle']
}
]
}
]
},
google_sheet_email: null,
google_sheet_name: null,
google_sheet_id: null,
google_access_token: null,
google_refresh_token: null,
airtable_base_id: null,
airtable_base_name: null,
airtable_table_name: null,
airtable_table_id: null,
airtable_access_token: null,
airtable_refresh_token: null,
schedule: null,
webhooks: null
});
logger.log('info', `Crawl robot created with id: ${newRobot.id}`);
capture('maxun-oss-robot-created', {
userId: req.user.id.toString(),
robotId: robotId,
robotName: robotName,
url: url,
robotType: 'crawl',
crawlConfig: crawlConfig
});
return res.status(201).json({
message: 'Crawl robot created successfully.',
robot: newRobot,
});
} catch (error) {
if (error instanceof Error) {
logger.log('error', `Error creating crawl robot: ${error.message}`);
return res.status(500).json({ error: error.message });
} else {
logger.log('error', 'Unknown error creating crawl robot');
return res.status(500).json({ error: 'An unknown error occurred.' });
}
}
});
/**
* POST endpoint for creating a search robot
* @route POST /recordings/search
* @auth requireSignIn - JWT authentication required
*/
router.post('/recordings/search', requireSignIn, async (req: AuthenticatedRequest, res) => {
try {
const { searchConfig, name } = req.body;
if (!searchConfig || !searchConfig.query) {
return res.status(400).json({ error: 'Search configuration with query is required.' });
}
if (!req.user) {
return res.status(401).send({ error: 'Unauthorized' });
}
const robotName = name || `Search Robot - ${searchConfig.query.substring(0, 50)}`;
const currentTimestamp = new Date().toLocaleString('en-US');
const robotId = uuid();
const newRobot = await Robot.create({
id: uuid(),
userId: req.user.id,
recording_meta: {
name: robotName,
id: robotId,
createdAt: currentTimestamp,
updatedAt: currentTimestamp,
pairs: 1,
params: [],
type: 'search',
},
recording: {
workflow: [
{
where: { url: 'about:blank' },
what: [{
action: 'search',
args: [searchConfig],
name: 'Search'
}]
}
]
},
google_sheet_email: null,
google_sheet_name: null,
google_sheet_id: null,
google_access_token: null,
google_refresh_token: null,
airtable_base_id: null,
airtable_base_name: null,
airtable_table_name: null,
airtable_table_id: null,
airtable_access_token: null,
airtable_refresh_token: null,
schedule: null,
webhooks: null
});
logger.log('info', `Search robot created with id: ${newRobot.id}`);
capture('maxun-oss-robot-created', {
userId: req.user.id.toString(),
robotId: robotId,
robotName: robotName,
robotType: 'search',
searchQuery: searchConfig.query,
searchProvider: searchConfig.provider || 'duckduckgo',
searchLimit: searchConfig.limit || 10
});
return res.status(201).json({
message: 'Search robot created successfully.',
robot: newRobot,
});
} catch (error) {
if (error instanceof Error) {
logger.log('error', `Error creating search robot: ${error.message}`);
return res.status(500).json({ error: error.message });
} else {
logger.log('error', 'Unknown error creating search robot');
return res.status(500).json({ error: 'An unknown error occurred.' });
}
}
});
export { processQueuedRuns };