Merge pull request #946 from getmaxun/auto-search
feat: add auto search logic
This commit is contained in:
@@ -870,24 +870,46 @@ router.post("/sdk/search", requireAPIKey, async (req: AuthenticatedRequest, res:
|
|||||||
/**
|
/**
|
||||||
* LLM-based extraction - generate workflow from natural language prompt
|
* LLM-based extraction - generate workflow from natural language prompt
|
||||||
* POST /api/sdk/extract/llm
|
* POST /api/sdk/extract/llm
|
||||||
|
* URL is optional - if not provided, the system will search for the target website based on the prompt
|
||||||
*/
|
*/
|
||||||
router.post("/sdk/extract/llm", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
|
router.post("/sdk/extract/llm", requireAPIKey, async (req: AuthenticatedRequest, res: Response) => {
|
||||||
try {
|
try {
|
||||||
const user = req.user
|
const user = req.user
|
||||||
const { url, prompt, llmProvider, llmModel, llmApiKey, llmBaseUrl, robotName } = req.body;
|
const { url, prompt, llmProvider, llmModel, llmApiKey, llmBaseUrl, robotName } = req.body;
|
||||||
|
|
||||||
if (!url || !prompt) {
|
if (!prompt) {
|
||||||
return res.status(400).json({
|
return res.status(400).json({
|
||||||
error: "URL and prompt are required"
|
error: "Prompt is required"
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
const workflowResult = await WorkflowEnricher.generateWorkflowFromPrompt(url, prompt, user.id, {
|
if (url) {
|
||||||
|
try {
|
||||||
|
new URL(url);
|
||||||
|
} catch (err) {
|
||||||
|
return res.status(400).json({
|
||||||
|
error: "Invalid URL format"
|
||||||
|
});
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const llmConfig = {
|
||||||
provider: llmProvider,
|
provider: llmProvider,
|
||||||
model: llmModel,
|
model: llmModel,
|
||||||
apiKey: llmApiKey,
|
apiKey: llmApiKey,
|
||||||
baseUrl: llmBaseUrl
|
baseUrl: llmBaseUrl
|
||||||
});
|
};
|
||||||
|
|
||||||
|
let workflowResult: any;
|
||||||
|
let finalUrl: string;
|
||||||
|
|
||||||
|
if (url) {
|
||||||
|
workflowResult = await WorkflowEnricher.generateWorkflowFromPrompt(url, prompt, user.id, llmConfig);
|
||||||
|
finalUrl = workflowResult.url || url;
|
||||||
|
} else {
|
||||||
|
workflowResult = await WorkflowEnricher.generateWorkflowFromPromptWithSearch(prompt, user.id, llmConfig);
|
||||||
|
finalUrl = workflowResult.url || '';
|
||||||
|
}
|
||||||
|
|
||||||
if (!workflowResult.success || !workflowResult.workflow) {
|
if (!workflowResult.success || !workflowResult.workflow) {
|
||||||
return res.status(400).json({
|
return res.status(400).json({
|
||||||
@@ -907,8 +929,8 @@ router.post("/sdk/extract/llm", requireAPIKey, async (req: AuthenticatedRequest,
|
|||||||
pairs: workflowResult.workflow.length,
|
pairs: workflowResult.workflow.length,
|
||||||
params: [],
|
params: [],
|
||||||
type: 'extract',
|
type: 'extract',
|
||||||
url: workflowResult.url,
|
url: finalUrl,
|
||||||
isLLM: true,
|
isLLM: true
|
||||||
};
|
};
|
||||||
|
|
||||||
const robot = await Robot.create({
|
const robot = await Robot.create({
|
||||||
@@ -925,7 +947,7 @@ router.post("/sdk/extract/llm", requireAPIKey, async (req: AuthenticatedRequest,
|
|||||||
capture("maxun-oss-llm-robot-created", {
|
capture("maxun-oss-llm-robot-created", {
|
||||||
robot_meta: robot.recording_meta,
|
robot_meta: robot.recording_meta,
|
||||||
recording: robot.recording,
|
recording: robot.recording,
|
||||||
prompt: prompt,
|
prompt: prompt
|
||||||
});
|
});
|
||||||
|
|
||||||
return res.status(200).json({
|
return res.status(200).json({
|
||||||
@@ -934,7 +956,7 @@ router.post("/sdk/extract/llm", requireAPIKey, async (req: AuthenticatedRequest,
|
|||||||
robotId: metaId,
|
robotId: metaId,
|
||||||
name: robotMeta.name,
|
name: robotMeta.name,
|
||||||
description: prompt,
|
description: prompt,
|
||||||
url: workflowResult.url,
|
url: finalUrl,
|
||||||
workflow: workflowResult.workflow
|
workflow: workflowResult.workflow
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
|
|||||||
@@ -442,33 +442,51 @@ router.post('/recordings/scrape', requireSignIn, async (req: AuthenticatedReques
|
|||||||
|
|
||||||
/**
|
/**
|
||||||
* POST endpoint for creating an LLM-powered extraction robot
|
* POST endpoint for creating an LLM-powered extraction robot
|
||||||
|
* URL is optional - if not provided, the system will search for the target website based on the prompt
|
||||||
*/
|
*/
|
||||||
router.post('/recordings/llm', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
router.post('/recordings/llm', requireSignIn, async (req: AuthenticatedRequest, res) => {
|
||||||
try {
|
try {
|
||||||
const { url, prompt, llmProvider, llmModel, llmApiKey, llmBaseUrl, robotName } = req.body;
|
const { url, prompt, llmProvider, llmModel, llmApiKey, llmBaseUrl, robotName } = req.body;
|
||||||
|
|
||||||
if (!url || !prompt) {
|
if (!prompt) {
|
||||||
return res.status(400).json({ error: 'Both "url" and "prompt" fields are required.' });
|
return res.status(400).json({ error: 'The "prompt" field is required.' });
|
||||||
}
|
}
|
||||||
|
|
||||||
if (!req.user) {
|
if (!req.user) {
|
||||||
return res.status(401).send({ error: 'Unauthorized' });
|
return res.status(401).send({ error: 'Unauthorized' });
|
||||||
}
|
}
|
||||||
|
|
||||||
try {
|
// Validate URL format if provided
|
||||||
new URL(url);
|
if (url) {
|
||||||
} catch (err) {
|
try {
|
||||||
return res.status(400).json({ error: 'Invalid URL format' });
|
new URL(url);
|
||||||
|
} catch (err) {
|
||||||
|
return res.status(400).json({ error: 'Invalid URL format' });
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
logger.log('info', `Starting LLM workflow generation for URL: ${url}`);
|
let workflowResult: any;
|
||||||
|
let finalUrl: string;
|
||||||
|
|
||||||
const workflowResult = await WorkflowEnricher.generateWorkflowFromPrompt(url, prompt, req.user.id, {
|
const llmConfig = {
|
||||||
provider: llmProvider || 'ollama',
|
provider: llmProvider || 'ollama',
|
||||||
model: llmModel,
|
model: llmModel,
|
||||||
apiKey: llmApiKey,
|
apiKey: llmApiKey,
|
||||||
baseUrl: llmBaseUrl
|
baseUrl: llmBaseUrl
|
||||||
});
|
};
|
||||||
|
|
||||||
|
if (url) {
|
||||||
|
logger.log('info', `Starting LLM workflow generation for provided URL: ${url}`);
|
||||||
|
workflowResult = await WorkflowEnricher.generateWorkflowFromPrompt(url, prompt, req.user.id, llmConfig);
|
||||||
|
finalUrl = workflowResult.url || url;
|
||||||
|
} else {
|
||||||
|
logger.log('info', `Starting LLM workflow generation with automatic URL detection for prompt: "${prompt}"`);
|
||||||
|
workflowResult = await WorkflowEnricher.generateWorkflowFromPromptWithSearch(prompt, req.user.id, llmConfig);
|
||||||
|
finalUrl = workflowResult.url || '';
|
||||||
|
if (finalUrl) {
|
||||||
|
logger.log('info', `Auto-detected URL: ${finalUrl}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
if (!workflowResult.success || !workflowResult.workflow) {
|
if (!workflowResult.success || !workflowResult.workflow) {
|
||||||
logger.log('error', `Failed to generate workflow: ${JSON.stringify(workflowResult.errors)}`);
|
logger.log('error', `Failed to generate workflow: ${JSON.stringify(workflowResult.errors)}`);
|
||||||
@@ -493,7 +511,7 @@ router.post('/recordings/llm', requireSignIn, async (req: AuthenticatedRequest,
|
|||||||
pairs: workflowResult.workflow.length,
|
pairs: workflowResult.workflow.length,
|
||||||
params: [],
|
params: [],
|
||||||
type: 'extract',
|
type: 'extract',
|
||||||
url: workflowResult.url || url,
|
url: finalUrl,
|
||||||
isLLM: true,
|
isLLM: true,
|
||||||
},
|
},
|
||||||
recording: { workflow: workflowResult.workflow },
|
recording: { workflow: workflowResult.workflow },
|
||||||
@@ -511,6 +529,7 @@ router.post('/recordings/llm', requireSignIn, async (req: AuthenticatedRequest,
|
|||||||
recording: newRobot.recording,
|
recording: newRobot.recording,
|
||||||
llm_provider: llmProvider || 'ollama',
|
llm_provider: llmProvider || 'ollama',
|
||||||
prompt: prompt,
|
prompt: prompt,
|
||||||
|
urlAutoDetected: !url,
|
||||||
});
|
});
|
||||||
|
|
||||||
return res.status(201).json({
|
return res.status(201).json({
|
||||||
|
|||||||
@@ -1529,4 +1529,521 @@ Return ONLY the list name, nothing else:`;
|
|||||||
|
|
||||||
return workflow;
|
return workflow;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate workflow from prompt with automatic URL detection via search
|
||||||
|
* This method searches for the target website based on the user's prompt,
|
||||||
|
* then generates a workflow for the best matching URL
|
||||||
|
*/
|
||||||
|
static async generateWorkflowFromPromptWithSearch(
|
||||||
|
userPrompt: string,
|
||||||
|
userId: string,
|
||||||
|
llmConfig?: {
|
||||||
|
provider?: 'anthropic' | 'openai' | 'ollama';
|
||||||
|
model?: string;
|
||||||
|
apiKey?: string;
|
||||||
|
baseUrl?: string;
|
||||||
|
}
|
||||||
|
): Promise<{
|
||||||
|
success: boolean;
|
||||||
|
workflow?: any[];
|
||||||
|
url?: string;
|
||||||
|
errors?: string[];
|
||||||
|
}> {
|
||||||
|
let browserId: string | null = null;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const { browserId: id, page } = await createRemoteBrowserForValidation(userId);
|
||||||
|
browserId = id;
|
||||||
|
|
||||||
|
const intent = await this.parseSearchIntent(userPrompt, llmConfig);
|
||||||
|
|
||||||
|
const searchResults = await this.performDuckDuckGoSearch(intent.searchQuery, page);
|
||||||
|
if (searchResults.length === 0) {
|
||||||
|
if (browserId) {
|
||||||
|
await destroyRemoteBrowser(browserId, userId);
|
||||||
|
}
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
errors: [`No search results found for query: "${intent.searchQuery}". Please provide a URL manually or refine your prompt.`]
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const selection = await this.selectBestUrlFromResults(searchResults, userPrompt, llmConfig);
|
||||||
|
|
||||||
|
await page.goto(selection.url, { waitUntil: 'networkidle', timeout: 30000 });
|
||||||
|
await page.waitForTimeout(2000);
|
||||||
|
|
||||||
|
const validator = new SelectorValidator();
|
||||||
|
await validator.initialize(page, selection.url);
|
||||||
|
|
||||||
|
const validatorPage = (validator as any).page;
|
||||||
|
const screenshotBuffer = await validatorPage.screenshot({
|
||||||
|
fullPage: true,
|
||||||
|
type: 'jpeg',
|
||||||
|
quality: 85
|
||||||
|
});
|
||||||
|
const screenshotBase64 = screenshotBuffer.toString('base64');
|
||||||
|
|
||||||
|
const elementGroups = await this.analyzePageGroups(validator);
|
||||||
|
const pageHTML = await validatorPage.content();
|
||||||
|
|
||||||
|
const llmDecision = await this.getLLMDecisionWithVision(
|
||||||
|
userPrompt,
|
||||||
|
screenshotBase64,
|
||||||
|
elementGroups,
|
||||||
|
pageHTML,
|
||||||
|
llmConfig
|
||||||
|
);
|
||||||
|
|
||||||
|
if (intent.limit !== undefined && intent.limit !== null) {
|
||||||
|
llmDecision.limit = intent.limit;
|
||||||
|
}
|
||||||
|
|
||||||
|
const workflow = await this.buildWorkflowFromLLMDecision(llmDecision, selection.url, validator, userPrompt, llmConfig);
|
||||||
|
|
||||||
|
await validator.close();
|
||||||
|
|
||||||
|
if (browserId) {
|
||||||
|
await destroyRemoteBrowser(browserId, userId);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
success: true,
|
||||||
|
workflow,
|
||||||
|
url: selection.url
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
if (browserId) {
|
||||||
|
try {
|
||||||
|
await destroyRemoteBrowser(browserId, userId);
|
||||||
|
} catch (cleanupError) {
|
||||||
|
logger.warn('Failed to cleanup RemoteBrowser:', cleanupError);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.error('Error in generateWorkflowFromPromptWithSearch:', error);
|
||||||
|
return {
|
||||||
|
success: false,
|
||||||
|
errors: [error.message]
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Parse user prompt to extract search intent
|
||||||
|
*/
|
||||||
|
private static async parseSearchIntent(
|
||||||
|
userPrompt: string,
|
||||||
|
llmConfig?: {
|
||||||
|
provider?: 'anthropic' | 'openai' | 'ollama';
|
||||||
|
model?: string;
|
||||||
|
apiKey?: string;
|
||||||
|
baseUrl?: string;
|
||||||
|
}
|
||||||
|
): Promise<{
|
||||||
|
searchQuery: string;
|
||||||
|
extractionGoal: string;
|
||||||
|
limit?: number | null;
|
||||||
|
}> {
|
||||||
|
const systemPrompt = `You are a search query extractor. Analyze the user's extraction request and identify:
|
||||||
|
1. The website or page they want to extract from (for searching)
|
||||||
|
2. What data they want to extract
|
||||||
|
3. Any limit/quantity specified
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
- "Extract top 10 company data from YCombinator Companies site" → searchQuery: "YCombinator Companies", goal: "company data", limit: 10
|
||||||
|
- "Get first 20 laptop names and prices from Amazon" → searchQuery: "Amazon laptops", goal: "laptop names and prices", limit: 20
|
||||||
|
- "Scrape articles from TechCrunch AI section" → searchQuery: "TechCrunch AI section", goal: "articles", limit: null
|
||||||
|
|
||||||
|
Return ONLY valid JSON: {"searchQuery": "...", "extractionGoal": "...", "limit": NUMBER_OR_NULL}`;
|
||||||
|
|
||||||
|
const userMessage = `User request: "${userPrompt}"
|
||||||
|
|
||||||
|
Extract the search query, extraction goal, and limit. Return JSON only.`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const provider = llmConfig?.provider || 'ollama';
|
||||||
|
const axios = require('axios');
|
||||||
|
|
||||||
|
let llmResponse: string;
|
||||||
|
|
||||||
|
if (provider === 'ollama') {
|
||||||
|
const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434';
|
||||||
|
const ollamaModel = llmConfig?.model || 'llama3.2-vision';
|
||||||
|
|
||||||
|
const jsonSchema = {
|
||||||
|
type: 'object',
|
||||||
|
required: ['searchQuery', 'extractionGoal'],
|
||||||
|
properties: {
|
||||||
|
searchQuery: { type: 'string' },
|
||||||
|
extractionGoal: { type: 'string' },
|
||||||
|
limit: { type: ['integer', 'null'] }
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const response = await axios.post(`${ollamaBaseUrl}/api/chat`, {
|
||||||
|
model: ollamaModel,
|
||||||
|
messages: [
|
||||||
|
{ role: 'system', content: systemPrompt },
|
||||||
|
{ role: 'user', content: userMessage }
|
||||||
|
],
|
||||||
|
stream: false,
|
||||||
|
format: jsonSchema,
|
||||||
|
options: { temperature: 0.1 }
|
||||||
|
});
|
||||||
|
|
||||||
|
llmResponse = response.data.message.content;
|
||||||
|
|
||||||
|
} else if (provider === 'anthropic') {
|
||||||
|
const anthropic = new Anthropic({
|
||||||
|
apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY
|
||||||
|
});
|
||||||
|
const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022';
|
||||||
|
|
||||||
|
const response = await anthropic.messages.create({
|
||||||
|
model: anthropicModel,
|
||||||
|
max_tokens: 256,
|
||||||
|
temperature: 0.1,
|
||||||
|
messages: [{ role: 'user', content: userMessage }],
|
||||||
|
system: systemPrompt
|
||||||
|
});
|
||||||
|
|
||||||
|
const textContent = response.content.find((c: any) => c.type === 'text');
|
||||||
|
llmResponse = textContent?.type === 'text' ? textContent.text : '';
|
||||||
|
|
||||||
|
} else if (provider === 'openai') {
|
||||||
|
const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1';
|
||||||
|
const openaiModel = llmConfig?.model || 'gpt-4o-mini';
|
||||||
|
|
||||||
|
const response = await axios.post(`${openaiBaseUrl}/chat/completions`, {
|
||||||
|
model: openaiModel,
|
||||||
|
messages: [
|
||||||
|
{ role: 'system', content: systemPrompt },
|
||||||
|
{ role: 'user', content: userMessage }
|
||||||
|
],
|
||||||
|
max_tokens: 256,
|
||||||
|
temperature: 0.1,
|
||||||
|
response_format: { type: 'json_object' }
|
||||||
|
}, {
|
||||||
|
headers: {
|
||||||
|
'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`,
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
llmResponse = response.data.choices[0].message.content;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
throw new Error(`Unsupported LLM provider: ${provider}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`[WorkflowEnricher] Intent parsing response: ${llmResponse}`);
|
||||||
|
|
||||||
|
let jsonStr = llmResponse.trim();
|
||||||
|
const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/);
|
||||||
|
if (jsonMatch) {
|
||||||
|
jsonStr = jsonMatch[1].trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
const objectMatch = jsonStr.match(/\{[\s\S]*"searchQuery"[\s\S]*\}/);
|
||||||
|
if (objectMatch) {
|
||||||
|
jsonStr = objectMatch[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
const intent = JSON.parse(jsonStr);
|
||||||
|
|
||||||
|
if (!intent.searchQuery || !intent.extractionGoal) {
|
||||||
|
throw new Error('Invalid intent parsing response - missing required fields');
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
searchQuery: intent.searchQuery,
|
||||||
|
extractionGoal: intent.extractionGoal,
|
||||||
|
limit: intent.limit || null
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.warn(`Failed to parse intent with LLM: ${error.message}`);
|
||||||
|
logger.info('Using fallback heuristic intent parsing');
|
||||||
|
|
||||||
|
const fromMatch = userPrompt.match(/from\s+([^,\.]+)/i);
|
||||||
|
const searchQuery = fromMatch ? fromMatch[1].trim() : userPrompt.slice(0, 50);
|
||||||
|
|
||||||
|
const numberMatch = userPrompt.match(/(\d+)/);
|
||||||
|
const limit = numberMatch ? parseInt(numberMatch[1], 10) : null;
|
||||||
|
|
||||||
|
return {
|
||||||
|
searchQuery,
|
||||||
|
extractionGoal: userPrompt,
|
||||||
|
limit
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Perform DuckDuckGo search and return FIRST URL only
|
||||||
|
* Simplified version - just returns the first valid URL from search results
|
||||||
|
*/
|
||||||
|
private static async performDuckDuckGoSearch(
|
||||||
|
query: string,
|
||||||
|
page: any
|
||||||
|
): Promise<Array<{ url: string; title: string; description: string; position: number }>> {
|
||||||
|
logger.info(`[WorkflowEnricher] Searching DuckDuckGo for: "${query}"`);
|
||||||
|
|
||||||
|
try {
|
||||||
|
const searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(query)}`;
|
||||||
|
const initialDelay = 500 + Math.random() * 1000;
|
||||||
|
await new Promise(resolve => setTimeout(resolve, initialDelay));
|
||||||
|
|
||||||
|
await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
|
||||||
|
await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {
|
||||||
|
logger.warn('[WorkflowEnricher] Load state timeout, continuing anyway');
|
||||||
|
});
|
||||||
|
|
||||||
|
const pageLoadDelay = 2000 + Math.random() * 1500;
|
||||||
|
await new Promise(resolve => setTimeout(resolve, pageLoadDelay));
|
||||||
|
|
||||||
|
await page.waitForSelector('[data-testid="result"], .result', { timeout: 5000 }).catch(() => {
|
||||||
|
logger.warn('[WorkflowEnricher] DuckDuckGo results not found on initial wait');
|
||||||
|
});
|
||||||
|
|
||||||
|
const firstUrl = await page.evaluate(() => {
|
||||||
|
const selectors = [
|
||||||
|
'[data-testid="result"]',
|
||||||
|
'article[data-testid="result"]',
|
||||||
|
'li[data-layout="organic"]',
|
||||||
|
'.result',
|
||||||
|
'article[data-testid]'
|
||||||
|
];
|
||||||
|
|
||||||
|
let allElements: Element[] = [];
|
||||||
|
for (const selector of selectors) {
|
||||||
|
const elements = Array.from(document.querySelectorAll(selector));
|
||||||
|
if (elements.length > 0) {
|
||||||
|
console.log(`Found ${elements.length} DDG elements with: ${selector}`);
|
||||||
|
allElements = elements;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (allElements.length === 0) {
|
||||||
|
console.error('No search result elements found');
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
const element = allElements[0];
|
||||||
|
const titleEl = element.querySelector('h2, [data-testid="result-title-a"], h3, [data-testid="result-title"]');
|
||||||
|
|
||||||
|
let linkEl = titleEl?.querySelector('a[href]') as HTMLAnchorElement;
|
||||||
|
if (!linkEl) {
|
||||||
|
linkEl = element.querySelector('a[href]') as HTMLAnchorElement;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (!linkEl || !linkEl.href) return null;
|
||||||
|
|
||||||
|
let actualUrl = linkEl.href;
|
||||||
|
|
||||||
|
if (actualUrl.includes('uddg=')) {
|
||||||
|
try {
|
||||||
|
const urlParams = new URLSearchParams(actualUrl.split('?')[1]);
|
||||||
|
const uddgUrl = urlParams.get('uddg');
|
||||||
|
if (uddgUrl) {
|
||||||
|
actualUrl = decodeURIComponent(uddgUrl);
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
console.log('Failed to parse uddg parameter:', e);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
if (actualUrl.includes('duckduckgo.com')) {
|
||||||
|
console.log(`Skipping DDG internal URL: ${actualUrl}`);
|
||||||
|
return null;
|
||||||
|
}
|
||||||
|
|
||||||
|
return actualUrl;
|
||||||
|
});
|
||||||
|
|
||||||
|
if (!firstUrl) {
|
||||||
|
logger.error('[WorkflowEnricher] No valid URL found in search results');
|
||||||
|
return [];
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`[WorkflowEnricher] Successfully extracted first URL: ${firstUrl}`);
|
||||||
|
|
||||||
|
return [{
|
||||||
|
url: firstUrl,
|
||||||
|
title: '',
|
||||||
|
description: '',
|
||||||
|
position: 1
|
||||||
|
}];
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error(`[WorkflowEnricher] Search failed: ${error.message}`);
|
||||||
|
throw new Error(`DuckDuckGo search failed: ${error.message}`);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Use LLM to select the best URL from search results
|
||||||
|
*/
|
||||||
|
private static async selectBestUrlFromResults(
|
||||||
|
searchResults: any[],
|
||||||
|
userPrompt: string,
|
||||||
|
llmConfig?: {
|
||||||
|
provider?: 'anthropic' | 'openai' | 'ollama';
|
||||||
|
model?: string;
|
||||||
|
apiKey?: string;
|
||||||
|
baseUrl?: string;
|
||||||
|
}
|
||||||
|
): Promise<{
|
||||||
|
url: string;
|
||||||
|
confidence: number;
|
||||||
|
reasoning: string;
|
||||||
|
}> {
|
||||||
|
if (searchResults.length === 1) {
|
||||||
|
return {
|
||||||
|
url: searchResults[0].url,
|
||||||
|
confidence: 0.8,
|
||||||
|
reasoning: 'Selected first search result from DuckDuckGo'
|
||||||
|
};
|
||||||
|
}
|
||||||
|
|
||||||
|
const systemPrompt = `You are a URL selector. Given a list of search results and a user's extraction request, select the BEST URL that is most likely to contain the data the user wants.
|
||||||
|
|
||||||
|
Consider:
|
||||||
|
1. Title and description relevance to the user's request
|
||||||
|
2. Official/authoritative sources are usually better than aggregators
|
||||||
|
3. List/directory pages are better than individual item pages
|
||||||
|
4. The URL path often gives hints about the page content
|
||||||
|
|
||||||
|
Return ONLY valid JSON: {"selectedIndex": NUMBER, "confidence": NUMBER_0_TO_1, "reasoning": "brief explanation"}`;
|
||||||
|
|
||||||
|
const resultsDescription = searchResults.map((r, i) =>
|
||||||
|
`Result ${i}:
|
||||||
|
- Title: ${r.title}
|
||||||
|
- URL: ${r.url}
|
||||||
|
- Description: ${r.description}`
|
||||||
|
).join('\n\n');
|
||||||
|
|
||||||
|
const userMessage = `User wants to: "${userPrompt}"
|
||||||
|
|
||||||
|
Available search results:
|
||||||
|
${resultsDescription}
|
||||||
|
|
||||||
|
Select the BEST result index (0-${searchResults.length - 1}). Return JSON only.`;
|
||||||
|
|
||||||
|
try {
|
||||||
|
const provider = llmConfig?.provider || 'ollama';
|
||||||
|
const axios = require('axios');
|
||||||
|
|
||||||
|
let llmResponse: string;
|
||||||
|
|
||||||
|
if (provider === 'ollama') {
|
||||||
|
const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434';
|
||||||
|
const ollamaModel = llmConfig?.model || 'llama3.2-vision';
|
||||||
|
|
||||||
|
const jsonSchema = {
|
||||||
|
type: 'object',
|
||||||
|
required: ['selectedIndex', 'confidence', 'reasoning'],
|
||||||
|
properties: {
|
||||||
|
selectedIndex: { type: 'integer' },
|
||||||
|
confidence: { type: 'number' },
|
||||||
|
reasoning: { type: 'string' }
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
const response = await axios.post(`${ollamaBaseUrl}/api/chat`, {
|
||||||
|
model: ollamaModel,
|
||||||
|
messages: [
|
||||||
|
{ role: 'system', content: systemPrompt },
|
||||||
|
{ role: 'user', content: userMessage }
|
||||||
|
],
|
||||||
|
stream: false,
|
||||||
|
format: jsonSchema,
|
||||||
|
options: { temperature: 0.1 }
|
||||||
|
});
|
||||||
|
|
||||||
|
llmResponse = response.data.message.content;
|
||||||
|
|
||||||
|
} else if (provider === 'anthropic') {
|
||||||
|
const anthropic = new Anthropic({
|
||||||
|
apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY
|
||||||
|
});
|
||||||
|
const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022';
|
||||||
|
|
||||||
|
const response = await anthropic.messages.create({
|
||||||
|
model: anthropicModel,
|
||||||
|
max_tokens: 256,
|
||||||
|
temperature: 0.1,
|
||||||
|
messages: [{ role: 'user', content: userMessage }],
|
||||||
|
system: systemPrompt
|
||||||
|
});
|
||||||
|
|
||||||
|
const textContent = response.content.find((c: any) => c.type === 'text');
|
||||||
|
llmResponse = textContent?.type === 'text' ? textContent.text : '';
|
||||||
|
|
||||||
|
} else if (provider === 'openai') {
|
||||||
|
const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1';
|
||||||
|
const openaiModel = llmConfig?.model || 'gpt-4o-mini';
|
||||||
|
|
||||||
|
const response = await axios.post(`${openaiBaseUrl}/chat/completions`, {
|
||||||
|
model: openaiModel,
|
||||||
|
messages: [
|
||||||
|
{ role: 'system', content: systemPrompt },
|
||||||
|
{ role: 'user', content: userMessage }
|
||||||
|
],
|
||||||
|
max_tokens: 256,
|
||||||
|
temperature: 0.1,
|
||||||
|
response_format: { type: 'json_object' }
|
||||||
|
}, {
|
||||||
|
headers: {
|
||||||
|
'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`,
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
llmResponse = response.data.choices[0].message.content;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
throw new Error(`Unsupported LLM provider: ${provider}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`[WorkflowEnricher] URL selection response: ${llmResponse}`);
|
||||||
|
|
||||||
|
let jsonStr = llmResponse.trim();
|
||||||
|
const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/);
|
||||||
|
if (jsonMatch) {
|
||||||
|
jsonStr = jsonMatch[1].trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
const objectMatch = jsonStr.match(/\{[\s\S]*"selectedIndex"[\s\S]*\}/);
|
||||||
|
if (objectMatch) {
|
||||||
|
jsonStr = objectMatch[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
const decision = JSON.parse(jsonStr);
|
||||||
|
|
||||||
|
if (decision.selectedIndex === undefined || decision.selectedIndex < 0 || decision.selectedIndex >= searchResults.length) {
|
||||||
|
throw new Error(`Invalid selectedIndex: ${decision.selectedIndex}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: searchResults[decision.selectedIndex].url,
|
||||||
|
confidence: decision.confidence || 0.5,
|
||||||
|
reasoning: decision.reasoning || 'No reasoning provided'
|
||||||
|
};
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.warn(`[WorkflowEnricher] Failed to select URL with LLM: ${error.message}`);
|
||||||
|
logger.info('[WorkflowEnricher] Using fallback: selecting first search result');
|
||||||
|
|
||||||
|
return {
|
||||||
|
url: searchResults[0].url,
|
||||||
|
confidence: 0.6,
|
||||||
|
reasoning: 'Selected first search result (LLM selection failed)'
|
||||||
|
};
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
@@ -59,7 +59,7 @@ export const createScrapeRobot = async (
|
|||||||
};
|
};
|
||||||
|
|
||||||
export const createLLMRobot = async (
|
export const createLLMRobot = async (
|
||||||
url: string,
|
url: string | undefined,
|
||||||
prompt: string,
|
prompt: string,
|
||||||
llmProvider?: 'anthropic' | 'openai' | 'ollama',
|
llmProvider?: 'anthropic' | 'openai' | 'ollama',
|
||||||
llmModel?: string,
|
llmModel?: string,
|
||||||
@@ -71,7 +71,7 @@ export const createLLMRobot = async (
|
|||||||
const response = await axios.post(
|
const response = await axios.post(
|
||||||
`${apiUrl}/storage/recordings/llm`,
|
`${apiUrl}/storage/recordings/llm`,
|
||||||
{
|
{
|
||||||
url,
|
url: url || undefined,
|
||||||
prompt,
|
prompt,
|
||||||
llmProvider,
|
llmProvider,
|
||||||
llmModel,
|
llmModel,
|
||||||
|
|||||||
@@ -97,7 +97,7 @@ const LoadingRobotRow = memo(({ row, columns }: any) => {
|
|||||||
} else if (column.id === 'interpret') {
|
} else if (column.id === 'interpret') {
|
||||||
return (
|
return (
|
||||||
<MemoizedTableCell key={column.id} align={column.align}>
|
<MemoizedTableCell key={column.id} align={column.align}>
|
||||||
<CircularProgress size={20} />
|
<Box sx={{ opacity: 0.3 }}>-</Box>
|
||||||
</MemoizedTableCell>
|
</MemoizedTableCell>
|
||||||
);
|
);
|
||||||
} else {
|
} else {
|
||||||
|
|||||||
@@ -65,7 +65,7 @@ const RobotCreate: React.FC = () => {
|
|||||||
const [isWarningModalOpen, setWarningModalOpen] = useState(false);
|
const [isWarningModalOpen, setWarningModalOpen] = useState(false);
|
||||||
const [activeBrowserId, setActiveBrowserId] = useState('');
|
const [activeBrowserId, setActiveBrowserId] = useState('');
|
||||||
const [outputFormats, setOutputFormats] = useState<string[]>([]);
|
const [outputFormats, setOutputFormats] = useState<string[]>([]);
|
||||||
const [generationMode, setGenerationMode] = useState<'agent' | 'recorder' | null>(null);
|
const [generationMode, setGenerationMode] = useState<'agent' | 'recorder' | null>('recorder');
|
||||||
|
|
||||||
const [aiPrompt, setAiPrompt] = useState('');
|
const [aiPrompt, setAiPrompt] = useState('');
|
||||||
const [llmProvider, setLlmProvider] = useState<'anthropic' | 'openai' | 'ollama'>('ollama');
|
const [llmProvider, setLlmProvider] = useState<'anthropic' | 'openai' | 'ollama'>('ollama');
|
||||||
@@ -323,17 +323,6 @@ const RobotCreate: React.FC = () => {
|
|||||||
<Typography variant="body2" color="text.secondary" mb={3}>
|
<Typography variant="body2" color="text.secondary" mb={3}>
|
||||||
Extract structured data from websites using AI or record your own extraction workflow.
|
Extract structured data from websites using AI or record your own extraction workflow.
|
||||||
</Typography>
|
</Typography>
|
||||||
<Box sx={{ width: '100%', maxWidth: 700, mb: 3 }}>
|
|
||||||
<TextField
|
|
||||||
placeholder="Example: https://www.ycombinator.com/companies/"
|
|
||||||
variant="outlined"
|
|
||||||
fullWidth
|
|
||||||
value={url}
|
|
||||||
onChange={(e) => setUrl(e.target.value)}
|
|
||||||
label="Website URL"
|
|
||||||
/>
|
|
||||||
</Box>
|
|
||||||
|
|
||||||
<Box sx={{ width: '100%', maxWidth: 700, mb: 3 }}>
|
<Box sx={{ width: '100%', maxWidth: 700, mb: 3 }}>
|
||||||
<Typography variant="subtitle1" gutterBottom sx={{ mb: 2 }} color="text.secondary">
|
<Typography variant="subtitle1" gutterBottom sx={{ mb: 2 }} color="text.secondary">
|
||||||
Choose How to Build
|
Choose How to Build
|
||||||
@@ -432,6 +421,17 @@ const RobotCreate: React.FC = () => {
|
|||||||
/>
|
/>
|
||||||
</Box>
|
</Box>
|
||||||
|
|
||||||
|
<Box sx={{ mb: 3 }}>
|
||||||
|
<TextField
|
||||||
|
placeholder="Example: https://www.ycombinator.com/companies/"
|
||||||
|
variant="outlined"
|
||||||
|
fullWidth
|
||||||
|
value={url}
|
||||||
|
onChange={(e) => setUrl(e.target.value)}
|
||||||
|
label="Website URL (Optional)"
|
||||||
|
/>
|
||||||
|
</Box>
|
||||||
|
|
||||||
<Box sx={{ display: 'flex', gap: 2, mb: 3 }}>
|
<Box sx={{ display: 'flex', gap: 2, mb: 3 }}>
|
||||||
<FormControl sx={{ flex: 1 }}>
|
<FormControl sx={{ flex: 1 }}>
|
||||||
<InputLabel>LLM Provider</InputLabel>
|
<InputLabel>LLM Provider</InputLabel>
|
||||||
@@ -517,10 +517,7 @@ const RobotCreate: React.FC = () => {
|
|||||||
variant="contained"
|
variant="contained"
|
||||||
fullWidth
|
fullWidth
|
||||||
onClick={async () => {
|
onClick={async () => {
|
||||||
if (!url.trim()) {
|
// URL is optional for AI mode - it will auto-search if not provided
|
||||||
notify('error', 'Please enter a valid URL');
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
if (!extractRobotName.trim()) {
|
if (!extractRobotName.trim()) {
|
||||||
notify('error', 'Please enter a robot name');
|
notify('error', 'Please enter a robot name');
|
||||||
return;
|
return;
|
||||||
@@ -543,7 +540,7 @@ const RobotCreate: React.FC = () => {
|
|||||||
pairs: 0,
|
pairs: 0,
|
||||||
params: [],
|
params: [],
|
||||||
type: 'extract',
|
type: 'extract',
|
||||||
url: url,
|
url: url || '(auto-detecting...)',
|
||||||
},
|
},
|
||||||
recording: { workflow: [] },
|
recording: { workflow: [] },
|
||||||
isLoading: true,
|
isLoading: true,
|
||||||
@@ -552,12 +549,14 @@ const RobotCreate: React.FC = () => {
|
|||||||
|
|
||||||
addOptimisticRobot(optimisticRobot);
|
addOptimisticRobot(optimisticRobot);
|
||||||
|
|
||||||
notify('info', `Robot ${robotDisplayName} creation started`);
|
notify('info', url.trim()
|
||||||
|
? `Robot ${robotDisplayName} creation started`
|
||||||
|
: `Robot ${robotDisplayName} creation started (searching for website...)`);
|
||||||
navigate('/robots');
|
navigate('/robots');
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const result = await createLLMRobot(
|
const result = await createLLMRobot(
|
||||||
url,
|
url.trim() || undefined,
|
||||||
aiPrompt,
|
aiPrompt,
|
||||||
llmProvider,
|
llmProvider,
|
||||||
llmModel === 'default' ? undefined : llmModel,
|
llmModel === 'default' ? undefined : llmModel,
|
||||||
@@ -617,7 +616,7 @@ const RobotCreate: React.FC = () => {
|
|||||||
notify('error', error?.message || 'Failed to create and run AI robot');
|
notify('error', error?.message || 'Failed to create and run AI robot');
|
||||||
}
|
}
|
||||||
}}
|
}}
|
||||||
disabled={!url.trim() || !extractRobotName.trim() || !aiPrompt.trim() || isLoading}
|
disabled={!extractRobotName.trim() || !aiPrompt.trim() || isLoading}
|
||||||
sx={{
|
sx={{
|
||||||
bgcolor: '#ff00c3',
|
bgcolor: '#ff00c3',
|
||||||
py: 1.4,
|
py: 1.4,
|
||||||
@@ -633,6 +632,17 @@ const RobotCreate: React.FC = () => {
|
|||||||
)}
|
)}
|
||||||
|
|
||||||
{generationMode === 'recorder' && (
|
{generationMode === 'recorder' && (
|
||||||
|
<>
|
||||||
|
<Box sx={{ width: '100%', maxWidth: 700, mb: 3 }}>
|
||||||
|
<TextField
|
||||||
|
placeholder="Example: https://www.ycombinator.com/companies/"
|
||||||
|
variant="outlined"
|
||||||
|
fullWidth
|
||||||
|
value={url}
|
||||||
|
onChange={(e) => setUrl(e.target.value)}
|
||||||
|
label="Website URL"
|
||||||
|
/>
|
||||||
|
</Box>
|
||||||
<Box sx={{ width: '100%', maxWidth: 700 }}>
|
<Box sx={{ width: '100%', maxWidth: 700 }}>
|
||||||
<Button
|
<Button
|
||||||
variant="contained"
|
variant="contained"
|
||||||
@@ -651,6 +661,7 @@ const RobotCreate: React.FC = () => {
|
|||||||
{isLoading ? 'Starting...' : 'Start Recording'}
|
{isLoading ? 'Starting...' : 'Start Recording'}
|
||||||
</Button>
|
</Button>
|
||||||
</Box>
|
</Box>
|
||||||
|
</>
|
||||||
)}
|
)}
|
||||||
</Box>
|
</Box>
|
||||||
</Card>
|
</Card>
|
||||||
|
|||||||
Reference in New Issue
Block a user