Revert "Revert "feat: add auto search logic""

This reverts commit 536f046b60.
2026-01-23 17:50:26 +05:30
parent 4e1a3fdc5d
commit 3bf77e4f8c
6 changed files with 610 additions and 41 deletions
--- a/server/src/sdk/workflowEnricher.ts
+++ b/server/src/sdk/workflowEnricher.ts
@@ -1529,4 +1529,521 @@ Return ONLY the list name, nothing else:`;

    return workflow;
  }
+
+  /**
+   * Generate workflow from prompt with automatic URL detection via search
+   * This method searches for the target website based on the user's prompt,
+   * then generates a workflow for the best matching URL
+   */
+  static async generateWorkflowFromPromptWithSearch(
+    userPrompt: string,
+    userId: string,
+    llmConfig?: {
+      provider?: 'anthropic' | 'openai' | 'ollama';
+      model?: string;
+      apiKey?: string;
+      baseUrl?: string;
+    }
+  ): Promise<{
+    success: boolean;
+    workflow?: any[];
+    url?: string;
+    errors?: string[];
+  }> {
+    let browserId: string | null = null;
+
+    try {
+      const { browserId: id, page } = await createRemoteBrowserForValidation(userId);
+      browserId = id;
+
+      const intent = await this.parseSearchIntent(userPrompt, llmConfig);
+
+      const searchResults = await this.performDuckDuckGoSearch(intent.searchQuery, page);
+      if (searchResults.length === 0) {
+        if (browserId) {
+          await destroyRemoteBrowser(browserId, userId);
+        }
+        return {
+          success: false,
+          errors: [`No search results found for query: "${intent.searchQuery}". Please provide a URL manually or refine your prompt.`]
+        };
+      }
+
+      const selection = await this.selectBestUrlFromResults(searchResults, userPrompt, llmConfig);
+      
+      await page.goto(selection.url, { waitUntil: 'networkidle', timeout: 30000 });
+      await page.waitForTimeout(2000);
+
+      const validator = new SelectorValidator();
+      await validator.initialize(page, selection.url);
+
+      const validatorPage = (validator as any).page;
+      const screenshotBuffer = await validatorPage.screenshot({ 
+        fullPage: true, 
+        type: 'jpeg',
+        quality: 85
+      });
+      const screenshotBase64 = screenshotBuffer.toString('base64');
+
+      const elementGroups = await this.analyzePageGroups(validator);
+      const pageHTML = await validatorPage.content();
+
+      const llmDecision = await this.getLLMDecisionWithVision(
+        userPrompt,
+        screenshotBase64,
+        elementGroups,
+        pageHTML,
+        llmConfig
+      );
+
+      if (intent.limit !== undefined && intent.limit !== null) {
+        llmDecision.limit = intent.limit;
+      }
+
+      const workflow = await this.buildWorkflowFromLLMDecision(llmDecision, selection.url, validator, userPrompt, llmConfig);
+
+      await validator.close();
+
+      if (browserId) {
+        await destroyRemoteBrowser(browserId, userId);
+      }
+
+      return {
+        success: true,
+        workflow,
+        url: selection.url
+      };
+
+    } catch (error: any) {
+      if (browserId) {
+        try {
+          await destroyRemoteBrowser(browserId, userId);
+        } catch (cleanupError) {
+          logger.warn('Failed to cleanup RemoteBrowser:', cleanupError);
+        }
+      }
+
+      logger.error('Error in generateWorkflowFromPromptWithSearch:', error);
+      return {
+        success: false,
+        errors: [error.message]
+      };
+    }
+  }
+
+  /**
+   * Parse user prompt to extract search intent
+   */
+  private static async parseSearchIntent(
+    userPrompt: string,
+    llmConfig?: {
+      provider?: 'anthropic' | 'openai' | 'ollama';
+      model?: string;
+      apiKey?: string;
+      baseUrl?: string;
+    }
+  ): Promise<{
+    searchQuery: string;
+    extractionGoal: string;
+    limit?: number | null;
+  }> {
+    const systemPrompt = `You are a search query extractor. Analyze the user's extraction request and identify:
+1. The website or page they want to extract from (for searching)
+2. What data they want to extract
+3. Any limit/quantity specified
+
+Examples:
+- "Extract top 10 company data from YCombinator Companies site" → searchQuery: "YCombinator Companies", goal: "company data", limit: 10
+- "Get first 20 laptop names and prices from Amazon" → searchQuery: "Amazon laptops", goal: "laptop names and prices", limit: 20
+- "Scrape articles from TechCrunch AI section" → searchQuery: "TechCrunch AI section", goal: "articles", limit: null
+
+Return ONLY valid JSON: {"searchQuery": "...", "extractionGoal": "...", "limit": NUMBER_OR_NULL}`;
+
+    const userMessage = `User request: "${userPrompt}"
+
+Extract the search query, extraction goal, and limit. Return JSON only.`;
+
+    try {
+      const provider = llmConfig?.provider || 'ollama';
+      const axios = require('axios');
+
+      let llmResponse: string;
+
+      if (provider === 'ollama') {
+        const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434';
+        const ollamaModel = llmConfig?.model || 'llama3.2-vision';
+
+        const jsonSchema = {
+          type: 'object',
+          required: ['searchQuery', 'extractionGoal'],
+          properties: {
+            searchQuery: { type: 'string' },
+            extractionGoal: { type: 'string' },
+            limit: { type: ['integer', 'null'] }
+          }
+        };
+
+        const response = await axios.post(`${ollamaBaseUrl}/api/chat`, {
+          model: ollamaModel,
+          messages: [
+            { role: 'system', content: systemPrompt },
+            { role: 'user', content: userMessage }
+          ],
+          stream: false,
+          format: jsonSchema,
+          options: { temperature: 0.1 }
+        });
+
+        llmResponse = response.data.message.content;
+
+      } else if (provider === 'anthropic') {
+        const anthropic = new Anthropic({
+          apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY
+        });
+        const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022';
+
+        const response = await anthropic.messages.create({
+          model: anthropicModel,
+          max_tokens: 256,
+          temperature: 0.1,
+          messages: [{ role: 'user', content: userMessage }],
+          system: systemPrompt
+        });
+
+        const textContent = response.content.find((c: any) => c.type === 'text');
+        llmResponse = textContent?.type === 'text' ? textContent.text : '';
+
+      } else if (provider === 'openai') {
+        const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1';
+        const openaiModel = llmConfig?.model || 'gpt-4o-mini';
+
+        const response = await axios.post(`${openaiBaseUrl}/chat/completions`, {
+          model: openaiModel,
+          messages: [
+            { role: 'system', content: systemPrompt },
+            { role: 'user', content: userMessage }
+          ],
+          max_tokens: 256,
+          temperature: 0.1,
+          response_format: { type: 'json_object' }
+        }, {
+          headers: {
+            'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`,
+            'Content-Type': 'application/json'
+          }
+        });
+
+        llmResponse = response.data.choices[0].message.content;
+
+      } else {
+        throw new Error(`Unsupported LLM provider: ${provider}`);
+      }
+
+      logger.info(`[WorkflowEnricher] Intent parsing response: ${llmResponse}`);
+
+      let jsonStr = llmResponse.trim();
+      const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/);
+      if (jsonMatch) {
+        jsonStr = jsonMatch[1].trim();
+      }
+
+      const objectMatch = jsonStr.match(/\{[\s\S]*"searchQuery"[\s\S]*\}/);
+      if (objectMatch) {
+        jsonStr = objectMatch[0];
+      }
+
+      const intent = JSON.parse(jsonStr);
+
+      if (!intent.searchQuery || !intent.extractionGoal) {
+        throw new Error('Invalid intent parsing response - missing required fields');
+      }
+
+      return {
+        searchQuery: intent.searchQuery,
+        extractionGoal: intent.extractionGoal,
+        limit: intent.limit || null
+      };
+
+    } catch (error: any) {
+      logger.warn(`Failed to parse intent with LLM: ${error.message}`);
+      logger.info('Using fallback heuristic intent parsing');
+
+      const fromMatch = userPrompt.match(/from\s+([^,\.]+)/i);
+      const searchQuery = fromMatch ? fromMatch[1].trim() : userPrompt.slice(0, 50);
+
+      const numberMatch = userPrompt.match(/(\d+)/);
+      const limit = numberMatch ? parseInt(numberMatch[1], 10) : null;
+
+      return {
+        searchQuery,
+        extractionGoal: userPrompt,
+        limit
+      };
+    }
+  }
+
+  /**
+   * Perform DuckDuckGo search and return FIRST URL only
+   * Simplified version - just returns the first valid URL from search results
+   */
+  private static async performDuckDuckGoSearch(
+    query: string,
+    page: any
+  ): Promise<Array<{ url: string; title: string; description: string; position: number }>> {
+    logger.info(`[WorkflowEnricher] Searching DuckDuckGo for: "${query}"`);
+
+    try {
+      const searchUrl = `https://duckduckgo.com/?q=${encodeURIComponent(query)}`;
+      const initialDelay = 500 + Math.random() * 1000;
+      await new Promise(resolve => setTimeout(resolve, initialDelay));
+
+      await page.goto(searchUrl, { waitUntil: 'domcontentloaded', timeout: 30000 });
+      await page.waitForLoadState('load', { timeout: 10000 }).catch(() => {
+        logger.warn('[WorkflowEnricher] Load state timeout, continuing anyway');
+      });
+
+      const pageLoadDelay = 2000 + Math.random() * 1500;
+      await new Promise(resolve => setTimeout(resolve, pageLoadDelay));
+
+      await page.waitForSelector('[data-testid="result"], .result', { timeout: 5000 }).catch(() => {
+        logger.warn('[WorkflowEnricher] DuckDuckGo results not found on initial wait');
+      });
+
+      const firstUrl = await page.evaluate(() => {
+        const selectors = [
+          '[data-testid="result"]',
+          'article[data-testid="result"]',
+          'li[data-layout="organic"]',
+          '.result',
+          'article[data-testid]'
+        ];
+
+        let allElements: Element[] = [];
+        for (const selector of selectors) {
+          const elements = Array.from(document.querySelectorAll(selector));
+          if (elements.length > 0) {
+            console.log(`Found ${elements.length} DDG elements with: ${selector}`);
+            allElements = elements;
+            break;
+          }
+        }
+
+        if (allElements.length === 0) {
+          console.error('No search result elements found');
+          return null;
+        }
+
+        const element = allElements[0];
+        const titleEl = element.querySelector('h2, [data-testid="result-title-a"], h3, [data-testid="result-title"]');
+
+        let linkEl = titleEl?.querySelector('a[href]') as HTMLAnchorElement;
+        if (!linkEl) {
+          linkEl = element.querySelector('a[href]') as HTMLAnchorElement;
+        }
+
+        if (!linkEl || !linkEl.href) return null;
+
+        let actualUrl = linkEl.href;
+
+        if (actualUrl.includes('uddg=')) {
+          try {
+            const urlParams = new URLSearchParams(actualUrl.split('?')[1]);
+            const uddgUrl = urlParams.get('uddg');
+            if (uddgUrl) {
+              actualUrl = decodeURIComponent(uddgUrl);
+            }
+          } catch (e) {
+            console.log('Failed to parse uddg parameter:', e);
+          }
+        }
+
+        if (actualUrl.includes('duckduckgo.com')) {
+          console.log(`Skipping DDG internal URL: ${actualUrl}`);
+          return null;
+        }
+
+        return actualUrl;
+      });
+
+      if (!firstUrl) {
+        logger.error('[WorkflowEnricher] No valid URL found in search results');
+        return [];
+      }
+
+      logger.info(`[WorkflowEnricher] Successfully extracted first URL: ${firstUrl}`);
+
+      return [{
+        url: firstUrl,
+        title: '',
+        description: '',
+        position: 1
+      }];
+
+    } catch (error: any) {
+      logger.error(`[WorkflowEnricher] Search failed: ${error.message}`);
+      throw new Error(`DuckDuckGo search failed: ${error.message}`);
+    }
+  }
+
+  /**
+   * Use LLM to select the best URL from search results
+   */
+  private static async selectBestUrlFromResults(
+    searchResults: any[],
+    userPrompt: string,
+    llmConfig?: {
+      provider?: 'anthropic' | 'openai' | 'ollama';
+      model?: string;
+      apiKey?: string;
+      baseUrl?: string;
+    }
+  ): Promise<{
+    url: string;
+    confidence: number;
+    reasoning: string;
+  }> {
+    if (searchResults.length === 1) {
+      return {
+        url: searchResults[0].url,
+        confidence: 0.8,
+        reasoning: 'Selected first search result from DuckDuckGo'
+      };
+    }
+
+    const systemPrompt = `You are a URL selector. Given a list of search results and a user's extraction request, select the BEST URL that is most likely to contain the data the user wants.
+
+Consider:
+1. Title and description relevance to the user's request
+2. Official/authoritative sources are usually better than aggregators
+3. List/directory pages are better than individual item pages
+4. The URL path often gives hints about the page content
+
+Return ONLY valid JSON: {"selectedIndex": NUMBER, "confidence": NUMBER_0_TO_1, "reasoning": "brief explanation"}`;
+
+    const resultsDescription = searchResults.map((r, i) =>
+      `Result ${i}:
+- Title: ${r.title}
+- URL: ${r.url}
+- Description: ${r.description}`
+    ).join('\n\n');
+
+    const userMessage = `User wants to: "${userPrompt}"
+
+Available search results:
+${resultsDescription}
+
+Select the BEST result index (0-${searchResults.length - 1}). Return JSON only.`;
+
+    try {
+      const provider = llmConfig?.provider || 'ollama';
+      const axios = require('axios');
+
+      let llmResponse: string;
+
+      if (provider === 'ollama') {
+        const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434';
+        const ollamaModel = llmConfig?.model || 'llama3.2-vision';
+
+        const jsonSchema = {
+          type: 'object',
+          required: ['selectedIndex', 'confidence', 'reasoning'],
+          properties: {
+            selectedIndex: { type: 'integer' },
+            confidence: { type: 'number' },
+            reasoning: { type: 'string' }
+          }
+        };
+
+        const response = await axios.post(`${ollamaBaseUrl}/api/chat`, {
+          model: ollamaModel,
+          messages: [
+            { role: 'system', content: systemPrompt },
+            { role: 'user', content: userMessage }
+          ],
+          stream: false,
+          format: jsonSchema,
+          options: { temperature: 0.1 }
+        });
+
+        llmResponse = response.data.message.content;
+
+      } else if (provider === 'anthropic') {
+        const anthropic = new Anthropic({
+          apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY
+        });
+        const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022';
+
+        const response = await anthropic.messages.create({
+          model: anthropicModel,
+          max_tokens: 256,
+          temperature: 0.1,
+          messages: [{ role: 'user', content: userMessage }],
+          system: systemPrompt
+        });
+
+        const textContent = response.content.find((c: any) => c.type === 'text');
+        llmResponse = textContent?.type === 'text' ? textContent.text : '';
+
+      } else if (provider === 'openai') {
+        const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1';
+        const openaiModel = llmConfig?.model || 'gpt-4o-mini';
+
+        const response = await axios.post(`${openaiBaseUrl}/chat/completions`, {
+          model: openaiModel,
+          messages: [
+            { role: 'system', content: systemPrompt },
+            { role: 'user', content: userMessage }
+          ],
+          max_tokens: 256,
+          temperature: 0.1,
+          response_format: { type: 'json_object' }
+        }, {
+          headers: {
+            'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`,
+            'Content-Type': 'application/json'
+          }
+        });
+
+        llmResponse = response.data.choices[0].message.content;
+
+      } else {
+        throw new Error(`Unsupported LLM provider: ${provider}`);
+      }
+
+      logger.info(`[WorkflowEnricher] URL selection response: ${llmResponse}`);
+
+      let jsonStr = llmResponse.trim();
+      const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/);
+      if (jsonMatch) {
+        jsonStr = jsonMatch[1].trim();
+      }
+
+      const objectMatch = jsonStr.match(/\{[\s\S]*"selectedIndex"[\s\S]*\}/);
+      if (objectMatch) {
+        jsonStr = objectMatch[0];
+      }
+
+      const decision = JSON.parse(jsonStr);
+
+      if (decision.selectedIndex === undefined || decision.selectedIndex < 0 || decision.selectedIndex >= searchResults.length) {
+        throw new Error(`Invalid selectedIndex: ${decision.selectedIndex}`);
+      }
+
+      return {
+        url: searchResults[decision.selectedIndex].url,
+        confidence: decision.confidence || 0.5,
+        reasoning: decision.reasoning || 'No reasoning provided'
+      };
+
+    } catch (error: any) {
+      logger.warn(`[WorkflowEnricher] Failed to select URL with LLM: ${error.message}`);
+      logger.info('[WorkflowEnricher] Using fallback: selecting first search result');
+
+      return {
+        url: searchResults[0].url,
+        confidence: 0.6,
+        reasoning: 'Selected first search result (LLM selection failed)'
+      };
+    }
+  }
 }