From b649ce3a33ad5beb0671d356d1944b5ae90ab960 Mon Sep 17 00:00:00 2001 From: Rohit Rajan Date: Thu, 11 Dec 2025 16:58:33 +0530 Subject: [PATCH] feat: filter the fields based on prompt --- server/src/sdk/workflowEnricher.ts | 278 +++++++++++++++++++++++++++-- 1 file changed, 259 insertions(+), 19 deletions(-) diff --git a/server/src/sdk/workflowEnricher.ts b/server/src/sdk/workflowEnricher.ts index cb57f264..476c82f9 100644 --- a/server/src/sdk/workflowEnricher.ts +++ b/server/src/sdk/workflowEnricher.ts @@ -415,11 +415,11 @@ export class WorkflowEnricher { 3. Extract any numeric limit from their request CRITICAL GROUP SELECTION RULES: -- Groups with "Has text content: YES" are usually better than groups with NO text content -- Match the sample content to what the user is asking for -- Avoid groups that only show images/icons (Has text content: NO) -- The group with the most relevant sample content should be selected, NOT just the first group -- Analyze the keywords in the user's request and find the group whose sample content contains related text +- Match the sample content to what the user is asking for - this is the PRIMARY criterion +- Groups with text content are often easier to match, but image galleries, icon grids, or data-attribute based groups can also be correct +- Analyze the keywords in the user's request and find the group whose sample content or structure best matches +- Consider the context: product sites may have image grids, job sites have text listings, etc. +- The group with the most relevant content should be selected, NOT just the first group or the group with most text LIMIT EXTRACTION: - Look for numbers in the request that indicate quantity (e.g., "50", "25", "100", "first 30", "top 10") @@ -683,15 +683,16 @@ ${samples.slice(0, 3).map((s, i) => ` ${i + 1}. "${s}"`).join('\n')}`; const systemPrompt = `You are a data field labeling assistant. Your job is to generate clear, semantic field names for extracted data based on the user's request and the actual field content. RULES FOR FIELD NAMING: -1. Use clear, descriptive names that match the content (e.g., "Product Name", "Price", "Rating") +1. Use clear, descriptive names that match the content and context 2. Keep names concise (2-4 words maximum) 3. Use Title Case for field names 4. Match the user's terminology when possible -5. Be specific - if it's a product name, call it "Product Name" not just "Name" -6. For images, include "Image" or "Photo" in the name (e.g., "Product Image") -7. For links/URLs, you can use "URL" or "Link" (e.g., "Product URL" or "Details Link") +5. Be specific - include context when needed (e.g., "Product Name", "Job Title", "Article Headline", "Company Name") +6. For images, include "Image" or "Photo" in the name (e.g., "Product Image", "Profile Photo", "Thumbnail") +7. For links/URLs, you can use "URL" or "Link" (e.g., "Details Link", "Company Website") 8. Avoid generic terms like "Text", "Field", "Data" unless absolutely necessary -9. If you can't determine the meaning, use a descriptive observation (e.g., "Star Rating", "Numeric Value") +9. If you can't determine the meaning, use a descriptive observation based on the content type +10. Adapt to the domain: e-commerce (Product, Price), jobs (Title, Company), articles (Headline, Author), etc. You must return a JSON object mapping each generic label to its semantic name.`; @@ -714,14 +715,6 @@ Return a JSON object with this exact structure: "Label 1": "Semantic Field Name 1", "Label 2": "Semantic Field Name 2", ... -} - -Example - if extracting products: -{ - "Label 1": "Product Name", - "Label 2": "Price", - "Label 3": "Product Image", - "Label 4": "Rating" }`; let llmResponse: string; @@ -881,6 +874,231 @@ Example - if extracting products: } } + /** + * Filter fields based on user intent using LLM with confidence scoring + */ + private static async filterFieldsByIntent( + labeledFields: Record, + fieldSamples: Record, + prompt: string, + llmConfig?: { + provider?: 'anthropic' | 'openai' | 'ollama'; + model?: string; + apiKey?: string; + baseUrl?: string; + } + ): Promise<{ + selectedFields: Record; + confidence: number; + reasoning: string; + needsUserConfirmation: boolean; + }> { + try { + const provider = llmConfig?.provider || 'ollama'; + const axios = require('axios'); + + const fieldDescriptions = Object.entries(labeledFields).map(([fieldName, fieldInfo]) => { + const samples = fieldSamples[fieldName] || []; + const sampleText = samples.length > 0 + ? samples.slice(0, 3).map((s, i) => ` ${i + 1}. "${s}"`).join('\n') + : ' (no samples available)'; + + return `${fieldName}: + Type: ${fieldInfo.tag || 'unknown'} + Attribute: ${fieldInfo.attribute || 'innerText'} + Sample values: +${sampleText}`; + }).join('\n\n'); + + const systemPrompt = `You are a field filter assistant. Your job is to analyze the user's extraction request and select ONLY the fields that match their intent. + +CRITICAL RULES: +1. Only include fields explicitly mentioned or clearly implied by the user's request +2. Use semantic matching (e.g., "quotes" matches "Quote Text", "company names" matches "Company Name") +3. If the user specifies a count (e.g., "20 quotes"), note it but return the matching fields +4. Be strict: when in doubt, exclude the field rather than include it +5. Return high confidence (0.9-1.0) only if matches are exact or obvious +6. Return medium confidence (0.6-0.8) if matches are semantic/implied +7. Return low confidence (<0.6) if uncertain + +You must return a JSON object with selectedFields, confidence, and reasoning.`; + + const userPrompt = `User's extraction request: "${prompt}" + +Available labeled fields: +${fieldDescriptions} + +TASK: Determine which fields the user wants to extract based on their request. + +Return a JSON object with this exact structure: +{ + "selectedFields": ["Field Name 1", "Field Name 2"], + "confidence": 0.95, + "reasoning": "Brief explanation of why these fields were selected and confidence level" +} + +Rules: +- selectedFields: Array of field names that match the user's intent +- confidence: Number between 0 and 1 (1.0 = exact match, 0.8+ = semantic match, <0.7 = uncertain) +- reasoning: Explain which keywords from the user's request matched which fields`; + + let llmResponse: string; + + if (provider === 'ollama') { + const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434'; + const ollamaModel = llmConfig?.model || 'llama3.2-vision'; + + const jsonSchema = { + type: 'object', + required: ['selectedFields', 'confidence', 'reasoning'], + properties: { + selectedFields: { + type: 'array', + items: { type: 'string' }, + description: 'Array of field names that match user intent' + }, + confidence: { + type: 'number', + minimum: 0, + maximum: 1, + description: 'Confidence score from 0 to 1' + }, + reasoning: { + type: 'string', + description: 'Explanation of field selection and confidence' + } + } + }; + + const response = await axios.post(`${ollamaBaseUrl}/api/chat`, { + model: ollamaModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: userPrompt + } + ], + stream: false, + format: jsonSchema, + options: { + temperature: 0.1, + top_p: 0.9 + } + }); + + llmResponse = response.data.message.content; + + } else if (provider === 'anthropic') { + const anthropic = new Anthropic({ + apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY + }); + const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022'; + + const response = await anthropic.messages.create({ + model: anthropicModel, + max_tokens: 1024, + temperature: 0.1, + messages: [{ + role: 'user', + content: userPrompt + }], + system: systemPrompt + }); + + const textContent = response.content.find((c: any) => c.type === 'text'); + llmResponse = textContent?.type === 'text' ? textContent.text : ''; + + } else if (provider === 'openai') { + const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1'; + const openaiModel = llmConfig?.model || 'gpt-4o-mini'; + + const response = await axios.post(`${openaiBaseUrl}/chat/completions`, { + model: openaiModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: userPrompt + } + ], + max_tokens: 1024, + temperature: 0.1, + response_format: { type: 'json_object' } + }, { + headers: { + 'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`, + 'Content-Type': 'application/json' + } + }); + + llmResponse = response.data.choices[0].message.content; + + } else { + throw new Error(`Unsupported LLM provider: ${provider}`); + } + + logger.info(`LLM Field Filtering Response: ${llmResponse}`); + + // Parse JSON response + let jsonStr = llmResponse.trim(); + + const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/); + if (jsonMatch) { + jsonStr = jsonMatch[1].trim(); + } + + const objectMatch = jsonStr.match(/\{[\s\S]*\}/); + if (objectMatch) { + jsonStr = objectMatch[0]; + } + + const filterResult = JSON.parse(jsonStr); + + if (!Array.isArray(filterResult.selectedFields)) { + throw new Error('Invalid response: selectedFields must be an array'); + } + + if (typeof filterResult.confidence !== 'number' || filterResult.confidence < 0 || filterResult.confidence > 1) { + throw new Error('Invalid response: confidence must be a number between 0 and 1'); + } + + const filteredFields: Record = {}; + for (const fieldName of filterResult.selectedFields) { + if (labeledFields[fieldName]) { + filteredFields[fieldName] = labeledFields[fieldName]; + } else { + logger.warn(`LLM selected field "${fieldName}" but it doesn't exist in labeled fields`); + } + } + + const needsUserConfirmation = filterResult.confidence < 0.8 || Object.keys(filteredFields).length === 0; + + return { + selectedFields: filteredFields, + confidence: filterResult.confidence, + reasoning: filterResult.reasoning || 'No reasoning provided', + needsUserConfirmation + }; + + } catch (error: any) { + logger.error(`Error filtering fields by intent: ${error.message}`); + + return { + selectedFields: labeledFields, + confidence: 0.5, + reasoning: 'Error during filtering, returning all fields as fallback', + needsUserConfirmation: true + }; + } + } + /** * Extract sample data from fields for LLM labeling */ @@ -1024,6 +1242,28 @@ Example - if extracting products: const semanticLabel = fieldLabels[genericLabel] || genericLabel; renamedFields[semanticLabel] = fieldInfo; }); + + const renamedSamples: Record = {}; + Object.entries(fieldSamples).forEach(([genericLabel, samples]) => { + const semanticLabel = fieldLabels[genericLabel] || genericLabel; + renamedSamples[semanticLabel] = samples; + }); + + const filterResult = await this.filterFieldsByIntent( + renamedFields, + renamedSamples, + prompt || 'Extract list data', + llmConfig + ); + + let finalFields = renamedFields; + if (filterResult.confidence >= 0.8 && Object.keys(filterResult.selectedFields).length > 0) { + finalFields = filterResult.selectedFields; + } else if (filterResult.confidence >= 0.6 && Object.keys(filterResult.selectedFields).length > 0) { + finalFields = filterResult.selectedFields; + } else { + logger.warn(`Low confidence (${filterResult.confidence}) or no fields selected. Using all detected fields as fallback.`); + } let paginationType = 'none'; let paginationSelector = ''; @@ -1041,7 +1281,7 @@ Example - if extracting products: actionId: `list-${uuid()}`, name: 'List 1', args: [{ - fields: renamedFields, + fields: finalFields, listSelector: autoDetectResult.listSelector, pagination: { type: paginationType,