diff --git a/server/src/sdk/workflowEnricher.ts b/server/src/sdk/workflowEnricher.ts index c15ba14d..d6bb06eb 100644 --- a/server/src/sdk/workflowEnricher.ts +++ b/server/src/sdk/workflowEnricher.ts @@ -316,7 +316,7 @@ export class WorkflowEnricher { ); logger.info(`LLM decided action type: ${llmDecision.actionType}`); - const workflow = await this.buildWorkflowFromLLMDecision(llmDecision, url, validator); + const workflow = await this.buildWorkflowFromLLMDecision(llmDecision, url, validator, prompt, llmConfig); await validator.close(); @@ -638,13 +638,346 @@ Note: selectedGroupIndex must be between 0 and ${elementGroups.length - 1}`; }; } + /** + * Generate semantic field labels using LLM based on content and context + */ + private static async generateFieldLabels( + fields: Record, + fieldSamples: Record, + prompt: string, + url: string, + llmConfig?: { + provider?: 'anthropic' | 'openai' | 'ollama'; + model?: string; + apiKey?: string; + baseUrl?: string; + } + ): Promise> { + try { + const provider = llmConfig?.provider || 'ollama'; + const axios = require('axios'); + + const fieldDescriptions = Object.entries(fieldSamples).map(([genericLabel, samples]) => { + const fieldInfo = fields[genericLabel]; + const tagType = fieldInfo?.tag?.toLowerCase() || 'unknown'; + const attribute = fieldInfo?.attribute || 'innerText'; + + let typeHint = ''; + if (attribute === 'href') typeHint = '(link/URL)'; + else if (attribute === 'src') typeHint = '(image)'; + else if (tagType === 'img') typeHint = '(image)'; + else if (tagType === 'a') typeHint = '(link)'; + + return `${genericLabel}: + Type: ${tagType} ${typeHint} + Attribute: ${attribute} + Sample values: +${samples.slice(0, 3).map((s, i) => ` ${i + 1}. "${s}"`).join('\n')}`; + }).join('\n\n'); + + const systemPrompt = `You are a data field labeling assistant. Your job is to generate clear, semantic field names for extracted data based on the user's request and the actual field content. + +RULES FOR FIELD NAMING: +1. Use clear, descriptive names that match the content (e.g., "Product Name", "Price", "Rating") +2. Keep names concise (2-4 words maximum) +3. Use Title Case for field names +4. Match the user's terminology when possible +5. Be specific - if it's a product name, call it "Product Name" not just "Name" +6. For images, include "Image" or "Photo" in the name (e.g., "Product Image") +7. For links/URLs, you can use "URL" or "Link" (e.g., "Product URL" or "Details Link") +8. Avoid generic terms like "Text", "Field", "Data" unless absolutely necessary +9. If you can't determine the meaning, use a descriptive observation (e.g., "Star Rating", "Numeric Value") + +You must return a JSON object mapping each generic label to its semantic name.`; + + const userPrompt = `URL: ${url} + +User's extraction request: "${prompt}" + +Detected fields with sample data: +${fieldDescriptions} + +TASK: Generate a semantic name for each field that accurately describes what it contains. +Consider: +- What the user is trying to extract (from their request) +- The actual content in the sample values +- The HTML element type and attribute being extracted +- Common naming conventions for this type of data + +Return a JSON object with this exact structure: +{ + "Label 1": "Semantic Field Name 1", + "Label 2": "Semantic Field Name 2", + ... +} + +Example - if extracting products: +{ + "Label 1": "Product Name", + "Label 2": "Price", + "Label 3": "Product Image", + "Label 4": "Rating" +}`; + + let llmResponse: string; + + if (provider === 'ollama') { + const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434'; + const ollamaModel = llmConfig?.model || 'llama3.2-vision'; + + logger.info(`Using Ollama at ${ollamaBaseUrl} with model ${ollamaModel}`); + + const jsonSchema = { + type: 'object', + required: ['fieldLabels'], + properties: { + fieldLabels: { + type: 'object', + description: 'Mapping of generic labels to semantic field names', + patternProperties: { + '^Label \\d+$': { + type: 'string', + description: 'Semantic field name in Title Case' + } + } + } + } + }; + + try { + const response = await axios.post(`${ollamaBaseUrl}/api/chat`, { + model: ollamaModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: userPrompt + } + ], + stream: false, + format: jsonSchema, + options: { + temperature: 0.1, + top_p: 0.9 + } + }); + + llmResponse = response.data.message.content; + } catch (ollamaError: any) { + logger.error(`Ollama request failed: ${ollamaError.message}`); + if (ollamaError.response) { + logger.error(`Ollama response status: ${ollamaError.response.status}`); + logger.error(`Ollama response data: ${JSON.stringify(ollamaError.response.data)}`); + } + throw new Error(`Ollama API error: ${ollamaError.message}. Make sure Ollama is running at ${ollamaBaseUrl}`); + } + + } else if (provider === 'anthropic') { + const anthropic = new Anthropic({ + apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY + }); + const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022'; + + const response = await anthropic.messages.create({ + model: anthropicModel, + max_tokens: 2048, + temperature: 0.1, + messages: [{ + role: 'user', + content: userPrompt + }], + system: systemPrompt + }); + + const textContent = response.content.find((c: any) => c.type === 'text'); + llmResponse = textContent?.type === 'text' ? textContent.text : ''; + + } else if (provider === 'openai') { + const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1'; + const openaiModel = llmConfig?.model || 'gpt-4o-mini'; + + const response = await axios.post(`${openaiBaseUrl}/chat/completions`, { + model: openaiModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: userPrompt + } + ], + max_tokens: 2048, + temperature: 0.1, + response_format: { type: 'json_object' } + }, { + headers: { + 'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`, + 'Content-Type': 'application/json' + } + }); + + llmResponse = response.data.choices[0].message.content; + + } else { + throw new Error(`Unsupported LLM provider: ${provider}`); + } + + logger.info(`LLM Field Labeling Response: ${llmResponse}`); + + let jsonStr = llmResponse.trim(); + + const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/); + if (jsonMatch) { + jsonStr = jsonMatch[1].trim(); + } + + const objectMatch = jsonStr.match(/\{[\s\S]*\}/); + if (objectMatch) { + jsonStr = objectMatch[0]; + } + + const parsedResponse = JSON.parse(jsonStr); + + let labelMapping: Record; + if (parsedResponse.fieldLabels) { + labelMapping = parsedResponse.fieldLabels; + } else { + labelMapping = parsedResponse; + } + + const missingLabels: string[] = []; + Object.keys(fields).forEach(genericLabel => { + if (!labelMapping[genericLabel]) { + missingLabels.push(genericLabel); + } + }); + + if (missingLabels.length > 0) { + logger.warn(`LLM did not provide labels for: ${missingLabels.join(', ')}`); + missingLabels.forEach(label => { + labelMapping[label] = label; + }); + } + + logger.info(`Generated semantic field labels: ${JSON.stringify(labelMapping, null, 2)}`); + return labelMapping; + + } catch (error: any) { + logger.error(`Error generating field labels with LLM: ${error.message}`); + logger.error(`Using fallback: keeping generic field labels`); + const fallbackLabels: Record = {}; + Object.keys(fields).forEach(label => { + fallbackLabels[label] = label; + }); + return fallbackLabels; + } + } + + /** + * Extract sample data from fields for LLM labeling + */ + private static async extractFieldSamples( + fields: Record, + listSelector: string, + validator: SelectorValidator + ): Promise> { + const fieldSamples: Record = {}; + + try { + const page = (validator as any).page; + if (!page) { + throw new Error('Page not available'); + } + + const samples = await page.evaluate((args: { fieldsData: any; listSel: string }) => { + const results: Record = {}; + + function evaluateSelector(selector: string, doc: Document): Element[] { + const isXPath = selector.startsWith('//') || selector.startsWith('(//'); + if (isXPath) { + const result = doc.evaluate(selector, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null); + const elements: Element[] = []; + for (let i = 0; i < result.snapshotLength; i++) { + const node = result.snapshotItem(i); + if (node && node.nodeType === Node.ELEMENT_NODE) { + elements.push(node as Element); + } + } + return elements; + } else { + return Array.from(doc.querySelectorAll(selector)); + } + } + + const listItems = evaluateSelector(args.listSel, document).slice(0, 5); + + Object.entries(args.fieldsData).forEach(([fieldLabel, fieldInfo]: [string, any]) => { + const samples: string[] = []; + const selector = fieldInfo.selector; + const attribute = fieldInfo.attribute || 'innerText'; + + listItems.forEach((listItem: Element) => { + try { + const elements = evaluateSelector(selector, document); + + const matchingElement = elements.find((el: Element) => { + return listItem.contains(el); + }); + + if (matchingElement) { + let value = ''; + if (attribute === 'innerText') { + value = (matchingElement.textContent || '').trim(); + } else { + value = matchingElement.getAttribute(attribute) || ''; + } + + if (value && value.length > 0 && !samples.includes(value)) { + samples.push(value.substring(0, 200)); + } + } + } catch (e) { + } + }); + + results[fieldLabel] = samples; + }); + + return results; + }, { fieldsData: fields, listSel: listSelector }); + + logger.info(`Extracted field samples: ${JSON.stringify(Object.keys(samples).map(key => ({ field: key, sampleCount: samples[key].length })))}`); + return samples; + + } catch (error: any) { + logger.error(`Error extracting field samples: ${error.message}`); + logger.error(`Error stack: ${error.stack}`); + Object.keys(fields).forEach(label => { + fieldSamples[label] = []; + }); + return fieldSamples; + } + } + /** * Build workflow from LLM decision */ private static async buildWorkflowFromLLMDecision( llmDecision: any, url: string, - validator: SelectorValidator + validator: SelectorValidator, + prompt?: string, + llmConfig?: { + provider?: 'anthropic' | 'openai' | 'ollama'; + model?: string; + apiKey?: string; + baseUrl?: string; + } ): Promise { const workflow: any[] = []; @@ -667,6 +1000,30 @@ Note: selectedGroupIndex must be between 0 and ${elementGroups.length - 1}`; logger.info(`Auto-detected ${Object.keys(autoDetectResult.fields).length} fields`); + logger.info('Extracting field samples for semantic labeling...'); + const fieldSamples = await this.extractFieldSamples( + autoDetectResult.fields, + autoDetectResult.listSelector || '', + validator + ); + + logger.info('Generating semantic field labels with LLM...'); + const fieldLabels = await this.generateFieldLabels( + autoDetectResult.fields, + fieldSamples, + prompt || 'Extract list data', + url, + llmConfig + ); + + const renamedFields: Record = {}; + Object.entries(autoDetectResult.fields).forEach(([genericLabel, fieldInfo]) => { + const semanticLabel = fieldLabels[genericLabel] || genericLabel; + renamedFields[semanticLabel] = fieldInfo; + }); + + logger.info(`Renamed fields: ${JSON.stringify(Object.keys(renamedFields))}`); + let paginationType = 'none'; let paginationSelector = ''; @@ -688,7 +1045,7 @@ Note: selectedGroupIndex must be between 0 and ${elementGroups.length - 1}`; actionId: `list-${uuid()}`, name: 'List 1', args: [{ - fields: autoDetectResult.fields, + fields: renamedFields, listSelector: autoDetectResult.listSelector, pagination: { type: paginationType,