feat: auto label llm extracted fields

This commit is contained in:
Rohit Rajan
2025-12-11 11:43:51 +05:30
parent 5fcc176fe5
commit 8bbcadafa7

View File

@@ -316,7 +316,7 @@ export class WorkflowEnricher {
);
logger.info(`LLM decided action type: ${llmDecision.actionType}`);
const workflow = await this.buildWorkflowFromLLMDecision(llmDecision, url, validator);
const workflow = await this.buildWorkflowFromLLMDecision(llmDecision, url, validator, prompt, llmConfig);
await validator.close();
@@ -638,13 +638,346 @@ Note: selectedGroupIndex must be between 0 and ${elementGroups.length - 1}`;
};
}
/**
* Generate semantic field labels using LLM based on content and context
*/
private static async generateFieldLabels(
fields: Record<string, any>,
fieldSamples: Record<string, string[]>,
prompt: string,
url: string,
llmConfig?: {
provider?: 'anthropic' | 'openai' | 'ollama';
model?: string;
apiKey?: string;
baseUrl?: string;
}
): Promise<Record<string, string>> {
try {
const provider = llmConfig?.provider || 'ollama';
const axios = require('axios');
const fieldDescriptions = Object.entries(fieldSamples).map(([genericLabel, samples]) => {
const fieldInfo = fields[genericLabel];
const tagType = fieldInfo?.tag?.toLowerCase() || 'unknown';
const attribute = fieldInfo?.attribute || 'innerText';
let typeHint = '';
if (attribute === 'href') typeHint = '(link/URL)';
else if (attribute === 'src') typeHint = '(image)';
else if (tagType === 'img') typeHint = '(image)';
else if (tagType === 'a') typeHint = '(link)';
return `${genericLabel}:
Type: ${tagType} ${typeHint}
Attribute: ${attribute}
Sample values:
${samples.slice(0, 3).map((s, i) => ` ${i + 1}. "${s}"`).join('\n')}`;
}).join('\n\n');
const systemPrompt = `You are a data field labeling assistant. Your job is to generate clear, semantic field names for extracted data based on the user's request and the actual field content.
RULES FOR FIELD NAMING:
1. Use clear, descriptive names that match the content (e.g., "Product Name", "Price", "Rating")
2. Keep names concise (2-4 words maximum)
3. Use Title Case for field names
4. Match the user's terminology when possible
5. Be specific - if it's a product name, call it "Product Name" not just "Name"
6. For images, include "Image" or "Photo" in the name (e.g., "Product Image")
7. For links/URLs, you can use "URL" or "Link" (e.g., "Product URL" or "Details Link")
8. Avoid generic terms like "Text", "Field", "Data" unless absolutely necessary
9. If you can't determine the meaning, use a descriptive observation (e.g., "Star Rating", "Numeric Value")
You must return a JSON object mapping each generic label to its semantic name.`;
const userPrompt = `URL: ${url}
User's extraction request: "${prompt}"
Detected fields with sample data:
${fieldDescriptions}
TASK: Generate a semantic name for each field that accurately describes what it contains.
Consider:
- What the user is trying to extract (from their request)
- The actual content in the sample values
- The HTML element type and attribute being extracted
- Common naming conventions for this type of data
Return a JSON object with this exact structure:
{
"Label 1": "Semantic Field Name 1",
"Label 2": "Semantic Field Name 2",
...
}
Example - if extracting products:
{
"Label 1": "Product Name",
"Label 2": "Price",
"Label 3": "Product Image",
"Label 4": "Rating"
}`;
let llmResponse: string;
if (provider === 'ollama') {
const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434';
const ollamaModel = llmConfig?.model || 'llama3.2-vision';
logger.info(`Using Ollama at ${ollamaBaseUrl} with model ${ollamaModel}`);
const jsonSchema = {
type: 'object',
required: ['fieldLabels'],
properties: {
fieldLabels: {
type: 'object',
description: 'Mapping of generic labels to semantic field names',
patternProperties: {
'^Label \\d+$': {
type: 'string',
description: 'Semantic field name in Title Case'
}
}
}
}
};
try {
const response = await axios.post(`${ollamaBaseUrl}/api/chat`, {
model: ollamaModel,
messages: [
{
role: 'system',
content: systemPrompt
},
{
role: 'user',
content: userPrompt
}
],
stream: false,
format: jsonSchema,
options: {
temperature: 0.1,
top_p: 0.9
}
});
llmResponse = response.data.message.content;
} catch (ollamaError: any) {
logger.error(`Ollama request failed: ${ollamaError.message}`);
if (ollamaError.response) {
logger.error(`Ollama response status: ${ollamaError.response.status}`);
logger.error(`Ollama response data: ${JSON.stringify(ollamaError.response.data)}`);
}
throw new Error(`Ollama API error: ${ollamaError.message}. Make sure Ollama is running at ${ollamaBaseUrl}`);
}
} else if (provider === 'anthropic') {
const anthropic = new Anthropic({
apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY
});
const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022';
const response = await anthropic.messages.create({
model: anthropicModel,
max_tokens: 2048,
temperature: 0.1,
messages: [{
role: 'user',
content: userPrompt
}],
system: systemPrompt
});
const textContent = response.content.find((c: any) => c.type === 'text');
llmResponse = textContent?.type === 'text' ? textContent.text : '';
} else if (provider === 'openai') {
const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1';
const openaiModel = llmConfig?.model || 'gpt-4o-mini';
const response = await axios.post(`${openaiBaseUrl}/chat/completions`, {
model: openaiModel,
messages: [
{
role: 'system',
content: systemPrompt
},
{
role: 'user',
content: userPrompt
}
],
max_tokens: 2048,
temperature: 0.1,
response_format: { type: 'json_object' }
}, {
headers: {
'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`,
'Content-Type': 'application/json'
}
});
llmResponse = response.data.choices[0].message.content;
} else {
throw new Error(`Unsupported LLM provider: ${provider}`);
}
logger.info(`LLM Field Labeling Response: ${llmResponse}`);
let jsonStr = llmResponse.trim();
const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/);
if (jsonMatch) {
jsonStr = jsonMatch[1].trim();
}
const objectMatch = jsonStr.match(/\{[\s\S]*\}/);
if (objectMatch) {
jsonStr = objectMatch[0];
}
const parsedResponse = JSON.parse(jsonStr);
let labelMapping: Record<string, string>;
if (parsedResponse.fieldLabels) {
labelMapping = parsedResponse.fieldLabels;
} else {
labelMapping = parsedResponse;
}
const missingLabels: string[] = [];
Object.keys(fields).forEach(genericLabel => {
if (!labelMapping[genericLabel]) {
missingLabels.push(genericLabel);
}
});
if (missingLabels.length > 0) {
logger.warn(`LLM did not provide labels for: ${missingLabels.join(', ')}`);
missingLabels.forEach(label => {
labelMapping[label] = label;
});
}
logger.info(`Generated semantic field labels: ${JSON.stringify(labelMapping, null, 2)}`);
return labelMapping;
} catch (error: any) {
logger.error(`Error generating field labels with LLM: ${error.message}`);
logger.error(`Using fallback: keeping generic field labels`);
const fallbackLabels: Record<string, string> = {};
Object.keys(fields).forEach(label => {
fallbackLabels[label] = label;
});
return fallbackLabels;
}
}
/**
* Extract sample data from fields for LLM labeling
*/
private static async extractFieldSamples(
fields: Record<string, any>,
listSelector: string,
validator: SelectorValidator
): Promise<Record<string, string[]>> {
const fieldSamples: Record<string, string[]> = {};
try {
const page = (validator as any).page;
if (!page) {
throw new Error('Page not available');
}
const samples = await page.evaluate((args: { fieldsData: any; listSel: string }) => {
const results: Record<string, string[]> = {};
function evaluateSelector(selector: string, doc: Document): Element[] {
const isXPath = selector.startsWith('//') || selector.startsWith('(//');
if (isXPath) {
const result = doc.evaluate(selector, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
const elements: Element[] = [];
for (let i = 0; i < result.snapshotLength; i++) {
const node = result.snapshotItem(i);
if (node && node.nodeType === Node.ELEMENT_NODE) {
elements.push(node as Element);
}
}
return elements;
} else {
return Array.from(doc.querySelectorAll(selector));
}
}
const listItems = evaluateSelector(args.listSel, document).slice(0, 5);
Object.entries(args.fieldsData).forEach(([fieldLabel, fieldInfo]: [string, any]) => {
const samples: string[] = [];
const selector = fieldInfo.selector;
const attribute = fieldInfo.attribute || 'innerText';
listItems.forEach((listItem: Element) => {
try {
const elements = evaluateSelector(selector, document);
const matchingElement = elements.find((el: Element) => {
return listItem.contains(el);
});
if (matchingElement) {
let value = '';
if (attribute === 'innerText') {
value = (matchingElement.textContent || '').trim();
} else {
value = matchingElement.getAttribute(attribute) || '';
}
if (value && value.length > 0 && !samples.includes(value)) {
samples.push(value.substring(0, 200));
}
}
} catch (e) {
}
});
results[fieldLabel] = samples;
});
return results;
}, { fieldsData: fields, listSel: listSelector });
logger.info(`Extracted field samples: ${JSON.stringify(Object.keys(samples).map(key => ({ field: key, sampleCount: samples[key].length })))}`);
return samples;
} catch (error: any) {
logger.error(`Error extracting field samples: ${error.message}`);
logger.error(`Error stack: ${error.stack}`);
Object.keys(fields).forEach(label => {
fieldSamples[label] = [];
});
return fieldSamples;
}
}
/**
* Build workflow from LLM decision
*/
private static async buildWorkflowFromLLMDecision(
llmDecision: any,
url: string,
validator: SelectorValidator
validator: SelectorValidator,
prompt?: string,
llmConfig?: {
provider?: 'anthropic' | 'openai' | 'ollama';
model?: string;
apiKey?: string;
baseUrl?: string;
}
): Promise<any[]> {
const workflow: any[] = [];
@@ -667,6 +1000,30 @@ Note: selectedGroupIndex must be between 0 and ${elementGroups.length - 1}`;
logger.info(`Auto-detected ${Object.keys(autoDetectResult.fields).length} fields`);
logger.info('Extracting field samples for semantic labeling...');
const fieldSamples = await this.extractFieldSamples(
autoDetectResult.fields,
autoDetectResult.listSelector || '',
validator
);
logger.info('Generating semantic field labels with LLM...');
const fieldLabels = await this.generateFieldLabels(
autoDetectResult.fields,
fieldSamples,
prompt || 'Extract list data',
url,
llmConfig
);
const renamedFields: Record<string, any> = {};
Object.entries(autoDetectResult.fields).forEach(([genericLabel, fieldInfo]) => {
const semanticLabel = fieldLabels[genericLabel] || genericLabel;
renamedFields[semanticLabel] = fieldInfo;
});
logger.info(`Renamed fields: ${JSON.stringify(Object.keys(renamedFields))}`);
let paginationType = 'none';
let paginationSelector = '';
@@ -688,7 +1045,7 @@ Note: selectedGroupIndex must be between 0 and ${elementGroups.length - 1}`;
actionId: `list-${uuid()}`,
name: 'List 1',
args: [{
fields: autoDetectResult.fields,
fields: renamedFields,
listSelector: autoDetectResult.listSelector,
pagination: {
type: paginationType,