feat: auto label llm extracted fields
This commit is contained in:
@@ -316,7 +316,7 @@ export class WorkflowEnricher {
|
|||||||
);
|
);
|
||||||
logger.info(`LLM decided action type: ${llmDecision.actionType}`);
|
logger.info(`LLM decided action type: ${llmDecision.actionType}`);
|
||||||
|
|
||||||
const workflow = await this.buildWorkflowFromLLMDecision(llmDecision, url, validator);
|
const workflow = await this.buildWorkflowFromLLMDecision(llmDecision, url, validator, prompt, llmConfig);
|
||||||
|
|
||||||
await validator.close();
|
await validator.close();
|
||||||
|
|
||||||
@@ -638,13 +638,346 @@ Note: selectedGroupIndex must be between 0 and ${elementGroups.length - 1}`;
|
|||||||
};
|
};
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Generate semantic field labels using LLM based on content and context
|
||||||
|
*/
|
||||||
|
private static async generateFieldLabels(
|
||||||
|
fields: Record<string, any>,
|
||||||
|
fieldSamples: Record<string, string[]>,
|
||||||
|
prompt: string,
|
||||||
|
url: string,
|
||||||
|
llmConfig?: {
|
||||||
|
provider?: 'anthropic' | 'openai' | 'ollama';
|
||||||
|
model?: string;
|
||||||
|
apiKey?: string;
|
||||||
|
baseUrl?: string;
|
||||||
|
}
|
||||||
|
): Promise<Record<string, string>> {
|
||||||
|
try {
|
||||||
|
const provider = llmConfig?.provider || 'ollama';
|
||||||
|
const axios = require('axios');
|
||||||
|
|
||||||
|
const fieldDescriptions = Object.entries(fieldSamples).map(([genericLabel, samples]) => {
|
||||||
|
const fieldInfo = fields[genericLabel];
|
||||||
|
const tagType = fieldInfo?.tag?.toLowerCase() || 'unknown';
|
||||||
|
const attribute = fieldInfo?.attribute || 'innerText';
|
||||||
|
|
||||||
|
let typeHint = '';
|
||||||
|
if (attribute === 'href') typeHint = '(link/URL)';
|
||||||
|
else if (attribute === 'src') typeHint = '(image)';
|
||||||
|
else if (tagType === 'img') typeHint = '(image)';
|
||||||
|
else if (tagType === 'a') typeHint = '(link)';
|
||||||
|
|
||||||
|
return `${genericLabel}:
|
||||||
|
Type: ${tagType} ${typeHint}
|
||||||
|
Attribute: ${attribute}
|
||||||
|
Sample values:
|
||||||
|
${samples.slice(0, 3).map((s, i) => ` ${i + 1}. "${s}"`).join('\n')}`;
|
||||||
|
}).join('\n\n');
|
||||||
|
|
||||||
|
const systemPrompt = `You are a data field labeling assistant. Your job is to generate clear, semantic field names for extracted data based on the user's request and the actual field content.
|
||||||
|
|
||||||
|
RULES FOR FIELD NAMING:
|
||||||
|
1. Use clear, descriptive names that match the content (e.g., "Product Name", "Price", "Rating")
|
||||||
|
2. Keep names concise (2-4 words maximum)
|
||||||
|
3. Use Title Case for field names
|
||||||
|
4. Match the user's terminology when possible
|
||||||
|
5. Be specific - if it's a product name, call it "Product Name" not just "Name"
|
||||||
|
6. For images, include "Image" or "Photo" in the name (e.g., "Product Image")
|
||||||
|
7. For links/URLs, you can use "URL" or "Link" (e.g., "Product URL" or "Details Link")
|
||||||
|
8. Avoid generic terms like "Text", "Field", "Data" unless absolutely necessary
|
||||||
|
9. If you can't determine the meaning, use a descriptive observation (e.g., "Star Rating", "Numeric Value")
|
||||||
|
|
||||||
|
You must return a JSON object mapping each generic label to its semantic name.`;
|
||||||
|
|
||||||
|
const userPrompt = `URL: ${url}
|
||||||
|
|
||||||
|
User's extraction request: "${prompt}"
|
||||||
|
|
||||||
|
Detected fields with sample data:
|
||||||
|
${fieldDescriptions}
|
||||||
|
|
||||||
|
TASK: Generate a semantic name for each field that accurately describes what it contains.
|
||||||
|
Consider:
|
||||||
|
- What the user is trying to extract (from their request)
|
||||||
|
- The actual content in the sample values
|
||||||
|
- The HTML element type and attribute being extracted
|
||||||
|
- Common naming conventions for this type of data
|
||||||
|
|
||||||
|
Return a JSON object with this exact structure:
|
||||||
|
{
|
||||||
|
"Label 1": "Semantic Field Name 1",
|
||||||
|
"Label 2": "Semantic Field Name 2",
|
||||||
|
...
|
||||||
|
}
|
||||||
|
|
||||||
|
Example - if extracting products:
|
||||||
|
{
|
||||||
|
"Label 1": "Product Name",
|
||||||
|
"Label 2": "Price",
|
||||||
|
"Label 3": "Product Image",
|
||||||
|
"Label 4": "Rating"
|
||||||
|
}`;
|
||||||
|
|
||||||
|
let llmResponse: string;
|
||||||
|
|
||||||
|
if (provider === 'ollama') {
|
||||||
|
const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434';
|
||||||
|
const ollamaModel = llmConfig?.model || 'llama3.2-vision';
|
||||||
|
|
||||||
|
logger.info(`Using Ollama at ${ollamaBaseUrl} with model ${ollamaModel}`);
|
||||||
|
|
||||||
|
const jsonSchema = {
|
||||||
|
type: 'object',
|
||||||
|
required: ['fieldLabels'],
|
||||||
|
properties: {
|
||||||
|
fieldLabels: {
|
||||||
|
type: 'object',
|
||||||
|
description: 'Mapping of generic labels to semantic field names',
|
||||||
|
patternProperties: {
|
||||||
|
'^Label \\d+$': {
|
||||||
|
type: 'string',
|
||||||
|
description: 'Semantic field name in Title Case'
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
};
|
||||||
|
|
||||||
|
try {
|
||||||
|
const response = await axios.post(`${ollamaBaseUrl}/api/chat`, {
|
||||||
|
model: ollamaModel,
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: 'system',
|
||||||
|
content: systemPrompt
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: 'user',
|
||||||
|
content: userPrompt
|
||||||
|
}
|
||||||
|
],
|
||||||
|
stream: false,
|
||||||
|
format: jsonSchema,
|
||||||
|
options: {
|
||||||
|
temperature: 0.1,
|
||||||
|
top_p: 0.9
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
llmResponse = response.data.message.content;
|
||||||
|
} catch (ollamaError: any) {
|
||||||
|
logger.error(`Ollama request failed: ${ollamaError.message}`);
|
||||||
|
if (ollamaError.response) {
|
||||||
|
logger.error(`Ollama response status: ${ollamaError.response.status}`);
|
||||||
|
logger.error(`Ollama response data: ${JSON.stringify(ollamaError.response.data)}`);
|
||||||
|
}
|
||||||
|
throw new Error(`Ollama API error: ${ollamaError.message}. Make sure Ollama is running at ${ollamaBaseUrl}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
} else if (provider === 'anthropic') {
|
||||||
|
const anthropic = new Anthropic({
|
||||||
|
apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY
|
||||||
|
});
|
||||||
|
const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022';
|
||||||
|
|
||||||
|
const response = await anthropic.messages.create({
|
||||||
|
model: anthropicModel,
|
||||||
|
max_tokens: 2048,
|
||||||
|
temperature: 0.1,
|
||||||
|
messages: [{
|
||||||
|
role: 'user',
|
||||||
|
content: userPrompt
|
||||||
|
}],
|
||||||
|
system: systemPrompt
|
||||||
|
});
|
||||||
|
|
||||||
|
const textContent = response.content.find((c: any) => c.type === 'text');
|
||||||
|
llmResponse = textContent?.type === 'text' ? textContent.text : '';
|
||||||
|
|
||||||
|
} else if (provider === 'openai') {
|
||||||
|
const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1';
|
||||||
|
const openaiModel = llmConfig?.model || 'gpt-4o-mini';
|
||||||
|
|
||||||
|
const response = await axios.post(`${openaiBaseUrl}/chat/completions`, {
|
||||||
|
model: openaiModel,
|
||||||
|
messages: [
|
||||||
|
{
|
||||||
|
role: 'system',
|
||||||
|
content: systemPrompt
|
||||||
|
},
|
||||||
|
{
|
||||||
|
role: 'user',
|
||||||
|
content: userPrompt
|
||||||
|
}
|
||||||
|
],
|
||||||
|
max_tokens: 2048,
|
||||||
|
temperature: 0.1,
|
||||||
|
response_format: { type: 'json_object' }
|
||||||
|
}, {
|
||||||
|
headers: {
|
||||||
|
'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`,
|
||||||
|
'Content-Type': 'application/json'
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
llmResponse = response.data.choices[0].message.content;
|
||||||
|
|
||||||
|
} else {
|
||||||
|
throw new Error(`Unsupported LLM provider: ${provider}`);
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`LLM Field Labeling Response: ${llmResponse}`);
|
||||||
|
|
||||||
|
let jsonStr = llmResponse.trim();
|
||||||
|
|
||||||
|
const jsonMatch = jsonStr.match(/```json\s*([\s\S]*?)\s*```/) || jsonStr.match(/```\s*([\s\S]*?)\s*```/);
|
||||||
|
if (jsonMatch) {
|
||||||
|
jsonStr = jsonMatch[1].trim();
|
||||||
|
}
|
||||||
|
|
||||||
|
const objectMatch = jsonStr.match(/\{[\s\S]*\}/);
|
||||||
|
if (objectMatch) {
|
||||||
|
jsonStr = objectMatch[0];
|
||||||
|
}
|
||||||
|
|
||||||
|
const parsedResponse = JSON.parse(jsonStr);
|
||||||
|
|
||||||
|
let labelMapping: Record<string, string>;
|
||||||
|
if (parsedResponse.fieldLabels) {
|
||||||
|
labelMapping = parsedResponse.fieldLabels;
|
||||||
|
} else {
|
||||||
|
labelMapping = parsedResponse;
|
||||||
|
}
|
||||||
|
|
||||||
|
const missingLabels: string[] = [];
|
||||||
|
Object.keys(fields).forEach(genericLabel => {
|
||||||
|
if (!labelMapping[genericLabel]) {
|
||||||
|
missingLabels.push(genericLabel);
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
if (missingLabels.length > 0) {
|
||||||
|
logger.warn(`LLM did not provide labels for: ${missingLabels.join(', ')}`);
|
||||||
|
missingLabels.forEach(label => {
|
||||||
|
labelMapping[label] = label;
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
logger.info(`Generated semantic field labels: ${JSON.stringify(labelMapping, null, 2)}`);
|
||||||
|
return labelMapping;
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error(`Error generating field labels with LLM: ${error.message}`);
|
||||||
|
logger.error(`Using fallback: keeping generic field labels`);
|
||||||
|
const fallbackLabels: Record<string, string> = {};
|
||||||
|
Object.keys(fields).forEach(label => {
|
||||||
|
fallbackLabels[label] = label;
|
||||||
|
});
|
||||||
|
return fallbackLabels;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/**
|
||||||
|
* Extract sample data from fields for LLM labeling
|
||||||
|
*/
|
||||||
|
private static async extractFieldSamples(
|
||||||
|
fields: Record<string, any>,
|
||||||
|
listSelector: string,
|
||||||
|
validator: SelectorValidator
|
||||||
|
): Promise<Record<string, string[]>> {
|
||||||
|
const fieldSamples: Record<string, string[]> = {};
|
||||||
|
|
||||||
|
try {
|
||||||
|
const page = (validator as any).page;
|
||||||
|
if (!page) {
|
||||||
|
throw new Error('Page not available');
|
||||||
|
}
|
||||||
|
|
||||||
|
const samples = await page.evaluate((args: { fieldsData: any; listSel: string }) => {
|
||||||
|
const results: Record<string, string[]> = {};
|
||||||
|
|
||||||
|
function evaluateSelector(selector: string, doc: Document): Element[] {
|
||||||
|
const isXPath = selector.startsWith('//') || selector.startsWith('(//');
|
||||||
|
if (isXPath) {
|
||||||
|
const result = doc.evaluate(selector, doc, null, XPathResult.ORDERED_NODE_SNAPSHOT_TYPE, null);
|
||||||
|
const elements: Element[] = [];
|
||||||
|
for (let i = 0; i < result.snapshotLength; i++) {
|
||||||
|
const node = result.snapshotItem(i);
|
||||||
|
if (node && node.nodeType === Node.ELEMENT_NODE) {
|
||||||
|
elements.push(node as Element);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return elements;
|
||||||
|
} else {
|
||||||
|
return Array.from(doc.querySelectorAll(selector));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
const listItems = evaluateSelector(args.listSel, document).slice(0, 5);
|
||||||
|
|
||||||
|
Object.entries(args.fieldsData).forEach(([fieldLabel, fieldInfo]: [string, any]) => {
|
||||||
|
const samples: string[] = [];
|
||||||
|
const selector = fieldInfo.selector;
|
||||||
|
const attribute = fieldInfo.attribute || 'innerText';
|
||||||
|
|
||||||
|
listItems.forEach((listItem: Element) => {
|
||||||
|
try {
|
||||||
|
const elements = evaluateSelector(selector, document);
|
||||||
|
|
||||||
|
const matchingElement = elements.find((el: Element) => {
|
||||||
|
return listItem.contains(el);
|
||||||
|
});
|
||||||
|
|
||||||
|
if (matchingElement) {
|
||||||
|
let value = '';
|
||||||
|
if (attribute === 'innerText') {
|
||||||
|
value = (matchingElement.textContent || '').trim();
|
||||||
|
} else {
|
||||||
|
value = matchingElement.getAttribute(attribute) || '';
|
||||||
|
}
|
||||||
|
|
||||||
|
if (value && value.length > 0 && !samples.includes(value)) {
|
||||||
|
samples.push(value.substring(0, 200));
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} catch (e) {
|
||||||
|
}
|
||||||
|
});
|
||||||
|
|
||||||
|
results[fieldLabel] = samples;
|
||||||
|
});
|
||||||
|
|
||||||
|
return results;
|
||||||
|
}, { fieldsData: fields, listSel: listSelector });
|
||||||
|
|
||||||
|
logger.info(`Extracted field samples: ${JSON.stringify(Object.keys(samples).map(key => ({ field: key, sampleCount: samples[key].length })))}`);
|
||||||
|
return samples;
|
||||||
|
|
||||||
|
} catch (error: any) {
|
||||||
|
logger.error(`Error extracting field samples: ${error.message}`);
|
||||||
|
logger.error(`Error stack: ${error.stack}`);
|
||||||
|
Object.keys(fields).forEach(label => {
|
||||||
|
fieldSamples[label] = [];
|
||||||
|
});
|
||||||
|
return fieldSamples;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Build workflow from LLM decision
|
* Build workflow from LLM decision
|
||||||
*/
|
*/
|
||||||
private static async buildWorkflowFromLLMDecision(
|
private static async buildWorkflowFromLLMDecision(
|
||||||
llmDecision: any,
|
llmDecision: any,
|
||||||
url: string,
|
url: string,
|
||||||
validator: SelectorValidator
|
validator: SelectorValidator,
|
||||||
|
prompt?: string,
|
||||||
|
llmConfig?: {
|
||||||
|
provider?: 'anthropic' | 'openai' | 'ollama';
|
||||||
|
model?: string;
|
||||||
|
apiKey?: string;
|
||||||
|
baseUrl?: string;
|
||||||
|
}
|
||||||
): Promise<any[]> {
|
): Promise<any[]> {
|
||||||
const workflow: any[] = [];
|
const workflow: any[] = [];
|
||||||
|
|
||||||
@@ -667,6 +1000,30 @@ Note: selectedGroupIndex must be between 0 and ${elementGroups.length - 1}`;
|
|||||||
|
|
||||||
logger.info(`Auto-detected ${Object.keys(autoDetectResult.fields).length} fields`);
|
logger.info(`Auto-detected ${Object.keys(autoDetectResult.fields).length} fields`);
|
||||||
|
|
||||||
|
logger.info('Extracting field samples for semantic labeling...');
|
||||||
|
const fieldSamples = await this.extractFieldSamples(
|
||||||
|
autoDetectResult.fields,
|
||||||
|
autoDetectResult.listSelector || '',
|
||||||
|
validator
|
||||||
|
);
|
||||||
|
|
||||||
|
logger.info('Generating semantic field labels with LLM...');
|
||||||
|
const fieldLabels = await this.generateFieldLabels(
|
||||||
|
autoDetectResult.fields,
|
||||||
|
fieldSamples,
|
||||||
|
prompt || 'Extract list data',
|
||||||
|
url,
|
||||||
|
llmConfig
|
||||||
|
);
|
||||||
|
|
||||||
|
const renamedFields: Record<string, any> = {};
|
||||||
|
Object.entries(autoDetectResult.fields).forEach(([genericLabel, fieldInfo]) => {
|
||||||
|
const semanticLabel = fieldLabels[genericLabel] || genericLabel;
|
||||||
|
renamedFields[semanticLabel] = fieldInfo;
|
||||||
|
});
|
||||||
|
|
||||||
|
logger.info(`Renamed fields: ${JSON.stringify(Object.keys(renamedFields))}`);
|
||||||
|
|
||||||
let paginationType = 'none';
|
let paginationType = 'none';
|
||||||
let paginationSelector = '';
|
let paginationSelector = '';
|
||||||
|
|
||||||
@@ -688,7 +1045,7 @@ Note: selectedGroupIndex must be between 0 and ${elementGroups.length - 1}`;
|
|||||||
actionId: `list-${uuid()}`,
|
actionId: `list-${uuid()}`,
|
||||||
name: 'List 1',
|
name: 'List 1',
|
||||||
args: [{
|
args: [{
|
||||||
fields: autoDetectResult.fields,
|
fields: renamedFields,
|
||||||
listSelector: autoDetectResult.listSelector,
|
listSelector: autoDetectResult.listSelector,
|
||||||
pagination: {
|
pagination: {
|
||||||
type: paginationType,
|
type: paginationType,
|
||||||
|
|||||||
Reference in New Issue
Block a user