Merge branch 'develop' into crawl-search

This commit is contained in:
Rohit
2026-01-04 18:21:47 +05:30
committed by GitHub
17 changed files with 462 additions and 53 deletions

View File

@@ -88,6 +88,7 @@ router.post("/sdk/robots", requireAPIKey, async (req: AuthenticatedRequest, res:
type,
url: extractedUrl,
formats: (workflowFile.meta as any).formats || [],
isLLM: (workflowFile.meta as any).isLLM,
};
const robot = await Robot.create({
@@ -102,10 +103,14 @@ router.post("/sdk/robots", requireAPIKey, async (req: AuthenticatedRequest, res:
const eventName = robotMeta.isLLM
? "maxun-oss-llm-robot-created"
: "maxun-oss-robot-created";
capture(eventName, {
const telemetryData: any = {
robot_meta: robot.recording_meta,
recording: robot.recording,
});
};
if (robotMeta.isLLM && (workflowFile.meta as any).prompt) {
telemetryData.prompt = (workflowFile.meta as any).prompt;
}
capture(eventName, telemetryData);
return res.status(201).json({
data: robot,
@@ -916,6 +921,7 @@ router.post("/sdk/extract/llm", requireAPIKey, async (req: AuthenticatedRequest,
capture("maxun-oss-llm-robot-created", {
robot_meta: robot.recording_meta,
recording: robot.recording,
prompt: prompt,
});
return res.status(200).json({

View File

@@ -7,6 +7,7 @@ interface UserAttributes {
password: string;
api_key_name?: string | null;
api_key?: string | null;
api_key_created_at?: Date | null;
proxy_url?: string | null;
proxy_username?: string | null;
proxy_password?: string | null;
@@ -20,6 +21,7 @@ class User extends Model<UserAttributes, UserCreationAttributes> implements User
public password!: string;
public api_key_name!: string | null;
public api_key!: string | null;
public api_key_created_at!: Date | null;
public proxy_url!: string | null;
public proxy_username!: string | null;
public proxy_password!: string | null;
@@ -53,6 +55,10 @@ User.init(
type: DataTypes.STRING,
allowNull: true,
},
api_key_created_at: {
type: DataTypes.DATE,
allowNull: true,
},
proxy_url: {
type: DataTypes.STRING,
allowNull: true,

View File

@@ -255,8 +255,9 @@ router.post(
return res.status(400).json({ message: "API key already exists" });
}
const apiKey = genAPIKey();
const createdAt = new Date();
await user.update({ api_key: apiKey });
await user.update({ api_key: apiKey, api_key_created_at: createdAt })
capture("maxun-oss-api-key-created", {
user_id: user.id,
@@ -266,6 +267,7 @@ router.post(
return res.status(200).json({
message: "API key generated successfully",
api_key: apiKey,
api_key_created_at: createdAt,
});
} catch (error) {
return res
@@ -290,7 +292,7 @@ router.get(
const user = await User.findByPk(req.user.id, {
raw: true,
attributes: ["api_key"],
attributes: ["api_key", "api_key_created_at"]
});
if (!user) {
@@ -305,6 +307,7 @@ router.get(
ok: true,
message: "API key fetched successfully",
api_key: user.api_key || null,
api_key_created_at: user.api_key_created_at || null,
});
} catch (error) {
console.error('API Key fetch error:', error);
@@ -336,7 +339,7 @@ router.delete(
return res.status(404).json({ message: "API Key not found" });
}
await User.update({ api_key: null }, { where: { id: req.user.id } });
await User.update({ api_key: null, api_key_created_at: null }, { where: { id: req.user.id } });
capture("maxun-oss-api-key-deleted", {
user_id: user.id,

View File

@@ -583,6 +583,7 @@ router.post('/recordings/llm', requireSignIn, async (req: AuthenticatedRequest,
robot_meta: newRobot.recording_meta,
recording: newRobot.recording,
llm_provider: llmProvider || 'ollama',
prompt: prompt,
});
return res.status(201).json({

View File

@@ -1240,6 +1240,168 @@ Rules:
}
}
/**
* Generate semantic list name using LLM based on user prompt and field context
*/
private static async generateListName(
prompt: string,
url: string,
fieldNames: string[],
llmConfig?: {
provider?: 'anthropic' | 'openai' | 'ollama';
model?: string;
apiKey?: string;
baseUrl?: string;
}
): Promise<string> {
try {
const provider = llmConfig?.provider || 'ollama';
const axios = require('axios');
const fieldContext = fieldNames.length > 0
? `\n\nDetected fields in the list:\n${fieldNames.slice(0, 10).map((name, idx) => `${idx + 1}. ${name}`).join('\n')}`
: '';
const systemPrompt = `You are a list naming assistant. Your job is to generate a clear, concise name for a data list based on the user's extraction request and the fields being extracted.
RULES FOR LIST NAMING:
1. Use 1-3 words maximum (prefer 2 words)
2. Use Title Case (e.g., "Product Listings", "Job Postings")
3. Be specific and descriptive
4. Match the user's terminology when possible
5. Adapt to the domain: e-commerce (Products, Listings), jobs (Jobs, Postings), articles (Articles, News), etc.
6. Avoid generic terms like "List", "Data", "Items" unless absolutely necessary
7. Focus on WHAT is being extracted, not HOW
Examples:
- User wants "product listings" → "Product Listings" or "Products"
- User wants "job postings" → "Job Postings" or "Jobs"
- User wants "article titles" → "Articles"
- User wants "company information" → "Companies"
- User wants "quotes from page" → "Quotes"
You must return ONLY the list name, nothing else. No JSON, no explanation, just the name.`;
const userPrompt = `URL: ${url}
User's extraction request: "${prompt}"
${fieldContext}
TASK: Generate a concise, descriptive name for this list (1-3 words in Title Case).
Return ONLY the list name, nothing else:`;
let llmResponse: string;
if (provider === 'ollama') {
const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434';
const ollamaModel = llmConfig?.model || 'llama3.2-vision';
try {
const response = await axios.post(`${ollamaBaseUrl}/api/chat`, {
model: ollamaModel,
messages: [
{
role: 'system',
content: systemPrompt
},
{
role: 'user',
content: userPrompt
}
],
stream: false,
options: {
temperature: 0.1,
top_p: 0.9,
num_predict: 20
}
});
llmResponse = response.data.message.content;
} catch (ollamaError: any) {
logger.error(`Ollama request failed for list naming: ${ollamaError.message}`);
logger.info('Using fallback list name: "List 1"');
return 'List 1';
}
} else if (provider === 'anthropic') {
const anthropic = new Anthropic({
apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY
});
const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022';
const response = await anthropic.messages.create({
model: anthropicModel,
max_tokens: 20,
temperature: 0.1,
messages: [{
role: 'user',
content: userPrompt
}],
system: systemPrompt
});
const textContent = response.content.find((c: any) => c.type === 'text');
llmResponse = textContent?.type === 'text' ? textContent.text : '';
} else if (provider === 'openai') {
const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1';
const openaiModel = llmConfig?.model || 'gpt-4o-mini';
const response = await axios.post(`${openaiBaseUrl}/chat/completions`, {
model: openaiModel,
messages: [
{
role: 'system',
content: systemPrompt
},
{
role: 'user',
content: userPrompt
}
],
max_tokens: 20,
temperature: 0.1
}, {
headers: {
'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`,
'Content-Type': 'application/json'
}
});
llmResponse = response.data.choices[0].message.content;
} else {
throw new Error(`Unsupported LLM provider: ${provider}`);
}
let listName = (llmResponse || '').trim();
logger.info(`LLM List Naming Response: "${listName}"`);
listName = listName.replace(/^["']|["']$/g, '');
listName = listName.split('\n')[0];
listName = listName.trim();
if (!listName || listName.length === 0) {
throw new Error('LLM returned empty list name');
}
if (listName.length > 50) {
throw new Error('LLM returned list name that is too long');
}
listName = listName.split(' ')
.map((word: string) => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase())
.join(' ');
logger.info(`✓ Generated list name: "${listName}"`);
return listName;
} catch (error: any) {
logger.error(`Error in generateListName: ${error.message}`);
logger.info('Using fallback list name: "List 1"');
return 'List 1';
}
}
/**
* Build workflow from LLM decision
*/
@@ -1333,10 +1495,19 @@ Rules:
const limit = llmDecision.limit || 100;
logger.info(`Using limit: ${limit}`);
logger.info('Generating semantic list name with LLM...');
const listName = await this.generateListName(
prompt || 'Extract list data',
url,
Object.keys(finalFields),
llmConfig
);
logger.info(`Using list name: "${listName}"`);
workflow[0].what.push({
action: 'scrapeList',
actionId: `list-${uuid()}`,
name: 'List 1',
name: listName,
args: [{
fields: finalFields,
listSelector: autoDetectResult.listSelector,

View File

@@ -580,6 +580,13 @@ export class WorkflowInterpreter {
setActionName: (name: string) => {
this.currentActionName = name;
},
progressUpdate: (current: number, total: number, percentage: number) => {
this.socket.nsp.emit('workflowProgress', {
current,
total,
percentage
});
},
},
serializableCallback: async (data: any) => {
try {