diff --git a/README.md b/README.md index 0b10399e..69e4a958 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@

✨ Turn any website into clean, contextualized data pipelines for your AI applications ✨
-Maxun is the easiest way to extract web data with no code. The modern open-source alternative to BrowseAI, Octoparse and similar tools. +Maxun is the easiest way to extract web data with no code.

diff --git a/maxun-core/src/interpret.ts b/maxun-core/src/interpret.ts index 7670a3e0..c6dd653b 100644 --- a/maxun-core/src/interpret.ts +++ b/maxun-core/src/interpret.ts @@ -48,6 +48,7 @@ interface InterpreterOptions { debugMessage: (msg: string) => void, setActionType: (type: string) => void, incrementScrapeListIndex: () => void, + progressUpdate: (current: number, total: number, percentage: number) => void, }> } @@ -86,6 +87,10 @@ export default class Interpreter extends EventEmitter { private scrapeListCounter: number = 0; + private totalActions: number = 0; + + private executedActions: number = 0; + constructor(workflow: WorkflowFile, options?: Partial) { super(); this.workflow = workflow.workflow; @@ -2346,6 +2351,17 @@ export default class Interpreter extends EventEmitter { workflowCopy.splice(actionId, 1); console.log(`Action with ID ${action.id} removed from the workflow copy.`); + + this.executedActions++; + const percentage = Math.round((this.executedActions / this.totalActions) * 100); + + if (this.options.debugChannel?.progressUpdate) { + this.options.debugChannel.progressUpdate( + this.executedActions, + this.totalActions, + percentage + ); + } // const newSelectors = this.getPreviousSelectors(workflow, actionId); // const newSelectors = this.getSelectors(workflowCopy); @@ -2436,6 +2452,13 @@ export default class Interpreter extends EventEmitter { */ this.initializedWorkflow = Preprocessor.initWorkflow(this.workflow, params); + this.totalActions = this.initializedWorkflow.length; + this.executedActions = 0; + + if (this.options.debugChannel?.progressUpdate) { + this.options.debugChannel.progressUpdate(0, this.totalActions, 0); + } + await this.ensureScriptsLoaded(page); this.stopper = () => { diff --git a/server/src/api/sdk.ts b/server/src/api/sdk.ts index e6adfa7d..dabd309c 100644 --- a/server/src/api/sdk.ts +++ b/server/src/api/sdk.ts @@ -88,6 +88,7 @@ router.post("/sdk/robots", requireAPIKey, async (req: AuthenticatedRequest, res: type, url: extractedUrl, formats: (workflowFile.meta as any).formats || [], + isLLM: (workflowFile.meta as any).isLLM, }; const robot = await Robot.create({ @@ -102,10 +103,14 @@ router.post("/sdk/robots", requireAPIKey, async (req: AuthenticatedRequest, res: const eventName = robotMeta.isLLM ? "maxun-oss-llm-robot-created" : "maxun-oss-robot-created"; - capture(eventName, { + const telemetryData: any = { robot_meta: robot.recording_meta, recording: robot.recording, - }); + }; + if (robotMeta.isLLM && (workflowFile.meta as any).prompt) { + telemetryData.prompt = (workflowFile.meta as any).prompt; + } + capture(eventName, telemetryData); return res.status(201).json({ data: robot, @@ -916,6 +921,7 @@ router.post("/sdk/extract/llm", requireAPIKey, async (req: AuthenticatedRequest, capture("maxun-oss-llm-robot-created", { robot_meta: robot.recording_meta, recording: robot.recording, + prompt: prompt, }); return res.status(200).json({ diff --git a/server/src/models/User.ts b/server/src/models/User.ts index 6664f381..1076bc07 100644 --- a/server/src/models/User.ts +++ b/server/src/models/User.ts @@ -7,6 +7,7 @@ interface UserAttributes { password: string; api_key_name?: string | null; api_key?: string | null; + api_key_created_at?: Date | null; proxy_url?: string | null; proxy_username?: string | null; proxy_password?: string | null; @@ -20,6 +21,7 @@ class User extends Model implements User public password!: string; public api_key_name!: string | null; public api_key!: string | null; + public api_key_created_at!: Date | null; public proxy_url!: string | null; public proxy_username!: string | null; public proxy_password!: string | null; @@ -53,6 +55,10 @@ User.init( type: DataTypes.STRING, allowNull: true, }, + api_key_created_at: { + type: DataTypes.DATE, + allowNull: true, + }, proxy_url: { type: DataTypes.STRING, allowNull: true, diff --git a/server/src/routes/auth.ts b/server/src/routes/auth.ts index 5a758ee9..4ddbafe1 100644 --- a/server/src/routes/auth.ts +++ b/server/src/routes/auth.ts @@ -255,8 +255,9 @@ router.post( return res.status(400).json({ message: "API key already exists" }); } const apiKey = genAPIKey(); + const createdAt = new Date(); - await user.update({ api_key: apiKey }); + await user.update({ api_key: apiKey, api_key_created_at: createdAt }) capture("maxun-oss-api-key-created", { user_id: user.id, @@ -266,6 +267,7 @@ router.post( return res.status(200).json({ message: "API key generated successfully", api_key: apiKey, + api_key_created_at: createdAt, }); } catch (error) { return res @@ -290,7 +292,7 @@ router.get( const user = await User.findByPk(req.user.id, { raw: true, - attributes: ["api_key"], + attributes: ["api_key", "api_key_created_at"] }); if (!user) { @@ -305,6 +307,7 @@ router.get( ok: true, message: "API key fetched successfully", api_key: user.api_key || null, + api_key_created_at: user.api_key_created_at || null, }); } catch (error) { console.error('API Key fetch error:', error); @@ -336,7 +339,7 @@ router.delete( return res.status(404).json({ message: "API Key not found" }); } - await User.update({ api_key: null }, { where: { id: req.user.id } }); + await User.update({ api_key: null, api_key_created_at: null }, { where: { id: req.user.id } }); capture("maxun-oss-api-key-deleted", { user_id: user.id, diff --git a/server/src/routes/storage.ts b/server/src/routes/storage.ts index f80a84fa..277c95d8 100644 --- a/server/src/routes/storage.ts +++ b/server/src/routes/storage.ts @@ -583,6 +583,7 @@ router.post('/recordings/llm', requireSignIn, async (req: AuthenticatedRequest, robot_meta: newRobot.recording_meta, recording: newRobot.recording, llm_provider: llmProvider || 'ollama', + prompt: prompt, }); return res.status(201).json({ diff --git a/server/src/sdk/workflowEnricher.ts b/server/src/sdk/workflowEnricher.ts index f8ae4920..9052b362 100644 --- a/server/src/sdk/workflowEnricher.ts +++ b/server/src/sdk/workflowEnricher.ts @@ -1240,6 +1240,168 @@ Rules: } } + /** + * Generate semantic list name using LLM based on user prompt and field context + */ + private static async generateListName( + prompt: string, + url: string, + fieldNames: string[], + llmConfig?: { + provider?: 'anthropic' | 'openai' | 'ollama'; + model?: string; + apiKey?: string; + baseUrl?: string; + } + ): Promise { + try { + const provider = llmConfig?.provider || 'ollama'; + const axios = require('axios'); + + const fieldContext = fieldNames.length > 0 + ? `\n\nDetected fields in the list:\n${fieldNames.slice(0, 10).map((name, idx) => `${idx + 1}. ${name}`).join('\n')}` + : ''; + + const systemPrompt = `You are a list naming assistant. Your job is to generate a clear, concise name for a data list based on the user's extraction request and the fields being extracted. + +RULES FOR LIST NAMING: +1. Use 1-3 words maximum (prefer 2 words) +2. Use Title Case (e.g., "Product Listings", "Job Postings") +3. Be specific and descriptive +4. Match the user's terminology when possible +5. Adapt to the domain: e-commerce (Products, Listings), jobs (Jobs, Postings), articles (Articles, News), etc. +6. Avoid generic terms like "List", "Data", "Items" unless absolutely necessary +7. Focus on WHAT is being extracted, not HOW + +Examples: +- User wants "product listings" → "Product Listings" or "Products" +- User wants "job postings" → "Job Postings" or "Jobs" +- User wants "article titles" → "Articles" +- User wants "company information" → "Companies" +- User wants "quotes from page" → "Quotes" + +You must return ONLY the list name, nothing else. No JSON, no explanation, just the name.`; + + const userPrompt = `URL: ${url} + +User's extraction request: "${prompt}" +${fieldContext} + +TASK: Generate a concise, descriptive name for this list (1-3 words in Title Case). + +Return ONLY the list name, nothing else:`; + + let llmResponse: string; + + if (provider === 'ollama') { + const ollamaBaseUrl = llmConfig?.baseUrl || process.env.OLLAMA_BASE_URL || 'http://localhost:11434'; + const ollamaModel = llmConfig?.model || 'llama3.2-vision'; + + try { + const response = await axios.post(`${ollamaBaseUrl}/api/chat`, { + model: ollamaModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: userPrompt + } + ], + stream: false, + options: { + temperature: 0.1, + top_p: 0.9, + num_predict: 20 + } + }); + + llmResponse = response.data.message.content; + } catch (ollamaError: any) { + logger.error(`Ollama request failed for list naming: ${ollamaError.message}`); + logger.info('Using fallback list name: "List 1"'); + return 'List 1'; + } + } else if (provider === 'anthropic') { + const anthropic = new Anthropic({ + apiKey: llmConfig?.apiKey || process.env.ANTHROPIC_API_KEY + }); + const anthropicModel = llmConfig?.model || 'claude-3-5-sonnet-20241022'; + + const response = await anthropic.messages.create({ + model: anthropicModel, + max_tokens: 20, + temperature: 0.1, + messages: [{ + role: 'user', + content: userPrompt + }], + system: systemPrompt + }); + + const textContent = response.content.find((c: any) => c.type === 'text'); + llmResponse = textContent?.type === 'text' ? textContent.text : ''; + + } else if (provider === 'openai') { + const openaiBaseUrl = llmConfig?.baseUrl || 'https://api.openai.com/v1'; + const openaiModel = llmConfig?.model || 'gpt-4o-mini'; + + const response = await axios.post(`${openaiBaseUrl}/chat/completions`, { + model: openaiModel, + messages: [ + { + role: 'system', + content: systemPrompt + }, + { + role: 'user', + content: userPrompt + } + ], + max_tokens: 20, + temperature: 0.1 + }, { + headers: { + 'Authorization': `Bearer ${llmConfig?.apiKey || process.env.OPENAI_API_KEY}`, + 'Content-Type': 'application/json' + } + }); + + llmResponse = response.data.choices[0].message.content; + } else { + throw new Error(`Unsupported LLM provider: ${provider}`); + } + + let listName = (llmResponse || '').trim(); + logger.info(`LLM List Naming Response: "${listName}"`); + + listName = listName.replace(/^["']|["']$/g, ''); + listName = listName.split('\n')[0]; + listName = listName.trim(); + + if (!listName || listName.length === 0) { + throw new Error('LLM returned empty list name'); + } + + if (listName.length > 50) { + throw new Error('LLM returned list name that is too long'); + } + + listName = listName.split(' ') + .map((word: string) => word.charAt(0).toUpperCase() + word.slice(1).toLowerCase()) + .join(' '); + + logger.info(`✓ Generated list name: "${listName}"`); + return listName; + } catch (error: any) { + logger.error(`Error in generateListName: ${error.message}`); + logger.info('Using fallback list name: "List 1"'); + return 'List 1'; + } + } + /** * Build workflow from LLM decision */ @@ -1333,10 +1495,19 @@ Rules: const limit = llmDecision.limit || 100; logger.info(`Using limit: ${limit}`); + logger.info('Generating semantic list name with LLM...'); + const listName = await this.generateListName( + prompt || 'Extract list data', + url, + Object.keys(finalFields), + llmConfig + ); + logger.info(`Using list name: "${listName}"`); + workflow[0].what.push({ action: 'scrapeList', actionId: `list-${uuid()}`, - name: 'List 1', + name: listName, args: [{ fields: finalFields, listSelector: autoDetectResult.listSelector, diff --git a/server/src/workflow-management/classes/Interpreter.ts b/server/src/workflow-management/classes/Interpreter.ts index 62570069..af4af624 100644 --- a/server/src/workflow-management/classes/Interpreter.ts +++ b/server/src/workflow-management/classes/Interpreter.ts @@ -580,6 +580,13 @@ export class WorkflowInterpreter { setActionName: (name: string) => { this.currentActionName = name; }, + progressUpdate: (current: number, total: number, percentage: number) => { + this.socket.nsp.emit('workflowProgress', { + current, + total, + percentage + }); + }, }, serializableCallback: async (data: any) => { try { diff --git a/src/components/api/ApiKey.tsx b/src/components/api/ApiKey.tsx index 27897169..73f95dc7 100644 --- a/src/components/api/ApiKey.tsx +++ b/src/components/api/ApiKey.tsx @@ -34,6 +34,7 @@ const ApiKeyManager = () => { const { t } = useTranslation(); const [apiKey, setApiKey] = useState(null); const [apiKeyName, setApiKeyName] = useState(t('apikey.default_name')); + const [apiKeyCreatedAt, setApiKeyCreatedAt] = useState(null); const [loading, setLoading] = useState(true); const [showKey, setShowKey] = useState(false); const [copySuccess, setCopySuccess] = useState(false); @@ -44,6 +45,7 @@ const ApiKeyManager = () => { try { const { data } = await axios.get(`${apiUrl}/auth/api-key`); setApiKey(data.api_key); + setApiKeyCreatedAt(data.api_key_created_at); } catch (error: any) { notify('error', t('apikey.notifications.fetch_error', { error: error.message })); } finally { @@ -60,7 +62,7 @@ const ApiKeyManager = () => { try { const { data } = await axios.post(`${apiUrl}/auth/generate-api-key`); setApiKey(data.api_key); - + setApiKeyCreatedAt(data.api_key_created_at); notify('success', t('apikey.notifications.generate_success')); } catch (error: any) { notify('error', t('apikey.notifications.generate_error', { error: error.message })); @@ -74,6 +76,7 @@ const ApiKeyManager = () => { try { await axios.delete(`${apiUrl}/auth/delete-api-key`); setApiKey(null); + setApiKeyCreatedAt(null); notify('success', t('apikey.notifications.delete_success')); } catch (error: any) { notify('error', t('apikey.notifications.delete_error', { error: error.message })); @@ -128,12 +131,13 @@ const ApiKeyManager = () => { {apiKey ? ( - +
{t('apikey.table.name')} {t('apikey.table.key')} - {t('apikey.table.actions')} + {apiKeyCreatedAt && Created On} + {t('apikey.table.actions')} @@ -144,7 +148,16 @@ const ApiKeyManager = () => { {showKey ? `${apiKey?.substring(0, 10)}...` : '**********'} - + {apiKeyCreatedAt && ( + + {new Date(apiKeyCreatedAt).toLocaleDateString('en-US', { + month: 'short', + day: 'numeric', + year: 'numeric', + })} + + )} + diff --git a/src/components/dashboard/NavBar.tsx b/src/components/dashboard/NavBar.tsx index 4240d8af..f9b4bc90 100644 --- a/src/components/dashboard/NavBar.tsx +++ b/src/components/dashboard/NavBar.tsx @@ -113,7 +113,7 @@ export const NavBar: React.FC = ({ if (data.ok) { dispatch({ type: "LOGOUT" }); window.localStorage.removeItem("user"); - notify('success', t('navbar.notifications.success.logout')); + // notify('success', t('navbar.notifications.success.logout')); navigate("/login"); } } catch (error: any) { diff --git a/src/components/robot/RecordingsTable.tsx b/src/components/robot/RecordingsTable.tsx index 704a7f97..1d748a4d 100644 --- a/src/components/robot/RecordingsTable.tsx +++ b/src/components/robot/RecordingsTable.tsx @@ -5,7 +5,6 @@ import Table from '@mui/material/Table'; import TableBody from '@mui/material/TableBody'; import TableCell from '@mui/material/TableCell'; import TableContainer from '@mui/material/TableContainer'; -import TableHead from '@mui/material/TableHead'; import TablePagination from '@mui/material/TablePagination'; import TableRow from '@mui/material/TableRow'; import { memo, useCallback, useEffect, useMemo } from "react"; @@ -116,7 +115,6 @@ const LoadingRobotRow = memo(({ row, columns }: any) => { // Virtualized row component for efficient rendering const TableRowMemoized = memo(({ row, columns, handlers }: any) => { - // If robot is loading, show loading row if (row.isLoading) { return ; } @@ -592,7 +590,6 @@ export const RecordingsTable = ({ <>
- {columns.map((column) => ( ))} - {visibleRows.map((row) => ( )} diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index 3e32827e..445f3b4c 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -704,14 +704,46 @@ const RobotCreate: React.FC = () => { value={outputFormats} label="Output Formats *" onChange={(e) => { - const value = typeof e.target.value === 'string' ? e.target.value.split(',') : e.target.value; + const value = + typeof e.target.value === 'string' + ? e.target.value.split(',') + : e.target.value; setOutputFormats(value); }} renderValue={(selected) => { if (selected.length === 0) { return Select formats; } - return `${selected.length} format${selected.length > 1 ? 's' : ''} selected`; + + const OUTPUT_FORMAT_LABELS: Record = { + markdown: 'Markdown', + html: 'HTML', + 'screenshot-visible': 'Screenshot (Visible)', + 'screenshot-fullpage': 'Screenshot (Full Page)', + }; + + const labels = selected.map( + (value) => OUTPUT_FORMAT_LABELS[value] ?? value + ); + + const MAX_ITEMS = 2; // Show only first 2, then ellipsis + + const display = + labels.length > MAX_ITEMS + ? `${labels.slice(0, MAX_ITEMS).join(', ')}…` + : labels.join(', '); + + return ( + + {display} + + ); }} MenuProps={{ PaperProps: { @@ -1097,4 +1129,4 @@ const modalStyle = { height: 'fit-content', display: 'block', padding: '20px', -}; \ No newline at end of file +}; diff --git a/src/components/run/ColapsibleRow.tsx b/src/components/run/ColapsibleRow.tsx index 67e82bf0..46e94ec9 100644 --- a/src/components/run/ColapsibleRow.tsx +++ b/src/components/run/ColapsibleRow.tsx @@ -12,6 +12,45 @@ import { GenericModal } from "../ui/GenericModal"; import { getUserById } from "../../api/auth"; import { useTranslation } from "react-i18next"; import { useTheme } from "@mui/material/styles"; +import { io, Socket } from "socket.io-client"; +import { apiUrl } from "../../apiConfig"; + +const socketCache = new Map(); +const progressCallbacks = new Map void>>(); + +function getOrCreateSocket(browserId: string): Socket { + if (socketCache.has(browserId)) { + return socketCache.get(browserId)!; + } + + const socket = io(`${apiUrl}/${browserId}`, { + transports: ["websocket"], + rejectUnauthorized: false + }); + + socket.on('workflowProgress', (data: any) => { + const callbacks = progressCallbacks.get(browserId); + if (callbacks) { + callbacks.forEach(cb => cb(data)); + } + }); + + socketCache.set(browserId, socket); + return socket; +} + +function cleanupSocketIfUnused(browserId: string) { + const callbacks = progressCallbacks.get(browserId); + + if (!callbacks || callbacks.size === 0) { + const socket = socketCache.get(browserId); + if (socket) { + socket.disconnect(); + socketCache.delete(browserId); + progressCallbacks.delete(browserId); + } + } +} interface RunTypeChipProps { runByUserId?: string; @@ -54,11 +93,52 @@ export const CollapsibleRow = ({ row, handleDelete, isOpen, onToggleExpanded, cu const logEndRef = useRef(null); - const scrollToLogBottom = () => { - if (logEndRef.current) { - logEndRef.current.scrollIntoView({ behavior: "smooth" }); + const [workflowProgress, setWorkflowProgress] = useState<{ + current: number; + total: number; + percentage: number; + } | null>(null); + + // Subscribe to progress updates using module-level socket cache + useEffect(() => { + if (!row.browserId) return; + + // Get or create socket (from module cache) + getOrCreateSocket(row.browserId); + + // Register callback + if (!progressCallbacks.has(row.browserId)) { + progressCallbacks.set(row.browserId, new Set()); } - } + + const callback = (data: any) => { + setWorkflowProgress(data); + }; + + progressCallbacks.get(row.browserId)!.add(callback); + + // Cleanup: remove callback and cleanup socket if no callbacks remain + return () => { + const callbacks = progressCallbacks.get(row.browserId); + if (callbacks) { + callbacks.delete(callback); + // Cleanup socket if this was the last callback + cleanupSocketIfUnused(row.browserId); + } + }; + }, [row.browserId]); + + // Clear progress UI when run completes and trigger socket cleanup + useEffect(() => { + if (row.status !== 'running' && row.status !== 'queued') { + setWorkflowProgress(null); + // Attempt to cleanup socket when run completes + // (will only cleanup if no other callbacks exist) + if (row.browserId) { + cleanupSocketIfUnused(row.browserId); + } + } + }, [row.status, row.browserId]); const handleAbort = () => { abortRunHandler(row.runId, row.name, row.browserId); @@ -67,12 +147,7 @@ export const CollapsibleRow = ({ row, handleDelete, isOpen, onToggleExpanded, cu const handleRowExpand = () => { const newOpen = !isOpen; onToggleExpanded(newOpen); - //scrollToLogBottom(); }; - - // useEffect(() => { - // scrollToLogBottom(); - // }, [currentLog]) useEffect(() => { const fetchUserEmail = async () => { @@ -196,7 +271,8 @@ export const CollapsibleRow = ({ row, handleDelete, isOpen, onToggleExpanded, cu + logEndRef={logEndRef} interpretationInProgress={runningRecordingName === row.name} + workflowProgress={workflowProgress} /> diff --git a/src/components/run/RunContent.tsx b/src/components/run/RunContent.tsx index 3f8ce161..08b295d5 100644 --- a/src/components/run/RunContent.tsx +++ b/src/components/run/RunContent.tsx @@ -30,9 +30,14 @@ interface RunContentProps { interpretationInProgress: boolean, logEndRef: React.RefObject, abortRunHandler: () => void, + workflowProgress: { + current: number; + total: number; + percentage: number; + } | null, } -export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRef, abortRunHandler }: RunContentProps) => { +export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRef, abortRunHandler, workflowProgress }: RunContentProps) => { const { t } = useTranslation(); const { darkMode } = useThemeMode(); const [tab, setTab] = React.useState('output'); @@ -73,6 +78,15 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe setTab(tab); }, [interpretationInProgress]); + const getProgressMessage = (percentage: number): string => { + if (percentage === 0) return 'Initializing workflow...'; + if (percentage < 25) return 'Starting execution...'; + if (percentage < 50) return 'Processing actions...'; + if (percentage < 75) return 'Extracting data...'; + if (percentage < 100) return 'Finalizing results...'; + return 'Completing...'; + }; + useEffect(() => { setMarkdownContent(''); setHtmlContent(''); @@ -925,7 +939,20 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe {row.status === 'running' || row.status === 'queued' ? ( <> - + {workflowProgress ? ( + <> + + {getProgressMessage(workflowProgress.percentage)} + + ) : ( + <> + + {t('run_content.loading')} + + )} {t('run_content.loading')}