diff --git a/src/api/storage.ts b/src/api/storage.ts index 4ac2f01b..3c9d6135 100644 --- a/src/api/storage.ts +++ b/src/api/storage.ts @@ -335,4 +335,81 @@ export const deleteSchedule = async (id: string): Promise => { console.log(error); return false; } -} \ No newline at end of file +} + +export const createCrawlRobot = async ( + url: string, + name: string, + crawlConfig: { + mode: 'domain' | 'subdomain' | 'path'; + limit: number; + maxDepth: number; + includePaths: string[]; + excludePaths: string[]; + useSitemap: boolean; + followLinks: boolean; + respectRobots: boolean; + } +): Promise => { + try { + const response = await axios.post( + `${apiUrl}/recordings/crawl`, + { + url, + name, + crawlConfig, + }, + { + headers: { 'Content-Type': 'application/json' }, + withCredentials: true, + } + ); + + if (response.status === 201) { + return response.data; + } else { + throw new Error('Failed to create crawl robot'); + } + } catch (error: any) { + console.error('Error creating crawl robot:', error); + return null; + } +}; + +export const createSearchRobot = async ( + name: string, + searchConfig: { + query: string; + limit: number; + provider: 'google' | 'bing' | 'duckduckgo'; + filters?: { + timeRange?: 'day' | 'week' | 'month' | 'year'; + location?: string; + lang?: string; + }; + mode: 'discover' | 'scrape'; + } +): Promise => { + try { + const response = await axios.post( + `${apiUrl}/recordings/search`, + { + name, + searchConfig, + }, + { + headers: { 'Content-Type': 'application/json' }, + withCredentials: true, + } + ); + + if (response.status === 201) { + return response.data; + } else { + throw new Error('Failed to create search robot'); + } + } catch (error: any) { + console.error('Error creating search robot:', error); + return null; + } +}; \ No newline at end of file diff --git a/src/components/robot/pages/RobotCreate.tsx b/src/components/robot/pages/RobotCreate.tsx index a64564e6..3e32827e 100644 --- a/src/components/robot/pages/RobotCreate.tsx +++ b/src/components/robot/pages/RobotCreate.tsx @@ -17,12 +17,14 @@ import { FormControl, Select, MenuItem, - InputLabel + InputLabel, + Collapse, + FormControlLabel } from '@mui/material'; import { ArrowBack, AutoAwesome, HighlightAlt } from '@mui/icons-material'; import { useGlobalInfoStore, useCacheInvalidation } from '../../../context/globalInfo'; import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording'; -import { createScrapeRobot, createLLMRobot, createAndRunRecording } from "../../../api/storage"; +import { createScrapeRobot, createLLMRobot, createAndRunRecording, createCrawlRobot, createSearchRobot } from "../../../api/storage"; import { AuthContext } from '../../../context/auth'; import { GenericModal } from '../../ui/GenericModal'; @@ -72,6 +74,25 @@ const RobotCreate: React.FC = () => { const [llmBaseUrl, setLlmBaseUrl] = useState(''); const [aiRobotName, setAiRobotName] = useState(''); + const [crawlRobotName, setCrawlRobotName] = useState(''); + const [crawlUrl, setCrawlUrl] = useState(''); + const [crawlMode, setCrawlMode] = useState<'domain' | 'subdomain' | 'path'>('domain'); + const [crawlLimit, setCrawlLimit] = useState(50); + const [crawlMaxDepth, setCrawlMaxDepth] = useState(3); + const [crawlIncludePaths, setCrawlIncludePaths] = useState(''); + const [crawlExcludePaths, setCrawlExcludePaths] = useState(''); + const [crawlUseSitemap, setCrawlUseSitemap] = useState(true); + const [crawlFollowLinks, setCrawlFollowLinks] = useState(true); + const [crawlRespectRobots, setCrawlRespectRobots] = useState(true); + const [showCrawlAdvanced, setShowCrawlAdvanced] = useState(false); + + const [searchRobotName, setSearchRobotName] = useState(''); + const [searchQuery, setSearchQuery] = useState(''); + const [searchLimit, setSearchLimit] = useState(10); + const [searchProvider] = useState<'duckduckgo'>('duckduckgo'); + const [searchMode, setSearchMode] = useState<'discover' | 'scrape'>('discover'); + const [searchTimeRange, setSearchTimeRange] = useState<'day' | 'week' | 'month' | 'year' | ''>(''); + const { state } = React.useContext(AuthContext); const { user } = state; const { addOptimisticRobot, removeOptimisticRobot, invalidateRecordings, invalidateRuns, addOptimisticRun } = useCacheInvalidation(); @@ -155,6 +176,76 @@ const RobotCreate: React.FC = () => { navigate('/robots'); }; + const handleCreateCrawlRobot = async () => { + if (!crawlUrl.trim()) { + notify('error', 'Please enter a valid URL'); + return; + } + if (!crawlRobotName.trim()) { + notify('error', 'Please enter a robot name'); + return; + } + + setIsLoading(true); + const result = await createCrawlRobot( + crawlUrl, + crawlRobotName, + { + mode: crawlMode, + limit: crawlLimit, + maxDepth: crawlMaxDepth, + includePaths: crawlIncludePaths ? crawlIncludePaths.split(',').map(p => p.trim()) : [], + excludePaths: crawlExcludePaths ? crawlExcludePaths.split(',').map(p => p.trim()) : [], + useSitemap: crawlUseSitemap, + followLinks: crawlFollowLinks, + respectRobots: crawlRespectRobots + } + ); + setIsLoading(false); + + if (result) { + invalidateRecordings(); + notify('success', `${crawlRobotName} created successfully!`); + navigate('/robots'); + } else { + notify('error', 'Failed to create crawl robot'); + } + }; + + const handleCreateSearchRobot = async () => { + if (!searchQuery.trim()) { + notify('error', 'Please enter a search query'); + return; + } + if (!searchRobotName.trim()) { + notify('error', 'Please enter a robot name'); + return; + } + + setIsLoading(true); + const result = await createSearchRobot( + searchRobotName, + { + query: searchQuery, + limit: searchLimit, + provider: searchProvider, + filters: { + timeRange: searchTimeRange ? searchTimeRange as 'day' | 'week' | 'month' | 'year' : undefined + }, + mode: searchMode + } + ); + setIsLoading(false); + + if (result) { + invalidateRecordings(); + notify('success', `${searchRobotName} created successfully!`); + navigate('/robots'); + } else { + notify('error', 'Failed to create search robot'); + } + }; + return ( @@ -210,6 +301,8 @@ const RobotCreate: React.FC = () => { > + + @@ -697,6 +790,262 @@ const RobotCreate: React.FC = () => { + + + + + Maxun Logo + + + Crawl entire websites and gather data from multiple pages automatically. + + + + setCrawlRobotName(e.target.value)} + sx={{ mb: 2 }} + /> + setCrawlUrl(e.target.value)} + sx={{ mb: 2 }} + /> + + setCrawlLimit(parseInt(e.target.value) || 10)} + sx={{ mb: 2 }} + /> + + + + + + + + + Crawl Scope + + + + setCrawlMaxDepth(parseInt(e.target.value) || 3)} + sx={{ mb: 2 }} + helperText="How many links deep to follow (default: 3)" + FormHelperTextProps={{ sx: { ml: 0 } }} + /> + + setCrawlIncludePaths(e.target.value)} + sx={{ mb: 2 }} + helperText="Only crawl URLs matching these paths (comma-separated)" + FormHelperTextProps={{ sx: { ml: 0 } }} + /> + + setCrawlExcludePaths(e.target.value)} + sx={{ mb: 2 }} + helperText="Skip URLs matching these paths (comma-separated)" + FormHelperTextProps={{ sx: { ml: 0 } }} + /> + + + setCrawlUseSitemap(e.target.checked)} + /> + } + label="Use sitemap.xml for URL discovery" + /> + setCrawlFollowLinks(e.target.checked)} + /> + } + label="Follow links on pages" + /> + setCrawlRespectRobots(e.target.checked)} + /> + } + label="Respect robots.txt" + /> + + + + + + + + + + + + + + Maxun Logo + + + Search the web and gather data from relevant results. + + + + setSearchRobotName(e.target.value)} + sx={{ mb: 2 }} + /> + + setSearchQuery(e.target.value)} + sx={{ mb: 2 }} + /> + + setSearchLimit(parseInt(e.target.value) || 10)} + sx={{ mb: 2 }} + /> + + + + Mode + + + + + Time Range + + + + + + + + + diff --git a/src/components/robot/pages/RobotDuplicatePage.tsx b/src/components/robot/pages/RobotDuplicatePage.tsx index f021ee45..8607b41b 100644 --- a/src/components/robot/pages/RobotDuplicatePage.tsx +++ b/src/components/robot/pages/RobotDuplicatePage.tsx @@ -24,7 +24,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'extract' | 'scrape'; + type?: 'extract' | 'scrape' | 'crawl' | 'search'; url?: string; formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[]; isLLM?: boolean; diff --git a/src/components/robot/pages/RobotEditPage.tsx b/src/components/robot/pages/RobotEditPage.tsx index 8914f4aa..d14421c9 100644 --- a/src/components/robot/pages/RobotEditPage.tsx +++ b/src/components/robot/pages/RobotEditPage.tsx @@ -1,4 +1,4 @@ -import React, { useState, useEffect } from "react"; +import { useState, useEffect } from "react"; import { useTranslation } from "react-i18next"; import { TextField, @@ -7,7 +7,13 @@ import { Button, IconButton, InputAdornment, - Divider, + FormControl, + InputLabel, + Select, + MenuItem, + FormControlLabel, + Checkbox, + Collapse } from "@mui/material"; import { Visibility, VisibilityOff } from "@mui/icons-material"; import { useGlobalInfoStore } from "../../../context/globalInfo"; @@ -24,7 +30,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'extract' | 'scrape'; + type?: 'extract' | 'scrape' | 'crawl' | 'search'; url?: string; formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[]; isLLM?: boolean; @@ -97,6 +103,25 @@ interface ScrapeListLimit { currentLimit: number; } +interface CrawlConfig { + mode?: string; + limit?: number; + maxDepth?: number; + useSitemap?: boolean; + followLinks?: boolean; + excludePaths?: string[]; + includePaths?: string[]; + respectRobots?: boolean; +} + +interface SearchConfig { + mode?: 'discover' | 'scrape'; + limit?: number; + query?: string; + filters?: Record; + provider?: string; +} + export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { const { t } = useTranslation(); const navigate = useNavigate(); @@ -115,6 +140,9 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { [] ); const [isLoading, setIsLoading] = useState(false); + const [crawlConfig, setCrawlConfig] = useState({}); + const [searchConfig, setSearchConfig] = useState({}); + const [showCrawlAdvanced, setShowCrawlAdvanced] = useState(false); const isEmailPattern = (value: string): boolean => { return value.includes("@"); @@ -163,6 +191,8 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { setCredentialGroups(groupCredentialsByType(extractedCredentials)); findScrapeListLimits(robot.recording.workflow); + extractCrawlConfig(robot.recording.workflow); + extractSearchConfig(robot.recording.workflow); } }, [robot]); @@ -195,6 +225,36 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { setScrapeListLimits(limits); }; + const extractCrawlConfig = (workflow: WhereWhatPair[]) => { + workflow.forEach((pair) => { + if (!pair.what) return; + + pair.what.forEach((action: any) => { + if (action.action === "crawl" && action.args && action.args.length > 0) { + const config = action.args[0]; + if (config && typeof config === "object") { + setCrawlConfig(config as CrawlConfig); + } + } + }); + }); + }; + + const extractSearchConfig = (workflow: WhereWhatPair[]) => { + workflow.forEach((pair) => { + if (!pair.what) return; + + pair.what.forEach((action: any) => { + if (action.action === "search" && action.args && action.args.length > 0) { + const config = action.args[0]; + if (config && typeof config === "object") { + setSearchConfig(config as SearchConfig); + } + } + }); + }); + }; + function extractInitialCredentials(workflow: any[]): Credentials { const credentials: Credentials = {}; @@ -475,19 +535,17 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { <> {renderCredentialFields( credentialGroups.usernames, - t("Username"), - "text" + t("Username") )} - {renderCredentialFields(credentialGroups.emails, t("Email"), "text")} + {renderCredentialFields(credentialGroups.emails, t("Email"))} {renderCredentialFields( credentialGroups.passwords, - t("Password"), - "password" + t("Password") )} - {renderCredentialFields(credentialGroups.others, t("Other"), "text")} + {renderCredentialFields(credentialGroups.others, t("Other"))} ); }; @@ -502,7 +560,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { {scrapeListLimits.map((limitInfo, index) => { - // Get the corresponding scrapeList action to extract its name const scrapeListAction = robot?.recording?.workflow?.[limitInfo.pairIndex]?.what?.[limitInfo.actionIndex]; const actionName = scrapeListAction?.name || @@ -542,7 +599,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { const screenshotInputs: JSX.Element[] = []; const listInputs: JSX.Element[] = []; - let textCount = 0; let screenshotCount = 0; let listCount = 0; @@ -683,7 +739,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { const renderCredentialFields = ( selectors: string[], headerText: string, - defaultType: "text" | "password" = "text" ) => { if (selectors.length === 0) return null; @@ -737,6 +792,193 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { return url; }; + const renderCrawlConfigFields = () => { + if (robot?.recording_meta.type !== 'crawl') return null; + + return ( + <> + { + const value = parseInt(e.target.value, 10); + if (value >= 1) { + setCrawlConfig((prev) => ({ ...prev, limit: value })); + } + }} + inputProps={{ min: 1 }} + style={{ marginBottom: "20px" }} + /> + + + + + + + Crawl Scope + + + + { + const value = parseInt(e.target.value, 10); + if (value >= 1) { + setCrawlConfig((prev) => ({ ...prev, maxDepth: value })); + } + }} + inputProps={{ min: 1 }} + sx={{ mb: 2 }} + helperText="How many links deep to follow (default: 3)" + /> + + { + const paths = e.target.value ? e.target.value.split(',').map(p => p.trim()) : []; + setCrawlConfig((prev) => ({ ...prev, includePaths: paths })); + }} + sx={{ mb: 2 }} + helperText="Only crawl URLs matching these paths (comma-separated)" + /> + + { + const paths = e.target.value ? e.target.value.split(',').map(p => p.trim()) : []; + setCrawlConfig((prev) => ({ ...prev, excludePaths: paths })); + }} + sx={{ mb: 2 }} + helperText="Skip URLs matching these paths (comma-separated)" + /> + + + setCrawlConfig((prev) => ({ ...prev, useSitemap: e.target.checked }))} + /> + } + label="Use sitemap.xml for URL discovery" + /> + setCrawlConfig((prev) => ({ ...prev, followLinks: e.target.checked }))} + /> + } + label="Follow links on pages" + /> + setCrawlConfig((prev) => ({ ...prev, respectRobots: e.target.checked }))} + /> + } + label="Respect robots.txt" + /> + + + + + ); + }; + + const renderSearchConfigFields = () => { + if (robot?.recording_meta.type !== 'search') return null; + + return ( + <> + { + setSearchConfig((prev) => ({ ...prev, query: e.target.value })); + }} + sx={{ mb: 2 }} + /> + + { + const value = parseInt(e.target.value, 10); + if (value >= 1) { + setSearchConfig((prev) => ({ ...prev, limit: value })); + } + }} + inputProps={{ min: 1 }} + sx={{ mb: 2 }} + /> + + + Mode + + + + + Time Range + + + + ); + }; + const handleSave = async () => { if (!robot) return; @@ -757,6 +999,48 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { const targetUrl = getTargetUrl(); + let updatedWorkflow = robot.recording.workflow; + if (robot.recording_meta.type === 'crawl') { + updatedWorkflow = updatedWorkflow.map((pair: any) => { + if (!pair.what) return pair; + + return { + ...pair, + what: pair.what.map((action: any) => { + if (action.action === 'crawl') { + return { + ...action, + args: [{ ...crawlConfig }] + }; + } + return action; + }) + }; + }); + } + + if (robot.recording_meta.type === 'search') { + updatedWorkflow = updatedWorkflow.map((pair: any) => { + if (!pair.what) return pair; + + return { + ...pair, + what: pair.what.map((action: any) => { + if (action.action === 'search') { + return { + ...action, + args: [{ + ...searchConfig, + provider: 'duckduckgo' + }] + }; + } + return action; + }) + }; + }); + } + const payload: any = { name: robot.recording_meta.name, limits: scrapeListLimits.map((limit) => ({ @@ -767,7 +1051,6 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { })), credentials: credentialsForPayload, targetUrl: targetUrl, - // send the (possibly edited) workflow so backend can persist action name changes workflow: robot.recording.workflow, }; @@ -825,19 +1108,12 @@ export const RobotEditPage = ({ handleStart }: RobotSettingsProps) => { onChange={(e) => handleTargetUrlChange(e.target.value)} style={{ marginBottom: "20px" }} /> - {renderScrapeListLimitFields() && ( - <> - - {renderScrapeListLimitFields()} - - )} + + {renderCrawlConfigFields()} + {renderSearchConfigFields()} - {renderActionNameFields() && ( - <> - - {renderActionNameFields()} - - )} + {renderScrapeListLimitFields()} + {renderActionNameFields()} )} diff --git a/src/components/robot/pages/RobotSettingsPage.tsx b/src/components/robot/pages/RobotSettingsPage.tsx index a5618d4c..684107e8 100644 --- a/src/components/robot/pages/RobotSettingsPage.tsx +++ b/src/components/robot/pages/RobotSettingsPage.tsx @@ -1,7 +1,6 @@ -import React, { useState, useEffect } from "react"; +import { useState, useEffect } from "react"; import { useTranslation } from "react-i18next"; -import { TextField, Typography, Box, Card, CardContent } from "@mui/material"; -import { Settings, Info } from "@mui/icons-material"; +import { TextField, Box } from "@mui/material"; import { useGlobalInfoStore } from "../../../context/globalInfo"; import { getStoredRecording } from "../../../api/storage"; import { WhereWhatPair } from "maxun-core"; @@ -16,7 +15,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'extract' | 'scrape'; + type?: 'extract' | 'scrape' | 'crawl' | 'search'; url?: string; formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[]; isLLM?: boolean; @@ -116,19 +115,11 @@ export const RobotSettingsPage = ({ handleStart }: RobotSettingsProps) => { fetchUserEmail(); }, [robot?.userId]); - const handleCancel = () => { - const basePath = location.pathname.includes("/prebuilt-robots") - ? "/prebuilt-robots" - : "/robots"; - navigate(basePath); - }; - const targetUrl = getTargetUrl(); return ( { {robot && ( <> - + {robot.recording_meta.type !== 'search' && ( + + )} { const { t } = useTranslation(); + const { darkMode } = useThemeMode(); const [tab, setTab] = React.useState('output'); const [markdownContent, setMarkdownContent] = useState(''); const [htmlContent, setHtmlContent] = useState(''); @@ -50,6 +51,15 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe const [listKeys, setListKeys] = useState([]); const [currentListIndex, setCurrentListIndex] = useState(0); + const [crawlData, setCrawlData] = useState([]); + const [crawlColumns, setCrawlColumns] = useState([]); + const [crawlKeys, setCrawlKeys] = useState([]); + const [currentCrawlIndex, setCurrentCrawlIndex] = useState(0); + + const [searchData, setSearchData] = useState([]); + const [searchMode, setSearchMode] = useState<'discover' | 'scrape'>('discover'); + const [currentSearchIndex, setCurrentSearchIndex] = useState(0); + const [screenshotKeys, setScreenshotKeys] = useState([]); const [screenshotKeyMap, setScreenshotKeyMap] = useState>({}); const [currentScreenshotIndex, setCurrentScreenshotIndex] = useState(0); @@ -93,6 +103,10 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe setListData([]); setListColumns([]); setListKeys([]); + setCrawlData([]); + setCrawlColumns([]); + setCrawlKeys([]); + setSearchData([]); setLegacyData([]); setLegacyColumns([]); setIsLegacyData(false); @@ -104,7 +118,7 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe const hasLegacySchema = row.serializableOutput.scrapeSchema && Array.isArray(row.serializableOutput.scrapeSchema); const hasLegacyList = row.serializableOutput.scrapeList && Array.isArray(row.serializableOutput.scrapeList); - const hasOldFormat = !row.serializableOutput.scrapeSchema && !row.serializableOutput.scrapeList && Object.keys(row.serializableOutput).length > 0; + const hasOldFormat = !row.serializableOutput.scrapeSchema && !row.serializableOutput.scrapeList && !row.serializableOutput.crawl && !row.serializableOutput.search && Object.keys(row.serializableOutput).length > 0; if (hasLegacySchema || hasLegacyList || hasOldFormat) { processLegacyData(row.serializableOutput); @@ -121,6 +135,14 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe if (row.serializableOutput.scrapeList) { processScrapeList(row.serializableOutput.scrapeList); } + + if (row.serializableOutput.crawl) { + processCrawl(row.serializableOutput.crawl); + } + + if (row.serializableOutput.search) { + processSearch(row.serializableOutput.search); + } }, [row.serializableOutput, row.status]); useEffect(() => { @@ -139,7 +161,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe let normalizedScreenshotKeys: string[]; if (isLegacyPattern) { - // Legacy unnamed screenshots → Screenshot 1, Screenshot 2... normalizedScreenshotKeys = rawKeys.map((_, index) => `Screenshot ${index + 1}`); } else { normalizedScreenshotKeys = rawKeys.map((key, index) => { @@ -342,6 +363,76 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe setCurrentListIndex(0); }; + const processCrawl = (crawlDataInput: any) => { + const tablesList: any[][] = []; + const columnsList: string[][] = []; + const keys: string[] = []; + + if (typeof crawlDataInput === 'object') { + Object.keys(crawlDataInput).forEach(key => { + const tableData = crawlDataInput[key]; + + if (Array.isArray(tableData) && tableData.length > 0) { + const filteredData = tableData.filter(row => + row && typeof row === 'object' && Object.values(row).some(value => value !== undefined && value !== "") + ); + + if (filteredData.length > 0) { + tablesList.push(filteredData); + keys.push(key); + const tableColumns = new Set(); + filteredData.forEach(item => { + Object.keys(item).forEach(key => tableColumns.add(key)); + }); + columnsList.push(Array.from(tableColumns)); + } + } + }); + } + + setCrawlData(tablesList); + setCrawlColumns(columnsList); + const normalizedCrawlKeys = keys.map((key, index) => { + if (!key || key.toLowerCase().includes("crawl")) { + return `Crawl ${index + 1}`; + } + return key; + }); + + setCrawlKeys(normalizedCrawlKeys); + setCurrentCrawlIndex(0); + }; + + const processSearch = (searchDataInput: any) => { + if (typeof searchDataInput === 'object') { + const keys = Object.keys(searchDataInput); + + if (keys.length > 0) { + const searchKey = keys[0]; + const searchInfo = searchDataInput[searchKey]; + + if (searchInfo && searchInfo.results && Array.isArray(searchInfo.results)) { + const mode = searchInfo.mode || 'discover'; + setSearchMode(mode); + + if (mode === 'scrape') { + setSearchData(searchInfo.results); + } else { + const normalizedResults = searchInfo.results.map((result: any, index: number) => ({ + title: result.title || '-', + url: result.url || '-', + description: result.description || '-', + position: result.position || index + 1, + })); + setSearchData(normalizedResults); + } + + setCurrentSearchIndex(0); + } + } + } + }; + const convertToCSV = (data: any[], columns: string[], isSchemaData: boolean = false, isTabular: boolean = false): string => { if (isSchemaData && !isTabular && data.length === 1) { const header = 'Label,Value'; @@ -362,7 +453,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe } }; - // Function to download a specific dataset as CSV const downloadCSV = (data: any[], columns: string[], filename: string, isSchemaData: boolean = false, isTabular: boolean = false) => { const csvContent = convertToCSV(data, columns, isSchemaData, isTabular); const blob = new Blob([csvContent], { type: 'text/csv;charset=utf-8;' }); @@ -413,6 +503,33 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe }, 100); }; + const downloadAllCrawlsAsZip = async (crawlDataArray: any[], zipFilename: string) => { + const zip = new JSZip(); + + crawlDataArray.forEach((item, index) => { + const url = item?.metadata?.url || item?.url || ''; + const filename = url + ? url.replace(/^https?:\/\//, '').replace(/\//g, '_').replace(/[^a-zA-Z0-9_.-]/g, '_') + '.json' + : `crawl_url_${index + 1}.json`; + + const jsonContent = JSON.stringify(item, null, 2); + zip.file(filename, jsonContent); + }); + + const blob = await zip.generateAsync({ type: 'blob' }); + const url = URL.createObjectURL(blob); + + const link = document.createElement("a"); + link.href = url; + link.setAttribute("download", zipFilename); + document.body.appendChild(link); + link.click(); + document.body.removeChild(link); + + setTimeout(() => { + URL.revokeObjectURL(url); + }, 100); + }; const renderDataTable = ( data: any[], @@ -420,14 +537,13 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe title: string, csvFilename: string, jsonFilename: string, - isPaginatedList: boolean = false, isSchemaData: boolean = false ) => { if (data.length === 0) return null; const shouldShowAsKeyValue = isSchemaData && !isSchemaTabular && data.length === 1; - if (title === '') { + if (!title || title.trim() === '') { return ( <> @@ -673,7 +789,7 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe ); }; - const hasData = schemaData.length > 0 || listData.length > 0 || legacyData.length > 0; + const hasData = schemaData.length > 0 || listData.length > 0 || crawlData.length > 0 || searchData.length > 0 || legacyData.length > 0; const hasScreenshots = row.binaryOutput && Object.keys(row.binaryOutput).length > 0; const hasMarkdown = markdownContent.length > 0; const hasHTML = htmlContent.length > 0; @@ -805,7 +921,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe )} ) : ( - // Extract robot output <> {row.status === 'running' || row.status === 'queued' ? ( <> @@ -884,7 +999,6 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe '', `${schemaKeys[currentSchemaIndex] || 'schema_data'}.csv`, `${schemaKeys[currentSchemaIndex] || 'schema_data'}.json`, - false, true )} @@ -1033,6 +1147,588 @@ export const RunContent = ({ row, currentLog, interpretationInProgress, logEndRe )} )} + + {crawlData.length > 0 && crawlData[0] && crawlData[0].length > 0 && ( + + }> + + + Crawl Results + + + + + + {crawlData[0].map((item: any, idx: number) => { + const url = item?.metadata?.url || item?.url || `URL ${idx + 1}`; + + return ( + setCurrentCrawlIndex(idx)} + sx={{ + px: 2, + py: 1, + cursor: 'pointer', + backgroundColor: currentCrawlIndex === idx + ? darkMode ? '#121111ff' : '#e9ecef' + : 'transparent', + borderBottom: currentCrawlIndex === idx ? '3px solid #FF00C3' : 'none', + color: darkMode ? '#fff' : '#000', + whiteSpace: 'nowrap', + fontSize: '0.875rem', + flexShrink: 0, + }} + title={url} + > + Link {idx + 1} + + ); + })} + + + {crawlData[0][currentCrawlIndex] && ( + <> + + }> + + + Metadata + + + + + + + + {crawlData[0][currentCrawlIndex].metadata && + Object.entries(crawlData[0][currentCrawlIndex].metadata).map(([key, value]: [string, any]) => ( + + + {key} + + + {value === undefined || value === '' ? '-' : String(value)} + + + )) + } + +
+
+
+
+ + {crawlData[0][currentCrawlIndex].text && ( + + }> + + + Text Content + + + + + + + {crawlData[0][currentCrawlIndex].text} + + + + + )} + + {crawlData[0][currentCrawlIndex].html && ( + + }> + + + HTML + + + + + + + {crawlData[0][currentCrawlIndex].html} + + + + + )} + + {crawlData[0][currentCrawlIndex].links && crawlData[0][currentCrawlIndex].links.length > 0 && ( + + }> + + + Links ({crawlData[0][currentCrawlIndex].links.length}) + + + + + + {crawlData[0][currentCrawlIndex].links.map((link: string, idx: number) => ( + + {link} + + ))} + + + + )} + + + + + + + + )} +
+
+ )} + + {searchData.length > 0 && ( + + }> + + + Search Results + + + + + {searchMode === 'scrape' && searchData.length > 0 ? ( + <> + + {searchData.map((item: any, idx: number) => { + const url = item?.metadata?.url || item?.url || `Result ${idx + 1}`; + + return ( + setCurrentSearchIndex(idx)} + sx={{ + px: 2, + py: 1, + cursor: 'pointer', + backgroundColor: currentSearchIndex === idx + ? darkMode ? '#121111ff' : '#e9ecef' + : 'transparent', + borderBottom: currentSearchIndex === idx ? '3px solid #FF00C3' : 'none', + color: darkMode ? '#fff' : '#000', + whiteSpace: 'nowrap', + fontSize: '0.875rem', + flexShrink: 0, + }} + title={url} + > + Link {idx + 1} + + ); + })} + + + {searchData[currentSearchIndex] && ( + <> + + }> + + + Metadata + + + + + + + + {searchData[currentSearchIndex].metadata && + Object.entries(searchData[currentSearchIndex].metadata).map(([key, value]: [string, any]) => ( + + + {key} + + + {value === undefined || value === '' ? '-' : String(value)} + + + )) + } + +
+
+
+
+ + {searchData[currentSearchIndex].text && ( + + }> + + + Text Content + + + + + + + {searchData[currentSearchIndex].text} + + + + + )} + + {searchData[currentSearchIndex].html && ( + + }> + + + HTML + + + + + + + {searchData[currentSearchIndex].html} + + + + + )} + + {searchData[currentSearchIndex].markdown && ( + + }> + + + Markdown + + + + + + + {searchData[currentSearchIndex].markdown} + + + + + )} + + {searchData[currentSearchIndex].links && searchData[currentSearchIndex].links.length > 0 && ( + + }> + + + Links ({searchData[currentSearchIndex].links.length}) + + + + + + {searchData[currentSearchIndex].links.map((link: string, idx: number) => ( + + {link} + + ))} + + + + )} + + + + + + )} + + ) : ( + <> + + + + + + Title + + + URL + + + Description + + + + + + {searchData.map((result: any, idx: number) => ( + + + {result.title || '-'} + + + {result.url ? ( + + {result.url} + + ) : '-'} + + + {result.description || '-'} + + + ))} + +
+
+ + + + + + )} +
+
+ )}
)} diff --git a/src/components/run/RunsTable.tsx b/src/components/run/RunsTable.tsx index 65e57049..c24b90e4 100644 --- a/src/components/run/RunsTable.tsx +++ b/src/components/run/RunsTable.tsx @@ -56,6 +56,7 @@ export interface Data { runByScheduleId?: string; browserId: string; runByAPI?: boolean; + runBySDK?: boolean; log: string; runId: string; robotId: string; diff --git a/src/context/globalInfo.tsx b/src/context/globalInfo.tsx index 5db9239d..5f958d75 100644 --- a/src/context/globalInfo.tsx +++ b/src/context/globalInfo.tsx @@ -27,7 +27,7 @@ interface RobotMeta { pairs: number; updatedAt: string; params: any[]; - type?: 'extract' | 'scrape'; + type?: 'extract' | 'scrape' | 'crawl' | 'search'; url?: string; formats?: ('markdown' | 'html' | 'screenshot-visible' | 'screenshot-fullpage')[]; isLLM?: boolean;