feat: add html scrape support

This commit is contained in:
Rohit Rajan
2025-11-20 18:49:39 +05:30
parent fef038b8cf
commit e90cd9961e
12 changed files with 366 additions and 105 deletions

View File

@@ -110,7 +110,10 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => {
case 'integrate':
return (
<MemoizedTableCell key={column.id} align={column.align}>
<MemoizedIntegrateButton handleIntegrate={() => handlers.handleIntegrateRecording(row.id, row.name, row.params || [])} />
<MemoizedIntegrateButton
handleIntegrate={() => handlers.handleIntegrateRecording(row.id, row.name, row.params || [])}
robotType={row.type}
/>
</MemoizedTableCell>
);
case 'options':
@@ -121,6 +124,7 @@ const TableRowMemoized = memo(({ row, columns, handlers }: any) => {
handleEdit={() => handlers.handleEditRobot(row.id, row.name, row.params || [])}
handleDuplicate={() => handlers.handleDuplicateRobot(row.id, row.name, row.params || [])}
handleDelete={() => handlers.handleDelete(row.id)}
robotType={row.type}
/>
</MemoizedTableCell>
);
@@ -709,13 +713,22 @@ const ScheduleButton = ({ handleSchedule }: ScheduleButtonProps) => {
interface IntegrateButtonProps {
handleIntegrate: () => void;
robotType: string;
}
const IntegrateButton = ({ handleIntegrate }: IntegrateButtonProps) => {
const IntegrateButton = ({ handleIntegrate, robotType }: IntegrateButtonProps) => {
const isDisabled = robotType === 'scrape';
return (
<IconButton aria-label="add" size="small" onClick={() => {
handleIntegrate();
}}
<IconButton
aria-label="integrate"
size="small"
onClick={isDisabled ? undefined : handleIntegrate}
disabled={isDisabled}
sx={{
opacity: isDisabled ? 0.4 : 1,
cursor: isDisabled ? 'not-allowed' : 'pointer',
}}
>
<Power />
</IconButton>
@@ -742,9 +755,10 @@ interface OptionsButtonProps {
handleEdit: () => void;
handleDelete: () => void;
handleDuplicate: () => void;
robotType: string;
}
const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate }: OptionsButtonProps) => {
const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicate, robotType }: OptionsButtonProps) => {
const [anchorEl, setAnchorEl] = React.useState<null | HTMLElement>(null);
const handleClick = (event: React.MouseEvent<HTMLElement>) => {
@@ -771,34 +785,33 @@ const OptionsButton = ({ handleRetrain, handleEdit, handleDelete, handleDuplicat
open={Boolean(anchorEl)}
onClose={handleClose}
>
<MenuItem onClick={() => { handleRetrain(); handleClose(); }}>
<ListItemIcon>
<Refresh fontSize="small" />
</ListItemIcon>
<ListItemText>{t('recordingtable.retrain')}</ListItemText>
</MenuItem>
{robotType !== 'scrape' && (
<MenuItem onClick={() => { handleRetrain(); handleClose(); }}>
<ListItemIcon>
<Refresh fontSize="small" />
</ListItemIcon>
<ListItemText>Retrain</ListItemText>
</MenuItem>
)}
<MenuItem onClick={() => { handleEdit(); handleClose(); }}>
<ListItemIcon>
<Edit fontSize="small" />
</ListItemIcon>
<ListItemText>{t('recordingtable.edit')}</ListItemText>
<ListItemIcon><Edit fontSize="small" /></ListItemIcon>
<ListItemText>Edit</ListItemText>
</MenuItem>
<MenuItem onClick={() => { handleDelete(); handleClose(); }}>
<ListItemIcon>
<DeleteForever fontSize="small" />
</ListItemIcon>
<ListItemText>{t('recordingtable.delete')}</ListItemText>
<ListItemIcon><DeleteForever fontSize="small" /></ListItemIcon>
<ListItemText>Delete</ListItemText>
</MenuItem>
<MenuItem onClick={() => { handleDuplicate(); handleClose(); }}>
<ListItemIcon>
<ContentCopy fontSize="small" />
</ListItemIcon>
<ListItemText>{t('recordingtable.duplicate')}</ListItemText>
</MenuItem>
{robotType !== 'scrape' && (
<MenuItem onClick={() => { handleDuplicate(); handleClose(); }}>
<ListItemIcon><ContentCopy fontSize="small" /></ListItemIcon>
<ListItemText>Duplicate</ListItemText>
</MenuItem>
)}
</Menu>
</>
);
};

View File

@@ -15,12 +15,16 @@ import {
Container,
CardContent,
Tabs,
Tab
Tab,
RadioGroup,
Radio,
FormControl,
FormLabel
} from '@mui/material';
import { ArrowBack, PlayCircleOutline, Article, Code, Description } from '@mui/icons-material';
import { useGlobalInfoStore } from '../../../context/globalInfo';
import { canCreateBrowserInState, getActiveBrowserId, stopRecording } from '../../../api/recording';
import { createMarkdownRobot } from "../../../api/storage";
import { createScrapeRobot } from "../../../api/storage";
import { AuthContext } from '../../../context/auth';
import { GenericModal } from '../../ui/GenericModal';
@@ -54,11 +58,12 @@ const RobotCreate: React.FC = () => {
const [tabValue, setTabValue] = useState(0);
const [url, setUrl] = useState('');
const [markdownRobotName, setMarkdownRobotName] = useState('');
const [scrapeRobotName, setScrapeRobotName] = useState('');
const [needsLogin, setNeedsLogin] = useState(false);
const [isLoading, setIsLoading] = useState(false);
const [isWarningModalOpen, setWarningModalOpen] = useState(false);
const [activeBrowserId, setActiveBrowserId] = useState('');
const [outputFormats, setOutputFormats] = useState<string[]>([]);
const { state } = React.useContext(AuthContext);
const { user } = state;
@@ -200,7 +205,7 @@ const RobotCreate: React.FC = () => {
}}
>
<Tab label="Extract" id="extract-robot" aria-controls="extract-robot" />
<Tab label="Markdown" id="markdown-robot" aria-controls="markdown-robot" />
<Tab label="Scrape" id="scrape-robot" aria-controls="scrape-robot" />
</Tabs>
</Box>
@@ -370,7 +375,7 @@ const RobotCreate: React.FC = () => {
/>
<Typography variant="body2" color="text.secondary" mb={3}>
Turn websites into LLM-ready Markdown content for AI apps.
Turn websites into LLM-ready Markdown or clean HTML content for AI apps.
</Typography>
<Box sx={{ width: '100%', maxWidth: 700, mb: 2 }}>
@@ -378,8 +383,8 @@ const RobotCreate: React.FC = () => {
placeholder="Example: YC Companies Scraper"
variant="outlined"
fullWidth
value={markdownRobotName}
onChange={(e) => setMarkdownRobotName(e.target.value)}
value={scrapeRobotName}
onChange={(e) => setScrapeRobotName(e.target.value)}
sx={{ mb: 2 }}
label="Robot Name"
/>
@@ -390,7 +395,44 @@ const RobotCreate: React.FC = () => {
value={url}
onChange={(e) => setUrl(e.target.value)}
label="Website URL"
sx={{ mb: 2 }}
/>
<FormControl component="fieldset" sx={{ width: '100%', textAlign: 'left' }}>
<FormLabel component="legend" sx={{ mb: 1 }}>Output Format (Select at least one)</FormLabel>
<FormControlLabel
control={
<Checkbox
checked={outputFormats.includes('markdown')}
onChange={(e) => {
if (e.target.checked) {
setOutputFormats([...outputFormats, 'markdown']);
} else {
setOutputFormats(outputFormats.filter(f => f !== 'markdown'));
}
}}
/>
}
label="Markdown"
/>
<FormControlLabel
control={
<Checkbox
checked={outputFormats.includes('html')}
onChange={(e) => {
if (e.target.checked) {
setOutputFormats([...outputFormats, 'html']);
} else {
setOutputFormats(outputFormats.filter(f => f !== 'html'));
}
}}
/>
}
label="HTML"
/>
</FormControl>
</Box>
<Button
@@ -401,23 +443,28 @@ const RobotCreate: React.FC = () => {
notify('error', 'Please enter a valid URL');
return;
}
if (!markdownRobotName.trim()) {
if (!scrapeRobotName.trim()) {
notify('error', 'Please enter a robot name');
return;
}
if (outputFormats.length === 0) {
notify('error', 'Please select at least one output format');
return;
}
setIsLoading(true);
const result = await createMarkdownRobot(url, markdownRobotName);
const result = await createScrapeRobot(url, scrapeRobotName, outputFormats);
setIsLoading(false);
if (result) {
setRerenderRobots(true);
notify('success', `${markdownRobotName} created successfully!`);
notify('success', `${scrapeRobotName} created successfully!`);
navigate('/robots');
} else {
notify('error', 'Failed to create markdown robot');
}
}}
disabled={!url.trim() || !markdownRobotName.trim() || isLoading}
disabled={!url.trim() || !scrapeRobotName.trim() || outputFormats.length === 0 || isLoading}
sx={{
bgcolor: '#ff00c3',
py: 1.4,
@@ -428,7 +475,10 @@ const RobotCreate: React.FC = () => {
}}
startIcon={isLoading ? <CircularProgress size={20} color="inherit" /> : null}
>
{isLoading ? 'Turning...' : 'Turn to Markdown'}
{isLoading
? "Creating..."
: `Create ${outputFormats.join(" + ").toUpperCase()} Robot`
}
</Button>
</Box>
</Card>

View File

@@ -24,8 +24,9 @@ interface RobotMeta {
pairs: number;
updatedAt: string;
params: any[];
type?: 'traditional' | 'markdown';
type?: 'extract' | 'scrape';
url?: string;
formats?: ('markdown' | 'html')[];
}
interface RobotWorkflow {

View File

@@ -24,8 +24,9 @@ interface RobotMeta {
pairs: number;
updatedAt: string;
params: any[];
type?: 'traditional' | 'markdown';
type?: 'extract' | 'scrape';
url?: string;
formats?: ('markdown' | 'html')[];
}
interface RobotWorkflow {

View File

@@ -16,8 +16,9 @@ interface RobotMeta {
pairs: number;
updatedAt: string;
params: any[];
type?: 'traditional' | 'markdown';
type?: 'extract' | 'scrape';
url?: string;
formats?: ('markdown' | 'html')[];
}
interface RobotWorkflow {

View File

@@ -27,8 +27,9 @@ interface RobotMeta {
pairs: number;
updatedAt: string;
params: any[];
type?: 'traditional' | 'markdown';
type?: 'extract' | 'scrape';
url?: string;
formats?: ('markdown' | 'html')[];
}
interface RobotWorkflow {