diff --git a/skyvern-frontend/src/routes/workflows/ImportWorkflowButton.tsx b/skyvern-frontend/src/routes/workflows/ImportWorkflowButton.tsx index b618752f..9e7c40e9 100644 --- a/skyvern-frontend/src/routes/workflows/ImportWorkflowButton.tsx +++ b/skyvern-frontend/src/routes/workflows/ImportWorkflowButton.tsx @@ -67,16 +67,55 @@ function ImportWorkflowButton() { { if (event.target.files && event.target.files[0]) { - const fileTextContent = await event.target.files[0].text(); - const isJson = isJsonString(fileTextContent); - const content = isJson - ? convertToYAML(JSON.parse(fileTextContent)) - : fileTextContent; - createWorkflowFromYamlMutation.mutate(content); + const file = event.target.files[0]; + const fileName = file.name.toLowerCase(); + + if (fileName.endsWith(".pdf")) { + // Handle PDF file - send as FormData to new endpoint + const formData = new FormData(); + formData.append("file", file); + + const client = await getClient(credentialGetter); + try { + const response = await client.post( + "/workflows/import-pdf", + formData, + { + headers: { + "Content-Type": "multipart/form-data", + }, + }, + ); + + queryClient.invalidateQueries({ + queryKey: ["workflows"], + }); + navigate( + `/workflows/${response.data.workflow_permanent_id}/debug`, + ); + } catch (error) { + toast({ + title: "Import Failed", + description: + error instanceof Error + ? error.message + : "Failed to import PDF", + variant: "destructive", + }); + } + } else { + // Non-pdf files like yaml, json + const fileTextContent = await file.text(); + const isJson = isJsonString(fileTextContent); + const content = isJson + ? convertToYAML(JSON.parse(fileTextContent)) + : fileTextContent; + createWorkflowFromYamlMutation.mutate(content); + } } }} /> @@ -91,7 +130,7 @@ function ImportWorkflowButton() { - Import a workflow from a YAML or JSON file + Import a workflow from a YAML, JSON, or PDF file diff --git a/skyvern/forge/prompts/skyvern/build-workflow-from-pdf.j2 b/skyvern/forge/prompts/skyvern/build-workflow-from-pdf.j2 new file mode 100644 index 00000000..71c30faa --- /dev/null +++ b/skyvern/forge/prompts/skyvern/build-workflow-from-pdf.j2 @@ -0,0 +1,84 @@ +You are an AI assistant that converts Standard Operating Procedures (SOP) from text into a Skyvern workflow definition in JSON format. + +REQUIRED OUTPUT FORMAT: +Return a JSON object with this structure: +{ + "title": "workflow_name", + "workflow_definition": { + "parameters": [ + { + "key": "parameter_name", + "parameter_type": "workflow", + "workflow_parameter_type": "string", + "description": "Parameter description", + "default_value": null + } + ], + "blocks": [ + { + "label": "block_name", + "block_type": "block_type_name", + "continue_on_failure": false, + // ... other required fields for each block type + } + ] + } +} + +PARAMETER STRUCTURE: +- Each parameter MUST have: "key", "parameter_type", "workflow_parameter_type" +- "parameter_type" should always be "workflow" +- "workflow_parameter_type" can be: "string", "json", "credential_id", "file_url" +- Use "credential_id" for passwords/credentials +- "description" and "default_value" are optional + +AVAILABLE BLOCK TYPES (use these exact names): +- "login": For user authentication with credentials +- "navigation": For navigating to pages and filling forms +- "action": For clicking buttons or simple actions +- "extraction": For extracting data from pages +- "task": For complex tasks with both navigation and extraction +- "file_download": For downloading files +- "for_loop": For repeating actions over a list +- "validation": For validating extracted data +- "wait": For waiting/pausing +- "code": For custom code execution +- "text_prompt": For LLM text generation +- "http_request": For API calls + +BLOCK STRUCTURE REQUIREMENTS: +1. Each block MUST have: label, block_type, continue_on_failure +2. Navigation blocks need: url (can be empty string ""), navigation_goal, engine (set to "skyvern-1.0") +3. Login blocks need: url, navigation_goal, parameter_keys (empty array if no credentials), engine (set to "skyvern-1.0") +4. Extraction blocks need: url (can be empty string ""), data_extraction_goal, data_schema, engine (set to "skyvern-1.0") +5. Action blocks need: url (can be empty string ""), navigation_goal, engine (set to "skyvern-1.0") +6. Validation blocks need: complete_criterion OR terminate_criterion (at least one must be set), parameter_keys (empty array if none) +7. For_loop blocks need: loop_blocks, loop_variable_reference +8. File_download blocks need: url (can be empty string ""), navigation_goal, engine (set to "skyvern-1.0") + +CRITICAL INSTRUCTIONS - READ CAREFULLY: +1. **BE THOROUGH**: Convert EVERY automatable step from the SOP into a block. Do not skip or combine steps. +2. **PRESERVE SPECIFICITY**: If the SOP gives specific instructions (e.g., "click X then Y", "fill field A with value B"), create separate blocks for each action. DO NOT generalize or simplify. +3. **ONE ACTION PER BLOCK**: For "action" blocks, each should do ONE specific thing (one click, one navigation). Create multiple blocks if needed. +4. **DETAILED navigation_goal**: Copy the exact instructions from the SOP into the navigation_goal field. Be as specific as the original SOP. +5. **MAINTAIN ORDER**: Keep the exact order of steps from the SOP. Do not reorganize or optimize. +6. **INCLUDE ALL CONDITIONS**: If the SOP says "if X then Y", create conditional blocks or separate blocks for each scenario. +7. **IGNORE ONLY**: Skip only steps requiring human judgment, creativity, or physical actions. Include everything else. +8. **URL FIELD**: Most blocks need a "url" field. Use empty string "" if no specific URL is needed (browser stays on current page). +9. **AVOID VALIDATION BLOCKS**: Use "extraction" blocks for data extraction. Only use "validation" if explicitly validating previous extracted data, and always include complete_criterion. +10. Set continue_on_failure to false for critical steps, true for optional ones +11. Set engine to "skyvern-1.0" for all blocks that need it +12. Use clear, descriptive labels that match the SOP terminology + +EXAMPLES OF THOROUGHNESS: +- If SOP says "Navigate to page X, then click button Y, then fill form Z" → Create 3 separate blocks +- If SOP says "Click the 'Products' link in the top left" → Use that EXACT wording in navigation_goal +- If SOP has 20 steps → Your workflow should have ~20 blocks (one per step) + +Standard Operating Procedure: +``` +{{ sop_text }} +``` + +Return ONLY a valid JSON object following the structure above. Create a comprehensive workflow that captures EVERY automatable step from the SOP with full specificity. + diff --git a/skyvern/forge/sdk/routes/agent_protocol.py b/skyvern/forge/sdk/routes/agent_protocol.py index 374d2be9..ed9c4eb4 100644 --- a/skyvern/forge/sdk/routes/agent_protocol.py +++ b/skyvern/forge/sdk/routes/agent_protocol.py @@ -98,6 +98,7 @@ from skyvern.schemas.runs import ( ) from skyvern.schemas.workflows import BlockType, WorkflowCreateYAMLRequest, WorkflowRequest, WorkflowStatus from skyvern.services import block_service, run_service, task_v1_service, task_v2_service, workflow_service +from skyvern.services.pdf_import_service import pdf_import_service from skyvern.webeye.actions.actions import Action LOG = structlog.get_logger() @@ -588,6 +589,47 @@ async def create_workflow_from_prompt( return workflow.model_dump(by_alias=True) +@legacy_base_router.post( + "/workflows/import-pdf", + response_model=dict[str, Any], + tags=["agent"], + openapi_extra={ + "x-fern-sdk-method-name": "import_workflow_from_pdf", + "x-fern-examples": [ + { + "code-samples": [ + { + "sdk": "curl", + "code": 'curl -X POST "https://api.skyvern.com/workflows/import-pdf" \\\n -H "Authorization: Bearer YOUR_API_KEY" \\\n -F "file=@sop_document.pdf"', + } + ] + } + ], + }, + description="Import a workflow from a PDF containing Standard Operating Procedures", + summary="Import workflow from PDF", + responses={ + 200: {"description": "Successfully imported workflow from PDF"}, + 400: {"description": "Invalid PDF file or no content found"}, + 422: {"description": "Failed to convert SOP to workflow"}, + 500: {"description": "Internal server error during processing"}, + }, +) +@legacy_base_router.post( + "/workflows/import-pdf/", + response_model=dict[str, Any], + include_in_schema=False, +) +async def import_workflow_from_pdf( + file: UploadFile, + current_org: Organization = Depends(org_auth_service.get_current_org), +) -> dict[str, Any]: + """Import a workflow from a PDF file containing Standard Operating Procedures.""" + analytics.capture("skyvern-oss-workflow-import-pdf") + + return await pdf_import_service.import_workflow_from_pdf(file, current_org) + + @legacy_base_router.put( "/workflows/{workflow_id}", openapi_extra={ diff --git a/skyvern/services/pdf_import_service.py b/skyvern/services/pdf_import_service.py new file mode 100644 index 00000000..b8defd64 --- /dev/null +++ b/skyvern/services/pdf_import_service.py @@ -0,0 +1,288 @@ +import os +import re +import tempfile +from typing import Any + +import structlog +from fastapi import HTTPException, UploadFile +from pypdf import PdfReader + +from skyvern.config import settings +from skyvern.forge import app +from skyvern.forge.prompts import prompt_engine +from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory +from skyvern.forge.sdk.schemas.organizations import Organization +from skyvern.schemas.workflows import WorkflowCreateYAMLRequest + +LOG = structlog.get_logger(__name__) + + +class PDFImportService: + @staticmethod + def _sanitize_workflow_json(raw: dict[str, Any]) -> dict[str, Any]: + """Clean LLM JSON to match Skyvern schema conventions and avoid Jinja errors. + + - Replace Jinja refs like {{workflow.foo}} or {{parameters.foo}} with {{foo}} + - Auto-populate block.parameter_keys with any referenced parameter keys + - Ensure all block labels are unique by appending indices to duplicates + """ + + def strip_prefixes(text: str) -> tuple[str, set[str]]: + # Replace {{ workflow.xxx }} and {{ parameters.xxx }} with {{ xxx }} + cleaned = text + cleaned = re.sub(r"\{\{\s*workflow\.([a-zA-Z0-9_\.]+)\s*\}\}", r"{{ \1 }}", cleaned) + cleaned = re.sub(r"\{\{\s*parameters\.([a-zA-Z0-9_\.]+)\s*\}\}", r"{{ \1 }}", cleaned) + + # Collect jinja variable names (take first segment before any dot) + used: set[str] = set() + for match in re.finditer(r"\{\{\s*([^\}\s\|]+)\s*[^}]*\}\}", cleaned): + var = match.group(1) + # Use base segment before dot to match parameter keys + base = var.split(".")[0] + used.add(base) + return cleaned, used + + workflow_def = raw.get("workflow_definition", {}) + param_defs = workflow_def.get("parameters", []) or [] + param_keys = {p.get("key") for p in param_defs if isinstance(p, dict) and p.get("key")} + + blocks = workflow_def.get("blocks", []) or [] + + # First pass: deduplicate block labels + seen_labels: dict[str, int] = {} + deduplicated_count = 0 + for blk in blocks: + if not isinstance(blk, dict): + continue + label = blk.get("label", "") + if not label: + continue + + if label in seen_labels: + # This label has been seen before, append index + seen_labels[label] += 1 + new_label = f"{label}_{seen_labels[label]}" + LOG.info( + "Deduplicating block label", + original_label=label, + new_label=new_label, + occurrence=seen_labels[label], + ) + blk["label"] = new_label + deduplicated_count += 1 + else: + # First time seeing this label + seen_labels[label] = 1 + + if deduplicated_count > 0: + LOG.info( + "Deduplicated block labels", + total_deduplicated=deduplicated_count, + duplicate_labels=sorted([label for label, count in seen_labels.items() if count > 1]), + ) + for blk in blocks: + if not isinstance(blk, dict): + continue + referenced: set[str] = set() + # Fields that commonly contain Jinja + for field in [ + "url", + "navigation_goal", + "data_extraction_goal", + "complete_criterion", + "terminate_criterion", + "title", + ]: + val = blk.get(field) + if isinstance(val, str): + cleaned, used = strip_prefixes(val) + blk[field] = cleaned + referenced.update(used) + + # Ensure required fields for text_prompt blocks + if blk.get("block_type") == "text_prompt": + if not blk.get("prompt"): + # Prefer an instruction-bearing field if present + blk["prompt"] = ( + blk.get("navigation_goal") + or blk.get("title") + or blk.get("label") + or "Provide the requested text response." + ) + # Track jinja usage within the prompt + prompt_val = blk.get("prompt") + if isinstance(prompt_val, str): + cleaned, used = strip_prefixes(prompt_val) + blk["prompt"] = cleaned + referenced.update(used) + + # parameter_keys should include only known parameter keys + if param_keys: + keys_to_include = sorted(k for k in referenced if k in param_keys) + if keys_to_include: + blk["parameter_keys"] = keys_to_include + + # Ensure engine where needed + if blk.get("block_type") in {"navigation", "action", "extraction", "login", "file_download"}: + blk.setdefault("engine", "skyvern-1.0") + + # Ensure url exists (can be empty string) + if blk.get("block_type") in {"navigation", "action", "extraction", "file_download"}: + if blk.get("url") is None: + blk["url"] = "" + + return raw + + async def import_workflow_from_pdf(self, file: UploadFile, organization: Organization) -> dict[str, Any]: + LOG.info("Starting PDF import", filename=file.filename, organization_id=organization.organization_id) + + if not file.filename.lower().endswith(".pdf"): + raise HTTPException(status_code=400, detail="Only PDF files are supported.") + + # Save the uploaded file to a temporary location + LOG.info("Saving PDF to temporary file", filename=file.filename) + with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file: + temp_file.write(await file.read()) + temp_file_path = temp_file.name + + try: + # Extract text from PDF + LOG.info("Extracting text from PDF", filename=file.filename, temp_file=temp_file_path) + reader = PdfReader(temp_file_path) + sop_text = "" + for page_num, page in enumerate(reader.pages, 1): + page_text = page.extract_text() + sop_text += page_text + "\n" + LOG.debug("Extracted text from page", page=page_num, text_length=len(page_text)) + + LOG.info( + "PDF text extraction complete", + total_text_length=len(sop_text), + organization_id=organization.organization_id, + ) + + if not sop_text.strip(): + raise HTTPException(status_code=400, detail="No readable content found in the PDF.") + + # Load and render the prompt template + prompt = prompt_engine.load_prompt( + "build-workflow-from-pdf", + sop_text=sop_text, + ) + + # Use the LLM to convert SOP to workflow + llm_key = settings.LLM_KEY or "gpt-4o-mini" + LOG.info( + "Calling LLM to convert SOP to workflow", + llm_key=llm_key, + prompt_length=len(prompt), + sop_text_length=len(sop_text), + sop_chars_sent=len(sop_text), + organization_id=organization.organization_id, + ) + + llm_api_handler = LLMAPIHandlerFactory.get_llm_api_handler(llm_key) + + response = await llm_api_handler( + prompt=prompt, + prompt_name="sop_to_workflow_conversion", + organization_id=organization.organization_id, + parameters={"max_completion_tokens": 32768}, # Override the default 4096 limit for PDF conversion + ) + + LOG.info( + "LLM response received", + response_type=type(response), + response_keys=list(response.keys()) if isinstance(response, dict) else None, + organization_id=organization.organization_id, + ) + + # The LLM API handler automatically parses JSON responses + # The response should be a dict with the workflow structure + if not isinstance(response, dict): + LOG.error( + "LLM returned non-dict response", + response_type=type(response), + response=str(response)[:500], + organization_id=organization.organization_id, + ) + raise HTTPException( + status_code=422, detail="LLM returned invalid response format - expected JSON object" + ) + + # Validate that it has the required structure + if "workflow_definition" not in response: + LOG.error( + "LLM response missing workflow_definition", + response_keys=list(response.keys()), + organization_id=organization.organization_id, + ) + raise HTTPException(status_code=422, detail="LLM response missing 'workflow_definition' field") + + if "blocks" not in response.get("workflow_definition", {}): + LOG.error( + "LLM workflow_definition missing blocks", + workflow_def_keys=list(response.get("workflow_definition", {}).keys()), + organization_id=organization.organization_id, + ) + raise HTTPException(status_code=422, detail="LLM workflow definition missing 'blocks' field") + + LOG.info( + "Workflow JSON validated", + title=response.get("title"), + block_count=len(response.get("workflow_definition", {}).get("blocks", [])), + organization_id=organization.organization_id, + ) + + LOG.info( + "Creating workflow from JSON", + response_keys=list(response.keys()), + organization_id=organization.organization_id, + ) + + try: + # Sanitize LLM output for Jinja and required fields before validation + response = self._sanitize_workflow_json(response) + workflow_create_request = WorkflowCreateYAMLRequest.model_validate(response) + except Exception as e: + LOG.error( + "Failed to validate workflow request", + error=str(e), + error_type=type(e).__name__, + response_sample=str(response)[:1000], + organization_id=organization.organization_id, + exc_info=True, + ) + raise HTTPException(status_code=422, detail=f"Failed to validate workflow structure: {str(e)}") + + try: + workflow = await app.WORKFLOW_SERVICE.create_workflow_from_request( + organization=organization, + request=workflow_create_request, + ) + except Exception as e: + LOG.error( + "Failed to create workflow", + error=str(e), + error_type=type(e).__name__, + organization_id=organization.organization_id, + exc_info=True, + ) + raise HTTPException(status_code=422, detail=f"Failed to create workflow: {str(e)}") + + workflow_dict = workflow.model_dump(by_alias=True) + LOG.info( + "PDF import completed successfully", + workflow_id=workflow.workflow_permanent_id, + workflow_permanent_id_in_dict=workflow_dict.get("workflow_permanent_id"), + dict_keys=list(workflow_dict.keys()), + organization_id=organization.organization_id, + ) + return workflow_dict + + finally: + # Clean up the temporary file + os.unlink(temp_file_path) + + +pdf_import_service = PDFImportService()