Add SOP-to-blocks endpoint for workflow editor - backend (#4556)

2026-01-27 18:16:05 +03:00
parent 16945e117f
commit cb2a72775d
3 changed files with 123 additions and 2 deletions
--- a/skyvern/forge/prompts/skyvern/build-workflow-from-pdf.j2
+++ b/skyvern/forge/prompts/skyvern/build-workflow-from-pdf.j2
@@ -1,5 +1,22 @@
 You are an AI assistant that converts Standard Operating Procedures (SOP) from text into a Skyvern workflow definition in JSON format.

+FIRST: Evaluate if this content contains procedural instructions.
+
+A valid SOP MUST contain:
+- Step-by-step instructions telling someone what to DO
+- Action verbs like: go to, click, navigate, enter, fill, submit, download, extract, etc.
+- Sequential steps for completing a task
+
+Note: A document may include reference materials (like sample invoices, screenshots, or examples) alongside instructions - this is still a valid SOP as long as it contains procedural steps.
+
+If the content has NO procedural instructions (just data, information, or documents without steps), return ONLY this JSON:
+{
+  "error": "not_sop",
+  "reason": "This document does not contain procedural instructions or steps to automate."
+}
+
+If the content DOES contain procedural instructions, proceed with the conversion below.
+
 REQUIRED OUTPUT FORMAT:
 Return a JSON object with this structure:
 {
@@ -68,7 +85,7 @@ CRITICAL INSTRUCTIONS - READ CAREFULLY:
 9. **AVOID VALIDATION BLOCKS**: Use "extraction" blocks for data extraction. Only use "validation" if explicitly validating previous extracted data, and always include complete_criterion.
 10. Set continue_on_failure to false for critical steps, true for optional ones
 11. Set engine to "skyvern-1.0" for all blocks that need it
-12. Use clear, descriptive labels that match the SOP terminology
+12. Use SHORT descriptive labels (1-5 words, snake_case): e.g., "login", "extract_date_data", "submit_google_login_form". Avoid long labels.

 EXAMPLES OF THOROUGHNESS:
 - If SOP says "Navigate to page X, then click button Y, then fill form Z" → Create 3 separate blocks
--- a/skyvern/forge/sdk/routes/agent_protocol.py
+++ b/skyvern/forge/sdk/routes/agent_protocol.py
@@ -664,6 +664,76 @@ async def _validate_file_size(file: UploadFile) -> UploadFile:
    return file


+@legacy_base_router.post(
+    "/workflows/sop-to-blocks",
+    response_model=dict[str, Any],
+    include_in_schema=False,
+)
+@legacy_base_router.post(
+    "/workflows/sop-to-blocks/",
+    response_model=dict[str, Any],
+    include_in_schema=False,
+)
+async def convert_sop_to_blocks(
+    file: UploadFile = Depends(_validate_file_size),
+    current_org: Organization = Depends(org_auth_service.get_current_org),
+) -> dict[str, Any]:
+    """Convert a PDF SOP to workflow blocks without creating a workflow."""
+    analytics.capture(
+        "skyvern-oss-workflow-sop-to-blocks",
+        data={"organization_id": current_org.organization_id},
+    )
+
+    # Validate PDF
+    if not file.filename or not file.filename.lower().endswith(".pdf"):
+        raise HTTPException(status_code=400, detail="Only PDF files are supported.")
+
+    try:
+        file_contents = await file.read()
+        file_name = file.filename
+    finally:
+        await file.close()
+
+    # Extract text from PDF
+    sop_text = await asyncio.to_thread(
+        pdf_import_service.extract_text_from_pdf,
+        file_contents,
+        file_name,
+    )
+
+    # Convert to workflow definition via LLM
+    try:
+        result = await pdf_import_service.create_workflow_from_sop_text(sop_text, current_org)
+    except HTTPException:
+        raise
+    except Exception as e:
+        LOG.exception(
+            "Failed to convert SOP to blocks",
+            organization_id=current_org.organization_id,
+            filename=file_name,
+        )
+        raise HTTPException(
+            status_code=422,
+            detail="Failed to convert SOP to workflow blocks. Please verify the PDF content and try again.",
+        ) from e
+
+    workflow_def = result.get("workflow_definition", {})
+
+    # Transform blocks: convert parameter_keys (backend format) to parameters (frontend format)
+    # This is done here rather than in _sanitize_workflow_json because the import-pdf endpoint
+    # needs the backend format for WorkflowCreateYAMLRequest validation
+    # Create shallow copies to avoid mutating shared data structures
+    blocks = [dict(block) for block in workflow_def.get("blocks", [])]
+    for block in blocks:
+        parameter_keys = block.pop("parameter_keys", None) or []
+        block["parameters"] = [{"key": key} for key in parameter_keys]
+
+    return {
+        "blocks": blocks,
+        "parameters": workflow_def.get("parameters", []),
+    }
+
+
@legacy_base_router.post(
    "/workflows/import-pdf",
    response_model=dict[str, Any],
--- a/skyvern/services/pdf_import_service.py
+++ b/skyvern/services/pdf_import_service.py
@@ -21,6 +21,7 @@ class PDFImportService:
    def _sanitize_workflow_json(raw: dict[str, Any]) -> dict[str, Any]:
        """Clean LLM JSON to match Skyvern schema conventions and avoid Jinja errors.

+        - Replace whitespace in block labels with underscores
        - Replace Jinja refs like {{workflow.foo}} or {{parameters.foo}} with {{foo}}
        - Auto-populate block.parameter_keys with any referenced parameter keys
        - Ensure all block labels are unique by appending indices to duplicates
@@ -47,7 +48,23 @@ class PDFImportService:

        blocks = workflow_def.get("blocks", []) or []

-        # First pass: deduplicate block labels
+        # First pass: sanitize block labels (replace whitespace with underscores)
+        for blk in blocks:
+            if not isinstance(blk, dict):
+                continue
+            label = blk.get("label", "")
+            if label:
+                # Replace any whitespace with underscores (same as frontend behavior)
+                sanitized_label = re.sub(r"\s+", "_", label)
+                if sanitized_label != label:
+                    LOG.info(
+                        "Sanitizing block label",
+                        original_label=label,
+                        sanitized_label=sanitized_label,
+                    )
+                    blk["label"] = sanitized_label
+
+        # Second pass: deduplicate block labels
        seen_labels: dict[str, int] = {}
        deduplicated_count = 0
        for blk in blocks:
@@ -130,6 +147,9 @@ class PDFImportService:
                if blk.get("url") is None:
                    blk["url"] = ""

+            # Note: parameter_keys is kept in backend format for WorkflowCreateYAMLRequest validation
+            # The sop-to-blocks endpoint transforms to frontend format separately
+
        return raw

    def extract_text_from_pdf(self, file_contents: bytes, file_name: str) -> str:
@@ -212,6 +232,20 @@ class PDFImportService:
            )
            raise HTTPException(status_code=422, detail="LLM returned invalid response format - expected JSON object")

+        # Check if LLM detected non-SOP content
+        if response.get("error") == "not_sop":
+            LOG.info(
+                "LLM detected non-SOP content",
+                reason=response.get("reason"),
+                organization_id=organization.organization_id,
+            )
+            raise HTTPException(
+                status_code=422,
+                detail=response.get(
+                    "reason", "The uploaded PDF does not appear to contain a Standard Operating Procedure."
+                ),
+            )
+
        # Validate that it has the required structure
        if "workflow_definition" not in response:
            LOG.error(