Pedro/add sop workflow builder (#3685)

This commit is contained in:
pedrohsdb
2025-10-10 12:37:42 -07:00
committed by GitHub
parent f5a313c74b
commit ae7785d426
4 changed files with 461 additions and 8 deletions

View File

@@ -67,16 +67,55 @@ function ImportWorkflowButton() {
<input <input
id={inputId} id={inputId}
type="file" type="file"
accept=".yaml,.yml,.json" accept=".yaml,.yml,.json,.pdf"
className="hidden" className="hidden"
onChange={async (event) => { onChange={async (event) => {
if (event.target.files && event.target.files[0]) { if (event.target.files && event.target.files[0]) {
const fileTextContent = await event.target.files[0].text(); const file = event.target.files[0];
const isJson = isJsonString(fileTextContent); const fileName = file.name.toLowerCase();
const content = isJson
? convertToYAML(JSON.parse(fileTextContent)) if (fileName.endsWith(".pdf")) {
: fileTextContent; // Handle PDF file - send as FormData to new endpoint
createWorkflowFromYamlMutation.mutate(content); const formData = new FormData();
formData.append("file", file);
const client = await getClient(credentialGetter);
try {
const response = await client.post<WorkflowApiResponse>(
"/workflows/import-pdf",
formData,
{
headers: {
"Content-Type": "multipart/form-data",
},
},
);
queryClient.invalidateQueries({
queryKey: ["workflows"],
});
navigate(
`/workflows/${response.data.workflow_permanent_id}/debug`,
);
} catch (error) {
toast({
title: "Import Failed",
description:
error instanceof Error
? error.message
: "Failed to import PDF",
variant: "destructive",
});
}
} else {
// Non-pdf files like yaml, json
const fileTextContent = await file.text();
const isJson = isJsonString(fileTextContent);
const content = isJson
? convertToYAML(JSON.parse(fileTextContent))
: fileTextContent;
createWorkflowFromYamlMutation.mutate(content);
}
} }
}} }}
/> />
@@ -91,7 +130,7 @@ function ImportWorkflowButton() {
</Label> </Label>
</TooltipTrigger> </TooltipTrigger>
<TooltipContent> <TooltipContent>
Import a workflow from a YAML or JSON file Import a workflow from a YAML, JSON, or PDF file
</TooltipContent> </TooltipContent>
</Tooltip> </Tooltip>
</TooltipProvider> </TooltipProvider>

View File

@@ -0,0 +1,84 @@
You are an AI assistant that converts Standard Operating Procedures (SOP) from text into a Skyvern workflow definition in JSON format.
REQUIRED OUTPUT FORMAT:
Return a JSON object with this structure:
{
"title": "workflow_name",
"workflow_definition": {
"parameters": [
{
"key": "parameter_name",
"parameter_type": "workflow",
"workflow_parameter_type": "string",
"description": "Parameter description",
"default_value": null
}
],
"blocks": [
{
"label": "block_name",
"block_type": "block_type_name",
"continue_on_failure": false,
// ... other required fields for each block type
}
]
}
}
PARAMETER STRUCTURE:
- Each parameter MUST have: "key", "parameter_type", "workflow_parameter_type"
- "parameter_type" should always be "workflow"
- "workflow_parameter_type" can be: "string", "json", "credential_id", "file_url"
- Use "credential_id" for passwords/credentials
- "description" and "default_value" are optional
AVAILABLE BLOCK TYPES (use these exact names):
- "login": For user authentication with credentials
- "navigation": For navigating to pages and filling forms
- "action": For clicking buttons or simple actions
- "extraction": For extracting data from pages
- "task": For complex tasks with both navigation and extraction
- "file_download": For downloading files
- "for_loop": For repeating actions over a list
- "validation": For validating extracted data
- "wait": For waiting/pausing
- "code": For custom code execution
- "text_prompt": For LLM text generation
- "http_request": For API calls
BLOCK STRUCTURE REQUIREMENTS:
1. Each block MUST have: label, block_type, continue_on_failure
2. Navigation blocks need: url (can be empty string ""), navigation_goal, engine (set to "skyvern-1.0")
3. Login blocks need: url, navigation_goal, parameter_keys (empty array if no credentials), engine (set to "skyvern-1.0")
4. Extraction blocks need: url (can be empty string ""), data_extraction_goal, data_schema, engine (set to "skyvern-1.0")
5. Action blocks need: url (can be empty string ""), navigation_goal, engine (set to "skyvern-1.0")
6. Validation blocks need: complete_criterion OR terminate_criterion (at least one must be set), parameter_keys (empty array if none)
7. For_loop blocks need: loop_blocks, loop_variable_reference
8. File_download blocks need: url (can be empty string ""), navigation_goal, engine (set to "skyvern-1.0")
CRITICAL INSTRUCTIONS - READ CAREFULLY:
1. **BE THOROUGH**: Convert EVERY automatable step from the SOP into a block. Do not skip or combine steps.
2. **PRESERVE SPECIFICITY**: If the SOP gives specific instructions (e.g., "click X then Y", "fill field A with value B"), create separate blocks for each action. DO NOT generalize or simplify.
3. **ONE ACTION PER BLOCK**: For "action" blocks, each should do ONE specific thing (one click, one navigation). Create multiple blocks if needed.
4. **DETAILED navigation_goal**: Copy the exact instructions from the SOP into the navigation_goal field. Be as specific as the original SOP.
5. **MAINTAIN ORDER**: Keep the exact order of steps from the SOP. Do not reorganize or optimize.
6. **INCLUDE ALL CONDITIONS**: If the SOP says "if X then Y", create conditional blocks or separate blocks for each scenario.
7. **IGNORE ONLY**: Skip only steps requiring human judgment, creativity, or physical actions. Include everything else.
8. **URL FIELD**: Most blocks need a "url" field. Use empty string "" if no specific URL is needed (browser stays on current page).
9. **AVOID VALIDATION BLOCKS**: Use "extraction" blocks for data extraction. Only use "validation" if explicitly validating previous extracted data, and always include complete_criterion.
10. Set continue_on_failure to false for critical steps, true for optional ones
11. Set engine to "skyvern-1.0" for all blocks that need it
12. Use clear, descriptive labels that match the SOP terminology
EXAMPLES OF THOROUGHNESS:
- If SOP says "Navigate to page X, then click button Y, then fill form Z" → Create 3 separate blocks
- If SOP says "Click the 'Products' link in the top left" → Use that EXACT wording in navigation_goal
- If SOP has 20 steps → Your workflow should have ~20 blocks (one per step)
Standard Operating Procedure:
```
{{ sop_text }}
```
Return ONLY a valid JSON object following the structure above. Create a comprehensive workflow that captures EVERY automatable step from the SOP with full specificity.

View File

@@ -98,6 +98,7 @@ from skyvern.schemas.runs import (
) )
from skyvern.schemas.workflows import BlockType, WorkflowCreateYAMLRequest, WorkflowRequest, WorkflowStatus from skyvern.schemas.workflows import BlockType, WorkflowCreateYAMLRequest, WorkflowRequest, WorkflowStatus
from skyvern.services import block_service, run_service, task_v1_service, task_v2_service, workflow_service from skyvern.services import block_service, run_service, task_v1_service, task_v2_service, workflow_service
from skyvern.services.pdf_import_service import pdf_import_service
from skyvern.webeye.actions.actions import Action from skyvern.webeye.actions.actions import Action
LOG = structlog.get_logger() LOG = structlog.get_logger()
@@ -588,6 +589,47 @@ async def create_workflow_from_prompt(
return workflow.model_dump(by_alias=True) return workflow.model_dump(by_alias=True)
@legacy_base_router.post(
"/workflows/import-pdf",
response_model=dict[str, Any],
tags=["agent"],
openapi_extra={
"x-fern-sdk-method-name": "import_workflow_from_pdf",
"x-fern-examples": [
{
"code-samples": [
{
"sdk": "curl",
"code": 'curl -X POST "https://api.skyvern.com/workflows/import-pdf" \\\n -H "Authorization: Bearer YOUR_API_KEY" \\\n -F "file=@sop_document.pdf"',
}
]
}
],
},
description="Import a workflow from a PDF containing Standard Operating Procedures",
summary="Import workflow from PDF",
responses={
200: {"description": "Successfully imported workflow from PDF"},
400: {"description": "Invalid PDF file or no content found"},
422: {"description": "Failed to convert SOP to workflow"},
500: {"description": "Internal server error during processing"},
},
)
@legacy_base_router.post(
"/workflows/import-pdf/",
response_model=dict[str, Any],
include_in_schema=False,
)
async def import_workflow_from_pdf(
file: UploadFile,
current_org: Organization = Depends(org_auth_service.get_current_org),
) -> dict[str, Any]:
"""Import a workflow from a PDF file containing Standard Operating Procedures."""
analytics.capture("skyvern-oss-workflow-import-pdf")
return await pdf_import_service.import_workflow_from_pdf(file, current_org)
@legacy_base_router.put( @legacy_base_router.put(
"/workflows/{workflow_id}", "/workflows/{workflow_id}",
openapi_extra={ openapi_extra={

View File

@@ -0,0 +1,288 @@
import os
import re
import tempfile
from typing import Any
import structlog
from fastapi import HTTPException, UploadFile
from pypdf import PdfReader
from skyvern.config import settings
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
from skyvern.forge.sdk.schemas.organizations import Organization
from skyvern.schemas.workflows import WorkflowCreateYAMLRequest
LOG = structlog.get_logger(__name__)
class PDFImportService:
@staticmethod
def _sanitize_workflow_json(raw: dict[str, Any]) -> dict[str, Any]:
"""Clean LLM JSON to match Skyvern schema conventions and avoid Jinja errors.
- Replace Jinja refs like {{workflow.foo}} or {{parameters.foo}} with {{foo}}
- Auto-populate block.parameter_keys with any referenced parameter keys
- Ensure all block labels are unique by appending indices to duplicates
"""
def strip_prefixes(text: str) -> tuple[str, set[str]]:
# Replace {{ workflow.xxx }} and {{ parameters.xxx }} with {{ xxx }}
cleaned = text
cleaned = re.sub(r"\{\{\s*workflow\.([a-zA-Z0-9_\.]+)\s*\}\}", r"{{ \1 }}", cleaned)
cleaned = re.sub(r"\{\{\s*parameters\.([a-zA-Z0-9_\.]+)\s*\}\}", r"{{ \1 }}", cleaned)
# Collect jinja variable names (take first segment before any dot)
used: set[str] = set()
for match in re.finditer(r"\{\{\s*([^\}\s\|]+)\s*[^}]*\}\}", cleaned):
var = match.group(1)
# Use base segment before dot to match parameter keys
base = var.split(".")[0]
used.add(base)
return cleaned, used
workflow_def = raw.get("workflow_definition", {})
param_defs = workflow_def.get("parameters", []) or []
param_keys = {p.get("key") for p in param_defs if isinstance(p, dict) and p.get("key")}
blocks = workflow_def.get("blocks", []) or []
# First pass: deduplicate block labels
seen_labels: dict[str, int] = {}
deduplicated_count = 0
for blk in blocks:
if not isinstance(blk, dict):
continue
label = blk.get("label", "")
if not label:
continue
if label in seen_labels:
# This label has been seen before, append index
seen_labels[label] += 1
new_label = f"{label}_{seen_labels[label]}"
LOG.info(
"Deduplicating block label",
original_label=label,
new_label=new_label,
occurrence=seen_labels[label],
)
blk["label"] = new_label
deduplicated_count += 1
else:
# First time seeing this label
seen_labels[label] = 1
if deduplicated_count > 0:
LOG.info(
"Deduplicated block labels",
total_deduplicated=deduplicated_count,
duplicate_labels=sorted([label for label, count in seen_labels.items() if count > 1]),
)
for blk in blocks:
if not isinstance(blk, dict):
continue
referenced: set[str] = set()
# Fields that commonly contain Jinja
for field in [
"url",
"navigation_goal",
"data_extraction_goal",
"complete_criterion",
"terminate_criterion",
"title",
]:
val = blk.get(field)
if isinstance(val, str):
cleaned, used = strip_prefixes(val)
blk[field] = cleaned
referenced.update(used)
# Ensure required fields for text_prompt blocks
if blk.get("block_type") == "text_prompt":
if not blk.get("prompt"):
# Prefer an instruction-bearing field if present
blk["prompt"] = (
blk.get("navigation_goal")
or blk.get("title")
or blk.get("label")
or "Provide the requested text response."
)
# Track jinja usage within the prompt
prompt_val = blk.get("prompt")
if isinstance(prompt_val, str):
cleaned, used = strip_prefixes(prompt_val)
blk["prompt"] = cleaned
referenced.update(used)
# parameter_keys should include only known parameter keys
if param_keys:
keys_to_include = sorted(k for k in referenced if k in param_keys)
if keys_to_include:
blk["parameter_keys"] = keys_to_include
# Ensure engine where needed
if blk.get("block_type") in {"navigation", "action", "extraction", "login", "file_download"}:
blk.setdefault("engine", "skyvern-1.0")
# Ensure url exists (can be empty string)
if blk.get("block_type") in {"navigation", "action", "extraction", "file_download"}:
if blk.get("url") is None:
blk["url"] = ""
return raw
async def import_workflow_from_pdf(self, file: UploadFile, organization: Organization) -> dict[str, Any]:
LOG.info("Starting PDF import", filename=file.filename, organization_id=organization.organization_id)
if not file.filename.lower().endswith(".pdf"):
raise HTTPException(status_code=400, detail="Only PDF files are supported.")
# Save the uploaded file to a temporary location
LOG.info("Saving PDF to temporary file", filename=file.filename)
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(await file.read())
temp_file_path = temp_file.name
try:
# Extract text from PDF
LOG.info("Extracting text from PDF", filename=file.filename, temp_file=temp_file_path)
reader = PdfReader(temp_file_path)
sop_text = ""
for page_num, page in enumerate(reader.pages, 1):
page_text = page.extract_text()
sop_text += page_text + "\n"
LOG.debug("Extracted text from page", page=page_num, text_length=len(page_text))
LOG.info(
"PDF text extraction complete",
total_text_length=len(sop_text),
organization_id=organization.organization_id,
)
if not sop_text.strip():
raise HTTPException(status_code=400, detail="No readable content found in the PDF.")
# Load and render the prompt template
prompt = prompt_engine.load_prompt(
"build-workflow-from-pdf",
sop_text=sop_text,
)
# Use the LLM to convert SOP to workflow
llm_key = settings.LLM_KEY or "gpt-4o-mini"
LOG.info(
"Calling LLM to convert SOP to workflow",
llm_key=llm_key,
prompt_length=len(prompt),
sop_text_length=len(sop_text),
sop_chars_sent=len(sop_text),
organization_id=organization.organization_id,
)
llm_api_handler = LLMAPIHandlerFactory.get_llm_api_handler(llm_key)
response = await llm_api_handler(
prompt=prompt,
prompt_name="sop_to_workflow_conversion",
organization_id=organization.organization_id,
parameters={"max_completion_tokens": 32768}, # Override the default 4096 limit for PDF conversion
)
LOG.info(
"LLM response received",
response_type=type(response),
response_keys=list(response.keys()) if isinstance(response, dict) else None,
organization_id=organization.organization_id,
)
# The LLM API handler automatically parses JSON responses
# The response should be a dict with the workflow structure
if not isinstance(response, dict):
LOG.error(
"LLM returned non-dict response",
response_type=type(response),
response=str(response)[:500],
organization_id=organization.organization_id,
)
raise HTTPException(
status_code=422, detail="LLM returned invalid response format - expected JSON object"
)
# Validate that it has the required structure
if "workflow_definition" not in response:
LOG.error(
"LLM response missing workflow_definition",
response_keys=list(response.keys()),
organization_id=organization.organization_id,
)
raise HTTPException(status_code=422, detail="LLM response missing 'workflow_definition' field")
if "blocks" not in response.get("workflow_definition", {}):
LOG.error(
"LLM workflow_definition missing blocks",
workflow_def_keys=list(response.get("workflow_definition", {}).keys()),
organization_id=organization.organization_id,
)
raise HTTPException(status_code=422, detail="LLM workflow definition missing 'blocks' field")
LOG.info(
"Workflow JSON validated",
title=response.get("title"),
block_count=len(response.get("workflow_definition", {}).get("blocks", [])),
organization_id=organization.organization_id,
)
LOG.info(
"Creating workflow from JSON",
response_keys=list(response.keys()),
organization_id=organization.organization_id,
)
try:
# Sanitize LLM output for Jinja and required fields before validation
response = self._sanitize_workflow_json(response)
workflow_create_request = WorkflowCreateYAMLRequest.model_validate(response)
except Exception as e:
LOG.error(
"Failed to validate workflow request",
error=str(e),
error_type=type(e).__name__,
response_sample=str(response)[:1000],
organization_id=organization.organization_id,
exc_info=True,
)
raise HTTPException(status_code=422, detail=f"Failed to validate workflow structure: {str(e)}")
try:
workflow = await app.WORKFLOW_SERVICE.create_workflow_from_request(
organization=organization,
request=workflow_create_request,
)
except Exception as e:
LOG.error(
"Failed to create workflow",
error=str(e),
error_type=type(e).__name__,
organization_id=organization.organization_id,
exc_info=True,
)
raise HTTPException(status_code=422, detail=f"Failed to create workflow: {str(e)}")
workflow_dict = workflow.model_dump(by_alias=True)
LOG.info(
"PDF import completed successfully",
workflow_id=workflow.workflow_permanent_id,
workflow_permanent_id_in_dict=workflow_dict.get("workflow_permanent_id"),
dict_keys=list(workflow_dict.keys()),
organization_id=organization.organization_id,
)
return workflow_dict
finally:
# Clean up the temporary file
os.unlink(temp_file_path)
pdf_import_service = PDFImportService()