Files
Dorod-Sky/skyvern/services/pdf_import_service.py

261 lines
10 KiB
Python
Raw Normal View History

2025-10-10 12:37:42 -07:00
import os
import re
import tempfile
from typing import Any
import structlog
from fastapi import HTTPException
2025-10-10 12:37:42 -07:00
from pypdf import PdfReader
from skyvern.config import settings
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
from skyvern.forge.sdk.schemas.organizations import Organization
from skyvern.schemas.workflows import WorkflowCreateYAMLRequest
LOG = structlog.get_logger(__name__)
class PDFImportService:
@staticmethod
def _sanitize_workflow_json(raw: dict[str, Any]) -> dict[str, Any]:
"""Clean LLM JSON to match Skyvern schema conventions and avoid Jinja errors.
- Replace Jinja refs like {{workflow.foo}} or {{parameters.foo}} with {{foo}}
- Auto-populate block.parameter_keys with any referenced parameter keys
- Ensure all block labels are unique by appending indices to duplicates
"""
def strip_prefixes(text: str) -> tuple[str, set[str]]:
# Replace {{ workflow.xxx }} and {{ parameters.xxx }} with {{ xxx }}
cleaned = text
cleaned = re.sub(r"\{\{\s*workflow\.([a-zA-Z0-9_\.]+)\s*\}\}", r"{{ \1 }}", cleaned)
cleaned = re.sub(r"\{\{\s*parameters\.([a-zA-Z0-9_\.]+)\s*\}\}", r"{{ \1 }}", cleaned)
# Collect jinja variable names (take first segment before any dot)
used: set[str] = set()
for match in re.finditer(r"\{\{\s*([^\}\s\|]+)\s*[^}]*\}\}", cleaned):
var = match.group(1)
# Use base segment before dot to match parameter keys
base = var.split(".")[0]
used.add(base)
return cleaned, used
workflow_def = raw.get("workflow_definition", {})
param_defs = workflow_def.get("parameters", []) or []
param_keys = {p.get("key") for p in param_defs if isinstance(p, dict) and p.get("key")}
blocks = workflow_def.get("blocks", []) or []
# First pass: deduplicate block labels
seen_labels: dict[str, int] = {}
deduplicated_count = 0
for blk in blocks:
if not isinstance(blk, dict):
continue
label = blk.get("label", "")
if not label:
continue
if label in seen_labels:
# This label has been seen before, append index
seen_labels[label] += 1
new_label = f"{label}_{seen_labels[label]}"
LOG.info(
"Deduplicating block label",
original_label=label,
new_label=new_label,
occurrence=seen_labels[label],
)
blk["label"] = new_label
deduplicated_count += 1
else:
# First time seeing this label
seen_labels[label] = 1
if deduplicated_count > 0:
LOG.info(
"Deduplicated block labels",
total_deduplicated=deduplicated_count,
duplicate_labels=sorted([label for label, count in seen_labels.items() if count > 1]),
)
for blk in blocks:
if not isinstance(blk, dict):
continue
referenced: set[str] = set()
# Fields that commonly contain Jinja
for field in [
"url",
"navigation_goal",
"data_extraction_goal",
"complete_criterion",
"terminate_criterion",
"title",
]:
val = blk.get(field)
if isinstance(val, str):
cleaned, used = strip_prefixes(val)
blk[field] = cleaned
referenced.update(used)
# Ensure required fields for text_prompt blocks
if blk.get("block_type") == "text_prompt":
if not blk.get("prompt"):
# Prefer an instruction-bearing field if present
blk["prompt"] = (
blk.get("navigation_goal")
or blk.get("title")
or blk.get("label")
or "Provide the requested text response."
)
# Track jinja usage within the prompt
prompt_val = blk.get("prompt")
if isinstance(prompt_val, str):
cleaned, used = strip_prefixes(prompt_val)
blk["prompt"] = cleaned
referenced.update(used)
# parameter_keys should include only known parameter keys
if param_keys:
keys_to_include = sorted(k for k in referenced if k in param_keys)
if keys_to_include:
blk["parameter_keys"] = keys_to_include
# Ensure engine where needed
if blk.get("block_type") in {"navigation", "action", "extraction", "login", "file_download"}:
blk.setdefault("engine", "skyvern-1.0")
# Ensure url exists (can be empty string)
if blk.get("block_type") in {"navigation", "action", "extraction", "file_download"}:
if blk.get("url") is None:
blk["url"] = ""
return raw
def extract_text_from_pdf(self, file_contents: bytes, file_name: str) -> str:
"""Extract text from PDF file contents. Raises HTTPException if invalid."""
LOG.info("Extracting text from PDF", filename=file_name)
2025-10-10 12:37:42 -07:00
# Save the uploaded file to a temporary location
with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
temp_file.write(file_contents)
2025-10-10 12:37:42 -07:00
temp_file_path = temp_file.name
try:
reader = PdfReader(temp_file_path)
sop_text = ""
for page_num, page in enumerate(reader.pages, 1):
page_text = page.extract_text() or ""
2025-10-10 12:37:42 -07:00
sop_text += page_text + "\n"
LOG.debug("Extracted text from page", page=page_num, text_length=len(page_text))
LOG.info("PDF text extraction complete", total_text_length=len(sop_text))
2025-10-10 12:37:42 -07:00
if not sop_text.strip():
raise HTTPException(status_code=400, detail="No readable content found in the PDF.")
return sop_text
except Exception as e:
LOG.warning(
"Failed to read/extract text from PDF",
filename=file_name,
error=str(e),
2025-10-10 12:37:42 -07:00
)
raise HTTPException(status_code=400, detail="Invalid or unreadable PDF file.") from e
finally:
# Clean up the temporary file
os.unlink(temp_file_path)
2025-10-10 12:37:42 -07:00
async def create_workflow_from_sop_text(self, sop_text: str, organization: Organization) -> dict[str, Any]:
"""Convert SOP text to workflow definition using LLM (does not create the workflow)."""
# Load and render the prompt template
prompt = prompt_engine.load_prompt(
"build-workflow-from-pdf",
sop_text=sop_text,
)
# Use the LLM to convert SOP to workflow
llm_key = settings.LLM_KEY or "gpt-4o-mini"
LOG.info(
"Calling LLM to convert SOP to workflow",
llm_key=llm_key,
prompt_length=len(prompt),
sop_text_length=len(sop_text),
sop_chars_sent=len(sop_text),
organization_id=organization.organization_id,
)
llm_api_handler = LLMAPIHandlerFactory.get_llm_api_handler(llm_key)
response = await llm_api_handler(
prompt=prompt,
prompt_name="sop_to_workflow_conversion",
organization_id=organization.organization_id,
parameters={"max_completion_tokens": 32768}, # Override the default 4096 limit for PDF conversion
)
LOG.info(
"LLM response received",
response_type=type(response),
response_keys=list(response.keys()) if isinstance(response, dict) else None,
organization_id=organization.organization_id,
)
# The LLM API handler automatically parses JSON responses
# The response should be a dict with the workflow structure
if not isinstance(response, dict):
LOG.error(
"LLM returned non-dict response",
response_type=type(response),
response=str(response)[:500],
2025-10-10 12:37:42 -07:00
organization_id=organization.organization_id,
)
raise HTTPException(status_code=422, detail="LLM returned invalid response format - expected JSON object")
2025-10-10 12:37:42 -07:00
# Validate that it has the required structure
if "workflow_definition" not in response:
LOG.error(
"LLM response missing workflow_definition",
response_keys=list(response.keys()),
2025-10-10 12:37:42 -07:00
organization_id=organization.organization_id,
)
raise HTTPException(status_code=422, detail="LLM response missing 'workflow_definition' field")
2025-10-10 12:37:42 -07:00
if "blocks" not in response.get("workflow_definition", {}):
LOG.error(
"LLM workflow_definition missing blocks",
workflow_def_keys=list(response.get("workflow_definition", {}).keys()),
2025-10-10 12:37:42 -07:00
organization_id=organization.organization_id,
)
raise HTTPException(status_code=422, detail="LLM workflow definition missing 'blocks' field")
2025-10-10 12:37:42 -07:00
try:
# Sanitize LLM output for Jinja and required fields before validation
response = self._sanitize_workflow_json(response)
workflow_create_request = WorkflowCreateYAMLRequest.model_validate(response)
2025-10-10 12:37:42 -07:00
LOG.info(
"Workflow JSON validated successfully",
2025-10-10 12:37:42 -07:00
title=response.get("title"),
block_count=len(response.get("workflow_definition", {}).get("blocks", [])),
organization_id=organization.organization_id,
)
except Exception as e:
LOG.error(
"Failed to validate workflow request",
error=str(e),
error_type=type(e).__name__,
2025-10-10 12:37:42 -07:00
organization_id=organization.organization_id,
exc_info=True,
2025-10-10 12:37:42 -07:00
)
raise HTTPException(
status_code=422,
detail=f"Failed to validate workflow structure: {e!s}",
) from e
2025-10-10 12:37:42 -07:00
# Return the validated request as a dict (caller will create the workflow)
return workflow_create_request.model_dump(by_alias=True)
2025-10-10 12:37:42 -07:00
pdf_import_service = PDFImportService()