skyvern/services/pdf_import_service.py

import os
import re
import tempfile
from typing import Any

import structlog
from fastapi import HTTPException
from pypdf import PdfReader

from skyvern.config import settings
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
from skyvern.forge.sdk.schemas.organizations import Organization
from skyvern.schemas.workflows import WorkflowCreateYAMLRequest

LOG = structlog.get_logger(__name__)


class PDFImportService:
    @staticmethod
    def _sanitize_workflow_json(raw: dict[str, Any]) -> dict[str, Any]:
        """Clean LLM JSON to match Skyvern schema conventions and avoid Jinja errors.

        - Replace Jinja refs like {{workflow.foo}} or {{parameters.foo}} with {{foo}}
        - Auto-populate block.parameter_keys with any referenced parameter keys
        - Ensure all block labels are unique by appending indices to duplicates
        """

        def strip_prefixes(text: str) -> tuple[str, set[str]]:
            # Replace {{ workflow.xxx }} and {{ parameters.xxx }} with {{ xxx }}
            cleaned = text
            cleaned = re.sub(r"\{\{\s*workflow\.([a-zA-Z0-9_\.]+)\s*\}\}", r"{{ \1 }}", cleaned)
            cleaned = re.sub(r"\{\{\s*parameters\.([a-zA-Z0-9_\.]+)\s*\}\}", r"{{ \1 }}", cleaned)

            # Collect jinja variable names (take first segment before any dot)
            used: set[str] = set()
            for match in re.finditer(r"\{\{\s*([^\}\s\|]+)\s*[^}]*\}\}", cleaned):
                var = match.group(1)
                # Use base segment before dot to match parameter keys
                base = var.split(".")[0]
                used.add(base)
            return cleaned, used

        workflow_def = raw.get("workflow_definition", {})
        param_defs = workflow_def.get("parameters", []) or []
        param_keys = {p.get("key") for p in param_defs if isinstance(p, dict) and p.get("key")}

        blocks = workflow_def.get("blocks", []) or []

        # First pass: deduplicate block labels
        seen_labels: dict[str, int] = {}
        deduplicated_count = 0
        for blk in blocks:
            if not isinstance(blk, dict):
                continue
            label = blk.get("label", "")
            if not label:
                continue

            if label in seen_labels:
                # This label has been seen before, append index
                seen_labels[label] += 1
                new_label = f"{label}_{seen_labels[label]}"
                LOG.info(
                    "Deduplicating block label",
                    original_label=label,
                    new_label=new_label,
                    occurrence=seen_labels[label],
                )
                blk["label"] = new_label
                deduplicated_count += 1
            else:
                # First time seeing this label
                seen_labels[label] = 1

        if deduplicated_count > 0:
            LOG.info(
                "Deduplicated block labels",
                total_deduplicated=deduplicated_count,
                duplicate_labels=sorted([label for label, count in seen_labels.items() if count > 1]),
            )
        for blk in blocks:
            if not isinstance(blk, dict):
                continue
            referenced: set[str] = set()
            # Fields that commonly contain Jinja
            for field in [
                "url",
                "navigation_goal",
                "data_extraction_goal",
                "complete_criterion",
                "terminate_criterion",
                "title",
            ]:
                val = blk.get(field)
                if isinstance(val, str):
                    cleaned, used = strip_prefixes(val)
                    blk[field] = cleaned
                    referenced.update(used)

            # Ensure required fields for text_prompt blocks
            if blk.get("block_type") == "text_prompt":
                if not blk.get("prompt"):
                    # Prefer an instruction-bearing field if present
                    blk["prompt"] = (
                        blk.get("navigation_goal")
                        or blk.get("title")
                        or blk.get("label")
                        or "Provide the requested text response."
                    )
                # Track jinja usage within the prompt
                prompt_val = blk.get("prompt")
                if isinstance(prompt_val, str):
                    cleaned, used = strip_prefixes(prompt_val)
                    blk["prompt"] = cleaned
                    referenced.update(used)

            # parameter_keys should include only known parameter keys
            if param_keys:
                keys_to_include = sorted(k for k in referenced if k in param_keys)
                if keys_to_include:
                    blk["parameter_keys"] = keys_to_include

            # Ensure engine where needed
            if blk.get("block_type") in {"navigation", "action", "extraction", "login", "file_download"}:
                blk.setdefault("engine", "skyvern-1.0")

            # Ensure url exists (can be empty string)
            if blk.get("block_type") in {"navigation", "action", "extraction", "file_download"}:
                if blk.get("url") is None:
                    blk["url"] = ""

        return raw

    def extract_text_from_pdf(self, file_contents: bytes, file_name: str) -> str:
        """Extract text from PDF file contents. Raises HTTPException if invalid."""
        LOG.info("Extracting text from PDF", filename=file_name)

        # Save the uploaded file to a temporary location
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            temp_file.write(file_contents)
            temp_file_path = temp_file.name

        try:
            reader = PdfReader(temp_file_path)
            sop_text = ""
            for page_num, page in enumerate(reader.pages, 1):
                page_text = page.extract_text() or ""
                sop_text += page_text + "\n"
                LOG.debug("Extracted text from page", page=page_num, text_length=len(page_text))

            LOG.info("PDF text extraction complete", total_text_length=len(sop_text))

            if not sop_text.strip():
                raise HTTPException(status_code=400, detail="No readable content found in the PDF.")

            return sop_text
        except Exception as e:
            LOG.warning(
                "Failed to read/extract text from PDF",
                filename=file_name,
                error=str(e),
            )
            raise HTTPException(status_code=400, detail="Invalid or unreadable PDF file.") from e
        finally:
            # Clean up the temporary file
            os.unlink(temp_file_path)

    async def create_workflow_from_sop_text(self, sop_text: str, organization: Organization) -> dict[str, Any]:
        """Convert SOP text to workflow definition using LLM (does not create the workflow)."""
        # Load and render the prompt template
        prompt = prompt_engine.load_prompt(
            "build-workflow-from-pdf",
            sop_text=sop_text,
        )

        # Use the LLM to convert SOP to workflow
        llm_key = settings.LLM_KEY or "gpt-4o-mini"
        LOG.info(
            "Calling LLM to convert SOP to workflow",
            llm_key=llm_key,
            prompt_length=len(prompt),
            sop_text_length=len(sop_text),
            sop_chars_sent=len(sop_text),
            organization_id=organization.organization_id,
        )

        llm_api_handler = LLMAPIHandlerFactory.get_llm_api_handler(llm_key)

        response = await llm_api_handler(
            prompt=prompt,
            prompt_name="sop_to_workflow_conversion",
            organization_id=organization.organization_id,
            parameters={"max_completion_tokens": 32768},  # Override the default 4096 limit for PDF conversion
        )

        LOG.info(
            "LLM response received",
            response_type=type(response),
            response_keys=list(response.keys()) if isinstance(response, dict) else None,
            organization_id=organization.organization_id,
        )

        # The LLM API handler automatically parses JSON responses
        # The response should be a dict with the workflow structure
        if not isinstance(response, dict):
            LOG.error(
                "LLM returned non-dict response",
                response_type=type(response),
                response=str(response)[:500],
                organization_id=organization.organization_id,
            )
            raise HTTPException(status_code=422, detail="LLM returned invalid response format - expected JSON object")

        # Validate that it has the required structure
        if "workflow_definition" not in response:
            LOG.error(
                "LLM response missing workflow_definition",
                response_keys=list(response.keys()),
                organization_id=organization.organization_id,
            )
            raise HTTPException(status_code=422, detail="LLM response missing 'workflow_definition' field")

        if "blocks" not in response.get("workflow_definition", {}):
            LOG.error(
                "LLM workflow_definition missing blocks",
                workflow_def_keys=list(response.get("workflow_definition", {}).keys()),
                organization_id=organization.organization_id,
            )
            raise HTTPException(status_code=422, detail="LLM workflow definition missing 'blocks' field")

        try:
            # Sanitize LLM output for Jinja and required fields before validation
            response = self._sanitize_workflow_json(response)
            workflow_create_request = WorkflowCreateYAMLRequest.model_validate(response)

            LOG.info(
                "Workflow JSON validated successfully",
                title=response.get("title"),
                block_count=len(response.get("workflow_definition", {}).get("blocks", [])),
                organization_id=organization.organization_id,
            )
        except Exception as e:
            LOG.error(
                "Failed to validate workflow request",
                error=str(e),
                error_type=type(e).__name__,
                organization_id=organization.organization_id,
                exc_info=True,
            )
            raise HTTPException(
                status_code=422,
                detail=f"Failed to validate workflow structure: {e!s}",
            ) from e

        # Return the validated request as a dict (caller will create the workflow)
        return workflow_create_request.model_dump(by_alias=True)


pdf_import_service = PDFImportService()
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00			`import os`
			`import re`
			`import tempfile`
			`from typing import Any`

			`import structlog`
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`from fastapi import HTTPException`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00			`from pypdf import PdfReader`

			`from skyvern.config import settings`
			`from skyvern.forge.prompts import prompt_engine`
			`from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory`
			`from skyvern.forge.sdk.schemas.organizations import Organization`
			`from skyvern.schemas.workflows import WorkflowCreateYAMLRequest`

			`LOG = structlog.get_logger(__name__)`


			`class PDFImportService:`
			`@staticmethod`
			`def _sanitize_workflow_json(raw: dict[str, Any]) -> dict[str, Any]:`
			`"""Clean LLM JSON to match Skyvern schema conventions and avoid Jinja errors.`

			`- Replace Jinja refs like {{workflow.foo}} or {{parameters.foo}} with {{foo}}`
			`- Auto-populate block.parameter_keys with any referenced parameter keys`
			`- Ensure all block labels are unique by appending indices to duplicates`
			`"""`

			`def strip_prefixes(text: str) -> tuple[str, set[str]]:`
			`# Replace {{ workflow.xxx }} and {{ parameters.xxx }} with {{ xxx }}`
			`cleaned = text`
			`cleaned = re.sub(r"\{\{\sworkflow\.([a-zA-Z0-9_\.]+)\s\}\}", r"{{ \1 }}", cleaned)`
			`cleaned = re.sub(r"\{\{\sparameters\.([a-zA-Z0-9_\.]+)\s\}\}", r"{{ \1 }}", cleaned)`

			`# Collect jinja variable names (take first segment before any dot)`
			`used: set[str] = set()`
			`for match in re.finditer(r"\{\{\s([^\}\s\\|]+)\s[^}]*\}\}", cleaned):`
			`var = match.group(1)`
			`# Use base segment before dot to match parameter keys`
			`base = var.split(".")[0]`
			`used.add(base)`
			`return cleaned, used`

			`workflow_def = raw.get("workflow_definition", {})`
			`param_defs = workflow_def.get("parameters", []) or []`
			`param_keys = {p.get("key") for p in param_defs if isinstance(p, dict) and p.get("key")}`

			`blocks = workflow_def.get("blocks", []) or []`

			`# First pass: deduplicate block labels`
			`seen_labels: dict[str, int] = {}`
			`deduplicated_count = 0`
			`for blk in blocks:`
			`if not isinstance(blk, dict):`
			`continue`
			`label = blk.get("label", "")`
			`if not label:`
			`continue`

			`if label in seen_labels:`
			`# This label has been seen before, append index`
			`seen_labels[label] += 1`
			`new_label = f"{label}_{seen_labels[label]}"`
			`LOG.info(`
			`"Deduplicating block label",`
			`original_label=label,`
			`new_label=new_label,`
			`occurrence=seen_labels[label],`
			`)`
			`blk["label"] = new_label`
			`deduplicated_count += 1`
			`else:`
			`# First time seeing this label`
			`seen_labels[label] = 1`

			`if deduplicated_count > 0:`
			`LOG.info(`
			`"Deduplicated block labels",`
			`total_deduplicated=deduplicated_count,`
			`duplicate_labels=sorted([label for label, count in seen_labels.items() if count > 1]),`
			`)`
			`for blk in blocks:`
			`if not isinstance(blk, dict):`
			`continue`
			`referenced: set[str] = set()`
			`# Fields that commonly contain Jinja`
			`for field in [`
			`"url",`
			`"navigation_goal",`
			`"data_extraction_goal",`
			`"complete_criterion",`
			`"terminate_criterion",`
			`"title",`
			`]:`
			`val = blk.get(field)`
			`if isinstance(val, str):`
			`cleaned, used = strip_prefixes(val)`
			`blk[field] = cleaned`
			`referenced.update(used)`

			`# Ensure required fields for text_prompt blocks`
			`if blk.get("block_type") == "text_prompt":`
			`if not blk.get("prompt"):`
			`# Prefer an instruction-bearing field if present`
			`blk["prompt"] = (`
			`blk.get("navigation_goal")`
			`or blk.get("title")`
			`or blk.get("label")`
			`or "Provide the requested text response."`
			`)`
			`# Track jinja usage within the prompt`
			`prompt_val = blk.get("prompt")`
			`if isinstance(prompt_val, str):`
			`cleaned, used = strip_prefixes(prompt_val)`
			`blk["prompt"] = cleaned`
			`referenced.update(used)`

			`# parameter_keys should include only known parameter keys`
			`if param_keys:`
			`keys_to_include = sorted(k for k in referenced if k in param_keys)`
			`if keys_to_include:`
			`blk["parameter_keys"] = keys_to_include`

			`# Ensure engine where needed`
			`if blk.get("block_type") in {"navigation", "action", "extraction", "login", "file_download"}:`
			`blk.setdefault("engine", "skyvern-1.0")`

			`# Ensure url exists (can be empty string)`
			`if blk.get("block_type") in {"navigation", "action", "extraction", "file_download"}:`
			`if blk.get("url") is None:`
			`blk["url"] = ""`

			`return raw`

Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`def extract_text_from_pdf(self, file_contents: bytes, file_name: str) -> str:`
			`"""Extract text from PDF file contents. Raises HTTPException if invalid."""`
			`LOG.info("Extracting text from PDF", filename=file_name)`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00
			`# Save the uploaded file to a temporary location`
			`with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:`
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`temp_file.write(file_contents)`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00			`temp_file_path = temp_file.name`

			`try:`
			`reader = PdfReader(temp_file_path)`
			`sop_text = ""`
			`for page_num, page in enumerate(reader.pages, 1):`
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`page_text = page.extract_text() or ""`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00			`sop_text += page_text + "\n"`
			`LOG.debug("Extracted text from page", page=page_num, text_length=len(page_text))`

Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`LOG.info("PDF text extraction complete", total_text_length=len(sop_text))`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00
			`if not sop_text.strip():`
			`raise HTTPException(status_code=400, detail="No readable content found in the PDF.")`

Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`return sop_text`
			`except Exception as e:`
			`LOG.warning(`
			`"Failed to read/extract text from PDF",`
			`filename=file_name,`
			`error=str(e),`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00			`)`
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`raise HTTPException(status_code=400, detail="Invalid or unreadable PDF file.") from e`
			`finally:`
			`# Clean up the temporary file`
			`os.unlink(temp_file_path)`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`async def create_workflow_from_sop_text(self, sop_text: str, organization: Organization) -> dict[str, Any]:`
			`"""Convert SOP text to workflow definition using LLM (does not create the workflow)."""`
			`# Load and render the prompt template`
			`prompt = prompt_engine.load_prompt(`
			`"build-workflow-from-pdf",`
			`sop_text=sop_text,`
			`)`

			`# Use the LLM to convert SOP to workflow`
			`llm_key = settings.LLM_KEY or "gpt-4o-mini"`
			`LOG.info(`
			`"Calling LLM to convert SOP to workflow",`
			`llm_key=llm_key,`
			`prompt_length=len(prompt),`
			`sop_text_length=len(sop_text),`
			`sop_chars_sent=len(sop_text),`
			`organization_id=organization.organization_id,`
			`)`

			`llm_api_handler = LLMAPIHandlerFactory.get_llm_api_handler(llm_key)`

			`response = await llm_api_handler(`
			`prompt=prompt,`
			`prompt_name="sop_to_workflow_conversion",`
			`organization_id=organization.organization_id,`
			`parameters={"max_completion_tokens": 32768}, # Override the default 4096 limit for PDF conversion`
			`)`

			`LOG.info(`
			`"LLM response received",`
			`response_type=type(response),`
			`response_keys=list(response.keys()) if isinstance(response, dict) else None,`
			`organization_id=organization.organization_id,`
			`)`

			`# The LLM API handler automatically parses JSON responses`
			`# The response should be a dict with the workflow structure`
			`if not isinstance(response, dict):`
			`LOG.error(`
			`"LLM returned non-dict response",`
			`response_type=type(response),`
			`response=str(response)[:500],`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00			`organization_id=organization.organization_id,`
			`)`
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`raise HTTPException(status_code=422, detail="LLM returned invalid response format - expected JSON object")`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`# Validate that it has the required structure`
			`if "workflow_definition" not in response:`
			`LOG.error(`
			`"LLM response missing workflow_definition",`
			`response_keys=list(response.keys()),`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00			`organization_id=organization.organization_id,`
			`)`
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`raise HTTPException(status_code=422, detail="LLM response missing 'workflow_definition' field")`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`if "blocks" not in response.get("workflow_definition", {}):`
			`LOG.error(`
			`"LLM workflow_definition missing blocks",`
			`workflow_def_keys=list(response.get("workflow_definition", {}).keys()),`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00			`organization_id=organization.organization_id,`
			`)`
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`raise HTTPException(status_code=422, detail="LLM workflow definition missing 'blocks' field")`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`try:`
			`# Sanitize LLM output for Jinja and required fields before validation`
			`response = self._sanitize_workflow_json(response)`
			`workflow_create_request = WorkflowCreateYAMLRequest.model_validate(response)`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00
			`LOG.info(`
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`"Workflow JSON validated successfully",`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00			`title=response.get("title"),`
			`block_count=len(response.get("workflow_definition", {}).get("blocks", [])),`
			`organization_id=organization.organization_id,`
			`)`
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`except Exception as e:`
			`LOG.error(`
			`"Failed to validate workflow request",`
			`error=str(e),`
			`error_type=type(e).__name__,`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00			`organization_id=organization.organization_id,`
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`exc_info=True,`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00			`)`
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`raise HTTPException(`
			`status_code=422,`
			`detail=f"Failed to validate workflow structure: {e!s}",`
			`) from e`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00
Folders, Persistent Import Progress Tracking & UX Enhancements (#3841) Co-authored-by: Jonathan Dobson <jon.m.dobson@gmail.com> 2025-11-05 18:37:18 +03:00			`# Return the validated request as a dict (caller will create the workflow)`
			`return workflow_create_request.model_dump(by_alias=True)`
Pedro/add sop workflow builder (#3685) 2025-10-10 12:37:42 -07:00

			`pdf_import_service = PDFImportService()`