Dorod-Sky/skyvern/services/pdf_import_service.py

import os
import re
import tempfile
from typing import Any

import structlog
from fastapi import HTTPException
from pypdf import PdfReader

from skyvern.config import settings
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory
from skyvern.forge.sdk.schemas.organizations import Organization
from skyvern.schemas.workflows import WorkflowCreateYAMLRequest

LOG = structlog.get_logger(__name__)


class PDFImportService:
    @staticmethod
    def _sanitize_workflow_json(raw: dict[str, Any]) -> dict[str, Any]:
        """Clean LLM JSON to match Skyvern schema conventions and avoid Jinja errors.

        - Replace Jinja refs like {{workflow.foo}} or {{parameters.foo}} with {{foo}}
        - Auto-populate block.parameter_keys with any referenced parameter keys
        - Ensure all block labels are unique by appending indices to duplicates
        """

        def strip_prefixes(text: str) -> tuple[str, set[str]]:
            # Replace {{ workflow.xxx }} and {{ parameters.xxx }} with {{ xxx }}
            cleaned = text
            cleaned = re.sub(r"\{\{\s*workflow\.([a-zA-Z0-9_\.]+)\s*\}\}", r"{{ \1 }}", cleaned)
            cleaned = re.sub(r"\{\{\s*parameters\.([a-zA-Z0-9_\.]+)\s*\}\}", r"{{ \1 }}", cleaned)

            # Collect jinja variable names (take first segment before any dot)
            used: set[str] = set()
            for match in re.finditer(r"\{\{\s*([^\}\s\|]+)\s*[^}]*\}\}", cleaned):
                var = match.group(1)
                # Use base segment before dot to match parameter keys
                base = var.split(".")[0]
                used.add(base)
            return cleaned, used

        workflow_def = raw.get("workflow_definition", {})
        param_defs = workflow_def.get("parameters", []) or []
        param_keys = {p.get("key") for p in param_defs if isinstance(p, dict) and p.get("key")}

        blocks = workflow_def.get("blocks", []) or []

        # First pass: deduplicate block labels
        seen_labels: dict[str, int] = {}
        deduplicated_count = 0
        for blk in blocks:
            if not isinstance(blk, dict):
                continue
            label = blk.get("label", "")
            if not label:
                continue

            if label in seen_labels:
                # This label has been seen before, append index
                seen_labels[label] += 1
                new_label = f"{label}_{seen_labels[label]}"
                LOG.info(
                    "Deduplicating block label",
                    original_label=label,
                    new_label=new_label,
                    occurrence=seen_labels[label],
                )
                blk["label"] = new_label
                deduplicated_count += 1
            else:
                # First time seeing this label
                seen_labels[label] = 1

        if deduplicated_count > 0:
            LOG.info(
                "Deduplicated block labels",
                total_deduplicated=deduplicated_count,
                duplicate_labels=sorted([label for label, count in seen_labels.items() if count > 1]),
            )
        for blk in blocks:
            if not isinstance(blk, dict):
                continue
            referenced: set[str] = set()
            # Fields that commonly contain Jinja
            for field in [
                "url",
                "navigation_goal",
                "data_extraction_goal",
                "complete_criterion",
                "terminate_criterion",
                "title",
            ]:
                val = blk.get(field)
                if isinstance(val, str):
                    cleaned, used = strip_prefixes(val)
                    blk[field] = cleaned
                    referenced.update(used)

            # Ensure required fields for text_prompt blocks
            if blk.get("block_type") == "text_prompt":
                if not blk.get("prompt"):
                    # Prefer an instruction-bearing field if present
                    blk["prompt"] = (
                        blk.get("navigation_goal")
                        or blk.get("title")
                        or blk.get("label")
                        or "Provide the requested text response."
                    )
                # Track jinja usage within the prompt
                prompt_val = blk.get("prompt")
                if isinstance(prompt_val, str):
                    cleaned, used = strip_prefixes(prompt_val)
                    blk["prompt"] = cleaned
                    referenced.update(used)

            # parameter_keys should include only known parameter keys
            if param_keys:
                keys_to_include = sorted(k for k in referenced if k in param_keys)
                if keys_to_include:
                    blk["parameter_keys"] = keys_to_include

            # Ensure engine where needed
            if blk.get("block_type") in {"navigation", "action", "extraction", "login", "file_download"}:
                blk.setdefault("engine", "skyvern-1.0")

            # Ensure url exists (can be empty string)
            if blk.get("block_type") in {"navigation", "action", "extraction", "file_download"}:
                if blk.get("url") is None:
                    blk["url"] = ""

        return raw

    def extract_text_from_pdf(self, file_contents: bytes, file_name: str) -> str:
        """Extract text from PDF file contents. Raises HTTPException if invalid."""
        LOG.info("Extracting text from PDF", filename=file_name)

        # Save the uploaded file to a temporary location
        with tempfile.NamedTemporaryFile(delete=False, suffix=".pdf") as temp_file:
            temp_file.write(file_contents)
            temp_file_path = temp_file.name

        try:
            reader = PdfReader(temp_file_path)
            sop_text = ""
            for page_num, page in enumerate(reader.pages, 1):
                page_text = page.extract_text() or ""
                sop_text += page_text + "\n"
                LOG.debug("Extracted text from page", page=page_num, text_length=len(page_text))

            LOG.info("PDF text extraction complete", total_text_length=len(sop_text))

            if not sop_text.strip():
                raise HTTPException(status_code=400, detail="No readable content found in the PDF.")

            return sop_text
        except Exception as e:
            LOG.warning(
                "Failed to read/extract text from PDF",
                filename=file_name,
                error=str(e),
            )
            raise HTTPException(status_code=400, detail="Invalid or unreadable PDF file.") from e
        finally:
            # Clean up the temporary file
            os.unlink(temp_file_path)

    async def create_workflow_from_sop_text(self, sop_text: str, organization: Organization) -> dict[str, Any]:
        """Convert SOP text to workflow definition using LLM (does not create the workflow)."""
        # Load and render the prompt template
        prompt = prompt_engine.load_prompt(
            "build-workflow-from-pdf",
            sop_text=sop_text,
        )

        # Use the LLM to convert SOP to workflow
        llm_key = settings.LLM_KEY or "gpt-4o-mini"
        LOG.info(
            "Calling LLM to convert SOP to workflow",
            llm_key=llm_key,
            prompt_length=len(prompt),
            sop_text_length=len(sop_text),
            sop_chars_sent=len(sop_text),
            organization_id=organization.organization_id,
        )

        llm_api_handler = LLMAPIHandlerFactory.get_llm_api_handler(llm_key)

        response = await llm_api_handler(
            prompt=prompt,
            prompt_name="sop_to_workflow_conversion",
            organization_id=organization.organization_id,
            parameters={"max_completion_tokens": 32768},  # Override the default 4096 limit for PDF conversion
        )

        LOG.info(
            "LLM response received",
            response_type=type(response),
            response_keys=list(response.keys()) if isinstance(response, dict) else None,
            organization_id=organization.organization_id,
        )

        # The LLM API handler automatically parses JSON responses
        # The response should be a dict with the workflow structure
        if not isinstance(response, dict):
            LOG.error(
                "LLM returned non-dict response",
                response_type=type(response),
                response=str(response)[:500],
                organization_id=organization.organization_id,
            )
            raise HTTPException(status_code=422, detail="LLM returned invalid response format - expected JSON object")

        # Validate that it has the required structure
        if "workflow_definition" not in response:
            LOG.error(
                "LLM response missing workflow_definition",
                response_keys=list(response.keys()),
                organization_id=organization.organization_id,
            )
            raise HTTPException(status_code=422, detail="LLM response missing 'workflow_definition' field")

        if "blocks" not in response.get("workflow_definition", {}):
            LOG.error(
                "LLM workflow_definition missing blocks",
                workflow_def_keys=list(response.get("workflow_definition", {}).keys()),
                organization_id=organization.organization_id,
            )
            raise HTTPException(status_code=422, detail="LLM workflow definition missing 'blocks' field")

        try:
            # Sanitize LLM output for Jinja and required fields before validation
            response = self._sanitize_workflow_json(response)
            workflow_create_request = WorkflowCreateYAMLRequest.model_validate(response)

            LOG.info(
                "Workflow JSON validated successfully",
                title=response.get("title"),
                block_count=len(response.get("workflow_definition", {}).get("blocks", [])),
                organization_id=organization.organization_id,
            )
        except Exception as e:
            LOG.error(
                "Failed to validate workflow request",
                error=str(e),
                error_type=type(e).__name__,
                organization_id=organization.organization_id,
                exc_info=True,
            )
            raise HTTPException(
                status_code=422,
                detail=f"Failed to validate workflow structure: {e!s}",
            ) from e

        # Return the validated request as a dict (caller will create the workflow)
        return workflow_create_request.model_dump(by_alias=True)


pdf_import_service = PDFImportService()