Add PDF Parser Block (#1600)

2025-01-20 12:33:54 -08:00
parent 8f3941b6b5
commit 1796af6df9
6 changed files with 143 additions and 2 deletions
--- a/skyvern/forge/prompts/skyvern/extract-information-from-file-text.j2
+++ b/skyvern/forge/prompts/skyvern/extract-information-from-file-text.j2
@@ -0,0 +1,13 @@
+You are given text that has been extracted from a file. Your task is to extract information from this text.
+
+Extract information from the text and output it in the specified JSON schema format: {{ json_schema }}
+
+Add as much details as possible to the output JSON object while conforming to the output JSON schema.
+
+Do not ever include anything other than the JSON object in your output, and do not ever include any additional fields in the JSON object.
+
+If you are unable to extract the requested information for a specific field in the json schema, please output a null value for that field.
+
+You are given the following text
+
+{{ extracted_text_content }}
--- a/skyvern/forge/sdk/workflow/models/block.py
+++ b/skyvern/forge/sdk/workflow/models/block.py
@@ -21,6 +21,8 @@ import structlog
 from email_validator import EmailNotValidError, validate_email
 from jinja2 import Template
 from pydantic import BaseModel, Field
+from pypdf import PdfReader
+from pypdf.errors import PdfReadError

 from skyvern.config import settings
 from skyvern.exceptions import (
@@ -84,6 +86,7 @@ class BlockType(StrEnum):
    WAIT = "wait"
    FILE_DOWNLOAD = "file_download"
    GOTO_URL = "goto_url"
+    PDF_PARSER = "pdf_parser"


 class BlockStatus(StrEnum):
@@ -1832,6 +1835,112 @@ class FileParserBlock(Block):
        )


+class PDFParserBlock(Block):
+    block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER
+
+    file_url: str
+    json_schema: dict[str, Any] | None = None
+
+    def get_all_parameters(
+        self,
+        workflow_run_id: str,
+    ) -> list[PARAMETER_TYPE]:
+        workflow_run_context = self.get_workflow_run_context(workflow_run_id)
+        if self.file_url and workflow_run_context.has_parameter(self.file_url):
+            return [workflow_run_context.get_parameter(self.file_url)]
+        return []
+
+    def format_potential_template_parameters(self, workflow_run_context: WorkflowRunContext) -> None:
+        self.file_url = self.format_block_parameter_template_from_workflow_run_context(
+            self.file_url, workflow_run_context
+        )
+
+    async def execute(
+        self,
+        workflow_run_id: str,
+        workflow_run_block_id: str,
+        organization_id: str | None = None,
+        browser_session_id: str | None = None,
+        **kwargs: dict,
+    ) -> BlockResult:
+        workflow_run_context = self.get_workflow_run_context(workflow_run_id)
+        if (
+            self.file_url
+            and workflow_run_context.has_parameter(self.file_url)
+            and workflow_run_context.has_value(self.file_url)
+        ):
+            file_url_parameter_value = workflow_run_context.get_value(self.file_url)
+            if file_url_parameter_value:
+                LOG.info(
+                    "PDFParserBlock: File URL is parameterized, using parameter value",
+                    file_url_parameter_value=file_url_parameter_value,
+                    file_url_parameter_key=self.file_url,
+                )
+                self.file_url = file_url_parameter_value
+
+        try:
+            self.format_potential_template_parameters(workflow_run_context)
+        except Exception as e:
+            return await self.build_block_result(
+                success=False,
+                failure_reason=f"Failed to format jinja template: {str(e)}",
+                output_parameter_value=None,
+                status=BlockStatus.failed,
+                workflow_run_block_id=workflow_run_block_id,
+                organization_id=organization_id,
+            )
+
+        # Download the file
+        file_path = None
+        if self.file_url.startswith("s3://"):
+            file_path = await download_from_s3(self.get_async_aws_client(), self.file_url)
+        else:
+            file_path = await download_file(self.file_url)
+
+        extracted_text = ""
+        try:
+            reader = PdfReader(file_path)
+            page_count = len(reader.pages)
+            for i in range(page_count):
+                extracted_text += reader.pages[i].extract_text() + "\n"
+
+        except PdfReadError:
+            return await self.build_block_result(
+                success=False,
+                failure_reason="Failed to parse PDF file",
+                output_parameter_value=None,
+                status=BlockStatus.failed,
+                workflow_run_block_id=workflow_run_block_id,
+                organization_id=organization_id,
+            )
+
+        if not self.json_schema:
+            self.json_schema = {
+                "type": "object",
+                "properties": {
+                    "extracted_information": {
+                        "type": "object",
+                        "description": "Information extracted from the text",
+                    }
+                },
+            }
+
+        llm_prompt = prompt_engine.load_prompt(
+            "extract-information-from-file-text", extracted_text_content=extracted_text, json_schema=self.json_schema
+        )
+        llm_response = await app.LLM_API_HANDLER(prompt=llm_prompt)
+        # Record the parsed data
+        await self.record_output_parameter_value(workflow_run_context, workflow_run_id, llm_response)
+        return await self.build_block_result(
+            success=True,
+            failure_reason=None,
+            output_parameter_value=llm_response,
+            status=BlockStatus.completed,
+            workflow_run_block_id=workflow_run_block_id,
+            organization_id=organization_id,
+        )
+
+
 class WaitBlock(Block):
    block_type: Literal[BlockType.WAIT] = BlockType.WAIT

@@ -1952,6 +2061,7 @@ BlockSubclasses = Union[
    UploadToS3Block,
    SendEmailBlock,
    FileParserBlock,
+    PDFParserBlock,
    ValidationBlock,
    ActionBlock,
    NavigationBlock,
--- a/skyvern/forge/sdk/workflow/models/yaml.py
+++ b/skyvern/forge/sdk/workflow/models/yaml.py
@@ -215,6 +215,13 @@ class FileParserBlockYAML(BlockYAML):
    file_type: FileType


+class PDFParserBlockYAML(BlockYAML):
+    block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER  # type: ignore
+
+    file_url: str
+    json_schema: dict[str, Any] | None = None
+
+
 class ValidationBlockYAML(BlockYAML):
    block_type: Literal[BlockType.VALIDATION] = BlockType.VALIDATION  # type: ignore

@@ -343,6 +350,7 @@ BLOCK_YAML_SUBCLASSES = (
    | WaitBlockYAML
    | FileDownloadBlockYAML
    | UrlBlockYAML
+    | PDFParserBlockYAML
 )
 BLOCK_YAML_TYPES = Annotated[BLOCK_YAML_SUBCLASSES, Field(discriminator="block_type")]

--- a/skyvern/forge/sdk/workflow/service.py
+++ b/skyvern/forge/sdk/workflow/service.py
@@ -46,6 +46,7 @@ from skyvern.forge.sdk.workflow.models.block import (
    ForLoopBlock,
    LoginBlock,
    NavigationBlock,
+    PDFParserBlock,
    SendEmailBlock,
    TaskBlock,
    TextPromptBlock,
@@ -1469,6 +1470,14 @@ class WorkflowService:
                file_type=block_yaml.file_type,
                continue_on_failure=block_yaml.continue_on_failure,
            )
+        elif block_yaml.block_type == BlockType.PDF_PARSER:
+            return PDFParserBlock(
+                label=block_yaml.label,
+                output_parameter=output_parameter,
+                file_url=block_yaml.file_url,
+                json_schema=block_yaml.json_schema,
+                continue_on_failure=block_yaml.continue_on_failure,
+            )
        elif block_yaml.block_type == BlockType.VALIDATION:
            validation_block_parameters = (
                [parameters[parameter_key] for parameter_key in block_yaml.parameter_keys]