From 1796af6df94dffcd3628604c11b1c5b701618704 Mon Sep 17 00:00:00 2001
From: Shuchang Zheng <wintonzheng0325@gmail.com>
Date: Mon, 20 Jan 2025 12:33:54 -0800
Subject: [PATCH] Add PDF Parser Block (#1600)

---
 poetry.lock                                   |   4 +-
 pyproject.toml                                |   1 +
 .../extract-information-from-file-text.j2     |  13 +++
 skyvern/forge/sdk/workflow/models/block.py    | 110 ++++++++++++++++++
 skyvern/forge/sdk/workflow/models/yaml.py     |   8 ++
 skyvern/forge/sdk/workflow/service.py         |   9 ++
 6 files changed, 143 insertions(+), 2 deletions(-)
 create mode 100644 skyvern/forge/prompts/skyvern/extract-information-from-file-text.j2

diff --git a/poetry.lock b/poetry.lock
index 75b5bcff..3a6d477d 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -1,4 +1,4 @@
-# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
+# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
 
 [[package]]
 name = "aioboto3"
@@ -6182,4 +6182,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.11,<3.12"
-content-hash = "908718e1dfd3b4a34106c187c59a8627f00b3491fead6b7fac6e48143afb6f17"
+content-hash = "995b84f916f59c9166e84ce633f5cf848312fc85d80caa76486d3571c7321668"
diff --git a/pyproject.toml b/pyproject.toml
index f25a33f7..d6d6a76d 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -53,6 +53,7 @@ aiofiles = "^24.1.0"
 pyotp = "^2.9.0"
 asyncpg = "^0.30.0"
 json-repair = "^0.34.0"
+pypdf = "^5.1.0"
 
 [tool.poetry.group.dev.dependencies]
 isort = "^5.13.2"
diff --git a/skyvern/forge/prompts/skyvern/extract-information-from-file-text.j2 b/skyvern/forge/prompts/skyvern/extract-information-from-file-text.j2
new file mode 100644
index 00000000..b1f03e6f
--- /dev/null
+++ b/skyvern/forge/prompts/skyvern/extract-information-from-file-text.j2
@@ -0,0 +1,13 @@
+You are given text that has been extracted from a file. Your task is to extract information from this text.
+
+Extract information from the text and output it in the specified JSON schema format: {{ json_schema }}
+
+Add as much details as possible to the output JSON object while conforming to the output JSON schema.
+
+Do not ever include anything other than the JSON object in your output, and do not ever include any additional fields in the JSON object.
+
+If you are unable to extract the requested information for a specific field in the json schema, please output a null value for that field.
+
+You are given the following text
+
+{{ extracted_text_content }}
diff --git a/skyvern/forge/sdk/workflow/models/block.py b/skyvern/forge/sdk/workflow/models/block.py
index b237ca1d..3ecb31aa 100644
--- a/skyvern/forge/sdk/workflow/models/block.py
+++ b/skyvern/forge/sdk/workflow/models/block.py
@@ -21,6 +21,8 @@ import structlog
 from email_validator import EmailNotValidError, validate_email
 from jinja2 import Template
 from pydantic import BaseModel, Field
+from pypdf import PdfReader
+from pypdf.errors import PdfReadError
 
 from skyvern.config import settings
 from skyvern.exceptions import (
@@ -84,6 +86,7 @@ class BlockType(StrEnum):
     WAIT = "wait"
     FILE_DOWNLOAD = "file_download"
     GOTO_URL = "goto_url"
+    PDF_PARSER = "pdf_parser"
 
 
 class BlockStatus(StrEnum):
@@ -1832,6 +1835,112 @@ class FileParserBlock(Block):
         )
 
 
+class PDFParserBlock(Block):
+    block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER
+
+    file_url: str
+    json_schema: dict[str, Any] | None = None
+
+    def get_all_parameters(
+        self,
+        workflow_run_id: str,
+    ) -> list[PARAMETER_TYPE]:
+        workflow_run_context = self.get_workflow_run_context(workflow_run_id)
+        if self.file_url and workflow_run_context.has_parameter(self.file_url):
+            return [workflow_run_context.get_parameter(self.file_url)]
+        return []
+
+    def format_potential_template_parameters(self, workflow_run_context: WorkflowRunContext) -> None:
+        self.file_url = self.format_block_parameter_template_from_workflow_run_context(
+            self.file_url, workflow_run_context
+        )
+
+    async def execute(
+        self,
+        workflow_run_id: str,
+        workflow_run_block_id: str,
+        organization_id: str | None = None,
+        browser_session_id: str | None = None,
+        **kwargs: dict,
+    ) -> BlockResult:
+        workflow_run_context = self.get_workflow_run_context(workflow_run_id)
+        if (
+            self.file_url
+            and workflow_run_context.has_parameter(self.file_url)
+            and workflow_run_context.has_value(self.file_url)
+        ):
+            file_url_parameter_value = workflow_run_context.get_value(self.file_url)
+            if file_url_parameter_value:
+                LOG.info(
+                    "PDFParserBlock: File URL is parameterized, using parameter value",
+                    file_url_parameter_value=file_url_parameter_value,
+                    file_url_parameter_key=self.file_url,
+                )
+                self.file_url = file_url_parameter_value
+
+        try:
+            self.format_potential_template_parameters(workflow_run_context)
+        except Exception as e:
+            return await self.build_block_result(
+                success=False,
+                failure_reason=f"Failed to format jinja template: {str(e)}",
+                output_parameter_value=None,
+                status=BlockStatus.failed,
+                workflow_run_block_id=workflow_run_block_id,
+                organization_id=organization_id,
+            )
+
+        # Download the file
+        file_path = None
+        if self.file_url.startswith("s3://"):
+            file_path = await download_from_s3(self.get_async_aws_client(), self.file_url)
+        else:
+            file_path = await download_file(self.file_url)
+
+        extracted_text = ""
+        try:
+            reader = PdfReader(file_path)
+            page_count = len(reader.pages)
+            for i in range(page_count):
+                extracted_text += reader.pages[i].extract_text() + "\n"
+
+        except PdfReadError:
+            return await self.build_block_result(
+                success=False,
+                failure_reason="Failed to parse PDF file",
+                output_parameter_value=None,
+                status=BlockStatus.failed,
+                workflow_run_block_id=workflow_run_block_id,
+                organization_id=organization_id,
+            )
+
+        if not self.json_schema:
+            self.json_schema = {
+                "type": "object",
+                "properties": {
+                    "extracted_information": {
+                        "type": "object",
+                        "description": "Information extracted from the text",
+                    }
+                },
+            }
+
+        llm_prompt = prompt_engine.load_prompt(
+            "extract-information-from-file-text", extracted_text_content=extracted_text, json_schema=self.json_schema
+        )
+        llm_response = await app.LLM_API_HANDLER(prompt=llm_prompt)
+        # Record the parsed data
+        await self.record_output_parameter_value(workflow_run_context, workflow_run_id, llm_response)
+        return await self.build_block_result(
+            success=True,
+            failure_reason=None,
+            output_parameter_value=llm_response,
+            status=BlockStatus.completed,
+            workflow_run_block_id=workflow_run_block_id,
+            organization_id=organization_id,
+        )
+
+
 class WaitBlock(Block):
     block_type: Literal[BlockType.WAIT] = BlockType.WAIT
 
@@ -1952,6 +2061,7 @@ BlockSubclasses = Union[
     UploadToS3Block,
     SendEmailBlock,
     FileParserBlock,
+    PDFParserBlock,
     ValidationBlock,
     ActionBlock,
     NavigationBlock,
diff --git a/skyvern/forge/sdk/workflow/models/yaml.py b/skyvern/forge/sdk/workflow/models/yaml.py
index 8e7af235..db984a9b 100644
--- a/skyvern/forge/sdk/workflow/models/yaml.py
+++ b/skyvern/forge/sdk/workflow/models/yaml.py
@@ -215,6 +215,13 @@ class FileParserBlockYAML(BlockYAML):
     file_type: FileType
 
 
+class PDFParserBlockYAML(BlockYAML):
+    block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER  # type: ignore
+
+    file_url: str
+    json_schema: dict[str, Any] | None = None
+
+
 class ValidationBlockYAML(BlockYAML):
     block_type: Literal[BlockType.VALIDATION] = BlockType.VALIDATION  # type: ignore
 
@@ -343,6 +350,7 @@ BLOCK_YAML_SUBCLASSES = (
     | WaitBlockYAML
     | FileDownloadBlockYAML
     | UrlBlockYAML
+    | PDFParserBlockYAML
 )
 BLOCK_YAML_TYPES = Annotated[BLOCK_YAML_SUBCLASSES, Field(discriminator="block_type")]
 
diff --git a/skyvern/forge/sdk/workflow/service.py b/skyvern/forge/sdk/workflow/service.py
index 96f9c1be..1f713223 100644
--- a/skyvern/forge/sdk/workflow/service.py
+++ b/skyvern/forge/sdk/workflow/service.py
@@ -46,6 +46,7 @@ from skyvern.forge.sdk.workflow.models.block import (
     ForLoopBlock,
     LoginBlock,
     NavigationBlock,
+    PDFParserBlock,
     SendEmailBlock,
     TaskBlock,
     TextPromptBlock,
@@ -1469,6 +1470,14 @@ class WorkflowService:
                 file_type=block_yaml.file_type,
                 continue_on_failure=block_yaml.continue_on_failure,
             )
+        elif block_yaml.block_type == BlockType.PDF_PARSER:
+            return PDFParserBlock(
+                label=block_yaml.label,
+                output_parameter=output_parameter,
+                file_url=block_yaml.file_url,
+                json_schema=block_yaml.json_schema,
+                continue_on_failure=block_yaml.continue_on_failure,
+            )
         elif block_yaml.block_type == BlockType.VALIDATION:
             validation_block_parameters = (
                 [parameters[parameter_key] for parameter_key in block_yaml.parameter_keys]