From 1796af6df94dffcd3628604c11b1c5b701618704 Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Mon, 20 Jan 2025 12:33:54 -0800 Subject: [PATCH] Add PDF Parser Block (#1600) --- poetry.lock | 4 +- pyproject.toml | 1 + .../extract-information-from-file-text.j2 | 13 +++ skyvern/forge/sdk/workflow/models/block.py | 110 ++++++++++++++++++ skyvern/forge/sdk/workflow/models/yaml.py | 8 ++ skyvern/forge/sdk/workflow/service.py | 9 ++ 6 files changed, 143 insertions(+), 2 deletions(-) create mode 100644 skyvern/forge/prompts/skyvern/extract-information-from-file-text.j2 diff --git a/poetry.lock b/poetry.lock index 75b5bcff..3a6d477d 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand. [[package]] name = "aioboto3" @@ -6182,4 +6182,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.0" python-versions = "^3.11,<3.12" -content-hash = "908718e1dfd3b4a34106c187c59a8627f00b3491fead6b7fac6e48143afb6f17" +content-hash = "995b84f916f59c9166e84ce633f5cf848312fc85d80caa76486d3571c7321668" diff --git a/pyproject.toml b/pyproject.toml index f25a33f7..d6d6a76d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ aiofiles = "^24.1.0" pyotp = "^2.9.0" asyncpg = "^0.30.0" json-repair = "^0.34.0" +pypdf = "^5.1.0" [tool.poetry.group.dev.dependencies] isort = "^5.13.2" diff --git a/skyvern/forge/prompts/skyvern/extract-information-from-file-text.j2 b/skyvern/forge/prompts/skyvern/extract-information-from-file-text.j2 new file mode 100644 index 00000000..b1f03e6f --- /dev/null +++ b/skyvern/forge/prompts/skyvern/extract-information-from-file-text.j2 @@ -0,0 +1,13 @@ +You are given text that has been extracted from a file. Your task is to extract information from this text. + +Extract information from the text and output it in the specified JSON schema format: {{ json_schema }} + +Add as much details as possible to the output JSON object while conforming to the output JSON schema. + +Do not ever include anything other than the JSON object in your output, and do not ever include any additional fields in the JSON object. + +If you are unable to extract the requested information for a specific field in the json schema, please output a null value for that field. + +You are given the following text + +{{ extracted_text_content }} diff --git a/skyvern/forge/sdk/workflow/models/block.py b/skyvern/forge/sdk/workflow/models/block.py index b237ca1d..3ecb31aa 100644 --- a/skyvern/forge/sdk/workflow/models/block.py +++ b/skyvern/forge/sdk/workflow/models/block.py @@ -21,6 +21,8 @@ import structlog from email_validator import EmailNotValidError, validate_email from jinja2 import Template from pydantic import BaseModel, Field +from pypdf import PdfReader +from pypdf.errors import PdfReadError from skyvern.config import settings from skyvern.exceptions import ( @@ -84,6 +86,7 @@ class BlockType(StrEnum): WAIT = "wait" FILE_DOWNLOAD = "file_download" GOTO_URL = "goto_url" + PDF_PARSER = "pdf_parser" class BlockStatus(StrEnum): @@ -1832,6 +1835,112 @@ class FileParserBlock(Block): ) +class PDFParserBlock(Block): + block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER + + file_url: str + json_schema: dict[str, Any] | None = None + + def get_all_parameters( + self, + workflow_run_id: str, + ) -> list[PARAMETER_TYPE]: + workflow_run_context = self.get_workflow_run_context(workflow_run_id) + if self.file_url and workflow_run_context.has_parameter(self.file_url): + return [workflow_run_context.get_parameter(self.file_url)] + return [] + + def format_potential_template_parameters(self, workflow_run_context: WorkflowRunContext) -> None: + self.file_url = self.format_block_parameter_template_from_workflow_run_context( + self.file_url, workflow_run_context + ) + + async def execute( + self, + workflow_run_id: str, + workflow_run_block_id: str, + organization_id: str | None = None, + browser_session_id: str | None = None, + **kwargs: dict, + ) -> BlockResult: + workflow_run_context = self.get_workflow_run_context(workflow_run_id) + if ( + self.file_url + and workflow_run_context.has_parameter(self.file_url) + and workflow_run_context.has_value(self.file_url) + ): + file_url_parameter_value = workflow_run_context.get_value(self.file_url) + if file_url_parameter_value: + LOG.info( + "PDFParserBlock: File URL is parameterized, using parameter value", + file_url_parameter_value=file_url_parameter_value, + file_url_parameter_key=self.file_url, + ) + self.file_url = file_url_parameter_value + + try: + self.format_potential_template_parameters(workflow_run_context) + except Exception as e: + return await self.build_block_result( + success=False, + failure_reason=f"Failed to format jinja template: {str(e)}", + output_parameter_value=None, + status=BlockStatus.failed, + workflow_run_block_id=workflow_run_block_id, + organization_id=organization_id, + ) + + # Download the file + file_path = None + if self.file_url.startswith("s3://"): + file_path = await download_from_s3(self.get_async_aws_client(), self.file_url) + else: + file_path = await download_file(self.file_url) + + extracted_text = "" + try: + reader = PdfReader(file_path) + page_count = len(reader.pages) + for i in range(page_count): + extracted_text += reader.pages[i].extract_text() + "\n" + + except PdfReadError: + return await self.build_block_result( + success=False, + failure_reason="Failed to parse PDF file", + output_parameter_value=None, + status=BlockStatus.failed, + workflow_run_block_id=workflow_run_block_id, + organization_id=organization_id, + ) + + if not self.json_schema: + self.json_schema = { + "type": "object", + "properties": { + "extracted_information": { + "type": "object", + "description": "Information extracted from the text", + } + }, + } + + llm_prompt = prompt_engine.load_prompt( + "extract-information-from-file-text", extracted_text_content=extracted_text, json_schema=self.json_schema + ) + llm_response = await app.LLM_API_HANDLER(prompt=llm_prompt) + # Record the parsed data + await self.record_output_parameter_value(workflow_run_context, workflow_run_id, llm_response) + return await self.build_block_result( + success=True, + failure_reason=None, + output_parameter_value=llm_response, + status=BlockStatus.completed, + workflow_run_block_id=workflow_run_block_id, + organization_id=organization_id, + ) + + class WaitBlock(Block): block_type: Literal[BlockType.WAIT] = BlockType.WAIT @@ -1952,6 +2061,7 @@ BlockSubclasses = Union[ UploadToS3Block, SendEmailBlock, FileParserBlock, + PDFParserBlock, ValidationBlock, ActionBlock, NavigationBlock, diff --git a/skyvern/forge/sdk/workflow/models/yaml.py b/skyvern/forge/sdk/workflow/models/yaml.py index 8e7af235..db984a9b 100644 --- a/skyvern/forge/sdk/workflow/models/yaml.py +++ b/skyvern/forge/sdk/workflow/models/yaml.py @@ -215,6 +215,13 @@ class FileParserBlockYAML(BlockYAML): file_type: FileType +class PDFParserBlockYAML(BlockYAML): + block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER # type: ignore + + file_url: str + json_schema: dict[str, Any] | None = None + + class ValidationBlockYAML(BlockYAML): block_type: Literal[BlockType.VALIDATION] = BlockType.VALIDATION # type: ignore @@ -343,6 +350,7 @@ BLOCK_YAML_SUBCLASSES = ( | WaitBlockYAML | FileDownloadBlockYAML | UrlBlockYAML + | PDFParserBlockYAML ) BLOCK_YAML_TYPES = Annotated[BLOCK_YAML_SUBCLASSES, Field(discriminator="block_type")] diff --git a/skyvern/forge/sdk/workflow/service.py b/skyvern/forge/sdk/workflow/service.py index 96f9c1be..1f713223 100644 --- a/skyvern/forge/sdk/workflow/service.py +++ b/skyvern/forge/sdk/workflow/service.py @@ -46,6 +46,7 @@ from skyvern.forge.sdk.workflow.models.block import ( ForLoopBlock, LoginBlock, NavigationBlock, + PDFParserBlock, SendEmailBlock, TaskBlock, TextPromptBlock, @@ -1469,6 +1470,14 @@ class WorkflowService: file_type=block_yaml.file_type, continue_on_failure=block_yaml.continue_on_failure, ) + elif block_yaml.block_type == BlockType.PDF_PARSER: + return PDFParserBlock( + label=block_yaml.label, + output_parameter=output_parameter, + file_url=block_yaml.file_url, + json_schema=block_yaml.json_schema, + continue_on_failure=block_yaml.continue_on_failure, + ) elif block_yaml.block_type == BlockType.VALIDATION: validation_block_parameters = ( [parameters[parameter_key] for parameter_key in block_yaml.parameter_keys]