Add PDF Parser Block (#1600)

This commit is contained in:
Shuchang Zheng
2025-01-20 12:33:54 -08:00
committed by GitHub
parent 8f3941b6b5
commit 1796af6df9
6 changed files with 143 additions and 2 deletions

View File

@@ -0,0 +1,13 @@
You are given text that has been extracted from a file. Your task is to extract information from this text.
Extract information from the text and output it in the specified JSON schema format: {{ json_schema }}
Add as much details as possible to the output JSON object while conforming to the output JSON schema.
Do not ever include anything other than the JSON object in your output, and do not ever include any additional fields in the JSON object.
If you are unable to extract the requested information for a specific field in the json schema, please output a null value for that field.
You are given the following text
{{ extracted_text_content }}

View File

@@ -21,6 +21,8 @@ import structlog
from email_validator import EmailNotValidError, validate_email
from jinja2 import Template
from pydantic import BaseModel, Field
from pypdf import PdfReader
from pypdf.errors import PdfReadError
from skyvern.config import settings
from skyvern.exceptions import (
@@ -84,6 +86,7 @@ class BlockType(StrEnum):
WAIT = "wait"
FILE_DOWNLOAD = "file_download"
GOTO_URL = "goto_url"
PDF_PARSER = "pdf_parser"
class BlockStatus(StrEnum):
@@ -1832,6 +1835,112 @@ class FileParserBlock(Block):
)
class PDFParserBlock(Block):
block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER
file_url: str
json_schema: dict[str, Any] | None = None
def get_all_parameters(
self,
workflow_run_id: str,
) -> list[PARAMETER_TYPE]:
workflow_run_context = self.get_workflow_run_context(workflow_run_id)
if self.file_url and workflow_run_context.has_parameter(self.file_url):
return [workflow_run_context.get_parameter(self.file_url)]
return []
def format_potential_template_parameters(self, workflow_run_context: WorkflowRunContext) -> None:
self.file_url = self.format_block_parameter_template_from_workflow_run_context(
self.file_url, workflow_run_context
)
async def execute(
self,
workflow_run_id: str,
workflow_run_block_id: str,
organization_id: str | None = None,
browser_session_id: str | None = None,
**kwargs: dict,
) -> BlockResult:
workflow_run_context = self.get_workflow_run_context(workflow_run_id)
if (
self.file_url
and workflow_run_context.has_parameter(self.file_url)
and workflow_run_context.has_value(self.file_url)
):
file_url_parameter_value = workflow_run_context.get_value(self.file_url)
if file_url_parameter_value:
LOG.info(
"PDFParserBlock: File URL is parameterized, using parameter value",
file_url_parameter_value=file_url_parameter_value,
file_url_parameter_key=self.file_url,
)
self.file_url = file_url_parameter_value
try:
self.format_potential_template_parameters(workflow_run_context)
except Exception as e:
return await self.build_block_result(
success=False,
failure_reason=f"Failed to format jinja template: {str(e)}",
output_parameter_value=None,
status=BlockStatus.failed,
workflow_run_block_id=workflow_run_block_id,
organization_id=organization_id,
)
# Download the file
file_path = None
if self.file_url.startswith("s3://"):
file_path = await download_from_s3(self.get_async_aws_client(), self.file_url)
else:
file_path = await download_file(self.file_url)
extracted_text = ""
try:
reader = PdfReader(file_path)
page_count = len(reader.pages)
for i in range(page_count):
extracted_text += reader.pages[i].extract_text() + "\n"
except PdfReadError:
return await self.build_block_result(
success=False,
failure_reason="Failed to parse PDF file",
output_parameter_value=None,
status=BlockStatus.failed,
workflow_run_block_id=workflow_run_block_id,
organization_id=organization_id,
)
if not self.json_schema:
self.json_schema = {
"type": "object",
"properties": {
"extracted_information": {
"type": "object",
"description": "Information extracted from the text",
}
},
}
llm_prompt = prompt_engine.load_prompt(
"extract-information-from-file-text", extracted_text_content=extracted_text, json_schema=self.json_schema
)
llm_response = await app.LLM_API_HANDLER(prompt=llm_prompt)
# Record the parsed data
await self.record_output_parameter_value(workflow_run_context, workflow_run_id, llm_response)
return await self.build_block_result(
success=True,
failure_reason=None,
output_parameter_value=llm_response,
status=BlockStatus.completed,
workflow_run_block_id=workflow_run_block_id,
organization_id=organization_id,
)
class WaitBlock(Block):
block_type: Literal[BlockType.WAIT] = BlockType.WAIT
@@ -1952,6 +2061,7 @@ BlockSubclasses = Union[
UploadToS3Block,
SendEmailBlock,
FileParserBlock,
PDFParserBlock,
ValidationBlock,
ActionBlock,
NavigationBlock,

View File

@@ -215,6 +215,13 @@ class FileParserBlockYAML(BlockYAML):
file_type: FileType
class PDFParserBlockYAML(BlockYAML):
block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER # type: ignore
file_url: str
json_schema: dict[str, Any] | None = None
class ValidationBlockYAML(BlockYAML):
block_type: Literal[BlockType.VALIDATION] = BlockType.VALIDATION # type: ignore
@@ -343,6 +350,7 @@ BLOCK_YAML_SUBCLASSES = (
| WaitBlockYAML
| FileDownloadBlockYAML
| UrlBlockYAML
| PDFParserBlockYAML
)
BLOCK_YAML_TYPES = Annotated[BLOCK_YAML_SUBCLASSES, Field(discriminator="block_type")]

View File

@@ -46,6 +46,7 @@ from skyvern.forge.sdk.workflow.models.block import (
ForLoopBlock,
LoginBlock,
NavigationBlock,
PDFParserBlock,
SendEmailBlock,
TaskBlock,
TextPromptBlock,
@@ -1469,6 +1470,14 @@ class WorkflowService:
file_type=block_yaml.file_type,
continue_on_failure=block_yaml.continue_on_failure,
)
elif block_yaml.block_type == BlockType.PDF_PARSER:
return PDFParserBlock(
label=block_yaml.label,
output_parameter=output_parameter,
file_url=block_yaml.file_url,
json_schema=block_yaml.json_schema,
continue_on_failure=block_yaml.continue_on_failure,
)
elif block_yaml.block_type == BlockType.VALIDATION:
validation_block_parameters = (
[parameters[parameter_key] for parameter_key in block_yaml.parameter_keys]