Add PDF Parser Block (#1600)
This commit is contained in:
4
poetry.lock
generated
4
poetry.lock
generated
@@ -1,4 +1,4 @@
|
|||||||
# This file is automatically @generated by Poetry 1.8.1 and should not be changed by hand.
|
# This file is automatically @generated by Poetry 1.8.2 and should not be changed by hand.
|
||||||
|
|
||||||
[[package]]
|
[[package]]
|
||||||
name = "aioboto3"
|
name = "aioboto3"
|
||||||
@@ -6182,4 +6182,4 @@ type = ["pytest-mypy"]
|
|||||||
[metadata]
|
[metadata]
|
||||||
lock-version = "2.0"
|
lock-version = "2.0"
|
||||||
python-versions = "^3.11,<3.12"
|
python-versions = "^3.11,<3.12"
|
||||||
content-hash = "908718e1dfd3b4a34106c187c59a8627f00b3491fead6b7fac6e48143afb6f17"
|
content-hash = "995b84f916f59c9166e84ce633f5cf848312fc85d80caa76486d3571c7321668"
|
||||||
|
|||||||
@@ -53,6 +53,7 @@ aiofiles = "^24.1.0"
|
|||||||
pyotp = "^2.9.0"
|
pyotp = "^2.9.0"
|
||||||
asyncpg = "^0.30.0"
|
asyncpg = "^0.30.0"
|
||||||
json-repair = "^0.34.0"
|
json-repair = "^0.34.0"
|
||||||
|
pypdf = "^5.1.0"
|
||||||
|
|
||||||
[tool.poetry.group.dev.dependencies]
|
[tool.poetry.group.dev.dependencies]
|
||||||
isort = "^5.13.2"
|
isort = "^5.13.2"
|
||||||
|
|||||||
@@ -0,0 +1,13 @@
|
|||||||
|
You are given text that has been extracted from a file. Your task is to extract information from this text.
|
||||||
|
|
||||||
|
Extract information from the text and output it in the specified JSON schema format: {{ json_schema }}
|
||||||
|
|
||||||
|
Add as much details as possible to the output JSON object while conforming to the output JSON schema.
|
||||||
|
|
||||||
|
Do not ever include anything other than the JSON object in your output, and do not ever include any additional fields in the JSON object.
|
||||||
|
|
||||||
|
If you are unable to extract the requested information for a specific field in the json schema, please output a null value for that field.
|
||||||
|
|
||||||
|
You are given the following text
|
||||||
|
|
||||||
|
{{ extracted_text_content }}
|
||||||
@@ -21,6 +21,8 @@ import structlog
|
|||||||
from email_validator import EmailNotValidError, validate_email
|
from email_validator import EmailNotValidError, validate_email
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
from pydantic import BaseModel, Field
|
from pydantic import BaseModel, Field
|
||||||
|
from pypdf import PdfReader
|
||||||
|
from pypdf.errors import PdfReadError
|
||||||
|
|
||||||
from skyvern.config import settings
|
from skyvern.config import settings
|
||||||
from skyvern.exceptions import (
|
from skyvern.exceptions import (
|
||||||
@@ -84,6 +86,7 @@ class BlockType(StrEnum):
|
|||||||
WAIT = "wait"
|
WAIT = "wait"
|
||||||
FILE_DOWNLOAD = "file_download"
|
FILE_DOWNLOAD = "file_download"
|
||||||
GOTO_URL = "goto_url"
|
GOTO_URL = "goto_url"
|
||||||
|
PDF_PARSER = "pdf_parser"
|
||||||
|
|
||||||
|
|
||||||
class BlockStatus(StrEnum):
|
class BlockStatus(StrEnum):
|
||||||
@@ -1832,6 +1835,112 @@ class FileParserBlock(Block):
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
|
class PDFParserBlock(Block):
|
||||||
|
block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER
|
||||||
|
|
||||||
|
file_url: str
|
||||||
|
json_schema: dict[str, Any] | None = None
|
||||||
|
|
||||||
|
def get_all_parameters(
|
||||||
|
self,
|
||||||
|
workflow_run_id: str,
|
||||||
|
) -> list[PARAMETER_TYPE]:
|
||||||
|
workflow_run_context = self.get_workflow_run_context(workflow_run_id)
|
||||||
|
if self.file_url and workflow_run_context.has_parameter(self.file_url):
|
||||||
|
return [workflow_run_context.get_parameter(self.file_url)]
|
||||||
|
return []
|
||||||
|
|
||||||
|
def format_potential_template_parameters(self, workflow_run_context: WorkflowRunContext) -> None:
|
||||||
|
self.file_url = self.format_block_parameter_template_from_workflow_run_context(
|
||||||
|
self.file_url, workflow_run_context
|
||||||
|
)
|
||||||
|
|
||||||
|
async def execute(
|
||||||
|
self,
|
||||||
|
workflow_run_id: str,
|
||||||
|
workflow_run_block_id: str,
|
||||||
|
organization_id: str | None = None,
|
||||||
|
browser_session_id: str | None = None,
|
||||||
|
**kwargs: dict,
|
||||||
|
) -> BlockResult:
|
||||||
|
workflow_run_context = self.get_workflow_run_context(workflow_run_id)
|
||||||
|
if (
|
||||||
|
self.file_url
|
||||||
|
and workflow_run_context.has_parameter(self.file_url)
|
||||||
|
and workflow_run_context.has_value(self.file_url)
|
||||||
|
):
|
||||||
|
file_url_parameter_value = workflow_run_context.get_value(self.file_url)
|
||||||
|
if file_url_parameter_value:
|
||||||
|
LOG.info(
|
||||||
|
"PDFParserBlock: File URL is parameterized, using parameter value",
|
||||||
|
file_url_parameter_value=file_url_parameter_value,
|
||||||
|
file_url_parameter_key=self.file_url,
|
||||||
|
)
|
||||||
|
self.file_url = file_url_parameter_value
|
||||||
|
|
||||||
|
try:
|
||||||
|
self.format_potential_template_parameters(workflow_run_context)
|
||||||
|
except Exception as e:
|
||||||
|
return await self.build_block_result(
|
||||||
|
success=False,
|
||||||
|
failure_reason=f"Failed to format jinja template: {str(e)}",
|
||||||
|
output_parameter_value=None,
|
||||||
|
status=BlockStatus.failed,
|
||||||
|
workflow_run_block_id=workflow_run_block_id,
|
||||||
|
organization_id=organization_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
# Download the file
|
||||||
|
file_path = None
|
||||||
|
if self.file_url.startswith("s3://"):
|
||||||
|
file_path = await download_from_s3(self.get_async_aws_client(), self.file_url)
|
||||||
|
else:
|
||||||
|
file_path = await download_file(self.file_url)
|
||||||
|
|
||||||
|
extracted_text = ""
|
||||||
|
try:
|
||||||
|
reader = PdfReader(file_path)
|
||||||
|
page_count = len(reader.pages)
|
||||||
|
for i in range(page_count):
|
||||||
|
extracted_text += reader.pages[i].extract_text() + "\n"
|
||||||
|
|
||||||
|
except PdfReadError:
|
||||||
|
return await self.build_block_result(
|
||||||
|
success=False,
|
||||||
|
failure_reason="Failed to parse PDF file",
|
||||||
|
output_parameter_value=None,
|
||||||
|
status=BlockStatus.failed,
|
||||||
|
workflow_run_block_id=workflow_run_block_id,
|
||||||
|
organization_id=organization_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
if not self.json_schema:
|
||||||
|
self.json_schema = {
|
||||||
|
"type": "object",
|
||||||
|
"properties": {
|
||||||
|
"extracted_information": {
|
||||||
|
"type": "object",
|
||||||
|
"description": "Information extracted from the text",
|
||||||
|
}
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
llm_prompt = prompt_engine.load_prompt(
|
||||||
|
"extract-information-from-file-text", extracted_text_content=extracted_text, json_schema=self.json_schema
|
||||||
|
)
|
||||||
|
llm_response = await app.LLM_API_HANDLER(prompt=llm_prompt)
|
||||||
|
# Record the parsed data
|
||||||
|
await self.record_output_parameter_value(workflow_run_context, workflow_run_id, llm_response)
|
||||||
|
return await self.build_block_result(
|
||||||
|
success=True,
|
||||||
|
failure_reason=None,
|
||||||
|
output_parameter_value=llm_response,
|
||||||
|
status=BlockStatus.completed,
|
||||||
|
workflow_run_block_id=workflow_run_block_id,
|
||||||
|
organization_id=organization_id,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class WaitBlock(Block):
|
class WaitBlock(Block):
|
||||||
block_type: Literal[BlockType.WAIT] = BlockType.WAIT
|
block_type: Literal[BlockType.WAIT] = BlockType.WAIT
|
||||||
|
|
||||||
@@ -1952,6 +2061,7 @@ BlockSubclasses = Union[
|
|||||||
UploadToS3Block,
|
UploadToS3Block,
|
||||||
SendEmailBlock,
|
SendEmailBlock,
|
||||||
FileParserBlock,
|
FileParserBlock,
|
||||||
|
PDFParserBlock,
|
||||||
ValidationBlock,
|
ValidationBlock,
|
||||||
ActionBlock,
|
ActionBlock,
|
||||||
NavigationBlock,
|
NavigationBlock,
|
||||||
|
|||||||
@@ -215,6 +215,13 @@ class FileParserBlockYAML(BlockYAML):
|
|||||||
file_type: FileType
|
file_type: FileType
|
||||||
|
|
||||||
|
|
||||||
|
class PDFParserBlockYAML(BlockYAML):
|
||||||
|
block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER # type: ignore
|
||||||
|
|
||||||
|
file_url: str
|
||||||
|
json_schema: dict[str, Any] | None = None
|
||||||
|
|
||||||
|
|
||||||
class ValidationBlockYAML(BlockYAML):
|
class ValidationBlockYAML(BlockYAML):
|
||||||
block_type: Literal[BlockType.VALIDATION] = BlockType.VALIDATION # type: ignore
|
block_type: Literal[BlockType.VALIDATION] = BlockType.VALIDATION # type: ignore
|
||||||
|
|
||||||
@@ -343,6 +350,7 @@ BLOCK_YAML_SUBCLASSES = (
|
|||||||
| WaitBlockYAML
|
| WaitBlockYAML
|
||||||
| FileDownloadBlockYAML
|
| FileDownloadBlockYAML
|
||||||
| UrlBlockYAML
|
| UrlBlockYAML
|
||||||
|
| PDFParserBlockYAML
|
||||||
)
|
)
|
||||||
BLOCK_YAML_TYPES = Annotated[BLOCK_YAML_SUBCLASSES, Field(discriminator="block_type")]
|
BLOCK_YAML_TYPES = Annotated[BLOCK_YAML_SUBCLASSES, Field(discriminator="block_type")]
|
||||||
|
|
||||||
|
|||||||
@@ -46,6 +46,7 @@ from skyvern.forge.sdk.workflow.models.block import (
|
|||||||
ForLoopBlock,
|
ForLoopBlock,
|
||||||
LoginBlock,
|
LoginBlock,
|
||||||
NavigationBlock,
|
NavigationBlock,
|
||||||
|
PDFParserBlock,
|
||||||
SendEmailBlock,
|
SendEmailBlock,
|
||||||
TaskBlock,
|
TaskBlock,
|
||||||
TextPromptBlock,
|
TextPromptBlock,
|
||||||
@@ -1469,6 +1470,14 @@ class WorkflowService:
|
|||||||
file_type=block_yaml.file_type,
|
file_type=block_yaml.file_type,
|
||||||
continue_on_failure=block_yaml.continue_on_failure,
|
continue_on_failure=block_yaml.continue_on_failure,
|
||||||
)
|
)
|
||||||
|
elif block_yaml.block_type == BlockType.PDF_PARSER:
|
||||||
|
return PDFParserBlock(
|
||||||
|
label=block_yaml.label,
|
||||||
|
output_parameter=output_parameter,
|
||||||
|
file_url=block_yaml.file_url,
|
||||||
|
json_schema=block_yaml.json_schema,
|
||||||
|
continue_on_failure=block_yaml.continue_on_failure,
|
||||||
|
)
|
||||||
elif block_yaml.block_type == BlockType.VALIDATION:
|
elif block_yaml.block_type == BlockType.VALIDATION:
|
||||||
validation_block_parameters = (
|
validation_block_parameters = (
|
||||||
[parameters[parameter_key] for parameter_key in block_yaml.parameter_keys]
|
[parameters[parameter_key] for parameter_key in block_yaml.parameter_keys]
|
||||||
|
|||||||
Reference in New Issue
Block a user