diff --git a/fern/workflows/introduction.mdx b/fern/workflows/introduction.mdx index 2a94f0ae..4e748ff0 100644 --- a/fern/workflows/introduction.mdx +++ b/fern/workflows/introduction.mdx @@ -19,7 +19,7 @@ Building blocks supported today: - TextPromptBlock: A text only prompt block. - SendEmailBlock: Send an email. - FileDownloadBlock: Given a goal, Skyvern downloads a file from the website. -- FileParserBlock: Given a file url, Skyvern downloads the file from the url, and returns the parsed content as the output of the block. Currently only support CSV file format. +- FileParserBlock: Given a file url, Skyvern downloads the file from the url, and returns the parsed content as the output of the block. Supports CSV, Excel, and PDF file formats. - PDFParserBlock: Given a pdf url, Skyvern downloads the PDF file from the url and returns the parsed content as the output of the block. - FileUploadBlock: Upload all the downloaded files to a desired destination. Currently only AWS S3 is supported. Please contact support@skyvern.com if you need more integrations. - WaitBlock: Wait for a given amount of time. diff --git a/fern/workflows/workflow-blocks-details.mdx b/fern/workflows/workflow-blocks-details.mdx index 294606b6..b5cbc0a0 100644 --- a/fern/workflows/workflow-blocks-details.mdx +++ b/fern/workflows/workflow-blocks-details.mdx @@ -43,7 +43,7 @@ This block sends an email. This block downloads a file from the website. ## FileParserBlock -This block parses a file from the website. +This block parses PDFs, CSVs, and Excel files from the website. ## PDFParserBlock This block parses a PDF file from the website. diff --git a/fern/workflows/workflow-blocks.mdx b/fern/workflows/workflow-blocks.mdx index 92c1434d..55e47ce4 100644 --- a/fern/workflows/workflow-blocks.mdx +++ b/fern/workflows/workflow-blocks.mdx @@ -228,16 +228,16 @@ Inputs: Downloads and parses a file to be used within other workflow blocks. -**Supported types:** CSV +**Supported types:** CSV, TSV, Excel, PDF ``` - block_type: file_url_parser - label: csv_parser - file_type: csv - file_url: + label: file_parser + file_type: csv # Auto-detected from URL extension + file_url: ``` Inputs: -1. **File URL *(required):*** This block allows you to use a CSV within your workflow. +1. **File URL *(required):*** This block allows you to use CSV, TSV, Excel, and PDF files within your workflow. * Since we’re still in beta, you will need to [contact us](https://meetings.hubspot.com/skyvern/demo?uuid=7c83865f-1a92-4c44-9e52-1ba0dbc04f7a) to load a value into this block \ No newline at end of file diff --git a/poetry.lock b/poetry.lock index a2987713..28956e52 100644 --- a/poetry.lock +++ b/poetry.lock @@ -1,4 +1,4 @@ -# This file is automatically @generated by Poetry 2.1.1 and should not be changed by hand. +# This file is automatically @generated by Poetry 2.1.3 and should not be changed by hand. [[package]] name = "about-time" @@ -1587,6 +1587,18 @@ files = [ [package.extras] mypy = ["mypy"] +[[package]] +name = "et-xmlfile" +version = "2.0.0" +description = "An implementation of lxml.xmlfile for the standard library" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "et_xmlfile-2.0.0-py3-none-any.whl", hash = "sha256:7a91720bc756843502c3b7504c77b8fe44217c85c537d85037f0f536151b2caa"}, + {file = "et_xmlfile-2.0.0.tar.gz", hash = "sha256:dab3f4764309081ce75662649be815c4c9081e88f0837825f90fd28317d4da54"}, +] + [[package]] name = "exceptiongroup" version = "1.3.0" @@ -2374,7 +2386,7 @@ description = "Lightweight in-process concurrent programming" optional = false python-versions = ">=3.9" groups = ["main"] -markers = "python_version == \"3.12\" or python_version == \"3.13\"" +markers = "python_version >= \"3.12\"" files = [ {file = "greenlet-3.2.3-cp310-cp310-macosx_11_0_universal2.whl", hash = "sha256:1afd685acd5597349ee6d7a88a8bec83ce13c106ac78c196ee9dde7c04fe87be"}, {file = "greenlet-3.2.3-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.whl", hash = "sha256:761917cac215c61e9dc7324b2606107b3b292a8349bdebb31503ab4de3f559ac"}, @@ -3183,7 +3195,7 @@ description = "Low-level, pure Python DBus protocol wrapper." optional = false python-versions = ">=3.7" groups = ["dev"] -markers = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and sys_platform == \"linux\"" +markers = "sys_platform == \"linux\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\"" files = [ {file = "jeepney-0.9.0-py3-none-any.whl", hash = "sha256:97e5714520c16fc0a45695e5365a2e11b81ea79bba796e26f9f1d178cb182683"}, {file = "jeepney-0.9.0.tar.gz", hash = "sha256:cf0e9e845622b81e4a28df94c40345400256ec608d0e55bb8a3feaa9163f5732"}, @@ -4829,7 +4841,7 @@ description = "ONNX Runtime is a runtime accelerator for Machine Learning models optional = false python-versions = ">=3.10" groups = ["main"] -markers = "python_version == \"3.12\" or python_version == \"3.13\"" +markers = "python_version >= \"3.12\"" files = [ {file = "onnxruntime-1.22.0-cp310-cp310-macosx_13_0_universal2.whl", hash = "sha256:85d8826cc8054e4d6bf07f779dc742a363c39094015bdad6a08b3c18cfe0ba8c"}, {file = "onnxruntime-1.22.0-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:468c9502a12f6f49ec335c2febd22fdceecc1e4cc96dfc27e419ba237dff5aff"}, @@ -4937,6 +4949,21 @@ jsonschema-path = ">=0.3.1,<0.4.0" lazy-object-proxy = ">=1.7.1,<2.0.0" openapi-schema-validator = ">=0.6.0,<0.7.0" +[[package]] +name = "openpyxl" +version = "3.1.5" +description = "A Python library to read/write Excel 2010 xlsx/xlsm files" +optional = false +python-versions = ">=3.8" +groups = ["main"] +files = [ + {file = "openpyxl-3.1.5-py2.py3-none-any.whl", hash = "sha256:5282c12b107bffeef825f4617dc029afaf41d0ea60823bbb665ef3079dc79de2"}, + {file = "openpyxl-3.1.5.tar.gz", hash = "sha256:cf0e3cf56142039133628b5acffe8ef0c12bc902d2aadd3e0fe5878dc08d1050"}, +] + +[package.dependencies] +et-xmlfile = "*" + [[package]] name = "opentelemetry-api" version = "1.34.1" @@ -5930,7 +5957,7 @@ description = "A high-level API to automate web browsers" optional = false python-versions = ">=3.9" groups = ["main"] -markers = "python_version == \"3.12\" or python_version == \"3.13\"" +markers = "python_version >= \"3.12\"" files = [ {file = "playwright-1.53.0-py3-none-macosx_10_13_x86_64.whl", hash = "sha256:48a1a15ce810f0ffe512b6050de9871ea193b41dd3cc1bbed87b8431012419ba"}, {file = "playwright-1.53.0-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a701f9498a5b87e3f929ec01cea3109fbde75821b19c7ba4bba54f6127b94f76"}, @@ -6227,7 +6254,7 @@ description = "PostgreSQL database adapter for Python" optional = false python-versions = ">=3.7" groups = ["main"] -markers = "python_version == \"3.12\" or python_version == \"3.11\"" +markers = "python_version < \"3.13\"" files = [ {file = "psycopg-3.1.18-py3-none-any.whl", hash = "sha256:4d5a0a5a8590906daa58ebd5f3cfc34091377354a1acced269dd10faf55da60e"}, {file = "psycopg-3.1.18.tar.gz", hash = "sha256:31144d3fb4c17d78094d9e579826f047d4af1da6a10427d91dfcfb6ecdf6f12b"}, @@ -6280,7 +6307,7 @@ description = "PostgreSQL database adapter for Python -- C optimisation distribu optional = false python-versions = ">=3.7" groups = ["main"] -markers = "(python_version == \"3.12\" or python_version == \"3.11\") and implementation_name != \"pypy\"" +markers = "python_version < \"3.13\" and implementation_name != \"pypy\"" files = [ {file = "psycopg_binary-3.1.18-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:5c323103dfa663b88204cf5f028e83c77d7a715f9b6f51d2bbc8184b99ddd90a"}, {file = "psycopg_binary-3.1.18-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:887f8d856c91510148be942c7acd702ccf761a05f59f8abc123c22ab77b5a16c"}, @@ -6731,7 +6758,7 @@ description = "A rough port of Node.js's EventEmitter to Python with a few trick optional = false python-versions = ">=3.8" groups = ["main"] -markers = "python_version == \"3.12\" or python_version == \"3.13\"" +markers = "python_version >= \"3.12\"" files = [ {file = "pyee-13.0.0-py3-none-any.whl", hash = "sha256:48195a3cddb3b1515ce0695ed76036b5ccc2ef3a9f963ff9f77aec0139845498"}, {file = "pyee-13.0.0.tar.gz", hash = "sha256:b391e3c5a434d1f5118a25615001dbc8f669cf410ab67d04c4d4e07c55481c37"}, @@ -7044,7 +7071,7 @@ description = "A (partial) reimplementation of pywin32 using ctypes/cffi" optional = false python-versions = ">=3.6" groups = ["dev"] -markers = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and sys_platform == \"win32\"" +markers = "sys_platform == \"win32\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\"" files = [ {file = "pywin32-ctypes-0.2.3.tar.gz", hash = "sha256:d162dc04946d704503b2edc4d55f3dba5c1d539ead017afa00142c38b9885755"}, {file = "pywin32_ctypes-0.2.3-py3-none-any.whl", hash = "sha256:8a1513379d709975552d202d942d9837758905c8d01eb82b8bcc30918929e7b8"}, @@ -7784,7 +7811,7 @@ description = "Python bindings to FreeDesktop.org Secret Service API" optional = false python-versions = ">=3.6" groups = ["dev"] -markers = "platform_machine != \"ppc64le\" and platform_machine != \"s390x\" and sys_platform == \"linux\"" +markers = "sys_platform == \"linux\" and platform_machine != \"ppc64le\" and platform_machine != \"s390x\"" files = [ {file = "SecretStorage-3.3.3-py3-none-any.whl", hash = "sha256:f356e6628222568e3af06f2eba8df495efa13b3b63081dafd4f7d9a7b7bc9f99"}, {file = "SecretStorage-3.3.3.tar.gz", hash = "sha256:2403533ef369eca6d2ba81718576c5e0f564d5cca1b58f73a8b23e7d4eeebd77"}, @@ -9128,7 +9155,7 @@ description = "Fast implementation of asyncio event loop on top of libuv" optional = false python-versions = ">=3.8.0" groups = ["main"] -markers = "sys_platform != \"win32\" and sys_platform != \"cygwin\" and platform_python_implementation != \"PyPy\"" +markers = "platform_python_implementation != \"PyPy\" and sys_platform != \"win32\" and sys_platform != \"cygwin\"" files = [ {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_universal2.whl", hash = "sha256:ec7e6b09a6fdded42403182ab6b832b71f4edaf7f37a9a0e371a01db5f0cb45f"}, {file = "uvloop-0.21.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:196274f2adb9689a289ad7d65700d37df0c0930fd8e4e743fa4834e850d7719d"}, @@ -9763,4 +9790,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.14" -content-hash = "667e626dd8d08bae4f9b852616a9c2c5df9bdd4a3a37f7ef87030d7bfdf51f3b" +content-hash = "441c7080f7fbccb87de476e56dd86d1e56e4c0b8eaa8378d13c90d94a0a42123" diff --git a/pyproject.toml b/pyproject.toml index e92ee8f0..a94cb938 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -78,6 +78,7 @@ lark = "^1.2.2" libcst = "^1.8.2" curlparser = "^0.1.0" lmnr = {extras = ["all"], version = "^0.7.0"} +openpyxl = "^3.1.5" [tool.poetry.group.dev.dependencies] isort = "^5.13.2" diff --git a/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/FileParserNode.tsx b/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/FileParserNode.tsx index d2f469f8..173a883c 100644 --- a/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/FileParserNode.tsx +++ b/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/FileParserNode.tsx @@ -10,6 +10,8 @@ import { useDebugStore } from "@/store/useDebugStore"; import { cn } from "@/util/utils"; import { NodeHeader } from "../components/NodeHeader"; import { useParams } from "react-router-dom"; +import { WorkflowDataSchemaInputGroup } from "@/components/DataSchemaInputGroup/WorkflowDataSchemaInputGroup"; +import { dataSchemaExampleForFileExtraction } from "../types"; function FileParserNode({ id, data }: NodeProps) { const { updateNodeData } = useReactFlow(); @@ -21,8 +23,17 @@ function FileParserNode({ id, data }: NodeProps) { urlBlockLabel !== undefined && urlBlockLabel === label; const [inputs, setInputs] = useState({ fileUrl: data.fileUrl, + jsonSchema: data.jsonSchema, }); + function handleChange(key: string, value: unknown) { + if (!data.editable) { + return; + } + setInputs({ ...inputs, [key]: value }); + updateNodeData(id, { [key]: value }); + } + const isFirstWorkflowBlock = useIsFirstBlockInWorkflow({ id }); return ( @@ -75,15 +86,19 @@ function FileParserNode({ id, data }: NodeProps) { nodeId={id} value={inputs.fileUrl} onChange={(value) => { - if (!data.editable) { - return; - } - setInputs({ ...inputs, fileUrl: value }); - updateNodeData(id, { fileUrl: value }); + handleChange("fileUrl", value); }} className="nopan text-xs" /> + { + handleChange("jsonSchema", value); + }} + suggestionContext={{}} + /> diff --git a/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/types.ts b/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/types.ts index 672b7eeb..a81fc063 100644 --- a/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/types.ts +++ b/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/types.ts @@ -1,9 +1,11 @@ import type { Node } from "@xyflow/react"; import { NodeBaseData } from "../types"; +import { AppNode } from ".."; import { debuggableWorkflowBlockTypes } from "@/routes/workflows/types/workflowTypes"; export type FileParserNodeData = NodeBaseData & { fileUrl: string; + jsonSchema: string; }; export type FileParserNode = Node; @@ -14,5 +16,10 @@ export const fileParserNodeDefaultData: FileParserNodeData = { label: "", fileUrl: "", continueOnFailure: false, + jsonSchema: "null", model: null, } as const; + +export function isFileParserNode(node: AppNode): node is FileParserNode { + return node.type === "fileParser"; +} diff --git a/skyvern-frontend/src/routes/workflows/editor/panels/WorkflowNodeLibraryPanel.tsx b/skyvern-frontend/src/routes/workflows/editor/panels/WorkflowNodeLibraryPanel.tsx index ffa1ccc9..1e5f6f0f 100644 --- a/skyvern-frontend/src/routes/workflows/editor/panels/WorkflowNodeLibraryPanel.tsx +++ b/skyvern-frontend/src/routes/workflows/editor/panels/WorkflowNodeLibraryPanel.tsx @@ -162,20 +162,19 @@ const nodeLibraryItems: Array<{ /> ), title: "File Parser Block", - description: "Parse data from files", - }, - { - nodeType: "pdfParser", - icon: ( - - ), - title: "PDF Parser Block", - description: "Extract data from PDF files", + description: "Parse PDFs, CSVs, and Excel files", }, // { + // nodeType: "pdfParser", + // icon: ( + // + // ), + // title: "PDF Parser Block", + // description: "Extract data from PDF files", + // }, // nodeType: "upload", // icon: ( // ): Array { } }); + const fileParserNodes = nodes.filter(isFileParserNode); + fileParserNodes.forEach((node) => { + try { + JSON.parse(node.data.jsonSchema); + } catch { + errors.push(`${node.data.label}: Data schema is not valid JSON.`); + } + }); + const waitNodes = nodes.filter(isWaitNode); waitNodes.forEach((node) => { const waitTimeString = node.data.waitInSeconds.trim(); diff --git a/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts b/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts index b4ed76a5..efb3b655 100644 --- a/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts +++ b/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts @@ -354,7 +354,8 @@ export type SendEmailBlock = WorkflowBlockBase & { export type FileURLParserBlock = WorkflowBlockBase & { block_type: "file_url_parser"; file_url: string; - file_type: "csv"; + file_type: "csv" | "excel" | "pdf"; + json_schema: Record | null; }; export type ValidationBlock = WorkflowBlockBase & { diff --git a/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts b/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts index a3202666..f7ab86ea 100644 --- a/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts +++ b/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts @@ -308,7 +308,8 @@ export type SendEmailBlockYAML = BlockYAMLBase & { export type FileUrlParserBlockYAML = BlockYAMLBase & { block_type: "file_url_parser"; file_url: string; - file_type: "csv"; + file_type: "csv" | "excel" | "pdf"; + json_schema?: Record | null; }; export type ForLoopBlockYAML = BlockYAMLBase & { diff --git a/skyvern/forge/sdk/workflow/models/block.py b/skyvern/forge/sdk/workflow/models/block.py index e4167df8..7b15c267 100644 --- a/skyvern/forge/sdk/workflow/models/block.py +++ b/skyvern/forge/sdk/workflow/models/block.py @@ -21,6 +21,7 @@ from typing import Annotated, Any, Awaitable, Callable, Literal, Union from urllib.parse import quote, urlparse import filetype +import pandas as pd import structlog from email_validator import EmailNotValidError, validate_email from jinja2.sandbox import SandboxedEnvironment @@ -2342,6 +2343,8 @@ class SendEmailBlock(Block): class FileType(StrEnum): CSV = "csv" + EXCEL = "excel" + PDF = "pdf" class FileParserBlock(Block): @@ -2349,6 +2352,7 @@ class FileParserBlock(Block): file_url: str file_type: FileType + json_schema: dict[str, Any] | None = None def get_all_parameters( self, @@ -2364,6 +2368,18 @@ class FileParserBlock(Block): self.file_url, workflow_run_context ) + def _detect_file_type_from_url(self, file_url: str) -> FileType: + """Detect file type based on file extension in the URL.""" + url_lower = file_url.lower() + if url_lower.endswith((".xlsx", ".xls", ".xlsm")): + return FileType.EXCEL + elif url_lower.endswith(".pdf"): + return FileType.PDF + elif url_lower.endswith(".tsv"): + return FileType.CSV # TSV files are handled by the CSV parser + else: + return FileType.CSV # Default to CSV for .csv and any other extensions + def validate_file_type(self, file_url_used: str, file_path: str) -> None: if self.file_type == FileType.CSV: try: @@ -2371,6 +2387,121 @@ class FileParserBlock(Block): csv.Sniffer().sniff(file.read(1024)) except csv.Error as e: raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e)) + elif self.file_type == FileType.EXCEL: + try: + # Try to read the file with pandas to validate it's a valid Excel file + pd.read_excel(file_path, nrows=1, engine="openpyxl") + except Exception as e: + raise InvalidFileType( + file_url=file_url_used, file_type=self.file_type, error=f"Invalid Excel file format: {str(e)}" + ) + elif self.file_type == FileType.PDF: + try: + # Try to read the file with PyPDF to validate it's a valid PDF file + reader = PdfReader(file_path) + # Just check if we can access pages, don't read content yet + _ = len(reader.pages) + except Exception as e: + raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e)) + + async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]: + """Parse CSV/TSV file and return list of dictionaries.""" + parsed_data = [] + with open(file_path) as file: + # Try to detect the delimiter (comma for CSV, tab for TSV) + sample = file.read(1024) + file.seek(0) # Reset file pointer + + # Use csv.Sniffer to detect the delimiter + try: + dialect = csv.Sniffer().sniff(sample) + delimiter = dialect.delimiter + except csv.Error: + # Default to comma if detection fails + delimiter = "," + + reader = csv.DictReader(file, delimiter=delimiter) + for row in reader: + parsed_data.append(row) + return parsed_data + + def _clean_dataframe_for_json(self, df: pd.DataFrame) -> list[dict[str, Any]]: + """Clean DataFrame to ensure it can be serialized to JSON.""" + # Replace NaN and NaT values with "nan" string + df_cleaned = df.replace({pd.NA: "nan", pd.NaT: "nan"}) + df_cleaned = df_cleaned.where(pd.notna(df_cleaned), "nan") + + # Convert to list of dictionaries + records = df_cleaned.to_dict("records") + + # Additional cleaning for any remaining problematic values + for record in records: + for key, value in record.items(): + if pd.isna(value) or value == "NaN" or value == "NaT": + record[key] = "nan" + elif isinstance(value, (pd.Timestamp, pd.DatetimeTZDtype)): + # Convert pandas timestamps to ISO format strings + record[key] = value.isoformat() if pd.notna(value) else "nan" + + return records + + async def _parse_excel_file(self, file_path: str) -> list[dict[str, Any]]: + """Parse Excel file and return list of dictionaries.""" + try: + # Read Excel file with pandas, specifying engine explicitly + df = pd.read_excel(file_path, engine="openpyxl") + # Clean and convert DataFrame to list of dictionaries + return self._clean_dataframe_for_json(df) + except ImportError as e: + raise InvalidFileType( + file_url=self.file_url, + file_type=self.file_type, + error=f"Missing required dependency for Excel parsing: {str(e)}. Please install openpyxl: pip install openpyxl", + ) + except Exception as e: + raise InvalidFileType( + file_url=self.file_url, file_type=self.file_type, error=f"Failed to parse Excel file: {str(e)}" + ) + + async def _parse_pdf_file(self, file_path: str) -> str: + """Parse PDF file and return extracted text.""" + try: + reader = PdfReader(file_path) + extracted_text = "" + page_count = len(reader.pages) + for i in range(page_count): + extracted_text += reader.pages[i].extract_text() + "\n" + return extracted_text + except PdfReadError as e: + raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e)) + + async def _extract_with_ai( + self, content: str | list[dict[str, Any]], workflow_run_context: WorkflowRunContext + ) -> dict[str, Any]: + """Extract structured data using AI based on json_schema.""" + # Use local variable to avoid mutating the instance + schema_to_use = self.json_schema or { + "type": "object", + "properties": { + "output": { + "type": "object", + "description": "Information extracted from the file", + } + }, + } + + # Convert content to string for AI processing + if isinstance(content, list): + # For CSV/Excel data, convert to a readable format + content_str = json.dumps(content, indent=2) + else: + content_str = content + + llm_prompt = prompt_engine.load_prompt( + "extract-information-from-file-text", extracted_text_content=content_str, json_schema=schema_to_use + ) + llm_response = await app.LLM_API_HANDLER(prompt=llm_prompt, prompt_name="extract-information-from-file-text") + return llm_response async def execute( self, @@ -2381,6 +2512,7 @@ class FileParserBlock(Block): **kwargs: dict, ) -> BlockResult: workflow_run_context = self.get_workflow_run_context(workflow_run_id) + if ( self.file_url and workflow_run_context.has_parameter(self.file_url) @@ -2412,21 +2544,71 @@ class FileParserBlock(Block): file_path = await download_from_s3(self.get_async_aws_client(), self.file_url) else: file_path = await download_file(self.file_url) + + # Auto-detect file type based on file extension + detected_file_type = self._detect_file_type_from_url(self.file_url) + self.file_type = detected_file_type + # Validate the file type self.validate_file_type(self.file_url, file_path) - # Parse the file into a list of dictionaries where each dictionary represents a row in the file - parsed_data = [] - with open(file_path) as file: - if self.file_type == FileType.CSV: - reader = csv.DictReader(file) - for row in reader: - parsed_data.append(row) + + LOG.debug( + "FileParserBlock: After file type validation", + file_type=self.file_type, + json_schema_present=self.json_schema is not None, + json_schema_type=type(self.json_schema), + ) + + # Parse the file based on type + parsed_data: str | list[dict[str, Any]] + if self.file_type == FileType.CSV: + parsed_data = await self._parse_csv_file(file_path) + elif self.file_type == FileType.EXCEL: + parsed_data = await self._parse_excel_file(file_path) + elif self.file_type == FileType.PDF: + parsed_data = await self._parse_pdf_file(file_path) + else: + return await self.build_block_result( + success=False, + failure_reason=f"Unsupported file type: {self.file_type}", + output_parameter_value=None, + status=BlockStatus.failed, + workflow_run_block_id=workflow_run_block_id, + organization_id=organization_id, + ) + + # If json_schema is provided, use AI to extract structured data + final_data: str | list[dict[str, Any]] | dict[str, Any] + LOG.debug( + "FileParserBlock: JSON schema check", + has_json_schema=self.json_schema is not None, + json_schema_type=type(self.json_schema), + json_schema=self.json_schema, + ) + + if self.json_schema: + try: + ai_extracted_data = await self._extract_with_ai(parsed_data, workflow_run_context) + final_data = ai_extracted_data + except Exception as e: + return await self.build_block_result( + success=False, + failure_reason=f"Failed to extract data with AI: {str(e)}", + output_parameter_value=None, + status=BlockStatus.failed, + workflow_run_block_id=workflow_run_block_id, + organization_id=organization_id, + ) + else: + # Return raw parsed data + final_data = parsed_data + # Record the parsed data - await self.record_output_parameter_value(workflow_run_context, workflow_run_id, parsed_data) + await self.record_output_parameter_value(workflow_run_context, workflow_run_id, final_data) return await self.build_block_result( success=True, failure_reason=None, - output_parameter_value=parsed_data, + output_parameter_value=final_data, status=BlockStatus.completed, workflow_run_block_id=workflow_run_block_id, organization_id=organization_id, @@ -2434,6 +2616,11 @@ class FileParserBlock(Block): class PDFParserBlock(Block): + """ + DEPRECATED: Use FileParserBlock with file_type=FileType.PDF instead. + This block will be removed in a future version. + """ + block_type: Literal[BlockType.PDF_PARSER] = BlockType.PDF_PARSER file_url: str diff --git a/skyvern/forge/sdk/workflow/models/yaml.py b/skyvern/forge/sdk/workflow/models/yaml.py index 0bf582bb..49cb1f52 100644 --- a/skyvern/forge/sdk/workflow/models/yaml.py +++ b/skyvern/forge/sdk/workflow/models/yaml.py @@ -244,6 +244,7 @@ class FileParserBlockYAML(BlockYAML): file_url: str file_type: FileType + json_schema: dict[str, Any] | None = None class PDFParserBlockYAML(BlockYAML): diff --git a/skyvern/forge/sdk/workflow/service.py b/skyvern/forge/sdk/workflow/service.py index bbe17856..5165672a 100644 --- a/skyvern/forge/sdk/workflow/service.py +++ b/skyvern/forge/sdk/workflow/service.py @@ -1926,6 +1926,7 @@ class WorkflowService: output_parameter=output_parameter, file_url=block_yaml.file_url, file_type=block_yaml.file_type, + json_schema=block_yaml.json_schema, continue_on_failure=block_yaml.continue_on_failure, ) elif block_yaml.block_type == BlockType.PDF_PARSER: diff --git a/tests/unit_tests/test_file_parser_block.py b/tests/unit_tests/test_file_parser_block.py new file mode 100644 index 00000000..be845c2c --- /dev/null +++ b/tests/unit_tests/test_file_parser_block.py @@ -0,0 +1,252 @@ +import os +import tempfile +from datetime import datetime +from unittest.mock import MagicMock, patch + +import pandas as pd +import pytest + +from skyvern.forge.sdk.workflow.models.block import FileParserBlock, FileType +from skyvern.forge.sdk.workflow.models.parameter import OutputParameter + + +class TestFileParserBlock: + @pytest.fixture + def file_parser_block(self): + """Create a basic FileParserBlock instance for testing.""" + # Create a mock OutputParameter with all required fields + mock_output_parameter = MagicMock(spec=OutputParameter) + mock_output_parameter.parameter_type = "output" + mock_output_parameter.key = "test_output" + mock_output_parameter.output_parameter_id = "test_id" + mock_output_parameter.workflow_id = "test_workflow_id" + mock_output_parameter.created_at = datetime.now() + mock_output_parameter.modified_at = datetime.now() + mock_output_parameter.deleted_at = None + + return FileParserBlock( + label="test_parser", output_parameter=mock_output_parameter, file_url="test.csv", file_type=FileType.CSV + ) + + @pytest.fixture + def csv_file(self): + """Create a temporary CSV file for testing.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".csv", delete=False) as f: + f.write("name,age,city\nJohn,30,New York\nJane,25,Boston") + temp_file = f.name + + yield temp_file + os.unlink(temp_file) + + @pytest.fixture + def excel_file(self): + """Create a temporary Excel file for testing.""" + df = pd.DataFrame({"name": ["John", "Jane"], "age": [30, 25], "city": ["New York", "Boston"]}) + + with tempfile.NamedTemporaryFile(suffix=".xlsx", delete=False) as f: + df.to_excel(f.name, index=False) + temp_file = f.name + + yield temp_file + os.unlink(temp_file) + + @pytest.fixture + def tsv_file(self): + """Create a temporary TSV file for testing.""" + with tempfile.NamedTemporaryFile(mode="w", suffix=".tsv", delete=False) as f: + f.write("name\tage\tcity\nJohn\t30\tNew York\nJane\t25\tBoston") + temp_file = f.name + + yield temp_file + os.unlink(temp_file) + + def test_file_type_enum_values(self): + """Test that FileType enum has the expected values.""" + assert FileType.CSV == "csv" + assert FileType.EXCEL == "excel" + assert FileType.PDF == "pdf" + + def test_file_parser_block_initialization(self, file_parser_block): + """Test that FileParserBlock initializes correctly.""" + assert file_parser_block.label == "test_parser" + assert file_parser_block.file_url == "test.csv" + assert file_parser_block.file_type == FileType.CSV + assert file_parser_block.json_schema is None + + def test_file_parser_block_with_schema(self): + """Test that FileParserBlock can be initialized with a schema.""" + schema = {"type": "object", "properties": {"name": {"type": "string"}, "age": {"type": "integer"}}} + + # Create a mock OutputParameter + mock_output_parameter = MagicMock(spec=OutputParameter) + mock_output_parameter.parameter_type = "output" + mock_output_parameter.key = "test_output" + mock_output_parameter.output_parameter_id = "test_id" + mock_output_parameter.workflow_id = "test_workflow_id" + mock_output_parameter.created_at = datetime.now() + mock_output_parameter.modified_at = datetime.now() + mock_output_parameter.deleted_at = None + + block = FileParserBlock( + label="test_parser", + output_parameter=mock_output_parameter, + file_url="test.csv", + file_type=FileType.CSV, + json_schema=schema, + ) + + assert block.json_schema == schema + + @pytest.mark.asyncio + async def test_parse_csv_file(self, file_parser_block, csv_file): + """Test CSV file parsing.""" + result = await file_parser_block._parse_csv_file(csv_file) + + expected = [{"name": "John", "age": "30", "city": "New York"}, {"name": "Jane", "age": "25", "city": "Boston"}] + + assert result == expected + + @pytest.mark.asyncio + async def test_parse_excel_file(self, file_parser_block, excel_file): + """Test Excel file parsing.""" + result = await file_parser_block._parse_excel_file(excel_file) + + expected = [{"name": "John", "age": 30, "city": "New York"}, {"name": "Jane", "age": 25, "city": "Boston"}] + + assert result == expected + + @pytest.mark.asyncio + async def test_parse_tsv_file(self, file_parser_block, tsv_file): + """Test TSV file parsing.""" + result = await file_parser_block._parse_csv_file(tsv_file) + + expected = [{"name": "John", "age": "30", "city": "New York"}, {"name": "Jane", "age": "25", "city": "Boston"}] + + assert result == expected + + @pytest.mark.asyncio + async def test_validate_csv_file_type(self, file_parser_block, csv_file): + """Test CSV file type validation.""" + # Should not raise an exception + file_parser_block.validate_file_type("test.csv", csv_file) + + @pytest.mark.asyncio + async def test_validate_excel_file_type(self, file_parser_block, excel_file): + """Test Excel file type validation.""" + file_parser_block.file_type = FileType.EXCEL + # Should not raise an exception + file_parser_block.validate_file_type("test.xlsx", excel_file) + + @pytest.mark.asyncio + async def test_validate_invalid_csv_file(self, file_parser_block): + """Test validation of invalid CSV file.""" + # Create a binary file that's definitely not CSV + with tempfile.NamedTemporaryFile(mode="wb", suffix=".csv", delete=False) as f: + f.write(b"\x00\x01\x02\x03\x04\x05\x06\x07\x08\x09\x0a\x0b\x0c\x0d\x0e\x0f") + temp_file = f.name + + try: + with pytest.raises(Exception): + file_parser_block.validate_file_type("test.csv", temp_file) + finally: + os.unlink(temp_file) + + @pytest.mark.asyncio + async def test_extract_with_ai_with_schema(self, file_parser_block): + """Test AI extraction with a provided schema.""" + schema = { + "type": "object", + "properties": { + "extracted_data": { + "type": "object", + "properties": { + "names": {"type": "array", "items": {"type": "string"}}, + "total_count": {"type": "integer"}, + }, + } + }, + } + + file_parser_block.json_schema = schema + + # Mock the LLM response + mock_response = {"extracted_data": {"names": ["John", "Jane"], "total_count": 2}} + + with patch("skyvern.forge.sdk.workflow.models.block.app.LLM_API_HANDLER") as mock_llm: + mock_llm.return_value = mock_response + + with patch("skyvern.forge.sdk.workflow.models.block.prompt_engine.load_prompt") as mock_prompt: + mock_prompt.return_value = "mocked prompt" + + result = await file_parser_block._extract_with_ai([{"name": "John"}, {"name": "Jane"}], MagicMock()) + + assert result == mock_response + mock_llm.assert_called_once() + mock_prompt.assert_called_once() + + @pytest.mark.asyncio + async def test_extract_with_ai_without_schema(self, file_parser_block): + """Test AI extraction without a provided schema (should use default).""" + # Mock the LLM response + mock_response = {"output": {"summary": "Extracted data from file"}} + + with patch("skyvern.forge.sdk.workflow.models.block.app.LLM_API_HANDLER") as mock_llm: + mock_llm.return_value = mock_response + + with patch("skyvern.forge.sdk.workflow.models.block.prompt_engine.load_prompt") as mock_prompt: + mock_prompt.return_value = "mocked prompt" + + result = await file_parser_block._extract_with_ai("Some text content", MagicMock()) + + assert result == mock_response + # Should NOT mutate the instance - json_schema should remain None + assert file_parser_block.json_schema is None + mock_llm.assert_called_once() + mock_prompt.assert_called_once() + + def test_detect_file_type_from_url(self, file_parser_block): + """Test file type detection based on URL extension.""" + # Test Excel files + assert file_parser_block._detect_file_type_from_url("https://example.com/data.xlsx") == FileType.EXCEL + assert file_parser_block._detect_file_type_from_url("https://example.com/data.xls") == FileType.EXCEL + assert file_parser_block._detect_file_type_from_url("https://example.com/data.xlsm") == FileType.EXCEL + + # Test PDF files + assert file_parser_block._detect_file_type_from_url("https://example.com/document.pdf") == FileType.PDF + + # Test CSV files (default) + assert file_parser_block._detect_file_type_from_url("https://example.com/data.csv") == FileType.CSV + assert file_parser_block._detect_file_type_from_url("https://example.com/data.tsv") == FileType.CSV + assert file_parser_block._detect_file_type_from_url("https://example.com/data.txt") == FileType.CSV + assert file_parser_block._detect_file_type_from_url("https://example.com/data") == FileType.CSV + + def test_clean_dataframe_for_json(self, file_parser_block): + """Test DataFrame cleaning for JSON serialization.""" + # Create a DataFrame with NaN, NaT, and timestamp values + df = pd.DataFrame( + { + "OrderDate": ["2018-01-01", pd.NaT, "2018-01-03"], + "Region": ["North", "South", pd.NA], + "Sales": [1000.0, pd.NA, 3000.0], + "Timestamp": [pd.Timestamp("2018-01-01"), pd.NaT, pd.Timestamp("2018-01-03")], + } + ) + + # Clean the DataFrame + result = file_parser_block._clean_dataframe_for_json(df) + + # Check that NaN and NaT values are converted to "nan" string + assert result[0]["OrderDate"] == "2018-01-01" + assert result[0]["Region"] == "North" + assert result[0]["Sales"] == 1000.0 + assert result[0]["Timestamp"] == "2018-01-01T00:00:00" + + assert result[1]["OrderDate"] == "nan" + assert result[1]["Region"] == "South" + assert result[1]["Sales"] == "nan" + assert result[1]["Timestamp"] == "nan" + + assert result[2]["OrderDate"] == "2018-01-03" + assert result[2]["Region"] == "nan" + assert result[2]["Sales"] == 3000.0 + assert result[2]["Timestamp"] == "2018-01-03T00:00:00"