Support OCR of image files in File Parser block (#4609)

2026-02-02 21:37:12 -08:00
parent b64c7d4032
commit fcbe7fe84f
7 changed files with 60 additions and 10 deletions
--- a/skyvern-frontend/src/routes/workflows/editor/panels/WorkflowNodeLibraryPanel.tsx
+++ b/skyvern-frontend/src/routes/workflows/editor/panels/WorkflowNodeLibraryPanel.tsx
@@ -188,7 +188,7 @@ const nodeLibraryItems: Array<{
      />
    ),
    title: "File Parser Block",
-    description: "Parse PDFs, CSVs, and Excel files",
+    description: "Parse PDFs, CSVs, Excel files, and Images",
  },
  // {
  //   nodeType: "pdfParser",
--- a/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts
+++ b/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts
@@ -415,7 +415,7 @@ export type SendEmailBlock = WorkflowBlockBase & {
 export type FileURLParserBlock = WorkflowBlockBase & {
  block_type: "file_url_parser";
  file_url: string;
-  file_type: "csv" | "excel" | "pdf";
+  file_type: "csv" | "excel" | "pdf" | "image";
  json_schema: Record<string, unknown> | null;
 };
--- a/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts
+++ b/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts
@@ -350,7 +350,7 @@ export type SendEmailBlockYAML = BlockYAMLBase & {
 export type FileUrlParserBlockYAML = BlockYAMLBase & {
  block_type: "file_url_parser";
  file_url: string;
-  file_type: "csv" | "excel" | "pdf";
+  file_type: "csv" | "excel" | "pdf" | "image";
  json_schema?: Record<string, unknown> | null;
 };
--- a/skyvern/client/types/file_type.py
+++ b/skyvern/client/types/file_type.py
@@ -2,4 +2,4 @@
 import typing
-FileType = typing.Union[typing.Literal["csv", "excel", "pdf"], typing.Any]
+FileType = typing.Union[typing.Literal["csv", "excel", "pdf", "image"], typing.Any]
--- a/skyvern/forge/prompts/skyvern/extract-text-from-image.j2
+++ b/skyvern/forge/prompts/skyvern/extract-text-from-image.j2
@@ -0,0 +1,19 @@
 Extract all visible text from this image.
 MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments, no unnecessary quotes.
 Reply in JSON format with the following keys:
 {
    "extracted_text": str // All text extracted from the image
 }
 TEXT EXTRACTION GUIDELINES:
 - Preserve reading order (top to bottom, left to right)
 - For tables: format as rows separated by newlines, columns separated by " | "
 - For multi-column layouts: extract each column separately, separated by blank lines
 - For forms: format as "Label: Value" on each line
 - Preserve line breaks where they appear meaningful (paragraphs, list items)
 - Include all visible text: headers, body text, labels, captions, watermarks
 - For handwritten text: do your best to transcribe, use [illegible] for unclear parts
 If no text is visible in the image, return an empty string for extracted_text.
--- a/skyvern/forge/sdk/workflow/models/block.py
+++ b/skyvern/forge/sdk/workflow/models/block.py
@@ -3063,6 +3063,8 @@ class FileParserBlock(Block):
            return FileType.PDF
        elif suffix == ".tsv":
            return FileType.CSV  # TSV files are handled by the CSV parser
        elif suffix in (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".tiff", ".tif"):
            return FileType.IMAGE
        else:
            return FileType.CSV  # Default to CSV for .csv and any other extensions
@@ -3112,6 +3114,12 @@ class FileParserBlock(Block):
                validate_pdf_file(file_path, file_identifier=file_url_used)
            except PDFParsingError as e:
                raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
        elif self.file_type == FileType.IMAGE:
            kind = filetype.guess(file_path)
            if kind is None or not kind.mime.startswith("image/"):
                raise InvalidFileType(
                    file_url=file_url_used, file_type=self.file_type, error="File is not a valid image"
                )
    async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
        """Parse CSV/TSV file and return list of dictionaries."""
@@ -3184,6 +3192,27 @@ class FileParserBlock(Block):
        except PDFParsingError as e:
            raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
    async def _parse_image_file(self, file_path: str) -> str:
        """Parse image file using vision LLM for OCR."""
        try:
            with open(file_path, "rb") as f:
                image_bytes = f.read()
            llm_prompt = prompt_engine.load_prompt("extract-text-from-image")
            llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
                self.override_llm_key, default=app.LLM_API_HANDLER
            )
            llm_response = await llm_api_handler(
                prompt=llm_prompt,
                prompt_name="extract-text-from-image",
                screenshots=[image_bytes],
                force_dict=True,
            )
            return llm_response.get("extracted_text", "")
        except Exception:
            LOG.exception("Failed to extract text from image via OCR", file_url=self.file_url)
            raise
    async def _extract_with_ai(
        self, content: str | list[dict[str, Any]], workflow_run_context: WorkflowRunContext
    ) -> dict[str, Any]:
@@ -3210,9 +3239,8 @@ class FileParserBlock(Block):
            "extract-information-from-file-text", extracted_text_content=content_str, json_schema=schema_to_use
        )
-        llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
+        llm_key = self.override_llm_key
-            self.override_llm_key, default=app.LLM_API_HANDLER
+        llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(llm_key, default=app.LLM_API_HANDLER)
        )
        llm_response = await llm_api_handler(
            prompt=llm_prompt, prompt_name="extract-information-from-file-text", force_dict=False
@@ -3261,9 +3289,9 @@ class FileParserBlock(Block):
        else:
            file_path = await download_file(self.file_url)
-        # Auto-detect file type based on file extension
+        # Auto-detect file type if not explicitly set (IMAGE/EXCEL/PDF are explicit choices)
-        detected_file_type = self._detect_file_type_from_url(self.file_url)
+        if self.file_type not in (FileType.IMAGE, FileType.EXCEL, FileType.PDF):
-        self.file_type = detected_file_type
+            self.file_type = self._detect_file_type_from_url(self.file_url)
        # Validate the file type
        self.validate_file_type(self.file_url, file_path)
@@ -3283,6 +3311,8 @@ class FileParserBlock(Block):
            parsed_data = await self._parse_excel_file(file_path)
        elif self.file_type == FileType.PDF:
            parsed_data = await self._parse_pdf_file(file_path)
        elif self.file_type == FileType.IMAGE:
            parsed_data = await self._parse_image_file(file_path)
        else:
            return await self.build_block_result(
                success=False,
--- a/skyvern/schemas/workflows.py
+++ b/skyvern/schemas/workflows.py
@@ -67,6 +67,7 @@ class FileType(StrEnum):
    CSV = "csv"
    EXCEL = "excel"
    PDF = "pdf"
    IMAGE = "image"
 class PDFFormat(StrEnum):
`@@ -2,4 +2,4 @@`

	`import typing`	`import typing`

	`FileType = typing.Union[typing.Literal["csv", "excel", "pdf"], typing.Any]`	`FileType = typing.Union[typing.Literal["csv", "excel", "pdf", "image"], typing.Any]`