Support OCR of image files in File Parser block (#4609)

This commit is contained in:
Marc Kelechava
2026-02-02 21:37:12 -08:00
committed by GitHub
parent b64c7d4032
commit fcbe7fe84f
7 changed files with 60 additions and 10 deletions

View File

@@ -188,7 +188,7 @@ const nodeLibraryItems: Array<{
/> />
), ),
title: "File Parser Block", title: "File Parser Block",
description: "Parse PDFs, CSVs, and Excel files", description: "Parse PDFs, CSVs, Excel files, and Images",
}, },
// { // {
// nodeType: "pdfParser", // nodeType: "pdfParser",

View File

@@ -415,7 +415,7 @@ export type SendEmailBlock = WorkflowBlockBase & {
export type FileURLParserBlock = WorkflowBlockBase & { export type FileURLParserBlock = WorkflowBlockBase & {
block_type: "file_url_parser"; block_type: "file_url_parser";
file_url: string; file_url: string;
file_type: "csv" | "excel" | "pdf"; file_type: "csv" | "excel" | "pdf" | "image";
json_schema: Record<string, unknown> | null; json_schema: Record<string, unknown> | null;
}; };

View File

@@ -350,7 +350,7 @@ export type SendEmailBlockYAML = BlockYAMLBase & {
export type FileUrlParserBlockYAML = BlockYAMLBase & { export type FileUrlParserBlockYAML = BlockYAMLBase & {
block_type: "file_url_parser"; block_type: "file_url_parser";
file_url: string; file_url: string;
file_type: "csv" | "excel" | "pdf"; file_type: "csv" | "excel" | "pdf" | "image";
json_schema?: Record<string, unknown> | null; json_schema?: Record<string, unknown> | null;
}; };

View File

@@ -2,4 +2,4 @@
import typing import typing
FileType = typing.Union[typing.Literal["csv", "excel", "pdf"], typing.Any] FileType = typing.Union[typing.Literal["csv", "excel", "pdf", "image"], typing.Any]

View File

@@ -0,0 +1,19 @@
Extract all visible text from this image.
MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments, no unnecessary quotes.
Reply in JSON format with the following keys:
{
"extracted_text": str // All text extracted from the image
}
TEXT EXTRACTION GUIDELINES:
- Preserve reading order (top to bottom, left to right)
- For tables: format as rows separated by newlines, columns separated by " | "
- For multi-column layouts: extract each column separately, separated by blank lines
- For forms: format as "Label: Value" on each line
- Preserve line breaks where they appear meaningful (paragraphs, list items)
- Include all visible text: headers, body text, labels, captions, watermarks
- For handwritten text: do your best to transcribe, use [illegible] for unclear parts
If no text is visible in the image, return an empty string for extracted_text.

View File

@@ -3063,6 +3063,8 @@ class FileParserBlock(Block):
return FileType.PDF return FileType.PDF
elif suffix == ".tsv": elif suffix == ".tsv":
return FileType.CSV # TSV files are handled by the CSV parser return FileType.CSV # TSV files are handled by the CSV parser
elif suffix in (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".tiff", ".tif"):
return FileType.IMAGE
else: else:
return FileType.CSV # Default to CSV for .csv and any other extensions return FileType.CSV # Default to CSV for .csv and any other extensions
@@ -3112,6 +3114,12 @@ class FileParserBlock(Block):
validate_pdf_file(file_path, file_identifier=file_url_used) validate_pdf_file(file_path, file_identifier=file_url_used)
except PDFParsingError as e: except PDFParsingError as e:
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e)) raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
elif self.file_type == FileType.IMAGE:
kind = filetype.guess(file_path)
if kind is None or not kind.mime.startswith("image/"):
raise InvalidFileType(
file_url=file_url_used, file_type=self.file_type, error="File is not a valid image"
)
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]: async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
"""Parse CSV/TSV file and return list of dictionaries.""" """Parse CSV/TSV file and return list of dictionaries."""
@@ -3184,6 +3192,27 @@ class FileParserBlock(Block):
except PDFParsingError as e: except PDFParsingError as e:
raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e)) raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
async def _parse_image_file(self, file_path: str) -> str:
"""Parse image file using vision LLM for OCR."""
try:
with open(file_path, "rb") as f:
image_bytes = f.read()
llm_prompt = prompt_engine.load_prompt("extract-text-from-image")
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
self.override_llm_key, default=app.LLM_API_HANDLER
)
llm_response = await llm_api_handler(
prompt=llm_prompt,
prompt_name="extract-text-from-image",
screenshots=[image_bytes],
force_dict=True,
)
return llm_response.get("extracted_text", "")
except Exception:
LOG.exception("Failed to extract text from image via OCR", file_url=self.file_url)
raise
async def _extract_with_ai( async def _extract_with_ai(
self, content: str | list[dict[str, Any]], workflow_run_context: WorkflowRunContext self, content: str | list[dict[str, Any]], workflow_run_context: WorkflowRunContext
) -> dict[str, Any]: ) -> dict[str, Any]:
@@ -3210,9 +3239,8 @@ class FileParserBlock(Block):
"extract-information-from-file-text", extracted_text_content=content_str, json_schema=schema_to_use "extract-information-from-file-text", extracted_text_content=content_str, json_schema=schema_to_use
) )
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler( llm_key = self.override_llm_key
self.override_llm_key, default=app.LLM_API_HANDLER llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(llm_key, default=app.LLM_API_HANDLER)
)
llm_response = await llm_api_handler( llm_response = await llm_api_handler(
prompt=llm_prompt, prompt_name="extract-information-from-file-text", force_dict=False prompt=llm_prompt, prompt_name="extract-information-from-file-text", force_dict=False
@@ -3261,9 +3289,9 @@ class FileParserBlock(Block):
else: else:
file_path = await download_file(self.file_url) file_path = await download_file(self.file_url)
# Auto-detect file type based on file extension # Auto-detect file type if not explicitly set (IMAGE/EXCEL/PDF are explicit choices)
detected_file_type = self._detect_file_type_from_url(self.file_url) if self.file_type not in (FileType.IMAGE, FileType.EXCEL, FileType.PDF):
self.file_type = detected_file_type self.file_type = self._detect_file_type_from_url(self.file_url)
# Validate the file type # Validate the file type
self.validate_file_type(self.file_url, file_path) self.validate_file_type(self.file_url, file_path)
@@ -3283,6 +3311,8 @@ class FileParserBlock(Block):
parsed_data = await self._parse_excel_file(file_path) parsed_data = await self._parse_excel_file(file_path)
elif self.file_type == FileType.PDF: elif self.file_type == FileType.PDF:
parsed_data = await self._parse_pdf_file(file_path) parsed_data = await self._parse_pdf_file(file_path)
elif self.file_type == FileType.IMAGE:
parsed_data = await self._parse_image_file(file_path)
else: else:
return await self.build_block_result( return await self.build_block_result(
success=False, success=False,

View File

@@ -67,6 +67,7 @@ class FileType(StrEnum):
CSV = "csv" CSV = "csv"
EXCEL = "excel" EXCEL = "excel"
PDF = "pdf" PDF = "pdf"
IMAGE = "image"
class PDFFormat(StrEnum): class PDFFormat(StrEnum):