Support OCR of image files in File Parser block (#4609)

This commit is contained in:
Marc Kelechava
2026-02-02 21:37:12 -08:00
committed by GitHub
parent b64c7d4032
commit fcbe7fe84f
7 changed files with 60 additions and 10 deletions

View File

@@ -0,0 +1,19 @@
Extract all visible text from this image.
MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments, no unnecessary quotes.
Reply in JSON format with the following keys:
{
"extracted_text": str // All text extracted from the image
}
TEXT EXTRACTION GUIDELINES:
- Preserve reading order (top to bottom, left to right)
- For tables: format as rows separated by newlines, columns separated by " | "
- For multi-column layouts: extract each column separately, separated by blank lines
- For forms: format as "Label: Value" on each line
- Preserve line breaks where they appear meaningful (paragraphs, list items)
- Include all visible text: headers, body text, labels, captions, watermarks
- For handwritten text: do your best to transcribe, use [illegible] for unclear parts
If no text is visible in the image, return an empty string for extracted_text.

View File

@@ -3063,6 +3063,8 @@ class FileParserBlock(Block):
return FileType.PDF
elif suffix == ".tsv":
return FileType.CSV # TSV files are handled by the CSV parser
elif suffix in (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".tiff", ".tif"):
return FileType.IMAGE
else:
return FileType.CSV # Default to CSV for .csv and any other extensions
@@ -3112,6 +3114,12 @@ class FileParserBlock(Block):
validate_pdf_file(file_path, file_identifier=file_url_used)
except PDFParsingError as e:
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
elif self.file_type == FileType.IMAGE:
kind = filetype.guess(file_path)
if kind is None or not kind.mime.startswith("image/"):
raise InvalidFileType(
file_url=file_url_used, file_type=self.file_type, error="File is not a valid image"
)
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
"""Parse CSV/TSV file and return list of dictionaries."""
@@ -3184,6 +3192,27 @@ class FileParserBlock(Block):
except PDFParsingError as e:
raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
async def _parse_image_file(self, file_path: str) -> str:
"""Parse image file using vision LLM for OCR."""
try:
with open(file_path, "rb") as f:
image_bytes = f.read()
llm_prompt = prompt_engine.load_prompt("extract-text-from-image")
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
self.override_llm_key, default=app.LLM_API_HANDLER
)
llm_response = await llm_api_handler(
prompt=llm_prompt,
prompt_name="extract-text-from-image",
screenshots=[image_bytes],
force_dict=True,
)
return llm_response.get("extracted_text", "")
except Exception:
LOG.exception("Failed to extract text from image via OCR", file_url=self.file_url)
raise
async def _extract_with_ai(
self, content: str | list[dict[str, Any]], workflow_run_context: WorkflowRunContext
) -> dict[str, Any]:
@@ -3210,9 +3239,8 @@ class FileParserBlock(Block):
"extract-information-from-file-text", extracted_text_content=content_str, json_schema=schema_to_use
)
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
self.override_llm_key, default=app.LLM_API_HANDLER
)
llm_key = self.override_llm_key
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(llm_key, default=app.LLM_API_HANDLER)
llm_response = await llm_api_handler(
prompt=llm_prompt, prompt_name="extract-information-from-file-text", force_dict=False
@@ -3261,9 +3289,9 @@ class FileParserBlock(Block):
else:
file_path = await download_file(self.file_url)
# Auto-detect file type based on file extension
detected_file_type = self._detect_file_type_from_url(self.file_url)
self.file_type = detected_file_type
# Auto-detect file type if not explicitly set (IMAGE/EXCEL/PDF are explicit choices)
if self.file_type not in (FileType.IMAGE, FileType.EXCEL, FileType.PDF):
self.file_type = self._detect_file_type_from_url(self.file_url)
# Validate the file type
self.validate_file_type(self.file_url, file_path)
@@ -3283,6 +3311,8 @@ class FileParserBlock(Block):
parsed_data = await self._parse_excel_file(file_path)
elif self.file_type == FileType.PDF:
parsed_data = await self._parse_pdf_file(file_path)
elif self.file_type == FileType.IMAGE:
parsed_data = await self._parse_image_file(file_path)
else:
return await self.build_block_result(
success=False,