Support OCR of image files in File Parser block (#4609)
This commit is contained in:
@@ -188,7 +188,7 @@ const nodeLibraryItems: Array<{
|
|||||||
/>
|
/>
|
||||||
),
|
),
|
||||||
title: "File Parser Block",
|
title: "File Parser Block",
|
||||||
description: "Parse PDFs, CSVs, and Excel files",
|
description: "Parse PDFs, CSVs, Excel files, and Images",
|
||||||
},
|
},
|
||||||
// {
|
// {
|
||||||
// nodeType: "pdfParser",
|
// nodeType: "pdfParser",
|
||||||
|
|||||||
@@ -415,7 +415,7 @@ export type SendEmailBlock = WorkflowBlockBase & {
|
|||||||
export type FileURLParserBlock = WorkflowBlockBase & {
|
export type FileURLParserBlock = WorkflowBlockBase & {
|
||||||
block_type: "file_url_parser";
|
block_type: "file_url_parser";
|
||||||
file_url: string;
|
file_url: string;
|
||||||
file_type: "csv" | "excel" | "pdf";
|
file_type: "csv" | "excel" | "pdf" | "image";
|
||||||
json_schema: Record<string, unknown> | null;
|
json_schema: Record<string, unknown> | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -350,7 +350,7 @@ export type SendEmailBlockYAML = BlockYAMLBase & {
|
|||||||
export type FileUrlParserBlockYAML = BlockYAMLBase & {
|
export type FileUrlParserBlockYAML = BlockYAMLBase & {
|
||||||
block_type: "file_url_parser";
|
block_type: "file_url_parser";
|
||||||
file_url: string;
|
file_url: string;
|
||||||
file_type: "csv" | "excel" | "pdf";
|
file_type: "csv" | "excel" | "pdf" | "image";
|
||||||
json_schema?: Record<string, unknown> | null;
|
json_schema?: Record<string, unknown> | null;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
|||||||
@@ -2,4 +2,4 @@
|
|||||||
|
|
||||||
import typing
|
import typing
|
||||||
|
|
||||||
FileType = typing.Union[typing.Literal["csv", "excel", "pdf"], typing.Any]
|
FileType = typing.Union[typing.Literal["csv", "excel", "pdf", "image"], typing.Any]
|
||||||
|
|||||||
19
skyvern/forge/prompts/skyvern/extract-text-from-image.j2
Normal file
19
skyvern/forge/prompts/skyvern/extract-text-from-image.j2
Normal file
@@ -0,0 +1,19 @@
|
|||||||
|
Extract all visible text from this image.
|
||||||
|
|
||||||
|
MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments, no unnecessary quotes.
|
||||||
|
|
||||||
|
Reply in JSON format with the following keys:
|
||||||
|
{
|
||||||
|
"extracted_text": str // All text extracted from the image
|
||||||
|
}
|
||||||
|
|
||||||
|
TEXT EXTRACTION GUIDELINES:
|
||||||
|
- Preserve reading order (top to bottom, left to right)
|
||||||
|
- For tables: format as rows separated by newlines, columns separated by " | "
|
||||||
|
- For multi-column layouts: extract each column separately, separated by blank lines
|
||||||
|
- For forms: format as "Label: Value" on each line
|
||||||
|
- Preserve line breaks where they appear meaningful (paragraphs, list items)
|
||||||
|
- Include all visible text: headers, body text, labels, captions, watermarks
|
||||||
|
- For handwritten text: do your best to transcribe, use [illegible] for unclear parts
|
||||||
|
|
||||||
|
If no text is visible in the image, return an empty string for extracted_text.
|
||||||
@@ -3063,6 +3063,8 @@ class FileParserBlock(Block):
|
|||||||
return FileType.PDF
|
return FileType.PDF
|
||||||
elif suffix == ".tsv":
|
elif suffix == ".tsv":
|
||||||
return FileType.CSV # TSV files are handled by the CSV parser
|
return FileType.CSV # TSV files are handled by the CSV parser
|
||||||
|
elif suffix in (".png", ".jpg", ".jpeg", ".gif", ".bmp", ".webp", ".tiff", ".tif"):
|
||||||
|
return FileType.IMAGE
|
||||||
else:
|
else:
|
||||||
return FileType.CSV # Default to CSV for .csv and any other extensions
|
return FileType.CSV # Default to CSV for .csv and any other extensions
|
||||||
|
|
||||||
@@ -3112,6 +3114,12 @@ class FileParserBlock(Block):
|
|||||||
validate_pdf_file(file_path, file_identifier=file_url_used)
|
validate_pdf_file(file_path, file_identifier=file_url_used)
|
||||||
except PDFParsingError as e:
|
except PDFParsingError as e:
|
||||||
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
||||||
|
elif self.file_type == FileType.IMAGE:
|
||||||
|
kind = filetype.guess(file_path)
|
||||||
|
if kind is None or not kind.mime.startswith("image/"):
|
||||||
|
raise InvalidFileType(
|
||||||
|
file_url=file_url_used, file_type=self.file_type, error="File is not a valid image"
|
||||||
|
)
|
||||||
|
|
||||||
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
|
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
|
||||||
"""Parse CSV/TSV file and return list of dictionaries."""
|
"""Parse CSV/TSV file and return list of dictionaries."""
|
||||||
@@ -3184,6 +3192,27 @@ class FileParserBlock(Block):
|
|||||||
except PDFParsingError as e:
|
except PDFParsingError as e:
|
||||||
raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
|
raise InvalidFileType(file_url=self.file_url, file_type=self.file_type, error=str(e))
|
||||||
|
|
||||||
|
async def _parse_image_file(self, file_path: str) -> str:
|
||||||
|
"""Parse image file using vision LLM for OCR."""
|
||||||
|
try:
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
image_bytes = f.read()
|
||||||
|
|
||||||
|
llm_prompt = prompt_engine.load_prompt("extract-text-from-image")
|
||||||
|
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
|
||||||
|
self.override_llm_key, default=app.LLM_API_HANDLER
|
||||||
|
)
|
||||||
|
llm_response = await llm_api_handler(
|
||||||
|
prompt=llm_prompt,
|
||||||
|
prompt_name="extract-text-from-image",
|
||||||
|
screenshots=[image_bytes],
|
||||||
|
force_dict=True,
|
||||||
|
)
|
||||||
|
return llm_response.get("extracted_text", "")
|
||||||
|
except Exception:
|
||||||
|
LOG.exception("Failed to extract text from image via OCR", file_url=self.file_url)
|
||||||
|
raise
|
||||||
|
|
||||||
async def _extract_with_ai(
|
async def _extract_with_ai(
|
||||||
self, content: str | list[dict[str, Any]], workflow_run_context: WorkflowRunContext
|
self, content: str | list[dict[str, Any]], workflow_run_context: WorkflowRunContext
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
@@ -3210,9 +3239,8 @@ class FileParserBlock(Block):
|
|||||||
"extract-information-from-file-text", extracted_text_content=content_str, json_schema=schema_to_use
|
"extract-information-from-file-text", extracted_text_content=content_str, json_schema=schema_to_use
|
||||||
)
|
)
|
||||||
|
|
||||||
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
|
llm_key = self.override_llm_key
|
||||||
self.override_llm_key, default=app.LLM_API_HANDLER
|
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(llm_key, default=app.LLM_API_HANDLER)
|
||||||
)
|
|
||||||
|
|
||||||
llm_response = await llm_api_handler(
|
llm_response = await llm_api_handler(
|
||||||
prompt=llm_prompt, prompt_name="extract-information-from-file-text", force_dict=False
|
prompt=llm_prompt, prompt_name="extract-information-from-file-text", force_dict=False
|
||||||
@@ -3261,9 +3289,9 @@ class FileParserBlock(Block):
|
|||||||
else:
|
else:
|
||||||
file_path = await download_file(self.file_url)
|
file_path = await download_file(self.file_url)
|
||||||
|
|
||||||
# Auto-detect file type based on file extension
|
# Auto-detect file type if not explicitly set (IMAGE/EXCEL/PDF are explicit choices)
|
||||||
detected_file_type = self._detect_file_type_from_url(self.file_url)
|
if self.file_type not in (FileType.IMAGE, FileType.EXCEL, FileType.PDF):
|
||||||
self.file_type = detected_file_type
|
self.file_type = self._detect_file_type_from_url(self.file_url)
|
||||||
|
|
||||||
# Validate the file type
|
# Validate the file type
|
||||||
self.validate_file_type(self.file_url, file_path)
|
self.validate_file_type(self.file_url, file_path)
|
||||||
@@ -3283,6 +3311,8 @@ class FileParserBlock(Block):
|
|||||||
parsed_data = await self._parse_excel_file(file_path)
|
parsed_data = await self._parse_excel_file(file_path)
|
||||||
elif self.file_type == FileType.PDF:
|
elif self.file_type == FileType.PDF:
|
||||||
parsed_data = await self._parse_pdf_file(file_path)
|
parsed_data = await self._parse_pdf_file(file_path)
|
||||||
|
elif self.file_type == FileType.IMAGE:
|
||||||
|
parsed_data = await self._parse_image_file(file_path)
|
||||||
else:
|
else:
|
||||||
return await self.build_block_result(
|
return await self.build_block_result(
|
||||||
success=False,
|
success=False,
|
||||||
|
|||||||
@@ -67,6 +67,7 @@ class FileType(StrEnum):
|
|||||||
CSV = "csv"
|
CSV = "csv"
|
||||||
EXCEL = "excel"
|
EXCEL = "excel"
|
||||||
PDF = "pdf"
|
PDF = "pdf"
|
||||||
|
IMAGE = "image"
|
||||||
|
|
||||||
|
|
||||||
class PDFFormat(StrEnum):
|
class PDFFormat(StrEnum):
|
||||||
|
|||||||
Reference in New Issue
Block a user