Fix FileParserBlock decoding errors (#4399)

This commit is contained in:
Marc Kelechava
2026-01-06 12:12:43 -08:00
committed by GitHub
parent 4adadee82a
commit 39bf3c2237

View File

@@ -22,6 +22,7 @@ import filetype
import pandas as pd
import pyotp
import structlog
from charset_normalizer import from_bytes
from email_validator import EmailNotValidError, validate_email
from jinja2 import StrictUndefined
from jinja2.sandbox import SandboxedEnvironment
@@ -3037,10 +3038,36 @@ class FileParserBlock(Block):
else:
return FileType.CSV # Default to CSV for .csv and any other extensions
def _detect_file_encoding(self, file_path: str) -> str:
"""Detect the encoding of a file using charset-normalizer with fallbacks.
Reads a sample of the file (first 64KB) to detect encoding efficiently.
Falls back through common encodings if detection fails.
"""
sample_size = 65536 # 64KB sample for detection
with open(file_path, "rb") as f:
raw_data = f.read(sample_size)
result = from_bytes(raw_data)
best_match = result.best()
if best_match and best_match.encoding:
return best_match.encoding
for encoding in ["utf-8", "cp1252", "latin-1"]:
try:
raw_data.decode(encoding)
return encoding
except UnicodeDecodeError:
continue
# latin-1 always succeeds (1:1 byte mapping), so this is a safety fallback
return "latin-1"
def validate_file_type(self, file_url_used: str, file_path: str) -> None:
if self.file_type == FileType.CSV:
try:
with open(file_path) as file:
encoding = self._detect_file_encoding(file_path)
with open(file_path, encoding=encoding, errors="replace") as file:
csv.Sniffer().sniff(file.read(1024))
except csv.Error as e:
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
@@ -3061,7 +3088,8 @@ class FileParserBlock(Block):
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
"""Parse CSV/TSV file and return list of dictionaries."""
parsed_data = []
with open(file_path) as file:
encoding = self._detect_file_encoding(file_path)
with open(file_path, encoding=encoding, errors="replace") as file:
# Try to detect the delimiter (comma for CSV, tab for TSV)
sample = file.read(1024)
file.seek(0) # Reset file pointer