Fix FileParserBlock decoding errors (#4399)
This commit is contained in:
@@ -22,6 +22,7 @@ import filetype
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import pyotp
|
import pyotp
|
||||||
import structlog
|
import structlog
|
||||||
|
from charset_normalizer import from_bytes
|
||||||
from email_validator import EmailNotValidError, validate_email
|
from email_validator import EmailNotValidError, validate_email
|
||||||
from jinja2 import StrictUndefined
|
from jinja2 import StrictUndefined
|
||||||
from jinja2.sandbox import SandboxedEnvironment
|
from jinja2.sandbox import SandboxedEnvironment
|
||||||
@@ -3037,10 +3038,36 @@ class FileParserBlock(Block):
|
|||||||
else:
|
else:
|
||||||
return FileType.CSV # Default to CSV for .csv and any other extensions
|
return FileType.CSV # Default to CSV for .csv and any other extensions
|
||||||
|
|
||||||
|
def _detect_file_encoding(self, file_path: str) -> str:
|
||||||
|
"""Detect the encoding of a file using charset-normalizer with fallbacks.
|
||||||
|
|
||||||
|
Reads a sample of the file (first 64KB) to detect encoding efficiently.
|
||||||
|
Falls back through common encodings if detection fails.
|
||||||
|
"""
|
||||||
|
sample_size = 65536 # 64KB sample for detection
|
||||||
|
with open(file_path, "rb") as f:
|
||||||
|
raw_data = f.read(sample_size)
|
||||||
|
|
||||||
|
result = from_bytes(raw_data)
|
||||||
|
best_match = result.best()
|
||||||
|
if best_match and best_match.encoding:
|
||||||
|
return best_match.encoding
|
||||||
|
|
||||||
|
for encoding in ["utf-8", "cp1252", "latin-1"]:
|
||||||
|
try:
|
||||||
|
raw_data.decode(encoding)
|
||||||
|
return encoding
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# latin-1 always succeeds (1:1 byte mapping), so this is a safety fallback
|
||||||
|
return "latin-1"
|
||||||
|
|
||||||
def validate_file_type(self, file_url_used: str, file_path: str) -> None:
|
def validate_file_type(self, file_url_used: str, file_path: str) -> None:
|
||||||
if self.file_type == FileType.CSV:
|
if self.file_type == FileType.CSV:
|
||||||
try:
|
try:
|
||||||
with open(file_path) as file:
|
encoding = self._detect_file_encoding(file_path)
|
||||||
|
with open(file_path, encoding=encoding, errors="replace") as file:
|
||||||
csv.Sniffer().sniff(file.read(1024))
|
csv.Sniffer().sniff(file.read(1024))
|
||||||
except csv.Error as e:
|
except csv.Error as e:
|
||||||
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e))
|
||||||
@@ -3061,7 +3088,8 @@ class FileParserBlock(Block):
|
|||||||
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
|
async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]:
|
||||||
"""Parse CSV/TSV file and return list of dictionaries."""
|
"""Parse CSV/TSV file and return list of dictionaries."""
|
||||||
parsed_data = []
|
parsed_data = []
|
||||||
with open(file_path) as file:
|
encoding = self._detect_file_encoding(file_path)
|
||||||
|
with open(file_path, encoding=encoding, errors="replace") as file:
|
||||||
# Try to detect the delimiter (comma for CSV, tab for TSV)
|
# Try to detect the delimiter (comma for CSV, tab for TSV)
|
||||||
sample = file.read(1024)
|
sample = file.read(1024)
|
||||||
file.seek(0) # Reset file pointer
|
file.seek(0) # Reset file pointer
|
||||||
|
|||||||
Reference in New Issue
Block a user