diff --git a/skyvern/forge/sdk/workflow/models/block.py b/skyvern/forge/sdk/workflow/models/block.py index 91d47a8e..d9b87349 100644 --- a/skyvern/forge/sdk/workflow/models/block.py +++ b/skyvern/forge/sdk/workflow/models/block.py @@ -22,6 +22,7 @@ import filetype import pandas as pd import pyotp import structlog +from charset_normalizer import from_bytes from email_validator import EmailNotValidError, validate_email from jinja2 import StrictUndefined from jinja2.sandbox import SandboxedEnvironment @@ -3037,10 +3038,36 @@ class FileParserBlock(Block): else: return FileType.CSV # Default to CSV for .csv and any other extensions + def _detect_file_encoding(self, file_path: str) -> str: + """Detect the encoding of a file using charset-normalizer with fallbacks. + + Reads a sample of the file (first 64KB) to detect encoding efficiently. + Falls back through common encodings if detection fails. + """ + sample_size = 65536 # 64KB sample for detection + with open(file_path, "rb") as f: + raw_data = f.read(sample_size) + + result = from_bytes(raw_data) + best_match = result.best() + if best_match and best_match.encoding: + return best_match.encoding + + for encoding in ["utf-8", "cp1252", "latin-1"]: + try: + raw_data.decode(encoding) + return encoding + except UnicodeDecodeError: + continue + + # latin-1 always succeeds (1:1 byte mapping), so this is a safety fallback + return "latin-1" + def validate_file_type(self, file_url_used: str, file_path: str) -> None: if self.file_type == FileType.CSV: try: - with open(file_path) as file: + encoding = self._detect_file_encoding(file_path) + with open(file_path, encoding=encoding, errors="replace") as file: csv.Sniffer().sniff(file.read(1024)) except csv.Error as e: raise InvalidFileType(file_url=file_url_used, file_type=self.file_type, error=str(e)) @@ -3061,7 +3088,8 @@ class FileParserBlock(Block): async def _parse_csv_file(self, file_path: str) -> list[dict[str, Any]]: """Parse CSV/TSV file and return list of dictionaries.""" parsed_data = [] - with open(file_path) as file: + encoding = self._detect_file_encoding(file_path) + with open(file_path, encoding=encoding, errors="replace") as file: # Try to detect the delimiter (comma for CSV, tab for TSV) sample = file.read(1024) file.seek(0) # Reset file pointer