From be1c8ba060eae75a8849acf99fb943ecbfd6c4e0 Mon Sep 17 00:00:00 2001 From: Kerem Yilmaz Date: Fri, 6 Sep 2024 11:08:33 +0300 Subject: [PATCH] More visibility into attached files and duplicate status (#776) --- skyvern/forge/sdk/api/files.py | 10 +++++++++ skyvern/forge/sdk/workflow/models/block.py | 24 +++++++++++++++++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/skyvern/forge/sdk/api/files.py b/skyvern/forge/sdk/api/files.py index bbf3e6a3..46b56bed 100644 --- a/skyvern/forge/sdk/api/files.py +++ b/skyvern/forge/sdk/api/files.py @@ -1,3 +1,4 @@ +import hashlib import os import tempfile import zipfile @@ -86,3 +87,12 @@ def get_number_of_files_in_directory(directory: Path, recursive: bool = False) - break count += len(files) return count + + +def calculate_sha256(file_path: str) -> str: + """Helper function to calculate SHA256 hash of a file.""" + sha256_hash = hashlib.sha256() + with open(file_path, "rb") as f: + for byte_block in iter(lambda: f.read(4096), b""): + sha256_hash.update(byte_block) + return sha256_hash.hexdigest() diff --git a/skyvern/forge/sdk/workflow/models/block.py b/skyvern/forge/sdk/workflow/models/block.py index 56dc229b..d46ed680 100644 --- a/skyvern/forge/sdk/workflow/models/block.py +++ b/skyvern/forge/sdk/workflow/models/block.py @@ -6,6 +6,7 @@ import os import smtplib import textwrap import uuid +from collections import defaultdict from dataclasses import dataclass from email.message import EmailMessage from enum import StrEnum @@ -30,7 +31,12 @@ from skyvern.exceptions import ( from skyvern.forge import app from skyvern.forge.prompts import prompt_engine from skyvern.forge.sdk.api.aws import AsyncAWSClient -from skyvern.forge.sdk.api.files import download_file, download_from_s3, get_path_for_workflow_download_directory +from skyvern.forge.sdk.api.files import ( + calculate_sha256, + download_file, + download_from_s3, + get_path_for_workflow_download_directory, +) from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory from skyvern.forge.sdk.schemas.tasks import TaskOutput, TaskStatus from skyvern.forge.sdk.settings_manager import SettingsManager @@ -905,6 +911,8 @@ class SendEmailBlock(Block): else: msg.set_content(self.body) + file_names_by_hash: dict[str, list[str]] = defaultdict(list) + for filename in self._get_file_paths(workflow_run_context, workflow_run_id): path = None try: @@ -961,10 +969,24 @@ class SendEmailBlock(Block): subtype=subtype, filename=attachment_filename, ) + file_hash = calculate_sha256(path) + file_names_by_hash[file_hash].append(path) finally: if path: os.unlink(path) + # Calculate file stats based on content hashes + total_files = sum(len(files) for files in file_names_by_hash.values()) + unique_files = len(file_names_by_hash) + duplicate_files_list = [files for files in file_names_by_hash.values() if len(files) > 1] + + # Log file statistics + LOG.info("SendEmailBlock: Total files attached", total_files=total_files) + LOG.info("SendEmailBlock: Unique files (based on content) attached", unique_files=unique_files) + LOG.info( + "SendEmailBlock: Duplicate files (based on content) attached", duplicate_files_list=duplicate_files_list + ) + return msg async def execute(self, workflow_run_id: str, **kwargs: dict) -> BlockResult: