diff --git a/skyvern-frontend/src/routes/workflows/WorkflowRun.tsx b/skyvern-frontend/src/routes/workflows/WorkflowRun.tsx index 4ecf8b4a..83f09e44 100644 --- a/skyvern-frontend/src/routes/workflows/WorkflowRun.tsx +++ b/skyvern-frontend/src/routes/workflows/WorkflowRun.tsx @@ -442,7 +442,10 @@ function WorkflowRun() { {fileUrls.length > 0 ? ( - fileUrls.map((url, index) => { + fileUrls.map((url) => { + // Extract filename from URL path, stripping query params from signed URLs + const urlPath = url.split("?")[0] ?? url; + const filename = urlPath.split("/").pop() || "download"; return (
@@ -450,7 +453,7 @@ function WorkflowRun() { href={url} className="underline underline-offset-4" > - {`File ${index + 1}`} + {filename}
); diff --git a/skyvern-frontend/src/routes/workflows/debugger/DebuggerRunOutput.tsx b/skyvern-frontend/src/routes/workflows/debugger/DebuggerRunOutput.tsx index ddce0547..22ad4721 100644 --- a/skyvern-frontend/src/routes/workflows/debugger/DebuggerRunOutput.tsx +++ b/skyvern-frontend/src/routes/workflows/debugger/DebuggerRunOutput.tsx @@ -152,12 +152,15 @@ function DebuggerRunOutput() {

Workflow Run Downloaded Files

{fileUrls.length > 0 ? ( - fileUrls.map((url, index) => { + fileUrls.map((url) => { + // Extract filename from URL path, stripping query params from signed URLs + const urlPath = url.split("?")[0] ?? url; + const filename = urlPath.split("/").pop() || "download"; return (
- {`File ${index + 1}`} + {filename}
); diff --git a/skyvern-frontend/src/routes/workflows/editor/nodes/HttpRequestNode/HttpRequestNode.tsx b/skyvern-frontend/src/routes/workflows/editor/nodes/HttpRequestNode/HttpRequestNode.tsx index 2a322189..9b42ed0e 100644 --- a/skyvern-frontend/src/routes/workflows/editor/nodes/HttpRequestNode/HttpRequestNode.tsx +++ b/skyvern-frontend/src/routes/workflows/editor/nodes/HttpRequestNode/HttpRequestNode.tsx @@ -67,6 +67,8 @@ const filesTooltip = const timeoutTooltip = "Request timeout in seconds."; const followRedirectsTooltip = "Whether to automatically follow HTTP redirects."; +const downloadFilenameTooltip = + "The complete filename (without extension) for downloaded files. Extension is automatically determined from the response Content-Type."; function HttpRequestNode({ id, data, type }: NodeProps) { const { editable } = data; @@ -431,6 +433,43 @@ function HttpRequestNode({ id, data, type }: NodeProps) {
+
+
+
+ + +
+ { + update({ saveResponseAsFile: checked }); + }} + disabled={!editable} + /> +
+ {data.saveResponseAsFile && ( +
+
+ + +
+ { + update({ downloadFilename: e.target.value }); + }} + placeholder="Auto-generated from URL" + className="nopan text-xs" + disabled={!editable} + /> +
+ )} +
diff --git a/skyvern-frontend/src/routes/workflows/editor/nodes/HttpRequestNode/types.ts b/skyvern-frontend/src/routes/workflows/editor/nodes/HttpRequestNode/types.ts index 562f8e06..374800f3 100644 --- a/skyvern-frontend/src/routes/workflows/editor/nodes/HttpRequestNode/types.ts +++ b/skyvern-frontend/src/routes/workflows/editor/nodes/HttpRequestNode/types.ts @@ -11,6 +11,8 @@ export type HttpRequestNodeData = NodeBaseData & { timeout: number; followRedirects: boolean; parameterKeys: Array; + downloadFilename: string; + saveResponseAsFile: boolean; }; export type HttpRequestNode = Node; @@ -29,6 +31,8 @@ export const httpRequestNodeDefaultData: HttpRequestNodeData = { parameterKeys: [], editable: true, model: null, + downloadFilename: "", + saveResponseAsFile: false, }; export function isHttpRequestNode(node: Node): node is HttpRequestNode { diff --git a/skyvern-frontend/src/routes/workflows/editor/workflowEditorUtils.ts b/skyvern-frontend/src/routes/workflows/editor/workflowEditorUtils.ts index 84c2b7e8..a9ea8c5f 100644 --- a/skyvern-frontend/src/routes/workflows/editor/workflowEditorUtils.ts +++ b/skyvern-frontend/src/routes/workflows/editor/workflowEditorUtils.ts @@ -834,6 +834,8 @@ function convertToNode( timeout: block.timeout, followRedirects: block.follow_redirects, parameterKeys: block.parameters.map((p) => p.key), + downloadFilename: block.download_filename ?? "", + saveResponseAsFile: block.save_response_as_file ?? false, }, }; } @@ -2325,6 +2327,8 @@ function getWorkflowBlock( timeout: node.data.timeout, follow_redirects: node.data.followRedirects, parameter_keys: node.data.parameterKeys, + download_filename: node.data.downloadFilename || null, + save_response_as_file: node.data.saveResponseAsFile, }; } case "conditional": { @@ -3327,6 +3331,7 @@ function convertBlocksToBlockYAML( timeout: block.timeout, follow_redirects: block.follow_redirects, parameter_keys: block.parameters.map((p) => p.key), + download_filename: block.download_filename, }; return blockYaml; } diff --git a/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts b/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts index e2a5decb..c09a95e8 100644 --- a/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts +++ b/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts @@ -550,6 +550,8 @@ export type HttpRequestBlock = WorkflowBlockBase & { timeout: number; follow_redirects: boolean; parameters: Array; + download_filename: string | null; + save_response_as_file: boolean; }; export type WorkflowDefinition = { diff --git a/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts b/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts index ccb794aa..ebdda790 100644 --- a/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts +++ b/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts @@ -400,4 +400,6 @@ export type HttpRequestBlockYAML = BlockYAMLBase & { timeout: number; follow_redirects: boolean; parameter_keys?: Array | null; + download_filename?: string | null; + save_response_as_file?: boolean; }; diff --git a/skyvern-frontend/src/routes/workflows/workflowRun/WorkflowRunOutput.tsx b/skyvern-frontend/src/routes/workflows/workflowRun/WorkflowRunOutput.tsx index a8c1f1e5..1a74dba7 100644 --- a/skyvern-frontend/src/routes/workflows/workflowRun/WorkflowRunOutput.tsx +++ b/skyvern-frontend/src/routes/workflows/workflowRun/WorkflowRunOutput.tsx @@ -152,12 +152,15 @@ function WorkflowRunOutput() {

Workflow Run Downloaded Files

{fileUrls.length > 0 ? ( - fileUrls.map((url, index) => { + fileUrls.map((url) => { + // Extract filename from URL path, stripping query params from signed URLs + const urlPath = url.split("?")[0] ?? url; + const filename = urlPath.split("/").pop() || "download"; return ( ); diff --git a/skyvern/client/types/http_request_block.py b/skyvern/client/types/http_request_block.py index f9b05e15..8d309c4d 100644 --- a/skyvern/client/types/http_request_block.py +++ b/skyvern/client/types/http_request_block.py @@ -34,6 +34,8 @@ class HttpRequestBlock(UniversalBaseModel): timeout: typing.Optional[int] = None follow_redirects: typing.Optional[bool] = None parameters: typing.Optional[typing.List[HttpRequestBlockParametersItem]] = None + download_filename: typing.Optional[str] = None + save_response_as_file: typing.Optional[bool] = None if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/skyvern/client/types/http_request_block_yaml.py b/skyvern/client/types/http_request_block_yaml.py index 01544979..39c976c7 100644 --- a/skyvern/client/types/http_request_block_yaml.py +++ b/skyvern/client/types/http_request_block_yaml.py @@ -28,6 +28,8 @@ class HttpRequestBlockYaml(UniversalBaseModel): timeout: typing.Optional[int] = None follow_redirects: typing.Optional[bool] = None parameter_keys: typing.Optional[typing.List[str]] = None + download_filename: typing.Optional[str] = None + save_response_as_file: typing.Optional[bool] = None if IS_PYDANTIC_V2: model_config: typing.ClassVar[pydantic.ConfigDict] = pydantic.ConfigDict(extra="allow", frozen=True) # type: ignore # Pydantic v2 diff --git a/skyvern/config.py b/skyvern/config.py index 3e2a4410..bc35eb6d 100644 --- a/skyvern/config.py +++ b/skyvern/config.py @@ -95,6 +95,7 @@ class Settings(BaseSettings): # S3/AWS settings AWS_REGION: str = "us-east-1" MAX_UPLOAD_FILE_SIZE: int = 10 * 1024 * 1024 # 10 MB + MAX_HTTP_DOWNLOAD_FILE_SIZE: int = 500 * 1024 * 1024 # 500 MB PRESIGNED_URL_EXPIRATION: int = 60 * 60 * 24 # 24 hours AWS_S3_BUCKET_ARTIFACTS: str = "skyvern-artifacts" AWS_S3_BUCKET_SCREENSHOTS: str = "skyvern-screenshots" diff --git a/skyvern/forge/sdk/api/files.py b/skyvern/forge/sdk/api/files.py index 4264998f..04820a9b 100644 --- a/skyvern/forge/sdk/api/files.py +++ b/skyvern/forge/sdk/api/files.py @@ -33,7 +33,7 @@ async def download_from_s3(client: AsyncAWSClient, s3_uri: str) -> str: return file_path.name -def get_file_name_and_suffix_from_headers(headers: CIMultiDictProxy[str]) -> tuple[str, str]: +def get_file_name_and_suffix_from_headers(headers: CIMultiDictProxy[str] | dict[str, str]) -> tuple[str, str]: file_stem = "" file_suffix: str | None = "" # retrieve the stem and suffix from Content-Disposition @@ -70,6 +70,46 @@ def is_valid_mime_type(file_path: str) -> bool: return mime_type is not None +def _determine_download_filename( + filename: str | None, + response_headers: dict, + url: str, +) -> str: + """Determine the filename for a downloaded file.""" + if filename: + file_name = filename + if not os.path.splitext(file_name)[1]: + content_type = response_headers.get("Content-Type", "") + if content_type: + ext = mimetypes.guess_extension(content_type.split(";")[0].strip()) + if ext: + file_name = file_name + ext + return sanitize_filename(file_name) + + file_name = "" + file_suffix = "" + try: + file_name, file_suffix = get_file_name_and_suffix_from_headers(response_headers) + if not file_suffix: + LOG.warning("No extension name retrieved from HTTP headers") + except Exception: + LOG.exception("Failed to retrieve the file extension from HTTP headers") + + query_params = dict(parse_qsl(urlparse(url).query)) + if "download" in query_params: + file_name = query_params["download"] + + if not file_name: + LOG.info("No file name retrieved from HTTP headers, using the file name from the URL") + file_name = os.path.basename(urlparse(url).path) or "download" + + if not is_valid_mime_type(file_name) and file_suffix: + LOG.info("No file extension detected, adding the extension from HTTP headers") + file_name = file_name + file_suffix + + return sanitize_filename(file_name) + + def validate_download_url(url: str) -> bool: """Validate if a URL is supported for downloading. @@ -126,7 +166,13 @@ def validate_download_url(url: str) -> bool: return False -async def download_file(url: str, max_size_mb: int | None = None) -> str: +async def download_file( + url: str, + max_size_mb: int | None = None, + headers: dict[str, str] | None = None, + output_dir: str | None = None, + filename: str | None = None, +) -> str: try: # Check if URL is a Google Drive link if "drive.google.com" in url: @@ -175,42 +221,22 @@ async def download_file(url: str, max_size_mb: int | None = None) -> str: async with aiohttp.ClientSession(raise_for_status=True) as session: LOG.info("Starting to download file", url=url) encoded_url = encode_url(url) - async with session.get(URL(encoded_url, encoded=True)) as response: + async with session.get(URL(encoded_url, encoded=True), headers=headers) as response: # Check the content length if available if max_size_mb and response.content_length and response.content_length > max_size_mb * 1024 * 1024: # todo: move to root exception.py raise DownloadFileMaxSizeExceeded(max_size_mb) - # Parse the URL - a = urlparse(url) - # Get the file name - temp_dir = make_temp_directory(prefix="skyvern_downloads_") + if output_dir: + os.makedirs(output_dir, exist_ok=True) + download_dir = output_dir + else: + download_dir = make_temp_directory(prefix="skyvern_downloads_") - file_name = "" - file_suffix = "" - try: - file_name, file_suffix = get_file_name_and_suffix_from_headers(response.headers) - if not file_suffix: - LOG.warning("No extension name retrieved from HTTP headers") - except Exception: - LOG.exception("Failed to retrieve the file extension from HTTP headers") - - # parse the query params to get the file name - query_params = dict(parse_qsl(a.query)) - if "download" in query_params: - file_name = query_params["download"] - - if not file_name: - LOG.info("No file name retrieved from HTTP headers, using the file name from the URL") - file_name = os.path.basename(a.path) - - if not is_valid_mime_type(file_name) and file_suffix: - LOG.info("No file extension detected, adding the extension from HTTP headers") - file_name = file_name + file_suffix - - file_name = sanitize_filename(file_name) - file_path = os.path.join(temp_dir, file_name) + # Determine filename - use provided filename or derive from response/URL + file_name = _determine_download_filename(filename, dict(response.headers), url) + file_path = os.path.join(download_dir, file_name) LOG.info(f"Downloading file to {file_path}") with open(file_path, "wb") as f: diff --git a/skyvern/forge/sdk/core/aiohttp_helper.py b/skyvern/forge/sdk/core/aiohttp_helper.py index d9aa939e..790ffa56 100644 --- a/skyvern/forge/sdk/core/aiohttp_helper.py +++ b/skyvern/forge/sdk/core/aiohttp_helper.py @@ -96,11 +96,9 @@ async def aiohttp_request( async with session.request(method.upper(), **request_kwargs) as response: response_headers = dict(response.headers) - # Try to parse response as JSON try: response_body = await response.json() except (aiohttp.ContentTypeError, Exception): - # If not JSON, get as text response_body = await response.text() return response.status, response_headers, response_body diff --git a/skyvern/forge/sdk/workflow/models/block.py b/skyvern/forge/sdk/workflow/models/block.py index dcd0f006..bf0050f4 100644 --- a/skyvern/forge/sdk/workflow/models/block.py +++ b/skyvern/forge/sdk/workflow/models/block.py @@ -18,6 +18,7 @@ from types import SimpleNamespace from typing import Annotated, Any, Awaitable, Callable, ClassVar, Literal, Union, cast from urllib.parse import quote, urlparse +import aiohttp import filetype import pandas as pd import pyotp @@ -38,6 +39,7 @@ from skyvern.constants import ( from skyvern.exceptions import ( AzureConfigurationError, ContextParameterValueNotFound, + DownloadFileMaxSizeExceeded, MissingBrowserState, MissingBrowserStatePage, PDFParsingError, @@ -54,6 +56,7 @@ from skyvern.forge.sdk.api.files import ( create_named_temporary_file, download_file, download_from_s3, + get_download_dir, get_path_for_workflow_download_directory, parse_uri_to_path, ) @@ -4003,6 +4006,8 @@ class HttpRequestBlock(Block): files: dict[str, str] | None = None # Dictionary mapping field names to file paths for multipart file uploads timeout: int = 30 follow_redirects: bool = True + download_filename: str | None = None + save_response_as_file: bool = False # Parameters for templating parameters: list[PARAMETER_TYPE] = [] @@ -4101,6 +4106,11 @@ class HttpRequestBlock(Block): if self.headers: self.headers = cast(dict[str, str], _render_templates_in_json(self.headers)) + if self.download_filename: + self.download_filename = self.format_block_parameter_template_from_workflow_run_context( + self.download_filename, workflow_run_context, **template_kwargs + ) + def validate_url(self, url: str) -> bool: """Validate if the URL is properly formatted""" try: @@ -4109,6 +4119,92 @@ class HttpRequestBlock(Block): except Exception: return False + async def _execute_file_download( + self, + workflow_run_context: WorkflowRunContext, + workflow_run_id: str, + workflow_run_block_id: str, + organization_id: str | None, + ) -> BlockResult: + if not self.url: + return await self.build_block_result( + success=False, + failure_reason="URL is required for file download", + output_parameter_value=None, + status=BlockStatus.failed, + workflow_run_block_id=workflow_run_block_id, + organization_id=organization_id, + ) + + try: + max_size_mb = settings.MAX_HTTP_DOWNLOAD_FILE_SIZE // (1024 * 1024) + output_dir = get_download_dir(workflow_run_id) + file_path = await download_file( + self.url, + max_size_mb=max_size_mb, + headers=self.headers, + output_dir=output_dir, + filename=self.download_filename, + ) + + response_data = { + "file_path": file_path, + "file_name": os.path.basename(file_path), + "file_size": os.path.getsize(file_path), + } + + await self.record_output_parameter_value(workflow_run_context, workflow_run_id, response_data) + + return await self.build_block_result( + success=True, + failure_reason=None, + output_parameter_value=response_data, + status=BlockStatus.completed, + workflow_run_block_id=workflow_run_block_id, + organization_id=organization_id, + ) + + except aiohttp.ClientResponseError as e: + error_data = {"error": f"HTTP {e.status}", "error_type": "http_error"} + await self.record_output_parameter_value(workflow_run_context, workflow_run_id, error_data) + return await self.build_block_result( + success=False, + failure_reason=f"HTTP {e.status}", + output_parameter_value=error_data, + status=BlockStatus.failed, + workflow_run_block_id=workflow_run_block_id, + organization_id=organization_id, + ) + except DownloadFileMaxSizeExceeded as e: + max_size_str = f"{e.max_size:.1f}" + error_data = {"error": f"File exceeds maximum size of {max_size_str}MB", "error_type": "file_too_large"} + await self.record_output_parameter_value(workflow_run_context, workflow_run_id, error_data) + return await self.build_block_result( + success=False, + failure_reason=f"File exceeds maximum size of {max_size_str}MB", + output_parameter_value=error_data, + status=BlockStatus.failed, + workflow_run_block_id=workflow_run_block_id, + organization_id=organization_id, + ) + except Exception as e: + error_data = {"error": str(e), "error_type": "unknown"} + LOG.warning( + "File download failed", + error=str(e), + url=self.url, + workflow_run_id=workflow_run_id, + ) + await self.record_output_parameter_value(workflow_run_context, workflow_run_id, error_data) + return await self.build_block_result( + success=False, + failure_reason=f"File download failed: {str(e)}", + output_parameter_value=error_data, + status=BlockStatus.failed, + workflow_run_block_id=workflow_run_block_id, + organization_id=organization_id, + ) + async def execute( self, workflow_run_id: str, @@ -4280,7 +4376,14 @@ class HttpRequestBlock(Block): # Update self.files with local file paths self.files = downloaded_files - # Execute HTTP request using the generic aiohttp_request function + if self.save_response_as_file: + return await self._execute_file_download( + workflow_run_context=workflow_run_context, + workflow_run_id=workflow_run_id, + workflow_run_block_id=workflow_run_block_id, + organization_id=organization_id, + ) + try: LOG.info( "Executing HTTP request", @@ -4292,7 +4395,6 @@ class HttpRequestBlock(Block): files=self.files, ) - # Use the generic aiohttp_request function status_code, response_headers, response_body = await aiohttp_request( method=self.method, url=self.url, @@ -4304,22 +4406,18 @@ class HttpRequestBlock(Block): ) response_data = { - # Response information "status_code": status_code, "response_headers": response_headers, "response_body": response_body, - # Request information (what was sent) "request_method": self.method, "request_url": self.url, "request_headers": self.headers, "request_body": self.body, - # Backwards compatibility "headers": response_headers, "body": response_body, "url": self.url, } - # Mask secrets in output to prevent credential exposure in DB/UI response_data = workflow_run_context.mask_secrets_in_data(response_data) LOG.info( @@ -4331,14 +4429,14 @@ class HttpRequestBlock(Block): response_data=response_data, ) - # Determine success based on status code success = 200 <= status_code < 300 + failure_reason = None if success else f"HTTP {status_code}: {response_data.get('response_body', '')}" await self.record_output_parameter_value(workflow_run_context, workflow_run_id, response_data) return await self.build_block_result( success=success, - failure_reason=None if success else f"HTTP {status_code}: {response_body}", + failure_reason=failure_reason, output_parameter_value=response_data, status=BlockStatus.completed if success else BlockStatus.failed, workflow_run_block_id=workflow_run_block_id, @@ -4358,7 +4456,7 @@ class HttpRequestBlock(Block): ) except Exception as e: error_data = {"error": str(e), "error_type": "unknown"} - LOG.warning( # Changed from LOG.exception to LOG.warning as requested + LOG.warning( "HTTP request failed with unexpected error", error=str(e), url=self.url, diff --git a/skyvern/forge/sdk/workflow/service.py b/skyvern/forge/sdk/workflow/service.py index c8754c71..507f759f 100644 --- a/skyvern/forge/sdk/workflow/service.py +++ b/skyvern/forge/sdk/workflow/service.py @@ -3752,6 +3752,8 @@ class WorkflowService: files=block_yaml.files, timeout=block_yaml.timeout, follow_redirects=block_yaml.follow_redirects, + download_filename=block_yaml.download_filename, + save_response_as_file=block_yaml.save_response_as_file, parameters=http_request_block_parameters, ) elif block_yaml.block_type == BlockType.GOTO_URL: diff --git a/skyvern/schemas/workflows.py b/skyvern/schemas/workflows.py index fce51434..37367d1d 100644 --- a/skyvern/schemas/workflows.py +++ b/skyvern/schemas/workflows.py @@ -539,6 +539,8 @@ class HttpRequestBlockYAML(BlockYAML): files: dict[str, str] | None = None # Dictionary mapping field names to file paths/URLs for multipart file uploads timeout: int = 30 follow_redirects: bool = True + download_filename: str | None = None + save_response_as_file: bool = False # Parameter keys for templating parameter_keys: list[str] | None = None