From 0e0ae816937da967643b1f017bbf9e111e2b6a23 Mon Sep 17 00:00:00 2001 From: pedrohsdb Date: Fri, 31 Oct 2025 11:41:07 -0700 Subject: [PATCH] Improve LLM error message when LLM is down (#3874) --- skyvern/forge/agent.py | 90 ++++++++++++++++++++++++- skyvern/forge/sdk/api/llm/exceptions.py | 4 ++ 2 files changed, 93 insertions(+), 1 deletion(-) diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 4e26ab64..68adcb42 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -71,6 +71,7 @@ from skyvern.forge.sdk.api.files import ( wait_for_download_finished, ) from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory, LLMCaller, LLMCallerManager +from skyvern.forge.sdk.api.llm.exceptions import LLM_PROVIDER_ERROR_RETRYABLE_TASK_TYPE, LLM_PROVIDER_ERROR_TYPE from skyvern.forge.sdk.api.llm.ui_tars_llm_caller import UITarsLLMCaller from skyvern.forge.sdk.artifact.models import ArtifactType from skyvern.forge.sdk.core import skyvern_context @@ -2862,6 +2863,8 @@ class ForgeAgent: page: Page | None, ) -> MaxStepsReasonResponse: steps_results = [] + llm_errors: list[str] = [] + try: steps = await app.DATABASE.get_task_steps( task_id=task.task_id, organization_id=organization.organization_id @@ -2888,12 +2891,37 @@ class ForgeAgent: for action, action_results in step.output.actions_and_results: if len(action_results) == 0: continue + last_result = action_results[-1] + + # Check if this is an LLM provider error + if not last_result.success: + exception_type = last_result.exception_type or "" + exception_message = last_result.exception_message or "" + if ( + exception_type in (LLM_PROVIDER_ERROR_TYPE, LLM_PROVIDER_ERROR_RETRYABLE_TASK_TYPE) + or "LLMProvider" in exception_message + ): + llm_errors.append(f"Step {step_cnt}: {exception_message}") + action_result_summary.append( - f"{action.reasoning}(action_type={action.action_type}, result={'success' if action_results[-1].success else 'failed'})" + f"{action.reasoning}(action_type={action.action_type}, result={'success' if last_result.success else 'failed'})" ) step_result["actions_result"] = action_result_summary steps_results.append(step_result) + # If we detected LLM errors, return a clear message without calling the LLM + if llm_errors: + llm_error_details = "; ".join(llm_errors) + return MaxStepsReasonResponse( + page_info="", + reasoning=( + f"The task failed due to LLM service errors. The LLM provider encountered errors and was unable to process the requests. " + f"This is typically caused by rate limiting, service outages, or resource exhaustion from the LLM provider. " + f"Error details: {llm_error_details}" + ), + errors=[], + ) + scroll = True if await service_utils.is_cua_task(task=task): scroll = False @@ -2917,6 +2945,17 @@ class ForgeAgent: return MaxStepsReasonResponse.model_validate(json_response) except Exception: LOG.warning("Failed to summary the failure reason") + # Check if we have LLM errors even if the summarization failed + if llm_errors: + llm_error_details = "; ".join(llm_errors) + return MaxStepsReasonResponse( + page_info="", + reasoning=( + f"The task failed due to LLM service errors. The LLM provider encountered errors and was unable to process the requests. " + f"Error details: {llm_error_details}" + ), + errors=[], + ) if steps_results: last_step_result = steps_results[-1] return MaxStepsReasonResponse( @@ -2941,11 +2980,21 @@ class ForgeAgent: html = "" screenshots: list[bytes] = [] steps_results = [] + llm_errors: list[str] = [] + steps_without_actions = 0 + try: steps = await app.DATABASE.get_task_steps( task_id=task.task_id, organization_id=organization.organization_id ) + + # Check for LLM provider errors in the failed steps for step_cnt, cur_step in enumerate(steps[-max_retries:]): + if cur_step.status == StepStatus.failed: + # If step failed with no actions, it might be an LLM error during action extraction + if not cur_step.output or not cur_step.output.actions_and_results: + steps_without_actions += 1 + if cur_step.output and cur_step.output.actions_and_results: action_result_summary: list[str] = [] step_result: dict[str, Any] = { @@ -2958,12 +3007,38 @@ class ForgeAgent: if last_result.success: continue reason = last_result.exception_message or "" + + # Check if this is an LLM provider error + exception_type = last_result.exception_type or "" + if ( + exception_type in (LLM_PROVIDER_ERROR_TYPE, LLM_PROVIDER_ERROR_RETRYABLE_TASK_TYPE) + or "LLMProvider" in reason + ): + llm_errors.append(f"Step {step_cnt}: {reason}") + action_result_summary.append( f"{action.reasoning}(action_type={action.action_type}, result=failed, reason={reason})" ) step_result["actions_result"] = action_result_summary steps_results.append(step_result) + # If we detected LLM errors, return a clear message without calling the LLM + if llm_errors: + llm_error_details = "; ".join(llm_errors) + return ( + f"The task failed due to LLM service errors. The LLM provider encountered errors and was unable to process the requests. " + f"This is typically caused by rate limiting, service outages, or resource exhaustion from the LLM provider. " + f"Error details: {llm_error_details}" + ) + + # If multiple steps failed without producing any actions, it's likely an LLM error during action extraction + if steps_without_actions >= max_retries: + return ( + f"The task failed because all {max_retries} retry attempts failed to generate actions. " + f"This is typically caused by LLM service errors during action extraction, such as rate limiting, " + f"service outages, or resource exhaustion from the LLM provider. Please check the LLM service status and try again." + ) + if page is not None: skyvern_frame = await SkyvernFrame.create_instance(frame=page) html = await skyvern_frame.get_content() @@ -2987,6 +3062,19 @@ class ForgeAgent: return json_response.get("reasoning", "") except Exception: LOG.warning("Failed to summarize the failure reason for max retries") + # Check if we have LLM errors even if the summarization failed + if llm_errors: + llm_error_details = "; ".join(llm_errors) + return ( + f"The task failed due to LLM service errors. The LLM provider encountered errors and was unable to process the requests. " + f"Error details: {llm_error_details}" + ) + # If multiple steps failed without actions during summarization failure, still report it + if steps_without_actions >= max_retries: + return ( + f"The task failed because all {max_retries} retry attempts failed to generate actions. " + f"This is typically caused by LLM service errors during action extraction." + ) if steps_results: last_step_result = steps_results[-1] return f"Retry Step {last_step_result['order']}: {last_step_result['actions_result']}" diff --git a/skyvern/forge/sdk/api/llm/exceptions.py b/skyvern/forge/sdk/api/llm/exceptions.py index e4622004..7cb6a9e9 100644 --- a/skyvern/forge/sdk/api/llm/exceptions.py +++ b/skyvern/forge/sdk/api/llm/exceptions.py @@ -1,5 +1,9 @@ from skyvern.exceptions import SkyvernException +# Exception type name constants +LLM_PROVIDER_ERROR_TYPE = "LLMProviderError" +LLM_PROVIDER_ERROR_RETRYABLE_TASK_TYPE = "LLMProviderErrorRetryableTask" + class BaseLLMError(SkyvernException): pass