4597 lines
199 KiB
Python
4597 lines
199 KiB
Python
import asyncio
|
|
import base64
|
|
import hashlib
|
|
import json
|
|
import os
|
|
import random
|
|
import re
|
|
import string
|
|
import uuid
|
|
from asyncio.exceptions import CancelledError
|
|
from dataclasses import dataclass
|
|
from datetime import UTC, datetime
|
|
from pathlib import Path
|
|
from typing import Any, Tuple, cast
|
|
|
|
import httpx
|
|
import structlog
|
|
from openai.types.responses.response import Response as OpenAIResponse
|
|
from playwright._impl._errors import TargetClosedError
|
|
from playwright.async_api import Page
|
|
|
|
from skyvern import analytics
|
|
from skyvern.config import settings
|
|
from skyvern.constants import (
|
|
BROWSER_DOWNLOAD_TIMEOUT,
|
|
BROWSER_DOWNLOADING_SUFFIX,
|
|
DEFAULT_MAX_SCREENSHOT_SCROLLS,
|
|
GET_DOWNLOADED_FILES_TIMEOUT,
|
|
SAVE_DOWNLOADED_FILES_TIMEOUT,
|
|
SCRAPE_TYPE_ORDER,
|
|
SPECIAL_FIELD_VERIFICATION_CODE,
|
|
ScrapeType,
|
|
)
|
|
from skyvern.errors.errors import (
|
|
GetTOTPVerificationCodeError,
|
|
ReachMaxRetriesError,
|
|
ReachMaxStepsError,
|
|
TimeoutGetTOTPVerificationCodeError,
|
|
UserDefinedError,
|
|
)
|
|
from skyvern.exceptions import (
|
|
BrowserSessionNotFound,
|
|
DownloadFileMaxWaitingTime,
|
|
EmptyScrapePage,
|
|
FailedToGetTOTPVerificationCode,
|
|
FailedToNavigateToUrl,
|
|
FailedToParseActionInstruction,
|
|
FailedToSendWebhook,
|
|
FailedToTakeScreenshot,
|
|
InvalidTaskStatusTransition,
|
|
InvalidWorkflowTaskURLState,
|
|
MissingBrowserStatePage,
|
|
MissingExtractActionsResponse,
|
|
NoTOTPVerificationCodeFound,
|
|
PDFEmbedBase64DecodeError,
|
|
ScrapingFailed,
|
|
SkyvernException,
|
|
StepTerminationError,
|
|
StepUnableToExecuteError,
|
|
TaskAlreadyCanceled,
|
|
TaskAlreadyTimeout,
|
|
TaskNotFound,
|
|
UnsupportedActionType,
|
|
UnsupportedTaskType,
|
|
)
|
|
from skyvern.forge import app
|
|
from skyvern.forge.async_operations import AgentPhase, AsyncOperationPool
|
|
from skyvern.forge.prompts import prompt_engine
|
|
from skyvern.forge.sdk.api.aws import aws_client
|
|
from skyvern.forge.sdk.api.files import (
|
|
get_path_for_workflow_download_directory,
|
|
list_downloading_files_in_directory,
|
|
list_files_in_directory,
|
|
rename_file,
|
|
wait_for_download_finished,
|
|
)
|
|
from skyvern.forge.sdk.api.llm.api_handler_factory import LLMAPIHandlerFactory, LLMCaller, LLMCallerManager
|
|
from skyvern.forge.sdk.api.llm.config_registry import LLMConfigRegistry
|
|
from skyvern.forge.sdk.api.llm.exceptions import LLM_PROVIDER_ERROR_RETRYABLE_TASK_TYPE, LLM_PROVIDER_ERROR_TYPE
|
|
from skyvern.forge.sdk.api.llm.ui_tars_llm_caller import UITarsLLMCaller
|
|
from skyvern.forge.sdk.api.llm.vertex_cache_manager import get_cache_manager
|
|
from skyvern.forge.sdk.artifact.manager import BulkArtifactCreationRequest
|
|
from skyvern.forge.sdk.artifact.models import ArtifactType
|
|
from skyvern.forge.sdk.core import skyvern_context
|
|
from skyvern.forge.sdk.core.security import generate_skyvern_webhook_signature
|
|
from skyvern.forge.sdk.core.skyvern_context import SkyvernContext
|
|
from skyvern.forge.sdk.db.enums import TaskType
|
|
from skyvern.forge.sdk.log_artifacts import save_step_logs, save_task_logs
|
|
from skyvern.forge.sdk.models import SpeculativeLLMMetadata, Step, StepStatus
|
|
from skyvern.forge.sdk.schemas.files import FileInfo
|
|
from skyvern.forge.sdk.schemas.organizations import Organization
|
|
from skyvern.forge.sdk.schemas.tasks import Task, TaskRequest, TaskResponse, TaskStatus
|
|
from skyvern.forge.sdk.schemas.totp_codes import OTPType
|
|
from skyvern.forge.sdk.trace import TraceManager
|
|
from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext
|
|
from skyvern.forge.sdk.workflow.models.block import (
|
|
ActionBlock,
|
|
BaseTaskBlock,
|
|
ValidationBlock,
|
|
)
|
|
from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus
|
|
from skyvern.schemas.runs import CUA_ENGINES, RunEngine
|
|
from skyvern.schemas.steps import AgentStepOutput
|
|
from skyvern.services import run_service, service_utils
|
|
from skyvern.services.action_service import get_action_history
|
|
from skyvern.services.otp_service import poll_otp_value
|
|
from skyvern.utils.image_resizer import Resolution
|
|
from skyvern.utils.prompt_engine import MaxStepsReasonResponse, load_prompt_with_elements
|
|
from skyvern.webeye.actions.action_types import ActionType
|
|
from skyvern.webeye.actions.actions import (
|
|
Action,
|
|
ActionStatus,
|
|
CompleteAction,
|
|
CompleteVerifyResult,
|
|
DecisiveAction,
|
|
DownloadFileAction,
|
|
ExtractAction,
|
|
GotoUrlAction,
|
|
ReloadPageAction,
|
|
TerminateAction,
|
|
WebAction,
|
|
)
|
|
from skyvern.webeye.actions.handler import ActionHandler
|
|
from skyvern.webeye.actions.models import DetailedAgentStepOutput
|
|
from skyvern.webeye.actions.parse_actions import (
|
|
parse_actions,
|
|
parse_anthropic_actions,
|
|
parse_cua_actions,
|
|
parse_ui_tars_actions,
|
|
)
|
|
from skyvern.webeye.actions.responses import ActionResult, ActionSuccess
|
|
from skyvern.webeye.browser_state import BrowserState
|
|
from skyvern.webeye.scraper.scraped_page import ElementTreeFormat, ScrapedPage
|
|
from skyvern.webeye.utils.page import SkyvernFrame
|
|
|
|
LOG = structlog.get_logger()
|
|
|
|
EXTRACT_ACTION_TEMPLATE = "extract-action"
|
|
EXTRACT_ACTION_PROMPT_NAME = "extract-actions"
|
|
EXTRACT_ACTION_CACHE_KEY_PREFIX = f"{EXTRACT_ACTION_TEMPLATE}-static"
|
|
|
|
|
|
@dataclass
|
|
class SpeculativePlan:
|
|
scraped_page: ScrapedPage
|
|
extract_action_prompt: str
|
|
use_caching: bool
|
|
llm_json_response: dict[str, Any] | None
|
|
llm_metadata: SpeculativeLLMMetadata | None = None
|
|
prompt_name: str = "extract-actions"
|
|
|
|
|
|
class ActionLinkedNode:
|
|
def __init__(self, action: Action) -> None:
|
|
self.action = action
|
|
self.next: ActionLinkedNode | None = None
|
|
|
|
|
|
class ForgeAgent:
|
|
def __init__(self) -> None:
|
|
self.async_operation_pool = AsyncOperationPool()
|
|
|
|
async def create_task_and_step_from_block(
|
|
self,
|
|
task_block: BaseTaskBlock,
|
|
workflow: Workflow,
|
|
workflow_run: WorkflowRun,
|
|
workflow_run_context: WorkflowRunContext,
|
|
task_order: int,
|
|
task_retry: int,
|
|
) -> tuple[Task, Step]:
|
|
task_block_parameters = task_block.parameters
|
|
navigation_payload = {}
|
|
for parameter in task_block_parameters:
|
|
navigation_payload[parameter.key] = workflow_run_context.get_value(parameter.key)
|
|
|
|
task_url = task_block.url
|
|
if task_url is None:
|
|
browser_state = app.BROWSER_MANAGER.get_for_workflow_run(
|
|
workflow_run_id=workflow_run.workflow_run_id, parent_workflow_run_id=workflow_run.parent_workflow_run_id
|
|
)
|
|
if browser_state is not None:
|
|
working_page = await browser_state.get_working_page()
|
|
if not working_page:
|
|
LOG.error(
|
|
"BrowserState has no page",
|
|
workflow_run_id=workflow_run.workflow_run_id,
|
|
)
|
|
raise MissingBrowserStatePage(workflow_run_id=workflow_run.workflow_run_id)
|
|
|
|
if working_page.url == "about:blank":
|
|
raise InvalidWorkflowTaskURLState(workflow_run.workflow_run_id)
|
|
|
|
task_url = working_page.url
|
|
else:
|
|
LOG.info("No browser state found for workflow run, setting task url to empty string")
|
|
task_url = ""
|
|
|
|
task = await app.DATABASE.create_task(
|
|
url=task_url,
|
|
task_type=task_block.task_type,
|
|
complete_criterion=task_block.complete_criterion,
|
|
terminate_criterion=task_block.terminate_criterion,
|
|
title=task_block.title or task_block.label,
|
|
webhook_callback_url=None,
|
|
totp_verification_url=task_block.totp_verification_url,
|
|
totp_identifier=task_block.totp_identifier,
|
|
navigation_goal=task_block.navigation_goal,
|
|
data_extraction_goal=task_block.data_extraction_goal,
|
|
navigation_payload=navigation_payload,
|
|
organization_id=workflow_run.organization_id,
|
|
proxy_location=workflow_run.proxy_location,
|
|
extracted_information_schema=task_block.data_schema,
|
|
workflow_run_id=workflow_run.workflow_run_id,
|
|
order=task_order,
|
|
retry=task_retry,
|
|
max_steps_per_run=task_block.max_steps_per_run,
|
|
error_code_mapping=task_block.error_code_mapping,
|
|
include_action_history_in_verification=task_block.include_action_history_in_verification,
|
|
model=task_block.model,
|
|
max_screenshot_scrolling_times=workflow_run.max_screenshot_scrolls,
|
|
extra_http_headers=workflow_run.extra_http_headers,
|
|
browser_address=workflow_run.browser_address,
|
|
browser_session_id=workflow_run.browser_session_id,
|
|
download_timeout=task_block.download_timeout,
|
|
)
|
|
LOG.info(
|
|
"Created a new task for workflow run",
|
|
workflow_id=workflow.workflow_id,
|
|
workflow_run_id=workflow_run.workflow_run_id,
|
|
task_id=task.task_id,
|
|
url=task.url,
|
|
title=task.title,
|
|
proxy_location=task.proxy_location,
|
|
task_order=task_order,
|
|
task_retry=task_retry,
|
|
)
|
|
# Update task status to running
|
|
task = await app.DATABASE.update_task(
|
|
task_id=task.task_id,
|
|
organization_id=task.organization_id,
|
|
status=TaskStatus.running,
|
|
)
|
|
|
|
step = await app.DATABASE.create_step(
|
|
task.task_id,
|
|
order=0,
|
|
retry_index=0,
|
|
organization_id=task.organization_id,
|
|
)
|
|
LOG.info(
|
|
"Created new step for workflow run",
|
|
workflow_id=workflow.workflow_id,
|
|
workflow_run_id=workflow_run.workflow_run_id,
|
|
order=step.order,
|
|
retry_index=step.retry_index,
|
|
)
|
|
return task, step
|
|
|
|
async def create_task(self, task_request: TaskRequest, organization_id: str) -> Task:
|
|
webhook_callback_url = str(task_request.webhook_callback_url) if task_request.webhook_callback_url else None
|
|
totp_verification_url = str(task_request.totp_verification_url) if task_request.totp_verification_url else None
|
|
# validate browser session id
|
|
if task_request.browser_session_id:
|
|
browser_session = await app.DATABASE.get_persistent_browser_session(
|
|
session_id=task_request.browser_session_id,
|
|
organization_id=organization_id,
|
|
)
|
|
if not browser_session:
|
|
raise BrowserSessionNotFound(browser_session_id=task_request.browser_session_id)
|
|
|
|
task = await app.DATABASE.create_task(
|
|
url=str(task_request.url),
|
|
title=task_request.title,
|
|
webhook_callback_url=webhook_callback_url,
|
|
totp_verification_url=totp_verification_url,
|
|
totp_identifier=task_request.totp_identifier,
|
|
navigation_goal=task_request.navigation_goal,
|
|
complete_criterion=task_request.complete_criterion,
|
|
terminate_criterion=task_request.terminate_criterion,
|
|
data_extraction_goal=task_request.data_extraction_goal,
|
|
navigation_payload=task_request.navigation_payload,
|
|
organization_id=organization_id,
|
|
proxy_location=task_request.proxy_location,
|
|
extracted_information_schema=task_request.extracted_information_schema,
|
|
error_code_mapping=task_request.error_code_mapping,
|
|
application=task_request.application,
|
|
include_action_history_in_verification=task_request.include_action_history_in_verification,
|
|
model=task_request.model,
|
|
max_screenshot_scrolling_times=task_request.max_screenshot_scrolls,
|
|
extra_http_headers=task_request.extra_http_headers,
|
|
browser_session_id=task_request.browser_session_id,
|
|
browser_address=task_request.browser_address,
|
|
)
|
|
LOG.info(
|
|
"Created new task",
|
|
task_id=task.task_id,
|
|
url=task.url,
|
|
proxy_location=task.proxy_location,
|
|
organization_id=organization_id,
|
|
)
|
|
return task
|
|
|
|
async def register_async_operations(self, organization: Organization, task: Task, page: Page) -> None:
|
|
operations = await app.AGENT_FUNCTION.generate_async_operations(organization, task, page)
|
|
self.async_operation_pool.add_operations(task.task_id, operations)
|
|
|
|
@TraceManager.traced_async(
|
|
ignore_inputs=["api_key", "close_browser_on_completion", "task_block", "cua_response", "llm_caller"]
|
|
)
|
|
async def execute_step(
|
|
self,
|
|
organization: Organization,
|
|
task: Task,
|
|
step: Step,
|
|
api_key: str | None = None,
|
|
close_browser_on_completion: bool = True,
|
|
task_block: BaseTaskBlock | None = None,
|
|
browser_session_id: str | None = None,
|
|
complete_verification: bool = True,
|
|
engine: RunEngine = RunEngine.skyvern_v1,
|
|
cua_response: OpenAIResponse | None = None,
|
|
llm_caller: LLMCaller | None = None,
|
|
) -> Tuple[Step, DetailedAgentStepOutput | None, Step | None]:
|
|
# set the step_id and task_id in the context
|
|
context = skyvern_context.ensure_context()
|
|
context.step_id = step.step_id
|
|
context.task_id = task.task_id
|
|
|
|
# do not need to do complete verification when it's a CUA task
|
|
# 1. CUA executes only one action step by step -- it's pretty less likely to have a hallucination for completion or forget to return a complete
|
|
# 2. It will significantly slow down CUA tasks
|
|
if engine in CUA_ENGINES:
|
|
complete_verification = False
|
|
|
|
close_browser_on_completion = (
|
|
close_browser_on_completion and browser_session_id is None and not task.browser_address
|
|
)
|
|
|
|
workflow_run: WorkflowRun | None = None
|
|
if task.workflow_run_id:
|
|
workflow_run = await app.DATABASE.get_workflow_run(
|
|
workflow_run_id=task.workflow_run_id,
|
|
organization_id=organization.organization_id,
|
|
)
|
|
if workflow_run and workflow_run.status == WorkflowRunStatus.canceled:
|
|
LOG.info(
|
|
"Workflow run is canceled, stopping execution inside task",
|
|
workflow_run_id=workflow_run.workflow_run_id,
|
|
)
|
|
step = await self.update_step(
|
|
step,
|
|
status=StepStatus.canceled,
|
|
is_last=True,
|
|
)
|
|
task = await self.update_task(
|
|
task,
|
|
status=TaskStatus.canceled,
|
|
)
|
|
return step, None, None
|
|
|
|
if workflow_run and workflow_run.status == WorkflowRunStatus.timed_out:
|
|
LOG.info(
|
|
"Workflow run is timed out, stopping execution inside task",
|
|
workflow_run_id=workflow_run.workflow_run_id,
|
|
)
|
|
step = await self.update_step(
|
|
step,
|
|
status=StepStatus.canceled,
|
|
is_last=True,
|
|
)
|
|
task = await self.update_task(
|
|
task,
|
|
status=TaskStatus.timed_out,
|
|
)
|
|
return step, None, None
|
|
|
|
refreshed_task = await app.DATABASE.get_task(task_id=task.task_id, organization_id=organization.organization_id)
|
|
if refreshed_task:
|
|
task = refreshed_task
|
|
|
|
if task.status == TaskStatus.canceled:
|
|
LOG.info(
|
|
"Task is canceled, stopping execution",
|
|
task_id=task.task_id,
|
|
)
|
|
step = await self.update_step(
|
|
step,
|
|
status=StepStatus.canceled,
|
|
is_last=True,
|
|
)
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
need_call_webhook=True,
|
|
browser_session_id=browser_session_id,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
)
|
|
return step, None, None
|
|
|
|
override_max_steps_per_run = context.max_steps_override or None
|
|
max_steps_per_run = (
|
|
override_max_steps_per_run
|
|
or task.max_steps_per_run
|
|
or organization.max_steps_per_run
|
|
or settings.MAX_STEPS_PER_RUN
|
|
)
|
|
if max_steps_per_run and task.max_steps_per_run != max_steps_per_run:
|
|
await app.DATABASE.update_task(
|
|
task_id=task.task_id,
|
|
organization_id=organization.organization_id,
|
|
max_steps_per_run=max_steps_per_run,
|
|
)
|
|
next_step: Step | None = None
|
|
detailed_output: DetailedAgentStepOutput | None = None
|
|
list_files_before: list[str] = []
|
|
try:
|
|
if task.workflow_run_id:
|
|
list_files_before = list_files_in_directory(
|
|
get_path_for_workflow_download_directory(
|
|
context.run_id if context and context.run_id else task.workflow_run_id
|
|
)
|
|
)
|
|
if task.browser_session_id:
|
|
browser_session_downloaded_files = await app.STORAGE.list_downloaded_files_in_browser_session(
|
|
organization_id=organization.organization_id,
|
|
browser_session_id=task.browser_session_id,
|
|
)
|
|
list_files_before = list_files_before + browser_session_downloaded_files
|
|
# Check some conditions before executing the step, throw an exception if the step can't be executed
|
|
await app.AGENT_FUNCTION.validate_step_execution(task, step)
|
|
|
|
(
|
|
step,
|
|
browser_state,
|
|
detailed_output,
|
|
) = await self.initialize_execution_state(task, step, workflow_run, browser_session_id)
|
|
|
|
# mark step as completed and mark task as completed
|
|
if (
|
|
not task.navigation_goal
|
|
and not task.data_extraction_goal
|
|
and not task.complete_criterion
|
|
and not task.terminate_criterion
|
|
):
|
|
# most likely a GOTO_URL task block
|
|
page = await browser_state.must_get_working_page()
|
|
current_url = page.url
|
|
if current_url.rstrip("/") != task.url.rstrip("/"):
|
|
await page.goto(task.url, timeout=settings.BROWSER_LOADING_TIMEOUT_MS)
|
|
step = await self.update_step(
|
|
step, status=StepStatus.completed, is_last=True, output=AgentStepOutput(action_results=[])
|
|
)
|
|
task = await self.update_task(task, status=TaskStatus.completed)
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
need_call_webhook=True,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
browser_session_id=browser_session_id,
|
|
)
|
|
return step, detailed_output, None
|
|
|
|
if page := await browser_state.get_working_page():
|
|
await self.register_async_operations(organization, task, page)
|
|
|
|
if engine == RunEngine.anthropic_cua and not llm_caller:
|
|
# see if the llm_caller is already set in memory
|
|
llm_caller = LLMCallerManager.get_llm_caller(task.task_id)
|
|
if not llm_caller:
|
|
# if not, create a new llm_caller
|
|
llm_key = task.llm_key
|
|
llm_caller = LLMCaller(
|
|
llm_key=llm_key or settings.ANTHROPIC_CUA_LLM_KEY, screenshot_scaling_enabled=True
|
|
)
|
|
|
|
if engine == RunEngine.ui_tars and not llm_caller:
|
|
# see if the llm_caller is already set in memory
|
|
llm_caller = LLMCallerManager.get_llm_caller(task.task_id)
|
|
if not llm_caller:
|
|
# create a new UI-TARS llm_caller
|
|
llm_key = task.llm_key or settings.VOLCENGINE_CUA_LLM_KEY
|
|
ui_tars_llm_caller = UITarsLLMCaller(llm_key=llm_key, screenshot_scaling_enabled=True)
|
|
ui_tars_llm_caller.initialize_conversation(task)
|
|
llm_caller = ui_tars_llm_caller
|
|
|
|
# TODO: remove the code after migrating everything to llm callers
|
|
# currently, only anthropic cua and ui_tars tasks use llm_caller
|
|
if engine in [RunEngine.anthropic_cua, RunEngine.ui_tars] and llm_caller:
|
|
LLMCallerManager.set_llm_caller(task.task_id, llm_caller)
|
|
|
|
step, detailed_output = await self.agent_step(
|
|
task,
|
|
step,
|
|
browser_state,
|
|
organization=organization,
|
|
task_block=task_block,
|
|
complete_verification=complete_verification,
|
|
engine=engine,
|
|
cua_response=cua_response,
|
|
llm_caller=llm_caller,
|
|
)
|
|
await app.AGENT_FUNCTION.post_step_execution(task, step)
|
|
task = await self.update_task_errors_from_detailed_output(task, detailed_output) # type: ignore
|
|
retry = False
|
|
|
|
if task_block and task_block.complete_on_download and task.workflow_run_id:
|
|
workflow_download_directory = get_path_for_workflow_download_directory(
|
|
context.run_id if context and context.run_id else task.workflow_run_id
|
|
)
|
|
|
|
downloading_files = list_downloading_files_in_directory(workflow_download_directory)
|
|
if task.browser_session_id:
|
|
browser_session_downloading_files = await app.STORAGE.list_downloading_files_in_browser_session(
|
|
organization_id=organization.organization_id,
|
|
browser_session_id=task.browser_session_id,
|
|
)
|
|
downloading_files = downloading_files + browser_session_downloading_files
|
|
if len(downloading_files) > 0:
|
|
LOG.info(
|
|
"Detecting files are still downloading, waiting for files to be completely downloaded.",
|
|
downloading_files=downloading_files,
|
|
)
|
|
try:
|
|
await wait_for_download_finished(
|
|
downloading_files=downloading_files,
|
|
timeout=task_block.download_timeout or BROWSER_DOWNLOAD_TIMEOUT,
|
|
)
|
|
except DownloadFileMaxWaitingTime as e:
|
|
LOG.warning(
|
|
"There're several long-time downloading files, these files might be broken",
|
|
downloading_files=e.downloading_files,
|
|
workflow_run_id=task.workflow_run_id,
|
|
)
|
|
|
|
list_files_after = list_files_in_directory(workflow_download_directory)
|
|
if task.browser_session_id:
|
|
browser_session_downloaded_files_after = await app.STORAGE.list_downloaded_files_in_browser_session(
|
|
organization_id=organization.organization_id,
|
|
browser_session_id=task.browser_session_id,
|
|
)
|
|
list_files_after = list_files_after + browser_session_downloaded_files_after
|
|
if len(list_files_after) > len(list_files_before):
|
|
files_to_rename = list(set(list_files_after) - set(list_files_before))
|
|
for file in files_to_rename:
|
|
if file.startswith("s3://"):
|
|
file_data = await aws_client.download_file(file, log_exception=False)
|
|
if not file_data:
|
|
continue
|
|
file = file.split("/")[-1] # Extract filename from the end of S3 URI
|
|
with open(os.path.join(workflow_download_directory, file), "wb") as f:
|
|
f.write(file_data)
|
|
|
|
file_extension = Path(file).suffix
|
|
if file_extension == BROWSER_DOWNLOADING_SUFFIX:
|
|
LOG.warning(
|
|
"Detecting incompleted download file, skip the rename",
|
|
file=file,
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
)
|
|
continue
|
|
|
|
if task_block.download_suffix:
|
|
# Use download_suffix as the complete filename (without extension)
|
|
final_file_name = task_block.download_suffix
|
|
else:
|
|
# Fallback to random filename if no download_suffix provided
|
|
random_file_id = "".join(random.choices(string.ascii_uppercase + string.digits, k=4))
|
|
final_file_name = f"download-{datetime.now().strftime('%Y%m%d%H%M%S%f')}-{random_file_id}"
|
|
|
|
# Check if file with this name already exists
|
|
final_file_name = final_file_name
|
|
target_path = os.path.join(workflow_download_directory, final_file_name + file_extension)
|
|
counter = 1
|
|
while os.path.exists(target_path):
|
|
# If file exists, append counter to filename
|
|
final_file_name = f"{final_file_name}_{counter}"
|
|
target_path = os.path.join(workflow_download_directory, final_file_name + file_extension)
|
|
counter += 1
|
|
|
|
rename_file(os.path.join(workflow_download_directory, file), final_file_name + file_extension)
|
|
|
|
LOG.info(
|
|
"Task marked as completed due to download",
|
|
task_id=task.task_id,
|
|
num_files_before=len(list_files_before),
|
|
num_files_after=len(list_files_after),
|
|
new_files=files_to_rename,
|
|
)
|
|
last_step = await self.update_step(step, is_last=True)
|
|
completed_task = await self.update_task(
|
|
task,
|
|
status=TaskStatus.completed,
|
|
)
|
|
await self.clean_up_task(
|
|
task=completed_task,
|
|
last_step=last_step,
|
|
api_key=api_key,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
browser_session_id=browser_session_id,
|
|
)
|
|
return last_step, detailed_output, None
|
|
|
|
# If the step failed, mark the step as failed and retry
|
|
if step.status == StepStatus.failed:
|
|
maybe_next_step = await self.handle_failed_step(organization, task, step)
|
|
# If there is no next step, it means that the task has failed
|
|
if maybe_next_step:
|
|
next_step = maybe_next_step
|
|
retry = True
|
|
else:
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
browser_session_id=browser_session_id,
|
|
)
|
|
return step, detailed_output, None
|
|
elif step.status == StepStatus.completed:
|
|
# TODO (kerem): keep the task object uptodate at all times so that clean_up_task can just use it
|
|
(
|
|
is_task_completed,
|
|
maybe_last_step,
|
|
maybe_next_step,
|
|
) = await self.handle_completed_step(
|
|
organization=organization,
|
|
task=task,
|
|
step=step,
|
|
page=await browser_state.get_working_page(),
|
|
task_block=task_block,
|
|
browser_state=browser_state,
|
|
scraped_page=detailed_output.scraped_page if detailed_output else None,
|
|
engine=engine,
|
|
complete_verification=complete_verification,
|
|
)
|
|
if is_task_completed is not None and maybe_last_step:
|
|
last_step = maybe_last_step
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=last_step,
|
|
api_key=api_key,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
browser_session_id=browser_session_id,
|
|
)
|
|
return last_step, detailed_output, None
|
|
elif maybe_next_step:
|
|
next_step = maybe_next_step
|
|
retry = False
|
|
else:
|
|
LOG.error(
|
|
"Step completed but task is not completed and next step is not created.",
|
|
is_task_completed=is_task_completed,
|
|
maybe_last_step=maybe_last_step,
|
|
maybe_next_step=maybe_next_step,
|
|
)
|
|
else:
|
|
LOG.error(
|
|
"Unexpected step status after agent_step",
|
|
step_status=step.status,
|
|
)
|
|
|
|
cua_response_param = detailed_output.cua_response if detailed_output else None
|
|
if not cua_response_param and cua_response:
|
|
cua_response_param = cua_response
|
|
|
|
if retry and next_step:
|
|
return await self.execute_step(
|
|
organization,
|
|
task,
|
|
next_step,
|
|
api_key=api_key,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
browser_session_id=browser_session_id,
|
|
task_block=task_block,
|
|
complete_verification=complete_verification,
|
|
engine=engine,
|
|
cua_response=cua_response_param,
|
|
llm_caller=llm_caller,
|
|
)
|
|
elif settings.execute_all_steps() and next_step:
|
|
return await self.execute_step(
|
|
organization,
|
|
task,
|
|
next_step,
|
|
api_key=api_key,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
browser_session_id=browser_session_id,
|
|
task_block=task_block,
|
|
complete_verification=complete_verification,
|
|
engine=engine,
|
|
cua_response=cua_response_param,
|
|
llm_caller=llm_caller,
|
|
)
|
|
else:
|
|
LOG.info(
|
|
"Step executed but continuous execution is disabled.",
|
|
is_cloud_env=settings.is_cloud_environment(),
|
|
execute_all_steps=settings.execute_all_steps(),
|
|
next_step_id=next_step.step_id if next_step else None,
|
|
)
|
|
|
|
return step, detailed_output, next_step
|
|
# TODO (kerem): Let's add other exceptions that we know about here as custom exceptions as well
|
|
except StepUnableToExecuteError:
|
|
LOG.exception("Step cannot be executed. Task execution stopped")
|
|
raise
|
|
except TaskAlreadyTimeout:
|
|
LOG.warning("Task is timed out, stopping execution")
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
browser_session_id=browser_session_id,
|
|
)
|
|
return step, detailed_output, None
|
|
except StepTerminationError as e:
|
|
LOG.warning(
|
|
"Step cannot be executed, marking task as failed",
|
|
exc_info=True,
|
|
)
|
|
is_task_marked_as_failed = await self.fail_task(task, step, e.message)
|
|
if is_task_marked_as_failed:
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
browser_session_id=browser_session_id,
|
|
)
|
|
else:
|
|
LOG.warning("Task isn't marked as failed, after step termination. NOT clean up the task")
|
|
return step, detailed_output, None
|
|
except FailedToSendWebhook:
|
|
LOG.exception(
|
|
"Failed to send webhook",
|
|
task=task,
|
|
step=step,
|
|
)
|
|
return step, detailed_output, next_step
|
|
except FailedToNavigateToUrl as e:
|
|
# Fail the task if we can't navigate to the URL and send the response
|
|
LOG.exception(
|
|
"Failed to navigate to URL, marking task as failed, and sending webhook response",
|
|
url=e.url,
|
|
)
|
|
failure_reason = f"Failed to navigate to URL. URL:{e.url}, Error:{e.error_message}"
|
|
is_task_marked_as_failed = await self.fail_task(task, step, failure_reason)
|
|
if is_task_marked_as_failed:
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
need_final_screenshot=False,
|
|
browser_session_id=browser_session_id,
|
|
)
|
|
else:
|
|
LOG.warning("Task isn't marked as failed, after navigation failure. NOT clean up the task")
|
|
return step, detailed_output, next_step
|
|
except TaskAlreadyCanceled:
|
|
LOG.info(
|
|
"Task is already canceled, stopping execution",
|
|
task_id=task.task_id,
|
|
)
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
need_call_webhook=False,
|
|
browser_session_id=browser_session_id,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
)
|
|
return step, detailed_output, None
|
|
except InvalidTaskStatusTransition:
|
|
LOG.warning("Invalid task status transition")
|
|
# TODO: shall we send task response here?
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
need_call_webhook=False,
|
|
browser_session_id=browser_session_id,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
)
|
|
return step, detailed_output, None
|
|
except (UnsupportedActionType, UnsupportedTaskType, FailedToParseActionInstruction) as e:
|
|
LOG.warning(
|
|
"unsupported task type or action type, marking the task as failed",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
)
|
|
await self.fail_task(task, step, e.message)
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
need_call_webhook=False,
|
|
browser_session_id=browser_session_id,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
)
|
|
return step, detailed_output, None
|
|
except ScrapingFailed as sfe:
|
|
LOG.warning(
|
|
"Scraping failed, marking the task as failed",
|
|
exc_info=True,
|
|
)
|
|
|
|
await self.fail_task(
|
|
task,
|
|
step,
|
|
sfe.reason
|
|
or "Skyvern failed to load the website. This usually happens when the website is not properly designed, and crashes the browser as a result.",
|
|
)
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
browser_session_id=browser_session_id,
|
|
)
|
|
return step, detailed_output, None
|
|
except MissingBrowserStatePage:
|
|
LOG.warning("Missing browser state page, marking the task as failed")
|
|
await self.fail_task(
|
|
task,
|
|
step,
|
|
"The browser does not have a valid page for skyvern to operate. This may be due to the website being empty or the browser crashing.",
|
|
)
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
browser_session_id=browser_session_id,
|
|
)
|
|
return step, detailed_output, None
|
|
except Exception as e:
|
|
LOG.exception("Got an unexpected exception in step, marking task as failed")
|
|
|
|
failure_reason = f"Unexpected error: {str(e)}"
|
|
if isinstance(e, SkyvernException):
|
|
failure_reason = f"unexpected SkyvernException({e.__class__.__name__}): {str(e)}"
|
|
|
|
is_task_marked_as_failed = await self.fail_task(task, step, failure_reason)
|
|
if is_task_marked_as_failed:
|
|
await self.clean_up_task(
|
|
task=task,
|
|
last_step=step,
|
|
api_key=api_key,
|
|
close_browser_on_completion=close_browser_on_completion,
|
|
browser_session_id=browser_session_id,
|
|
)
|
|
else:
|
|
LOG.warning("Task isn't marked as failed, after unexpected exception. NOT clean up the task")
|
|
return step, detailed_output, None
|
|
finally:
|
|
# remove the step_id from the context
|
|
context = skyvern_context.ensure_context()
|
|
context.step_id = None
|
|
context.task_id = None
|
|
|
|
async def fail_task(self, task: Task, step: Step | None, reason: str | None) -> bool:
|
|
try:
|
|
if step is not None:
|
|
await self.update_step(
|
|
step=step,
|
|
status=StepStatus.failed,
|
|
)
|
|
|
|
await self.update_task(
|
|
task,
|
|
status=TaskStatus.failed,
|
|
failure_reason=reason,
|
|
)
|
|
return True
|
|
except TaskAlreadyCanceled:
|
|
LOG.info(
|
|
"Task is already canceled. Can't fail the task.",
|
|
)
|
|
return False
|
|
except InvalidTaskStatusTransition:
|
|
LOG.warning(
|
|
"Invalid task status transition while failing a task",
|
|
)
|
|
return False
|
|
except Exception:
|
|
LOG.exception(
|
|
"Failed to update status and failure reason in database. Task might going to be time_out",
|
|
reason=reason,
|
|
)
|
|
return True
|
|
|
|
@TraceManager.traced_async(
|
|
ignore_inputs=["browser_state", "organization", "task_block", "cua_response", "llm_caller"]
|
|
)
|
|
async def agent_step(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
browser_state: BrowserState,
|
|
engine: RunEngine = RunEngine.skyvern_v1,
|
|
organization: Organization | None = None,
|
|
task_block: BaseTaskBlock | None = None,
|
|
complete_verification: bool = True,
|
|
cua_response: OpenAIResponse | None = None,
|
|
llm_caller: LLMCaller | None = None,
|
|
) -> tuple[Step, DetailedAgentStepOutput]:
|
|
detailed_agent_step_output = DetailedAgentStepOutput(
|
|
scraped_page=None,
|
|
extract_action_prompt=None,
|
|
llm_response=None,
|
|
actions=None,
|
|
action_results=None,
|
|
actions_and_results=None,
|
|
cua_response=None,
|
|
)
|
|
try:
|
|
LOG.info(
|
|
"Starting agent step",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
)
|
|
|
|
# Update context with step_id for auto action/screenshot creation
|
|
context = skyvern_context.current()
|
|
if context:
|
|
context.step_id = step.step_id
|
|
|
|
step = await self.update_step(step=step, status=StepStatus.running)
|
|
await app.AGENT_FUNCTION.prepare_step_execution(
|
|
organization=organization, task=task, step=step, browser_state=browser_state
|
|
)
|
|
|
|
speculative_plan: SpeculativePlan | None = None
|
|
reuse_speculative_llm_response = False
|
|
speculative_llm_metadata: SpeculativeLLMMetadata | None = None
|
|
if context:
|
|
speculative_plan = context.speculative_plans.pop(step.step_id, None)
|
|
|
|
if speculative_plan:
|
|
step.is_speculative = False
|
|
scraped_page = speculative_plan.scraped_page
|
|
extract_action_prompt = speculative_plan.extract_action_prompt
|
|
use_caching = speculative_plan.use_caching
|
|
json_response = speculative_plan.llm_json_response
|
|
reuse_speculative_llm_response = json_response is not None
|
|
speculative_llm_metadata = speculative_plan.llm_metadata
|
|
prompt_name = speculative_plan.prompt_name
|
|
await self._persist_scrape_artifacts(
|
|
task=task,
|
|
step=step,
|
|
scraped_page=scraped_page,
|
|
context=context,
|
|
)
|
|
else:
|
|
(
|
|
scraped_page,
|
|
extract_action_prompt,
|
|
use_caching,
|
|
prompt_name,
|
|
) = await self.build_and_record_step_prompt(
|
|
task,
|
|
step,
|
|
browser_state,
|
|
engine,
|
|
)
|
|
json_response = None
|
|
|
|
detailed_agent_step_output.scraped_page = scraped_page
|
|
detailed_agent_step_output.extract_action_prompt = extract_action_prompt
|
|
actions: list[Action]
|
|
|
|
if engine == RunEngine.openai_cua:
|
|
actions, new_cua_response = await self._generate_cua_actions(
|
|
task=task,
|
|
step=step,
|
|
scraped_page=scraped_page,
|
|
previous_response=cua_response,
|
|
engine=engine,
|
|
)
|
|
detailed_agent_step_output.cua_response = new_cua_response
|
|
elif engine == RunEngine.anthropic_cua:
|
|
assert llm_caller is not None
|
|
actions = await self._generate_anthropic_actions(
|
|
task=task,
|
|
step=step,
|
|
scraped_page=scraped_page,
|
|
llm_caller=llm_caller,
|
|
)
|
|
elif engine == RunEngine.ui_tars and not await app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
|
|
"DISABLE_UI_TARS_CUA",
|
|
task.workflow_run_id or task.task_id,
|
|
properties={"organization_id": task.organization_id},
|
|
):
|
|
assert llm_caller is not None
|
|
actions = await self._generate_ui_tars_actions(
|
|
task=task,
|
|
step=step,
|
|
scraped_page=scraped_page,
|
|
llm_caller=llm_caller,
|
|
)
|
|
|
|
else:
|
|
if not task.navigation_goal and not isinstance(task_block, ValidationBlock):
|
|
actions = [await self.create_extract_action(task, step, scraped_page)]
|
|
else:
|
|
llm_key_override = task.llm_key
|
|
# FIXME: Redundant engine check?
|
|
if engine in CUA_ENGINES:
|
|
self.async_operation_pool.run_operation(task.task_id, AgentPhase.llm)
|
|
llm_key_override = None
|
|
|
|
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
|
|
llm_key_override, default=app.LLM_API_HANDLER
|
|
)
|
|
# Add caching flag to context for monitoring
|
|
if use_caching:
|
|
context = skyvern_context.current()
|
|
if context:
|
|
context.use_prompt_caching = True
|
|
|
|
if not reuse_speculative_llm_response:
|
|
json_response = await llm_api_handler(
|
|
prompt=extract_action_prompt,
|
|
prompt_name=prompt_name,
|
|
step=step,
|
|
screenshots=scraped_page.screenshots,
|
|
)
|
|
else:
|
|
LOG.debug(
|
|
"Using speculative extract-actions response",
|
|
step_id=step.step_id,
|
|
)
|
|
if json_response is None:
|
|
raise MissingExtractActionsResponse()
|
|
try:
|
|
if pdf_embed_src := scraped_page.check_pdf_viewer_embed():
|
|
LOG.info("Generate DownloadFileAction for PDF viewer page", step_id=step.step_id)
|
|
pdf_bytes: bytes | None = None
|
|
download_url: str | None = None
|
|
|
|
# Check if the embed src is a data URI with base64 encoded PDF
|
|
# Format: data:application/pdf[;charset=...];base64,<base64_data>
|
|
if pdf_embed_src.startswith("data:application/pdf"):
|
|
# Use more precise regex to extract base64 data after the base64, prefix
|
|
# This pattern matches: data:application/pdf[;optional_params];base64,<data>
|
|
m = re.search(r"data:application/pdf[^;]*;base64,(.+)", pdf_embed_src, re.S)
|
|
if not m:
|
|
raise PDFEmbedBase64DecodeError(
|
|
pdf_embed_src=pdf_embed_src,
|
|
reason="Failed to extract base64 data from PDF embed src. Expected format: data:application/pdf[;charset=...];base64,<data>",
|
|
)
|
|
|
|
base64_data = m.group(1)
|
|
LOG.info(
|
|
"Found base64 data in PDF embed src",
|
|
step_id=step.step_id,
|
|
base64_data_length=len(base64_data),
|
|
)
|
|
|
|
# Decode base64 data with error handling
|
|
try:
|
|
pdf_bytes = base64.b64decode(base64_data, validate=True)
|
|
except Exception as e:
|
|
raise PDFEmbedBase64DecodeError(
|
|
pdf_embed_src=pdf_embed_src,
|
|
reason=f"Failed to decode base64 data: {str(e)}",
|
|
) from e
|
|
else:
|
|
# If not a data URI, treat it as a URL
|
|
LOG.info(
|
|
"Found PDF embed src as URL (not base64 data)",
|
|
step_id=step.step_id,
|
|
download_url=pdf_embed_src,
|
|
)
|
|
download_url = pdf_embed_src
|
|
|
|
actions = [
|
|
DownloadFileAction(
|
|
reasoning="Downloading the file from the PDF viewer.",
|
|
organization_id=task.organization_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
task_id=task.task_id,
|
|
step_id=step.step_id,
|
|
step_order=step.order,
|
|
action_order=0,
|
|
file_name=f"{uuid.uuid4()}.pdf",
|
|
byte=pdf_bytes,
|
|
download_url=download_url,
|
|
download=True,
|
|
)
|
|
]
|
|
else:
|
|
otp_json_response, otp_actions = await self.handle_potential_OTP_actions(
|
|
task, step, scraped_page, browser_state, json_response
|
|
)
|
|
if otp_actions:
|
|
detailed_agent_step_output.llm_response = otp_json_response
|
|
actions = otp_actions
|
|
else:
|
|
actions = parse_actions(
|
|
task, step.step_id, step.order, scraped_page, json_response["actions"]
|
|
)
|
|
|
|
if context:
|
|
context.pop_totp_code(task.task_id)
|
|
except NoTOTPVerificationCodeFound:
|
|
actions = [
|
|
TerminateAction(
|
|
organization_id=task.organization_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
task_id=task.task_id,
|
|
step_id=step.step_id,
|
|
step_order=step.order,
|
|
action_order=0,
|
|
reasoning="No TOTP verification code found. Going to terminate.",
|
|
intention="No TOTP verification code found. Going to terminate.",
|
|
errors=[TimeoutGetTOTPVerificationCodeError().to_user_defined_error()],
|
|
)
|
|
]
|
|
except FailedToGetTOTPVerificationCode as e:
|
|
actions = [
|
|
TerminateAction(
|
|
reasoning=f"Failed to get TOTP verification code. Going to terminate. Reason: {e.reason}",
|
|
intention=f"Failed to get TOTP verification code. Going to terminate. Reason: {e.reason}",
|
|
organization_id=task.organization_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
task_id=task.task_id,
|
|
step_id=step.step_id,
|
|
step_order=step.order,
|
|
action_order=0,
|
|
errors=[GetTOTPVerificationCodeError(reason=e.reason).to_user_defined_error()],
|
|
)
|
|
]
|
|
|
|
if reuse_speculative_llm_response and speculative_llm_metadata:
|
|
await self._persist_speculative_llm_metadata(
|
|
step,
|
|
speculative_llm_metadata,
|
|
screenshots=scraped_page.screenshots,
|
|
)
|
|
speculative_llm_metadata = None
|
|
|
|
detailed_agent_step_output.actions = actions
|
|
if len(actions) == 0:
|
|
LOG.info(
|
|
"No actions to execute, marking step as failed",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
)
|
|
step = await self.update_step(
|
|
step=step,
|
|
status=StepStatus.failed,
|
|
output=detailed_agent_step_output.to_agent_step_output(),
|
|
)
|
|
return step, detailed_agent_step_output
|
|
|
|
# Execute the actions
|
|
LOG.info(
|
|
"Executing actions",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
actions=actions,
|
|
)
|
|
action_results: list[ActionResult] = []
|
|
detailed_agent_step_output.action_results = action_results
|
|
# filter out wait action if there are other actions in the list
|
|
# we do this because WAIT action is considered as a failure
|
|
# which will block following actions if we don't remove it from the list
|
|
# if the list only contains WAIT action, we will execute WAIT action(s)
|
|
if len(actions) > 1:
|
|
wait_actions_to_skip = [action for action in actions if action.action_type == ActionType.WAIT]
|
|
wait_actions_len = len(wait_actions_to_skip)
|
|
# if there are wait actions and there are other actions in the list, skip wait actions
|
|
# if we are using cached action plan, we don't skip wait actions
|
|
if wait_actions_len > 0 and wait_actions_len < len(actions):
|
|
actions = [action for action in actions if action.action_type != ActionType.WAIT]
|
|
LOG.info(
|
|
"Skipping wait actions",
|
|
wait_actions_to_skip=wait_actions_to_skip,
|
|
actions=actions,
|
|
)
|
|
|
|
# initialize list of tuples and set actions as the first element of each tuple so that in the case
|
|
# of an exception, we can still see all the actions
|
|
detailed_agent_step_output.actions_and_results = [(action, []) for action in actions]
|
|
|
|
# build a linked action chain by the action_idx
|
|
action_linked_list: list[ActionLinkedNode] = []
|
|
element_id_to_action_index: dict[str, int] = dict()
|
|
for action_idx, action in enumerate(actions):
|
|
node = ActionLinkedNode(action=action)
|
|
action_linked_list.append(node)
|
|
|
|
if not isinstance(action, WebAction):
|
|
continue
|
|
|
|
previous_action_idx = element_id_to_action_index.get(action.element_id)
|
|
if previous_action_idx is not None:
|
|
previous_node = action_linked_list[previous_action_idx]
|
|
previous_node.next = node
|
|
|
|
element_id_to_action_index[action.element_id] = action_idx
|
|
|
|
element_id_to_last_action: dict[str, int] = dict()
|
|
for action_idx, action_node in enumerate(action_linked_list):
|
|
context = skyvern_context.ensure_context()
|
|
if context.refresh_working_page:
|
|
LOG.warning(
|
|
"Detected the signal to reload the page, going to reload and skip the rest of the actions",
|
|
step_order=step.order,
|
|
)
|
|
await browser_state.reload_page()
|
|
context.refresh_working_page = False
|
|
action_result = ActionSuccess()
|
|
action_result.step_order = step.order
|
|
action_result.step_retry_number = step.retry_index
|
|
action = ReloadPageAction(
|
|
reasoning="Something wrong with the current page, reload to continue",
|
|
status=ActionStatus.completed,
|
|
organization_id=task.organization_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
task_id=task.task_id,
|
|
step_id=step.step_id,
|
|
step_order=step.order,
|
|
action_order=action_idx,
|
|
)
|
|
detailed_agent_step_output.actions_and_results[action_idx] = (action, [action_result])
|
|
action.action_id = (await app.DATABASE.create_action(action=action)).action_id
|
|
await self.record_artifacts_after_action(task, step, browser_state, engine, action)
|
|
break
|
|
|
|
action = action_node.action
|
|
if isinstance(action, WebAction):
|
|
previous_action_idx = element_id_to_last_action.get(action.element_id)
|
|
if previous_action_idx is not None:
|
|
LOG.warning(
|
|
"Duplicate action element id.",
|
|
step_order=step.order,
|
|
action=action,
|
|
)
|
|
|
|
previous_action, previous_result = detailed_agent_step_output.actions_and_results[
|
|
previous_action_idx
|
|
]
|
|
if len(previous_result) > 0 and previous_result[-1].success:
|
|
LOG.info(
|
|
"Previous action succeeded, but we'll still continue.",
|
|
step_order=step.order,
|
|
previous_action=previous_action,
|
|
previous_result=previous_result,
|
|
)
|
|
else:
|
|
LOG.warning(
|
|
"Previous action failed, so handle the next action.",
|
|
step_order=step.order,
|
|
previous_action=previous_action,
|
|
previous_result=previous_result,
|
|
)
|
|
|
|
element_id_to_last_action[action.element_id] = action_idx
|
|
|
|
if engine != RunEngine.openai_cua:
|
|
self.async_operation_pool.run_operation(task.task_id, AgentPhase.action)
|
|
current_page = await browser_state.must_get_working_page()
|
|
if isinstance(action, CompleteAction) and not complete_verification:
|
|
# Do not verify the complete action when complete_verification is False
|
|
# set verified to True will skip the completion verification
|
|
action.verified = True
|
|
|
|
# Pass TOTP secret to handler for multi-field TOTP sequences
|
|
# Handler will generate TOTP at execution time
|
|
if (
|
|
action.action_type == ActionType.INPUT_TEXT
|
|
and self._is_multi_field_totp_sequence(actions)
|
|
and (totp_secret := skyvern_context.ensure_context().totp_codes.get(f"{task.task_id}_secret"))
|
|
):
|
|
# Pass TOTP secret to handler for execution-time generation
|
|
action.totp_timing_info = {
|
|
"is_totp_sequence": True,
|
|
"action_index": action_idx,
|
|
"totp_secret": totp_secret,
|
|
"is_retry": step.retry_index > 0,
|
|
}
|
|
|
|
results = await ActionHandler.handle_action(
|
|
scraped_page=scraped_page,
|
|
task=task,
|
|
step=step,
|
|
page=current_page,
|
|
action=action,
|
|
)
|
|
await app.AGENT_FUNCTION.post_action_execution(action)
|
|
detailed_agent_step_output.actions_and_results[action_idx] = (
|
|
action,
|
|
results,
|
|
)
|
|
|
|
# Determine wait time between actions
|
|
wait_time = random.uniform(0.5, 1.0)
|
|
|
|
# For multi-field TOTP sequences, use zero delay between all digits for fast execution
|
|
if action.action_type == ActionType.INPUT_TEXT and self._is_multi_field_totp_sequence(actions):
|
|
current_text = action.text if hasattr(action, "text") else None
|
|
|
|
if current_text and len(current_text) == 1 and current_text.isdigit():
|
|
# Zero delay between all TOTP digits for fast execution
|
|
wait_time = 0.0
|
|
LOG.debug(
|
|
"TOTP: zero delay for digit",
|
|
task_id=task.task_id,
|
|
action_idx=action_idx,
|
|
digit=current_text,
|
|
)
|
|
|
|
await asyncio.sleep(wait_time)
|
|
await self.record_artifacts_after_action(task, step, browser_state, engine, action)
|
|
for result in results:
|
|
result.step_retry_number = step.retry_index
|
|
result.step_order = step.order
|
|
step.output = detailed_agent_step_output.to_agent_step_output()
|
|
action_results.extend(results)
|
|
# Check the last result for this action. If that succeeded, assume the entire action is successful
|
|
if results and results[-1].success:
|
|
LOG.info(
|
|
"Action succeeded",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
action_idx=action_idx,
|
|
action=action,
|
|
action_result=results,
|
|
)
|
|
if results[-1].skip_remaining_actions:
|
|
LOG.warning(
|
|
"Going to stop executing the remaining actions",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
action_idx=action_idx,
|
|
action=action,
|
|
action_result=results,
|
|
)
|
|
break
|
|
|
|
elif results and isinstance(action, DecisiveAction):
|
|
LOG.warning(
|
|
"DecisiveAction failed, but not stopping execution and not retrying the step",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
action_idx=action_idx,
|
|
action=action,
|
|
action_result=results,
|
|
)
|
|
elif results and not results[-1].success and not results[-1].stop_execution_on_failure:
|
|
LOG.warning(
|
|
"Action failed, but not stopping execution",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
action_idx=action_idx,
|
|
action=action,
|
|
action_result=results,
|
|
)
|
|
else:
|
|
if action_node.next is not None:
|
|
LOG.warning(
|
|
"Action failed, but have duplicated element id in the action list. Continue excuting.",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
action_idx=action_idx,
|
|
action=action,
|
|
next_action=action_node.next.action,
|
|
action_result=results,
|
|
)
|
|
continue
|
|
|
|
LOG.warning(
|
|
"Action failed, marking step as failed",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
action_idx=action_idx,
|
|
action=action,
|
|
action_result=results,
|
|
actions_and_results=detailed_agent_step_output.actions_and_results,
|
|
)
|
|
# if the action failed, don't execute the rest of the actions, mark the step as failed, and retry
|
|
failed_step = await self.update_step(
|
|
step=step,
|
|
status=StepStatus.failed,
|
|
output=detailed_agent_step_output.to_agent_step_output(),
|
|
)
|
|
return failed_step, detailed_agent_step_output.get_clean_detailed_output()
|
|
|
|
LOG.info(
|
|
"Actions executed successfully, marking step as completed",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
action_results=action_results,
|
|
)
|
|
|
|
# Clean up TOTP cache after multi-field TOTP sequence completion
|
|
if self._is_multi_field_totp_sequence(actions):
|
|
context = skyvern_context.ensure_context()
|
|
cache_key = f"{task.task_id}_totp_cache"
|
|
if cache_key in context.totp_codes:
|
|
context.totp_codes.pop(cache_key)
|
|
LOG.debug(
|
|
"Cleaned up TOTP cache after multi-field sequence completion",
|
|
task_id=task.task_id,
|
|
)
|
|
|
|
secret_key = f"{task.task_id}_secret"
|
|
if secret_key in context.totp_codes:
|
|
context.totp_codes.pop(secret_key)
|
|
|
|
# Check if Skyvern already returned a complete action, if so, don't run user goal check
|
|
has_decisive_action = False
|
|
if detailed_agent_step_output and detailed_agent_step_output.actions_and_results:
|
|
for action, results in detailed_agent_step_output.actions_and_results:
|
|
if isinstance(action, DecisiveAction):
|
|
has_decisive_action = True
|
|
break
|
|
|
|
task_completes_on_download = task_block and task_block.complete_on_download and task.workflow_run_id
|
|
enable_parallel_verification = False
|
|
if (
|
|
not has_decisive_action
|
|
and not task_completes_on_download
|
|
and not isinstance(task_block, ActionBlock)
|
|
and complete_verification
|
|
and (task.navigation_goal or task.complete_criterion)
|
|
):
|
|
disable_user_goal_check = await app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
|
|
"DISABLE_USER_GOAL_CHECK",
|
|
task.task_id,
|
|
properties={"task_url": task.url, "organization_id": task.organization_id},
|
|
)
|
|
|
|
# Parallel verification is always enabled (user goal check deferred to handle_completed_step)
|
|
enable_parallel_verification = not disable_user_goal_check
|
|
|
|
# if the last action is complete and is successful, check if there's a data extraction goal
|
|
# if task has navigation goal and extraction goal at the same time, handle ExtractAction before marking step as completed
|
|
if (
|
|
task.navigation_goal
|
|
and task.data_extraction_goal
|
|
and self.step_has_completed_goal(detailed_agent_step_output)
|
|
):
|
|
working_page = await browser_state.must_get_working_page()
|
|
# refresh task in case the extracted information is updated previously
|
|
refreshed_task = await app.DATABASE.get_task(task.task_id, task.organization_id)
|
|
assert refreshed_task is not None
|
|
task = refreshed_task
|
|
extract_action = await self.create_extract_action(task, step, scraped_page)
|
|
extract_results = await ActionHandler.handle_action(
|
|
scraped_page, task, step, working_page, extract_action
|
|
)
|
|
await app.AGENT_FUNCTION.post_action_execution(extract_action)
|
|
detailed_agent_step_output.actions_and_results.append((extract_action, extract_results))
|
|
|
|
# If no action errors return the agent state and output
|
|
completed_step = await self.update_step(
|
|
step=step,
|
|
status=StepStatus.completed,
|
|
output=detailed_agent_step_output.to_agent_step_output(),
|
|
)
|
|
if enable_parallel_verification:
|
|
completed_step.speculative_original_status = StepStatus.completed
|
|
return completed_step, detailed_agent_step_output.get_clean_detailed_output()
|
|
except CancelledError:
|
|
LOG.exception(
|
|
"CancelledError in agent_step, marking step as failed",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
)
|
|
detailed_agent_step_output.step_exception = "CancelledError"
|
|
failed_step = await self.update_step(
|
|
step=step,
|
|
status=StepStatus.failed,
|
|
output=detailed_agent_step_output.to_agent_step_output(),
|
|
)
|
|
return failed_step, detailed_agent_step_output.get_clean_detailed_output()
|
|
except (
|
|
UnsupportedActionType,
|
|
UnsupportedTaskType,
|
|
FailedToParseActionInstruction,
|
|
ScrapingFailed,
|
|
MissingBrowserStatePage,
|
|
):
|
|
raise
|
|
|
|
except Exception as e:
|
|
LOG.exception(
|
|
"Unexpected exception in agent_step, marking step as failed",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
)
|
|
detailed_agent_step_output.step_exception = e.__class__.__name__
|
|
failed_step = await self.update_step(
|
|
step=step,
|
|
status=StepStatus.failed,
|
|
output=detailed_agent_step_output.to_agent_step_output(),
|
|
)
|
|
return failed_step, detailed_agent_step_output.get_clean_detailed_output()
|
|
|
|
async def _generate_cua_actions(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
scraped_page: ScrapedPage,
|
|
previous_response: OpenAIResponse | None = None,
|
|
engine: RunEngine = RunEngine.openai_cua,
|
|
) -> tuple[list[Action], OpenAIResponse | None]:
|
|
if not previous_response:
|
|
# this is the first step
|
|
first_response: OpenAIResponse = await app.OPENAI_CLIENT.responses.create(
|
|
model="computer-use-preview",
|
|
tools=[
|
|
{
|
|
"type": "computer_use_preview",
|
|
"display_width": settings.BROWSER_WIDTH,
|
|
"display_height": settings.BROWSER_HEIGHT,
|
|
"environment": "browser",
|
|
}
|
|
],
|
|
input=[
|
|
{
|
|
"role": "user",
|
|
"content": task.navigation_goal,
|
|
}
|
|
],
|
|
reasoning={
|
|
"generate_summary": "concise",
|
|
},
|
|
truncation="auto",
|
|
temperature=0,
|
|
)
|
|
previous_response = first_response
|
|
input_tokens = first_response.usage.input_tokens or 0
|
|
output_tokens = first_response.usage.output_tokens or 0
|
|
first_response.usage.total_tokens or 0
|
|
cached_tokens = first_response.usage.input_tokens_details.cached_tokens or 0
|
|
reasoning_tokens = first_response.usage.output_tokens_details.reasoning_tokens or 0
|
|
llm_cost = (3.0 / 1000000) * input_tokens + (12.0 / 1000000) * output_tokens
|
|
await app.DATABASE.update_step(
|
|
task_id=task.task_id,
|
|
step_id=step.step_id,
|
|
organization_id=task.organization_id,
|
|
incremental_cost=llm_cost,
|
|
incremental_input_tokens=input_tokens if input_tokens > 0 else None,
|
|
incremental_output_tokens=output_tokens if output_tokens > 0 else None,
|
|
incremental_reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
|
|
incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
|
|
)
|
|
if not scraped_page.screenshots:
|
|
return [], previous_response
|
|
|
|
computer_calls = [item for item in previous_response.output if item.type == "computer_call"]
|
|
reasonings = [item for item in previous_response.output if item.type == "reasoning"]
|
|
assistant_messages = [
|
|
item for item in previous_response.output if item.type == "message" and item.role == "assistant"
|
|
]
|
|
last_call_id = None
|
|
if computer_calls:
|
|
last_call_id = computer_calls[-1].call_id
|
|
|
|
screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8")
|
|
if last_call_id is None:
|
|
current_context = skyvern_context.ensure_context()
|
|
resp_content = None
|
|
if task.task_id in current_context.totp_codes:
|
|
verification_code = current_context.totp_codes[task.task_id]
|
|
current_context.totp_codes.pop(task.task_id)
|
|
LOG.info(
|
|
"Using verification code from context",
|
|
task_id=task.task_id,
|
|
verification_code=verification_code,
|
|
)
|
|
resp_content = f"Here is the verification code: {verification_code}"
|
|
else:
|
|
# try address the conversation with the context we have
|
|
reasoning = reasonings[0].summary[0].text if reasonings and reasonings[0].summary else None
|
|
assistant_message = assistant_messages[0].content[0].text if assistant_messages else None
|
|
skyvern_repsonse_prompt = load_prompt_with_elements(
|
|
element_tree_builder=scraped_page,
|
|
prompt_engine=prompt_engine,
|
|
template_name="cua-answer-question",
|
|
navigation_goal=task.navigation_goal,
|
|
assistant_reasoning=reasoning,
|
|
assistant_message=assistant_message,
|
|
)
|
|
skyvern_response = await app.LLM_API_HANDLER(
|
|
prompt=skyvern_repsonse_prompt,
|
|
prompt_name="cua-answer-question",
|
|
step=step,
|
|
screenshots=scraped_page.screenshots,
|
|
)
|
|
LOG.info("Skyvern response to CUA question", skyvern_response=skyvern_response)
|
|
resp_content = skyvern_response.get("answer")
|
|
if not resp_content:
|
|
resp_content = "I don't know. Can you help me make the best decision to achieve the goal?"
|
|
current_response = await app.OPENAI_CLIENT.responses.create(
|
|
model="computer-use-preview",
|
|
previous_response_id=previous_response.id,
|
|
tools=[
|
|
{
|
|
"type": "computer_use_preview",
|
|
"display_width": settings.BROWSER_WIDTH,
|
|
"display_height": settings.BROWSER_HEIGHT,
|
|
"environment": "browser",
|
|
}
|
|
],
|
|
input=[
|
|
{"role": "user", "content": resp_content},
|
|
],
|
|
reasoning={"generate_summary": "concise"},
|
|
truncation="auto",
|
|
temperature=0,
|
|
)
|
|
else:
|
|
last_computer_call = computer_calls[-1]
|
|
computer_call_input = {
|
|
"call_id": last_call_id,
|
|
"type": "computer_call_output",
|
|
"output": {
|
|
"type": "input_image",
|
|
"image_url": f"data:image/png;base64,{screenshot_base64}",
|
|
},
|
|
}
|
|
if last_computer_call.pending_safety_checks:
|
|
pending_checks = [check.model_dump() for check in last_computer_call.pending_safety_checks]
|
|
computer_call_input["acknowledged_safety_checks"] = pending_checks
|
|
|
|
current_response = await app.OPENAI_CLIENT.responses.create(
|
|
model="computer-use-preview",
|
|
previous_response_id=previous_response.id,
|
|
tools=[
|
|
{
|
|
"type": "computer_use_preview",
|
|
"display_width": settings.BROWSER_WIDTH,
|
|
"display_height": settings.BROWSER_HEIGHT,
|
|
"environment": "browser",
|
|
}
|
|
],
|
|
input=[computer_call_input],
|
|
reasoning={
|
|
"generate_summary": "concise",
|
|
},
|
|
truncation="auto",
|
|
temperature=0,
|
|
)
|
|
input_tokens = current_response.usage.input_tokens or 0
|
|
output_tokens = current_response.usage.output_tokens or 0
|
|
current_response.usage.total_tokens or 0
|
|
cached_tokens = current_response.usage.input_tokens_details.cached_tokens or 0
|
|
reasoning_tokens = current_response.usage.output_tokens_details.reasoning_tokens or 0
|
|
llm_cost = (3.0 / 1000000) * input_tokens + (12.0 / 1000000) * output_tokens
|
|
await app.DATABASE.update_step(
|
|
task_id=task.task_id,
|
|
step_id=step.step_id,
|
|
organization_id=task.organization_id,
|
|
incremental_cost=llm_cost,
|
|
incremental_input_tokens=input_tokens if input_tokens > 0 else None,
|
|
incremental_output_tokens=output_tokens if output_tokens > 0 else None,
|
|
incremental_reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None,
|
|
incremental_cached_tokens=cached_tokens if cached_tokens > 0 else None,
|
|
)
|
|
|
|
return await parse_cua_actions(task, step, current_response), current_response
|
|
|
|
async def _generate_anthropic_actions(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
scraped_page: ScrapedPage,
|
|
llm_caller: LLMCaller,
|
|
) -> list[Action]:
|
|
LOG.info(
|
|
"Anthropic CU call starts",
|
|
tool_results=llm_caller.current_tool_results,
|
|
message_length=len(llm_caller.message_history),
|
|
)
|
|
if llm_caller.current_tool_results:
|
|
llm_caller.message_history.append({"role": "user", "content": llm_caller.current_tool_results})
|
|
llm_caller.clear_tool_results()
|
|
LOG.info(
|
|
"Anthropic CU call - appended tool result message to message history and cleared cached tool results",
|
|
message=llm_caller.current_tool_results,
|
|
message_length=len(llm_caller.message_history),
|
|
)
|
|
tools = [
|
|
{
|
|
"type": "computer_20250124",
|
|
"name": "computer",
|
|
"display_height_px": settings.BROWSER_HEIGHT,
|
|
"display_width_px": settings.BROWSER_WIDTH,
|
|
}
|
|
]
|
|
thinking = {"type": "enabled", "budget_tokens": 1024}
|
|
betas = ["computer-use-2025-01-24"]
|
|
window_dimension = cast(Resolution, scraped_page.window_dimension) if scraped_page.window_dimension else None
|
|
if not llm_caller.message_history:
|
|
llm_response = await llm_caller.call(
|
|
prompt=task.navigation_goal,
|
|
step=step,
|
|
screenshots=scraped_page.screenshots,
|
|
use_message_history=True,
|
|
tools=tools,
|
|
raw_response=True,
|
|
betas=betas,
|
|
thinking=thinking,
|
|
window_dimension=window_dimension,
|
|
)
|
|
else:
|
|
current_context = skyvern_context.ensure_context()
|
|
resp_content = None
|
|
if task.task_id in current_context.totp_codes:
|
|
verification_code = current_context.totp_codes[task.task_id]
|
|
current_context.totp_codes.pop(task.task_id)
|
|
LOG.info(
|
|
"Using verification code from context for anthropic CU call",
|
|
task_id=task.task_id,
|
|
verification_code=verification_code,
|
|
)
|
|
resp_content = f"Here is the verification code: {verification_code}"
|
|
|
|
llm_response = await llm_caller.call(
|
|
prompt=resp_content,
|
|
step=step,
|
|
screenshots=scraped_page.screenshots,
|
|
use_message_history=True,
|
|
tools=tools,
|
|
raw_response=True,
|
|
betas=betas,
|
|
thinking=thinking,
|
|
window_dimension=window_dimension,
|
|
)
|
|
assistant_content = llm_response["content"]
|
|
llm_caller.message_history.append({"role": "assistant", "content": assistant_content})
|
|
|
|
actions = await parse_anthropic_actions(
|
|
task,
|
|
step,
|
|
assistant_content,
|
|
window_dimension or llm_caller.browser_window_dimension,
|
|
llm_caller.get_screenshot_resize_target_dimension(window_dimension),
|
|
)
|
|
return actions
|
|
|
|
async def _generate_ui_tars_actions(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
scraped_page: ScrapedPage,
|
|
llm_caller: LLMCaller,
|
|
) -> list[Action]:
|
|
"""Generate actions using UI-TARS (Seed1.5-VL) model through the LLMCaller pattern."""
|
|
|
|
LOG.info(
|
|
"UI-TARS action generation starts",
|
|
step_order=step.order,
|
|
)
|
|
|
|
# Ensure we have a UITarsLLMCaller instance
|
|
if not isinstance(llm_caller, UITarsLLMCaller):
|
|
raise ValueError(f"Expected UITarsLLMCaller, got {type(llm_caller)}")
|
|
|
|
# Add the current screenshot to conversation
|
|
if scraped_page.screenshots:
|
|
llm_caller.add_screenshot(scraped_page.screenshots[0])
|
|
else:
|
|
LOG.error("No screenshots found, skipping UI-TARS action generation")
|
|
raise ValueError("No screenshots found, skipping UI-TARS action generation")
|
|
|
|
# Generate response using the LLMCaller
|
|
response_content = await llm_caller.generate_ui_tars_response(step)
|
|
|
|
LOG.info(f"UI-TARS raw response: {response_content}")
|
|
|
|
window_dimension = (
|
|
cast(Resolution, scraped_page.window_dimension)
|
|
if scraped_page.window_dimension
|
|
else Resolution(width=1920, height=1080)
|
|
)
|
|
LOG.info(f"UI-TARS browser window dimension: {window_dimension}")
|
|
|
|
actions = await parse_ui_tars_actions(task, step, response_content, window_dimension)
|
|
|
|
LOG.info(
|
|
"UI-TARS action generation completed",
|
|
actions_count=len(actions),
|
|
)
|
|
|
|
return actions
|
|
|
|
async def _speculate_next_step_plan(
|
|
self,
|
|
task: Task,
|
|
current_step: Step,
|
|
next_step: Step,
|
|
browser_state: BrowserState,
|
|
engine: RunEngine,
|
|
) -> SpeculativePlan | None:
|
|
if engine in CUA_ENGINES:
|
|
LOG.info(
|
|
"Skipping speculative extract-actions for CUA engine",
|
|
step_id=current_step.step_id,
|
|
task_id=task.task_id,
|
|
)
|
|
return None
|
|
|
|
try:
|
|
next_step.is_speculative = True
|
|
|
|
scraped_page, extract_action_prompt, use_caching, prompt_name = await self.build_and_record_step_prompt(
|
|
task,
|
|
next_step,
|
|
browser_state,
|
|
engine,
|
|
persist_artifacts=False,
|
|
)
|
|
|
|
if scraped_page.check_pdf_viewer_embed():
|
|
next_step.is_speculative = False
|
|
LOG.info("Skipping speculative extract-actions for PDF viewer page", step_id=current_step.step_id)
|
|
return None
|
|
|
|
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
|
|
task.llm_key,
|
|
default=app.LLM_API_HANDLER,
|
|
)
|
|
|
|
llm_json_response = await llm_api_handler(
|
|
prompt=extract_action_prompt,
|
|
prompt_name=prompt_name,
|
|
step=next_step,
|
|
screenshots=scraped_page.screenshots,
|
|
)
|
|
|
|
LOG.info(
|
|
"Speculative extract-actions completed",
|
|
current_step_id=current_step.step_id,
|
|
synthetic_step_id=next_step.step_id,
|
|
)
|
|
|
|
metadata_copy = None
|
|
if next_step.speculative_llm_metadata is not None:
|
|
metadata_copy = next_step.speculative_llm_metadata.model_copy()
|
|
next_step.speculative_llm_metadata = None
|
|
next_step.is_speculative = False
|
|
|
|
return SpeculativePlan(
|
|
scraped_page=scraped_page,
|
|
extract_action_prompt=extract_action_prompt,
|
|
use_caching=use_caching,
|
|
llm_json_response=llm_json_response,
|
|
llm_metadata=metadata_copy,
|
|
prompt_name=prompt_name,
|
|
)
|
|
except Exception:
|
|
LOG.warning(
|
|
"Failed to run speculative extract-actions",
|
|
step_id=current_step.step_id,
|
|
exc_info=True,
|
|
)
|
|
next_step.is_speculative = False
|
|
return None
|
|
|
|
async def _persist_speculative_llm_metadata(
|
|
self,
|
|
step: Step,
|
|
metadata: SpeculativeLLMMetadata,
|
|
*,
|
|
screenshots: list[bytes] | None = None,
|
|
) -> None:
|
|
if not metadata:
|
|
return
|
|
|
|
LOG.debug("Persisting speculative LLM metadata")
|
|
|
|
artifacts = []
|
|
if metadata.prompt:
|
|
artifacts.append(
|
|
await app.ARTIFACT_MANAGER.prepare_llm_artifact(
|
|
data=metadata.prompt.encode("utf-8"),
|
|
artifact_type=ArtifactType.LLM_PROMPT,
|
|
screenshots=screenshots,
|
|
step=step,
|
|
)
|
|
)
|
|
|
|
if metadata.llm_request_json:
|
|
artifacts.append(
|
|
await app.ARTIFACT_MANAGER.prepare_llm_artifact(
|
|
data=metadata.llm_request_json.encode("utf-8"),
|
|
artifact_type=ArtifactType.LLM_REQUEST,
|
|
step=step,
|
|
)
|
|
)
|
|
|
|
if metadata.llm_response_json:
|
|
artifacts.append(
|
|
await app.ARTIFACT_MANAGER.prepare_llm_artifact(
|
|
data=metadata.llm_response_json.encode("utf-8"),
|
|
artifact_type=ArtifactType.LLM_RESPONSE,
|
|
step=step,
|
|
)
|
|
)
|
|
|
|
if metadata.parsed_response_json:
|
|
artifacts.append(
|
|
await app.ARTIFACT_MANAGER.prepare_llm_artifact(
|
|
data=metadata.parsed_response_json.encode("utf-8"),
|
|
artifact_type=ArtifactType.LLM_RESPONSE_PARSED,
|
|
step=step,
|
|
)
|
|
)
|
|
|
|
if metadata.rendered_response_json:
|
|
artifacts.append(
|
|
await app.ARTIFACT_MANAGER.prepare_llm_artifact(
|
|
data=metadata.rendered_response_json.encode("utf-8"),
|
|
artifact_type=ArtifactType.LLM_RESPONSE_RENDERED,
|
|
step=step,
|
|
)
|
|
)
|
|
|
|
if artifacts:
|
|
await app.ARTIFACT_MANAGER.bulk_create_artifacts(artifacts)
|
|
|
|
incremental_cost = metadata.llm_cost if metadata.llm_cost and metadata.llm_cost > 0 else None
|
|
incremental_input_tokens = (
|
|
metadata.input_tokens if metadata.input_tokens and metadata.input_tokens > 0 else None
|
|
)
|
|
incremental_output_tokens = (
|
|
metadata.output_tokens if metadata.output_tokens and metadata.output_tokens > 0 else None
|
|
)
|
|
incremental_reasoning_tokens = (
|
|
metadata.reasoning_tokens if metadata.reasoning_tokens and metadata.reasoning_tokens > 0 else None
|
|
)
|
|
incremental_cached_tokens = (
|
|
metadata.cached_tokens if metadata.cached_tokens and metadata.cached_tokens > 0 else None
|
|
)
|
|
|
|
if (
|
|
incremental_cost is not None
|
|
or incremental_input_tokens is not None
|
|
or incremental_output_tokens is not None
|
|
or incremental_reasoning_tokens is not None
|
|
or incremental_cached_tokens is not None
|
|
):
|
|
await app.DATABASE.update_step(
|
|
task_id=step.task_id,
|
|
step_id=step.step_id,
|
|
organization_id=step.organization_id,
|
|
incremental_cost=incremental_cost,
|
|
incremental_input_tokens=incremental_input_tokens,
|
|
incremental_output_tokens=incremental_output_tokens,
|
|
incremental_reasoning_tokens=incremental_reasoning_tokens,
|
|
incremental_cached_tokens=incremental_cached_tokens,
|
|
)
|
|
|
|
if incremental_input_tokens:
|
|
step.input_token_count += incremental_input_tokens
|
|
if incremental_output_tokens:
|
|
step.output_token_count += incremental_output_tokens
|
|
if incremental_reasoning_tokens:
|
|
step.reasoning_token_count = (step.reasoning_token_count or 0) + incremental_reasoning_tokens
|
|
if incremental_cached_tokens:
|
|
step.cached_token_count = (step.cached_token_count or 0) + incremental_cached_tokens
|
|
if incremental_cost:
|
|
step.step_cost += incremental_cost
|
|
|
|
step.speculative_llm_metadata = None
|
|
|
|
async def _persist_speculative_metadata_for_discarded_plan(
|
|
self,
|
|
step: Step,
|
|
speculative_task: asyncio.Future[SpeculativePlan | None],
|
|
*,
|
|
cancel_step: bool = False,
|
|
) -> None:
|
|
try:
|
|
plan = await asyncio.shield(speculative_task)
|
|
except CancelledError:
|
|
LOG.debug(
|
|
"Speculative extract-actions cancelled before metadata persistence",
|
|
step_id=step.step_id,
|
|
)
|
|
step.is_speculative = False
|
|
if cancel_step:
|
|
await self._cancel_speculative_step(step)
|
|
return
|
|
except Exception:
|
|
LOG.debug(
|
|
"Speculative extract-actions failed before metadata persistence",
|
|
step_id=step.step_id,
|
|
exc_info=True,
|
|
)
|
|
step.is_speculative = False
|
|
if cancel_step:
|
|
await self._cancel_speculative_step(step)
|
|
return
|
|
|
|
if not plan or not plan.llm_metadata:
|
|
step.is_speculative = False
|
|
if cancel_step:
|
|
await self._cancel_speculative_step(step)
|
|
return
|
|
|
|
try:
|
|
await self._persist_speculative_llm_metadata(
|
|
step,
|
|
plan.llm_metadata,
|
|
)
|
|
step.is_speculative = False
|
|
if cancel_step:
|
|
await self._cancel_speculative_step(step)
|
|
except Exception:
|
|
LOG.warning(
|
|
"Failed to persist speculative llm metadata for discarded plan",
|
|
step_id=step.step_id,
|
|
exc_info=True,
|
|
)
|
|
|
|
async def _cancel_speculative_step(self, step: Step) -> None:
|
|
if step.status == StepStatus.canceled:
|
|
return
|
|
try:
|
|
updated_step = await self.update_step(step, status=StepStatus.canceled)
|
|
step.status = updated_step.status
|
|
step.is_speculative = False
|
|
except Exception:
|
|
LOG.warning(
|
|
"Failed to cancel speculative step",
|
|
step_id=step.step_id,
|
|
exc_info=True,
|
|
)
|
|
|
|
async def complete_verify(
|
|
self, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
|
) -> CompleteVerifyResult:
|
|
LOG.info(
|
|
"Checking if user goal is achieved after re-scraping the page",
|
|
workflow_run_id=task.workflow_run_id,
|
|
)
|
|
scroll = True
|
|
llm_key_override = task.llm_key
|
|
if await service_utils.is_cua_task(task=task):
|
|
scroll = False
|
|
llm_key_override = None
|
|
|
|
scraped_page_refreshed = await scraped_page.refresh(draw_boxes=False, scroll=scroll)
|
|
|
|
actions_and_results_str = ""
|
|
if task.include_action_history_in_verification:
|
|
actions_and_results_str = await self._get_action_results(task, current_step=step)
|
|
|
|
# Check if we should use the termination-aware prompt (experiment)
|
|
use_termination_prompt = False
|
|
try:
|
|
distinct_id = task.workflow_run_id if task.workflow_run_id else task.task_id
|
|
use_termination_prompt = await app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
|
|
"USE_TERMINATION_AWARE_COMPLETE_VERIFICATION",
|
|
distinct_id,
|
|
properties={"organization_id": task.organization_id, "task_url": task.url},
|
|
)
|
|
if use_termination_prompt:
|
|
LOG.info(
|
|
"Experiment enabled: using termination-aware complete verification prompt for file download block",
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
organization_id=task.organization_id,
|
|
block_type="file_download",
|
|
)
|
|
except Exception as e:
|
|
LOG.warning(
|
|
"Failed to check USE_TERMINATION_AWARE_COMPLETE_VERIFICATION experiment; using legacy behavior",
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
error=str(e),
|
|
)
|
|
|
|
# Select the appropriate template based on experiment
|
|
template_name = "check-user-goal-with-termination" if use_termination_prompt else "check-user-goal"
|
|
prompt_name = "check-user-goal-with-termination" if use_termination_prompt else "check-user-goal"
|
|
|
|
verification_prompt = load_prompt_with_elements(
|
|
element_tree_builder=scraped_page_refreshed,
|
|
prompt_engine=prompt_engine,
|
|
template_name=template_name,
|
|
navigation_goal=task.navigation_goal,
|
|
navigation_payload=task.navigation_payload,
|
|
complete_criterion=task.complete_criterion,
|
|
terminate_criterion=task.terminate_criterion,
|
|
action_history=actions_and_results_str,
|
|
local_datetime=datetime.now(skyvern_context.ensure_context().tz_info).isoformat(),
|
|
)
|
|
|
|
# This prompt is critical for our agent, we probably should use the primary LLM handler
|
|
# but we're experimenting with using the dedicated check-user-goal handler
|
|
use_check_user_goal_handler = False
|
|
try:
|
|
# Use task_id or workflow_run_id as distinct_id
|
|
distinct_id = task.workflow_run_id if task.workflow_run_id else task.task_id
|
|
use_check_user_goal_handler = await app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
|
|
"USE_CHECK_USER_GOAL_HANDLER_FOR_VERIFICATION",
|
|
distinct_id,
|
|
properties={"organization_id": task.organization_id},
|
|
)
|
|
if use_check_user_goal_handler:
|
|
LOG.info(
|
|
"Experiment enabled: using CHECK_USER_GOAL_LLM_API_HANDLER for complete verification",
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
organization_id=task.organization_id,
|
|
)
|
|
except Exception as e:
|
|
LOG.warning(
|
|
"Failed to check USE_CHECK_USER_GOAL_HANDLER_FOR_VERIFICATION experiment; using legacy behavior",
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
error=str(e),
|
|
)
|
|
|
|
if use_check_user_goal_handler:
|
|
# Use the dedicated check-user-goal handler (new behavior)
|
|
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
|
|
llm_key_override, default=app.CHECK_USER_GOAL_LLM_API_HANDLER
|
|
)
|
|
else:
|
|
# Use the primary LLM handler (legacy behavior)
|
|
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
|
|
llm_key_override, default=app.LLM_API_HANDLER
|
|
)
|
|
|
|
verification_result = await llm_api_handler(
|
|
prompt=verification_prompt,
|
|
step=step,
|
|
screenshots=scraped_page_refreshed.screenshots,
|
|
prompt_name=prompt_name,
|
|
)
|
|
return CompleteVerifyResult.model_validate(verification_result)
|
|
|
|
async def check_user_goal_complete(
|
|
self, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
|
) -> CompleteAction | TerminateAction | None:
|
|
try:
|
|
verification_result = await self.complete_verify(
|
|
page=page,
|
|
scraped_page=scraped_page,
|
|
task=task,
|
|
step=step,
|
|
)
|
|
|
|
# Check if we should terminate instead of complete
|
|
# Note: This requires the USE_TERMINATION_AWARE_COMPLETE_VERIFICATION experiment to be enabled
|
|
if verification_result.is_terminate:
|
|
LOG.warning(
|
|
"Periodic verification determined task should terminate (termination-aware experiment)",
|
|
workflow_run_id=task.workflow_run_id,
|
|
thoughts=verification_result.thoughts,
|
|
status=verification_result.status if verification_result.status else "legacy",
|
|
)
|
|
return TerminateAction(
|
|
reasoning=verification_result.thoughts,
|
|
)
|
|
|
|
# We don't want to return a complete action if the user goal is not achieved since we're checking at every step
|
|
if not verification_result.is_complete:
|
|
return None
|
|
|
|
return CompleteAction(
|
|
reasoning=verification_result.thoughts,
|
|
data_extraction_goal=task.data_extraction_goal,
|
|
verified=True,
|
|
)
|
|
|
|
except Exception:
|
|
LOG.exception(
|
|
"Failed to check user goal complete, skipping",
|
|
workflow_run_id=task.workflow_run_id,
|
|
)
|
|
return None
|
|
|
|
async def record_artifacts_after_action(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
browser_state: BrowserState,
|
|
engine: RunEngine,
|
|
action: Action,
|
|
) -> None:
|
|
working_page = await browser_state.get_working_page()
|
|
if not working_page:
|
|
raise MissingBrowserStatePage()
|
|
|
|
skyvern_frame: SkyvernFrame | None = None
|
|
try:
|
|
skyvern_frame = await SkyvernFrame.create_instance(frame=working_page)
|
|
await skyvern_frame.safe_wait_for_animation_end()
|
|
except Exception:
|
|
LOG.info("Failed to wait for animation end, ignore it", exc_info=True)
|
|
|
|
context = skyvern_context.ensure_context()
|
|
scrolling_number = context.max_screenshot_scrolls
|
|
if scrolling_number is None:
|
|
scrolling_number = DEFAULT_MAX_SCREENSHOT_SCROLLS
|
|
|
|
if engine in CUA_ENGINES:
|
|
scrolling_number = 0
|
|
|
|
artifacts: list[BulkArtifactCreationRequest | None] = []
|
|
screenshot_artifact_id: str | None = None
|
|
try:
|
|
# get current x, y position of the page
|
|
x: int | None = None
|
|
y: int | None = None
|
|
try:
|
|
x, y = await skyvern_frame.get_scroll_x_y() if skyvern_frame else (None, None)
|
|
LOG.debug("Current x, y position of the page before taking screenshot", x=x, y=y)
|
|
except Exception:
|
|
LOG.warning("Failed to get current x, y position of the page", exc_info=True)
|
|
|
|
screenshot = await browser_state.take_post_action_screenshot(
|
|
scrolling_number=scrolling_number,
|
|
)
|
|
# scroll back to the original x, y position of the page
|
|
if skyvern_frame and x is not None and y is not None:
|
|
await skyvern_frame.safe_scroll_to_x_y(x, y)
|
|
LOG.debug("Scrolled back to the original x, y position of the page after taking screenshot", x=x, y=y)
|
|
screenshot_request = await app.ARTIFACT_MANAGER.prepare_llm_artifact(
|
|
data=screenshot,
|
|
artifact_type=ArtifactType.SCREENSHOT_ACTION,
|
|
step=step,
|
|
)
|
|
if screenshot_request:
|
|
artifacts.append(screenshot_request)
|
|
for artifact_data in screenshot_request.artifacts:
|
|
if artifact_data.artifact_model.artifact_type == ArtifactType.SCREENSHOT_ACTION:
|
|
screenshot_artifact_id = artifact_data.artifact_model.artifact_id
|
|
break
|
|
except Exception:
|
|
LOG.error(
|
|
"Failed to record screenshot after action",
|
|
exc_info=True,
|
|
)
|
|
|
|
try:
|
|
skyvern_frame = await SkyvernFrame.create_instance(frame=working_page)
|
|
html = await skyvern_frame.get_content()
|
|
artifacts.append(
|
|
await app.ARTIFACT_MANAGER.prepare_llm_artifact(
|
|
data=html.encode(),
|
|
artifact_type=ArtifactType.HTML_ACTION,
|
|
step=step,
|
|
)
|
|
)
|
|
except Exception:
|
|
LOG.exception("Failed to record html after action")
|
|
|
|
if artifacts:
|
|
try:
|
|
await app.ARTIFACT_MANAGER.bulk_create_artifacts(artifacts)
|
|
except Exception:
|
|
LOG.warning("Failed to bulk create artifacts after action", exc_info=True)
|
|
else:
|
|
if screenshot_artifact_id and action.action_id and action.organization_id:
|
|
try:
|
|
# TODO: consider batching screenshot artifact updates to reduce per-action DB writes.
|
|
await app.DATABASE.update_action_screenshot_artifact_id(
|
|
organization_id=action.organization_id,
|
|
action_id=action.action_id,
|
|
screenshot_artifact_id=screenshot_artifact_id,
|
|
)
|
|
action.screenshot_artifact_id = screenshot_artifact_id
|
|
except Exception:
|
|
LOG.warning(
|
|
"Failed to update action with screenshot artifact id",
|
|
action_id=action.action_id,
|
|
screenshot_artifact_id=screenshot_artifact_id,
|
|
exc_info=True,
|
|
)
|
|
|
|
try:
|
|
video_artifacts = await app.BROWSER_MANAGER.get_video_artifacts(
|
|
task_id=task.task_id, browser_state=browser_state
|
|
)
|
|
for video_artifact in video_artifacts:
|
|
await app.ARTIFACT_MANAGER.update_artifact_data(
|
|
artifact_id=video_artifact.video_artifact_id,
|
|
organization_id=task.organization_id,
|
|
data=video_artifact.video_data,
|
|
)
|
|
except Exception:
|
|
LOG.exception("Failed to record video after action")
|
|
|
|
async def initialize_execution_state(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
workflow_run: WorkflowRun | None = None,
|
|
browser_session_id: str | None = None,
|
|
) -> tuple[Step, BrowserState, DetailedAgentStepOutput]:
|
|
if workflow_run:
|
|
browser_state = await app.BROWSER_MANAGER.get_or_create_for_workflow_run(
|
|
workflow_run=workflow_run,
|
|
url=task.url,
|
|
browser_session_id=browser_session_id,
|
|
browser_profile_id=workflow_run.browser_profile_id,
|
|
)
|
|
else:
|
|
browser_state = await app.BROWSER_MANAGER.get_or_create_for_task(
|
|
task=task,
|
|
browser_session_id=browser_session_id,
|
|
)
|
|
# Initialize video artifact for the task here, afterwards it'll only get updated
|
|
if browser_state and browser_state.browser_artifacts:
|
|
video_artifacts = await app.BROWSER_MANAGER.get_video_artifacts(
|
|
task_id=task.task_id, browser_state=browser_state
|
|
)
|
|
for idx, video_artifact in enumerate(video_artifacts):
|
|
if video_artifact.video_artifact_id:
|
|
continue
|
|
video_artifact_id = await app.ARTIFACT_MANAGER.create_artifact(
|
|
step=step,
|
|
artifact_type=ArtifactType.RECORDING,
|
|
data=video_artifact.video_data,
|
|
)
|
|
video_artifacts[idx].video_artifact_id = video_artifact_id
|
|
app.BROWSER_MANAGER.set_video_artifact_for_task(task, video_artifacts)
|
|
|
|
detailed_output = DetailedAgentStepOutput(
|
|
scraped_page=None,
|
|
extract_action_prompt=None,
|
|
llm_response=None,
|
|
actions=None,
|
|
action_results=None,
|
|
actions_and_results=None,
|
|
step_exception=None,
|
|
)
|
|
return step, browser_state, detailed_output
|
|
|
|
async def _scrape_with_type(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
browser_state: BrowserState,
|
|
scrape_type: ScrapeType,
|
|
engine: RunEngine,
|
|
) -> ScrapedPage:
|
|
if scrape_type == ScrapeType.NORMAL:
|
|
pass
|
|
|
|
elif scrape_type == ScrapeType.STOPLOADING:
|
|
LOG.info("Try to stop loading the page before scraping")
|
|
await browser_state.stop_page_loading()
|
|
elif scrape_type == ScrapeType.RELOAD:
|
|
LOG.info("Try to reload the page before scraping")
|
|
await browser_state.reload_page()
|
|
|
|
max_screenshot_number = settings.MAX_NUM_SCREENSHOTS
|
|
draw_boxes = True
|
|
scroll = True
|
|
if engine in CUA_ENGINES:
|
|
max_screenshot_number = 1
|
|
draw_boxes = False
|
|
scroll = False
|
|
|
|
return await browser_state.scrape_website(
|
|
url=task.url,
|
|
cleanup_element_tree=app.AGENT_FUNCTION.cleanup_element_tree_factory(task=task, step=step),
|
|
scrape_exclude=app.scrape_exclude,
|
|
max_screenshot_number=max_screenshot_number,
|
|
draw_boxes=draw_boxes,
|
|
scroll=scroll,
|
|
)
|
|
|
|
async def build_and_record_step_prompt(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
browser_state: BrowserState,
|
|
engine: RunEngine,
|
|
*,
|
|
persist_artifacts: bool = True,
|
|
) -> tuple[ScrapedPage, str, bool, str]:
|
|
# Check if we have pre-scraped data from parallel verification optimization
|
|
context = skyvern_context.current()
|
|
scraped_page: ScrapedPage | None = None
|
|
|
|
if (
|
|
context
|
|
and context.next_step_pre_scraped_data
|
|
and context.next_step_pre_scraped_data.get("step_id") == step.step_id
|
|
):
|
|
scraped_page = context.next_step_pre_scraped_data.get("scraped_page")
|
|
if scraped_page:
|
|
timestamp = context.next_step_pre_scraped_data.get("timestamp")
|
|
age_seconds = (datetime.now(UTC) - timestamp).total_seconds() if timestamp else 0
|
|
LOG.info(
|
|
"Using pre-scraped data from parallel verification optimization",
|
|
step_id=step.step_id,
|
|
num_elements=len(scraped_page.elements),
|
|
age_seconds=age_seconds,
|
|
)
|
|
# Clear the cached data
|
|
context.next_step_pre_scraped_data = None
|
|
|
|
# If we don't have pre-scraped data, scrape normally
|
|
if scraped_page is None:
|
|
# Check PostHog for speed optimizations BEFORE scraping
|
|
# This decision will be used in both:
|
|
# 1. SVG conversion skip (in agent_functions.py cleanup)
|
|
# 2. Tree selection (economy vs regular tree)
|
|
# By checking once and storing in context, we ensure perfect coordination
|
|
if context:
|
|
try:
|
|
distinct_id = task.workflow_run_id if task.workflow_run_id else task.task_id
|
|
context.enable_speed_optimizations = await app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
|
|
"ENABLE_SPEED_OPTIMIZATIONS",
|
|
distinct_id,
|
|
properties={"organization_id": task.organization_id},
|
|
)
|
|
except Exception:
|
|
LOG.warning(
|
|
"Failed to check ENABLE_SPEED_OPTIMIZATIONS feature flag",
|
|
exc_info=True,
|
|
task_id=task.task_id,
|
|
)
|
|
context.enable_speed_optimizations = False
|
|
|
|
# start the async tasks while running scrape_website
|
|
if engine not in CUA_ENGINES:
|
|
self.async_operation_pool.run_operation(task.task_id, AgentPhase.scrape)
|
|
|
|
# Scrape the web page and get the screenshot and the elements
|
|
# HACK: try scrape_website three time to handle screenshot timeout
|
|
# first time: normal scrape to take screenshot
|
|
# second time: try again the normal scrape, (stopping window loading before scraping barely helps, but causing problem)
|
|
# third time: reload the page before scraping
|
|
extract_action_prompt = ""
|
|
use_caching = False
|
|
for idx, scrape_type in enumerate(SCRAPE_TYPE_ORDER):
|
|
try:
|
|
scraped_page = await self._scrape_with_type(
|
|
task=task,
|
|
step=step,
|
|
browser_state=browser_state,
|
|
scrape_type=scrape_type,
|
|
engine=engine,
|
|
)
|
|
break
|
|
except (FailedToTakeScreenshot, ScrapingFailed) as e:
|
|
if idx < len(SCRAPE_TYPE_ORDER) - 1:
|
|
continue
|
|
LOG.exception(f"{e.__class__.__name__} happened in two normal attempts and reload-page retry")
|
|
raise e
|
|
|
|
if scraped_page is None:
|
|
raise EmptyScrapePage()
|
|
|
|
extract_action_prompt = ""
|
|
use_caching = False
|
|
|
|
if persist_artifacts:
|
|
await self._persist_scrape_artifacts(
|
|
task=task,
|
|
step=step,
|
|
scraped_page=scraped_page,
|
|
context=context,
|
|
)
|
|
LOG.info(
|
|
"Scraped website",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
num_elements=len(scraped_page.elements),
|
|
url=task.url,
|
|
)
|
|
extract_action_prompt = ""
|
|
prompt_name = EXTRACT_ACTION_PROMPT_NAME # Default; overwritten below for non-CUA engines
|
|
if engine not in CUA_ENGINES:
|
|
extract_action_prompt, use_caching, prompt_name = await self._build_extract_action_prompt(
|
|
task,
|
|
step,
|
|
browser_state,
|
|
scraped_page,
|
|
verification_code_check=bool(task.totp_verification_url or task.totp_identifier),
|
|
expire_verification_code=True,
|
|
)
|
|
|
|
return scraped_page, extract_action_prompt, use_caching, prompt_name
|
|
|
|
async def _persist_scrape_artifacts(
|
|
self,
|
|
*,
|
|
task: Task,
|
|
step: Step,
|
|
scraped_page: ScrapedPage,
|
|
context: SkyvernContext | None,
|
|
) -> None:
|
|
"""
|
|
Persist the core scrape artifacts (HTML + element metadata) for a step.
|
|
This is used both for regular runs and when adopting a speculative plan.
|
|
"""
|
|
|
|
await app.ARTIFACT_MANAGER.create_artifact(
|
|
step=step,
|
|
artifact_type=ArtifactType.HTML_SCRAPE,
|
|
data=scraped_page.html.encode(),
|
|
)
|
|
|
|
element_tree_format = ElementTreeFormat.HTML
|
|
element_tree_in_prompt = self._build_element_tree_for_prompt(
|
|
scraped_page=scraped_page,
|
|
step=step,
|
|
task=task,
|
|
context=context,
|
|
element_tree_format=element_tree_format,
|
|
)
|
|
|
|
await app.ARTIFACT_MANAGER.create_artifact(
|
|
step=step,
|
|
artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_CSS_MAP,
|
|
data=json.dumps(scraped_page.id_to_css_dict, indent=2).encode(),
|
|
)
|
|
await app.ARTIFACT_MANAGER.create_artifact(
|
|
step=step,
|
|
artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP,
|
|
data=json.dumps(scraped_page.id_to_frame_dict, indent=2).encode(),
|
|
)
|
|
await app.ARTIFACT_MANAGER.create_artifact(
|
|
step=step,
|
|
artifact_type=ArtifactType.VISIBLE_ELEMENTS_TREE,
|
|
data=json.dumps(scraped_page.element_tree, indent=2).encode(),
|
|
)
|
|
await app.ARTIFACT_MANAGER.create_artifact(
|
|
step=step,
|
|
artifact_type=ArtifactType.VISIBLE_ELEMENTS_TREE_TRIMMED,
|
|
data=json.dumps(scraped_page.element_tree_trimmed, indent=2).encode(),
|
|
)
|
|
await app.ARTIFACT_MANAGER.create_artifact(
|
|
step=step,
|
|
artifact_type=ArtifactType.VISIBLE_ELEMENTS_TREE_IN_PROMPT,
|
|
data=element_tree_in_prompt.encode(),
|
|
)
|
|
|
|
def _build_element_tree_for_prompt(
|
|
self,
|
|
*,
|
|
scraped_page: ScrapedPage,
|
|
step: Step,
|
|
task: Task,
|
|
context: SkyvernContext | None,
|
|
element_tree_format: ElementTreeFormat,
|
|
) -> str:
|
|
"""
|
|
Determine which element tree representation should be captured for the prompt/artifacts.
|
|
Mirrors the previous inline logic so that speculative runs can reuse it.
|
|
"""
|
|
|
|
enable_speed_optimizations = context.enable_speed_optimizations if context else False
|
|
if not enable_speed_optimizations:
|
|
return scraped_page.build_element_tree(element_tree_format)
|
|
|
|
if step.retry_index == 0:
|
|
element_tree_in_prompt = scraped_page.build_economy_elements_tree(element_tree_format)
|
|
LOG.info(
|
|
"Speed optimization: Using economy element tree (skipping SVGs)",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
)
|
|
return element_tree_in_prompt
|
|
|
|
element_tree_in_prompt = scraped_page.build_element_tree(element_tree_format)
|
|
LOG.info(
|
|
"Speed optimization: Using regular tree on retry (SVGs from global cache)",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
)
|
|
return element_tree_in_prompt
|
|
|
|
@staticmethod
|
|
def _build_extract_action_cache_variant(
|
|
verification_code_check: bool,
|
|
has_magic_link_page: bool,
|
|
complete_criterion: str | None,
|
|
) -> str:
|
|
"""
|
|
Build a short-but-unique cache variant identifier so extract-action prompts that
|
|
differ meaningfully (OTP, magic link flows, complete criteria) do not reuse the
|
|
same Vertex cache object.
|
|
"""
|
|
variant_parts: list[str] = []
|
|
if verification_code_check:
|
|
variant_parts.append("vc")
|
|
if has_magic_link_page:
|
|
variant_parts.append("ml")
|
|
if complete_criterion:
|
|
normalized = " ".join(complete_criterion.split())
|
|
digest = hashlib.sha1(normalized.encode("utf-8")).hexdigest()[:6]
|
|
variant_parts.append(f"cc{digest}")
|
|
return "-".join(variant_parts) if variant_parts else "std"
|
|
|
|
async def _create_vertex_cache_for_task(
|
|
self,
|
|
task: Task,
|
|
static_prompt: str,
|
|
context: SkyvernContext,
|
|
llm_key_override: str | None,
|
|
prompt_variant: str | None = None,
|
|
) -> None:
|
|
"""
|
|
Create a Vertex AI cache for the task's static prompt.
|
|
|
|
Uses llm_key as cache key to enable cache sharing across tasks with the same model.
|
|
|
|
Args:
|
|
task: The task to create cache for
|
|
static_prompt: The static prompt content to cache
|
|
context: The Skyvern context to store the cache name in
|
|
llm_key_override: Optional override when we explicitly pick an LLM key
|
|
prompt_variant: Cache variant identifier (std/vc/ml/etc.)
|
|
"""
|
|
resolved_llm_key = llm_key_override or task.llm_key
|
|
|
|
if not resolved_llm_key:
|
|
LOG.warning(
|
|
"Cannot create Vertex AI cache without llm_key, skipping cache creation",
|
|
task_id=task.task_id,
|
|
)
|
|
return
|
|
|
|
cache_variant = prompt_variant or "std"
|
|
|
|
try:
|
|
LOG.info(
|
|
"Attempting Vertex AI cache creation",
|
|
task_id=task.task_id,
|
|
llm_key=resolved_llm_key,
|
|
cache_variant=cache_variant,
|
|
)
|
|
cache_manager = get_cache_manager()
|
|
|
|
variant_suffix = f"-{cache_variant}" if cache_variant else ""
|
|
|
|
cache_key = f"{EXTRACT_ACTION_CACHE_KEY_PREFIX}{variant_suffix}-{resolved_llm_key}"
|
|
|
|
# Get the actual model name from LLM config to ensure correct format
|
|
# (e.g., "gemini-2.5-flash" with decimal, not "gemini-2-5-flash")
|
|
model_name = "gemini-2.5-flash" # Default
|
|
|
|
try:
|
|
llm_config = LLMConfigRegistry.get_config(resolved_llm_key)
|
|
extracted_name = None
|
|
|
|
# Try to extract from model_name if it contains "vertex_ai/" or starts with "gemini-"
|
|
if hasattr(llm_config, "model_name") and isinstance(llm_config.model_name, str):
|
|
if "vertex_ai/" in llm_config.model_name:
|
|
# Direct Vertex config: "vertex_ai/gemini-2.5-flash" -> "gemini-2.5-flash"
|
|
extracted_name = llm_config.model_name.split("/")[-1]
|
|
elif llm_config.model_name.startswith("gemini-"):
|
|
# Already in correct format
|
|
extracted_name = llm_config.model_name
|
|
|
|
# For router/fallback configs, extract from api_base or infer from key name
|
|
if not extracted_name and hasattr(llm_config, "litellm_params") and llm_config.litellm_params:
|
|
params = llm_config.litellm_params
|
|
api_base = getattr(params, "api_base", None)
|
|
if api_base and isinstance(api_base, str) and "/models/" in api_base:
|
|
# Extract from URL: .../models/gemini-2.5-flash -> "gemini-2.5-flash"
|
|
extracted_name = api_base.split("/models/")[-1]
|
|
|
|
# For router configs without api_base, infer from the llm_key itself
|
|
if not extracted_name:
|
|
# Extract version from llm_key (e.g., VERTEX_GEMINI_1_5_FLASH -> "1_5" or VERTEX_GEMINI_2.5_FLASH -> "2.5")
|
|
# Pattern: GEMINI_{version}_{flavor} where version can use dots, underscores, or dashes
|
|
version_match = re.search(r"GEMINI[_-](\d+[._-]\d+)", resolved_llm_key, re.IGNORECASE)
|
|
version = version_match.group(1).replace("_", ".").replace("-", ".") if version_match else "2.5"
|
|
|
|
# Determine flavor
|
|
if "_PRO_" in resolved_llm_key or resolved_llm_key.endswith("_PRO"):
|
|
extracted_name = f"gemini-{version}-pro"
|
|
elif "_FLASH_LITE_" in resolved_llm_key or resolved_llm_key.endswith("_FLASH_LITE"):
|
|
extracted_name = f"gemini-{version}-flash-lite"
|
|
else:
|
|
# Default to flash flavor
|
|
extracted_name = f"gemini-{version}-flash"
|
|
|
|
if extracted_name:
|
|
model_name = extracted_name
|
|
except Exception as e:
|
|
LOG.debug("Failed to extract model name from config, using default", error=str(e))
|
|
|
|
# Normalize model name to the canonical Vertex identifier (e.g., gemini-2.5-pro).
|
|
# Preserve preview suffixes so we don't strip required identifiers (e.g., gemini-3-flash-preview).
|
|
match = re.search(
|
|
r"(gemini-\d+(?:\.\d+)?-(?:flash-lite|flash|pro)(?:-preview)?)", model_name, re.IGNORECASE
|
|
)
|
|
if match:
|
|
model_name = match.group(1).lower()
|
|
|
|
# Create cache for this task
|
|
# Use asyncio.to_thread to offload blocking HTTP request (requests.post)
|
|
# This prevents freezing the event loop during cache creation
|
|
cache_data = await asyncio.to_thread(
|
|
cache_manager.create_cache,
|
|
model_name=model_name,
|
|
static_content=static_prompt,
|
|
cache_key=cache_key,
|
|
ttl_seconds=3600, # 1 hour
|
|
)
|
|
|
|
# Store cache metadata in context
|
|
context.vertex_cache_name = cache_data["name"]
|
|
context.vertex_cache_key = cache_key
|
|
context.vertex_cache_variant = cache_variant
|
|
|
|
LOG.info(
|
|
"Created Vertex AI cache for task",
|
|
task_id=task.task_id,
|
|
cache_key=cache_key,
|
|
cache_name=cache_data["name"],
|
|
model_name=model_name,
|
|
cache_variant=cache_variant,
|
|
)
|
|
except Exception as e:
|
|
LOG.warning(
|
|
"Failed to create Vertex AI cache, proceeding without caching",
|
|
task_id=task.task_id,
|
|
error=str(e),
|
|
exc_info=True,
|
|
)
|
|
|
|
async def _build_extract_action_prompt(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
browser_state: BrowserState,
|
|
scraped_page: ScrapedPage,
|
|
verification_code_check: bool = False,
|
|
expire_verification_code: bool = False,
|
|
) -> tuple[str, bool, str]:
|
|
actions_and_results_str = await self._get_action_results(task)
|
|
|
|
# Generate the extract action prompt
|
|
navigation_goal = task.navigation_goal
|
|
starting_url = task.url
|
|
page = await browser_state.get_working_page()
|
|
current_url = (
|
|
await SkyvernFrame.evaluate(frame=page, expression="() => document.location.href") if page else starting_url
|
|
)
|
|
final_navigation_payload = self._build_navigation_payload(
|
|
task, expire_verification_code=expire_verification_code, step=step, scraped_page=scraped_page
|
|
)
|
|
navigation_payload_str = json.dumps(final_navigation_payload)
|
|
|
|
task_type = task.task_type if task.task_type else TaskType.general
|
|
template = ""
|
|
if task_type == TaskType.general:
|
|
template = EXTRACT_ACTION_TEMPLATE
|
|
elif task_type == TaskType.validation:
|
|
template = "decisive-criterion-validate"
|
|
elif task_type == TaskType.action:
|
|
prompt = prompt_engine.load_prompt(
|
|
"infer-action-type", navigation_goal=navigation_goal, prompt_name="infer-action-type"
|
|
)
|
|
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
|
|
task.llm_key, default=app.LLM_API_HANDLER
|
|
)
|
|
json_response = await llm_api_handler(prompt=prompt, step=step, prompt_name="infer-action-type")
|
|
if json_response.get("error"):
|
|
raise FailedToParseActionInstruction(
|
|
reason=json_response.get("thought"), error_type=json_response.get("error")
|
|
)
|
|
|
|
inferred_actions: list[dict[str, Any]] = json_response.get("inferred_actions", [])
|
|
if not inferred_actions:
|
|
raise FailedToParseActionInstruction(reason=json_response.get("thought"), error_type="EMPTY_ACTION")
|
|
|
|
action_type: str = inferred_actions[0].get("action_type") or ""
|
|
action_type = ActionType[action_type.upper()]
|
|
|
|
if action_type == ActionType.CLICK:
|
|
template = "single-click-action"
|
|
elif action_type == ActionType.INPUT_TEXT:
|
|
template = "single-input-action"
|
|
elif action_type == ActionType.UPLOAD_FILE:
|
|
template = "single-upload-action"
|
|
elif action_type == ActionType.SELECT_OPTION:
|
|
template = "single-select-action"
|
|
else:
|
|
raise UnsupportedActionType(action_type=action_type)
|
|
|
|
if not template:
|
|
raise UnsupportedTaskType(task_type=task_type)
|
|
|
|
context = skyvern_context.ensure_context()
|
|
|
|
# Reset cached prompt and cache reference by default; we will set them below if caching is enabled.
|
|
# This prevents extract-action cache from being attached to other prompts like decisive-criterion-validate.
|
|
context.cached_static_prompt = None
|
|
context.vertex_cache_name = None
|
|
|
|
# Check if prompt caching is enabled for extract-action
|
|
use_caching = False
|
|
prompt_caching_settings = await self._get_prompt_caching_settings(context)
|
|
effective_llm_key = task.llm_key
|
|
if not effective_llm_key:
|
|
handler_for_key = LLMAPIHandlerFactory.get_override_llm_api_handler(
|
|
task.llm_key, default=app.LLM_API_HANDLER
|
|
)
|
|
effective_llm_key = getattr(handler_for_key, "llm_key", None)
|
|
cache_enabled = prompt_caching_settings.get(EXTRACT_ACTION_PROMPT_NAME) or prompt_caching_settings.get(
|
|
EXTRACT_ACTION_TEMPLATE
|
|
)
|
|
LOG.info(
|
|
"Extract-action prompt caching evaluation",
|
|
template=template,
|
|
cache_enabled=cache_enabled,
|
|
prompt_caching_settings=prompt_caching_settings,
|
|
task_llm_key=task.llm_key,
|
|
effective_llm_key=effective_llm_key,
|
|
)
|
|
enable_speed_optimizations = context.enable_speed_optimizations
|
|
element_tree_format = ElementTreeFormat.HTML
|
|
if enable_speed_optimizations:
|
|
if step.retry_index == 0:
|
|
elements_for_prompt = scraped_page.build_economy_elements_tree(element_tree_format)
|
|
else:
|
|
elements_for_prompt = scraped_page.build_element_tree(element_tree_format)
|
|
else:
|
|
elements_for_prompt = scraped_page.build_element_tree(element_tree_format)
|
|
|
|
if template == EXTRACT_ACTION_TEMPLATE and cache_enabled:
|
|
try:
|
|
# Try to load split templates for caching
|
|
prompt_kwargs = {
|
|
"navigation_goal": navigation_goal,
|
|
"navigation_payload_str": navigation_payload_str,
|
|
"starting_url": starting_url,
|
|
"current_url": current_url,
|
|
"data_extraction_goal": task.data_extraction_goal,
|
|
"action_history": actions_and_results_str,
|
|
"error_code_mapping_str": (
|
|
json.dumps(task.error_code_mapping) if task.error_code_mapping else None
|
|
),
|
|
"local_datetime": datetime.now(context.tz_info).isoformat(),
|
|
"verification_code_check": verification_code_check,
|
|
"complete_criterion": task.complete_criterion.strip() if task.complete_criterion else None,
|
|
"terminate_criterion": task.terminate_criterion.strip() if task.terminate_criterion else None,
|
|
"parse_select_feature_enabled": context.enable_parse_select_in_extract,
|
|
"has_magic_link_page": context.has_magic_link_page(task.task_id),
|
|
}
|
|
cache_variant = self._build_extract_action_cache_variant(
|
|
verification_code_check=verification_code_check,
|
|
has_magic_link_page=context.has_magic_link_page(task.task_id),
|
|
complete_criterion=task.complete_criterion.strip() if task.complete_criterion else None,
|
|
)
|
|
static_prompt = prompt_engine.load_prompt(f"{template}-static", **prompt_kwargs)
|
|
dynamic_prompt = prompt_engine.load_prompt(
|
|
f"{template}-dynamic",
|
|
elements=elements_for_prompt,
|
|
**prompt_kwargs,
|
|
)
|
|
|
|
# Store static prompt for caching and continue sending it alongside the dynamic section.
|
|
# Vertex explicit caching expects the static content to still be present in the request so the
|
|
# first call succeeds even if the cache is cold. The cached reference simply lets the service
|
|
# reuse the static portion internally.
|
|
context.cached_static_prompt = static_prompt
|
|
context.use_prompt_caching = True
|
|
use_caching = True
|
|
|
|
# Create Vertex AI cache for Gemini models
|
|
if effective_llm_key and "GEMINI" in effective_llm_key:
|
|
await self._create_vertex_cache_for_task(
|
|
task,
|
|
static_prompt,
|
|
context,
|
|
effective_llm_key,
|
|
prompt_variant=cache_variant,
|
|
)
|
|
|
|
combined_prompt = f"{static_prompt.rstrip()}\n\n{dynamic_prompt.lstrip()}"
|
|
|
|
LOG.info(
|
|
"Using cached prompt",
|
|
task_id=task.task_id,
|
|
prompt_name=EXTRACT_ACTION_PROMPT_NAME,
|
|
cache_variant=cache_variant,
|
|
)
|
|
# Map template to prompt_name for logging/caching guards
|
|
prompt_name = EXTRACT_ACTION_PROMPT_NAME if template == EXTRACT_ACTION_TEMPLATE else template
|
|
return combined_prompt, use_caching, prompt_name
|
|
|
|
except Exception as e:
|
|
LOG.warning("Failed to load cached prompt templates, falling back to original", error=str(e))
|
|
# Fall through to original behavior
|
|
|
|
# Original behavior - load full prompt
|
|
full_prompt = load_prompt_with_elements(
|
|
element_tree_builder=scraped_page,
|
|
prompt_engine=prompt_engine,
|
|
template_name=template,
|
|
navigation_goal=navigation_goal,
|
|
navigation_payload_str=navigation_payload_str,
|
|
starting_url=starting_url,
|
|
current_url=current_url,
|
|
data_extraction_goal=task.data_extraction_goal,
|
|
action_history=actions_and_results_str,
|
|
error_code_mapping_str=(json.dumps(task.error_code_mapping) if task.error_code_mapping else None),
|
|
local_datetime=datetime.now(context.tz_info).isoformat(),
|
|
verification_code_check=verification_code_check,
|
|
complete_criterion=task.complete_criterion.strip() if task.complete_criterion else None,
|
|
terminate_criterion=task.terminate_criterion.strip() if task.terminate_criterion else None,
|
|
parse_select_feature_enabled=context.enable_parse_select_in_extract,
|
|
has_magic_link_page=context.has_magic_link_page(task.task_id),
|
|
)
|
|
|
|
# Map template to prompt_name for logging/caching guards
|
|
prompt_name = EXTRACT_ACTION_PROMPT_NAME if template == EXTRACT_ACTION_TEMPLATE else template
|
|
return full_prompt, use_caching, prompt_name
|
|
|
|
async def _get_prompt_caching_settings(self, context: SkyvernContext) -> dict[str, bool]:
|
|
"""
|
|
Resolve prompt caching settings for the current run.
|
|
|
|
We prefer explicit overrides via LLMAPIHandlerFactory.set_prompt_caching_settings(), which
|
|
are mostly used by scripts/tests. When no override exists, evaluate the PostHog experiment
|
|
once per context and cache the result on the context to avoid repeated lookups.
|
|
"""
|
|
if LLMAPIHandlerFactory._prompt_caching_settings is not None:
|
|
return LLMAPIHandlerFactory._prompt_caching_settings
|
|
|
|
if context.prompt_caching_settings is not None:
|
|
return context.prompt_caching_settings
|
|
|
|
distinct_id = context.run_id or context.workflow_run_id or context.task_id
|
|
organization_id = context.organization_id
|
|
context.prompt_caching_settings = {}
|
|
|
|
if not distinct_id or not organization_id:
|
|
return context.prompt_caching_settings
|
|
|
|
try:
|
|
enabled = await app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
|
|
"PROMPT_CACHING_OPTIMIZATION",
|
|
distinct_id,
|
|
properties={"organization_id": organization_id},
|
|
)
|
|
except Exception as exc:
|
|
LOG.warning(
|
|
"Failed to evaluate prompt caching experiment; defaulting to disabled",
|
|
distinct_id=distinct_id,
|
|
organization_id=organization_id,
|
|
error=str(exc),
|
|
)
|
|
return context.prompt_caching_settings
|
|
|
|
if enabled:
|
|
context.prompt_caching_settings = {
|
|
EXTRACT_ACTION_PROMPT_NAME: True,
|
|
EXTRACT_ACTION_TEMPLATE: True,
|
|
}
|
|
LOG.info(
|
|
"Prompt caching optimization enabled",
|
|
distinct_id=distinct_id,
|
|
organization_id=organization_id,
|
|
)
|
|
|
|
return context.prompt_caching_settings
|
|
|
|
def _should_process_totp(self, scraped_page: ScrapedPage | None) -> bool:
|
|
"""Detect TOTP pages by checking for multiple input fields or verification keywords."""
|
|
if not scraped_page:
|
|
return False
|
|
|
|
try:
|
|
# Count input fields that could be for TOTP (more flexible than maxlength="1")
|
|
input_fields = [
|
|
element
|
|
for element in scraped_page.elements
|
|
if element.get("tagName", "").lower() == "input"
|
|
and element.get("attributes", {}).get("type", "text").lower() in ["text", "number", "tel"]
|
|
]
|
|
|
|
# Check for multiple input fields (potential multi-field TOTP)
|
|
if len(input_fields) >= 4:
|
|
# Additional check: look for patterns that suggest multi-field TOTP
|
|
# Check if inputs are close together or have similar attributes
|
|
has_maxlength_1 = any(elem.get("attributes", {}).get("maxlength") == "1" for elem in input_fields)
|
|
|
|
# Check for input fields with numeric patterns (type="number", pattern for digits)
|
|
has_numeric_patterns = any(
|
|
elem.get("attributes", {}).get("type") == "number"
|
|
or elem.get("attributes", {}).get("pattern", "").isdigit()
|
|
or "digit" in elem.get("attributes", {}).get("pattern", "").lower()
|
|
for elem in input_fields
|
|
)
|
|
|
|
if has_maxlength_1 or has_numeric_patterns:
|
|
return True
|
|
|
|
# Check for TOTP-related keywords in page content
|
|
page_text = scraped_page.html.lower() if scraped_page.html else ""
|
|
totp_keywords = [
|
|
"verification code",
|
|
"authentication code",
|
|
"security code",
|
|
"2fa",
|
|
"two-factor",
|
|
"totp",
|
|
"authenticator",
|
|
"verification",
|
|
"enter code",
|
|
"verification number",
|
|
"security number",
|
|
]
|
|
|
|
keyword_matches = sum(1 for keyword in totp_keywords if keyword in page_text)
|
|
|
|
# If we have multiple TOTP keywords and multiple input fields, likely TOTP
|
|
if keyword_matches >= 2 and len(input_fields) >= 6:
|
|
return True
|
|
|
|
# Strong single keyword match with multiple inputs
|
|
strong_keywords = ["verification code", "authentication code", "2fa", "two-factor"]
|
|
if any(keyword in page_text for keyword in strong_keywords) and len(input_fields) >= 3:
|
|
return True
|
|
|
|
return False
|
|
|
|
except Exception:
|
|
return False
|
|
|
|
def _is_multi_field_totp_sequence(self, actions: list) -> bool:
|
|
"""
|
|
Check if the action sequence represents a multi-field TOTP input (6 single-digit fields).
|
|
|
|
Args:
|
|
actions: List of actions to analyze
|
|
|
|
Returns:
|
|
bool: True if this is a multi-field TOTP sequence
|
|
"""
|
|
# Must have at least 4 actions (minimum for TOTP)
|
|
if len(actions) < 4:
|
|
return False
|
|
|
|
# Check if we have multiple consecutive single-digit INPUT_TEXT actions
|
|
consecutive_single_digits = 0
|
|
max_consecutive = 0
|
|
|
|
for action in actions:
|
|
if (
|
|
action.action_type == ActionType.INPUT_TEXT
|
|
and hasattr(action, "text")
|
|
and action.text
|
|
and len(action.text) == 1
|
|
and action.text.isdigit()
|
|
):
|
|
consecutive_single_digits += 1
|
|
max_consecutive = max(max_consecutive, consecutive_single_digits)
|
|
else:
|
|
# If we hit a non-single-digit action, reset consecutive counter
|
|
consecutive_single_digits = 0
|
|
|
|
# Consider it a multi-field TOTP if we have 4+ consecutive single-digit inputs
|
|
# This is more reliable than just counting total single digits
|
|
# We use 4+ as the threshold to avoid false positives with single TOTP fields
|
|
is_multi_field_totp = max_consecutive >= 4
|
|
|
|
if is_multi_field_totp:
|
|
LOG.debug(
|
|
"Detected multi-field TOTP sequence",
|
|
max_consecutive=max_consecutive,
|
|
total_actions=len(actions),
|
|
)
|
|
|
|
return is_multi_field_totp
|
|
|
|
def _build_navigation_payload(
|
|
self,
|
|
task: Task,
|
|
expire_verification_code: bool = False,
|
|
step: Step | None = None,
|
|
scraped_page: ScrapedPage | None = None,
|
|
) -> dict[str, Any] | list | str | None:
|
|
final_navigation_payload = task.navigation_payload
|
|
|
|
current_context = skyvern_context.ensure_context()
|
|
verification_code = current_context.totp_codes.get(task.task_id)
|
|
if (task.totp_verification_url or task.totp_identifier) and verification_code:
|
|
if (
|
|
isinstance(final_navigation_payload, dict)
|
|
and SPECIAL_FIELD_VERIFICATION_CODE not in final_navigation_payload
|
|
):
|
|
final_navigation_payload[SPECIAL_FIELD_VERIFICATION_CODE] = verification_code
|
|
elif (
|
|
isinstance(final_navigation_payload, str)
|
|
and SPECIAL_FIELD_VERIFICATION_CODE not in final_navigation_payload
|
|
):
|
|
final_navigation_payload = (
|
|
final_navigation_payload + "\n" + str({SPECIAL_FIELD_VERIFICATION_CODE: verification_code})
|
|
)
|
|
elif isinstance(final_navigation_payload, list):
|
|
verification_code_dict = str({SPECIAL_FIELD_VERIFICATION_CODE: verification_code})
|
|
if verification_code_dict not in final_navigation_payload:
|
|
final_navigation_payload.append(verification_code_dict)
|
|
else:
|
|
LOG.warning(
|
|
"Verification code already exists in navigation payload",
|
|
final_navigation_payload=final_navigation_payload,
|
|
)
|
|
|
|
elif final_navigation_payload is None:
|
|
final_navigation_payload = {SPECIAL_FIELD_VERIFICATION_CODE: verification_code}
|
|
else:
|
|
LOG.warning(
|
|
"Didn't add verification code to navigation payload",
|
|
final_navigation_payload=final_navigation_payload,
|
|
)
|
|
if expire_verification_code:
|
|
current_context.totp_codes.pop(task.task_id)
|
|
|
|
# Store TOTP secrets and provide placeholder TOTP for LLM to see format
|
|
# Only when on a TOTP page to avoid premature processing
|
|
if (
|
|
task.workflow_run_id
|
|
and step
|
|
and isinstance(final_navigation_payload, dict)
|
|
and self._should_process_totp(scraped_page)
|
|
):
|
|
workflow_run_context = app.WORKFLOW_CONTEXT_MANAGER.get_workflow_run_context(task.workflow_run_id)
|
|
|
|
for key, value in list(final_navigation_payload.items()):
|
|
if isinstance(value, dict) and "totp" in value:
|
|
totp_placeholder = value.get("totp")
|
|
if totp_placeholder and isinstance(totp_placeholder, str):
|
|
totp_secret_key = workflow_run_context.totp_secret_value_key(totp_placeholder)
|
|
totp_secret = workflow_run_context.get_original_secret_value_or_none(totp_secret_key)
|
|
|
|
if totp_secret:
|
|
# Store TOTP secret for handler to use during execution
|
|
current_context = skyvern_context.ensure_context()
|
|
current_context.totp_codes[f"{task.task_id}_secret"] = totp_secret
|
|
|
|
# Send a placeholder TOTP for the LLM to see the format
|
|
final_navigation_payload[key]["totp"] = "123456"
|
|
|
|
return final_navigation_payload
|
|
|
|
async def _get_action_results(self, task: Task, current_step: Step | None = None) -> str:
|
|
return json.dumps(await get_action_history(task=task, current_step=current_step))
|
|
|
|
async def get_extracted_information_for_task(self, task: Task) -> dict[str, Any] | list | str | None:
|
|
"""
|
|
Find the last successful ScrapeAction for the task and return the extracted information.
|
|
"""
|
|
# TODO: make sure we can get extracted information with the ExtractAction change
|
|
steps = await app.DATABASE.get_task_steps(
|
|
task_id=task.task_id,
|
|
organization_id=task.organization_id,
|
|
)
|
|
for step in reversed(steps):
|
|
if step.status != StepStatus.completed:
|
|
continue
|
|
if not step.output or not step.output.actions_and_results:
|
|
continue
|
|
for action, action_results in step.output.actions_and_results:
|
|
if action.action_type != ActionType.EXTRACT:
|
|
continue
|
|
|
|
for action_result in action_results:
|
|
if action_result.success:
|
|
LOG.info(
|
|
"Extracted information for task",
|
|
extracted_information=action_result.data,
|
|
)
|
|
return action_result.data
|
|
|
|
if task.data_extraction_goal:
|
|
LOG.warning(
|
|
"Failed to find extracted information for task",
|
|
task_id=task.task_id,
|
|
)
|
|
return None
|
|
|
|
async def get_failure_reason_for_task(self, task: Task) -> str | None:
|
|
"""
|
|
Find the TerminateAction for the task and return the reasoning.
|
|
# TODO (kerem): Also return meaningful exceptions when we add them [WYV-311]
|
|
"""
|
|
steps = await app.DATABASE.get_task_steps(
|
|
task_id=task.task_id,
|
|
organization_id=task.organization_id,
|
|
)
|
|
for step in reversed(steps):
|
|
if step.status != StepStatus.completed:
|
|
continue
|
|
if not step.output:
|
|
continue
|
|
|
|
if step.output.actions_and_results:
|
|
for action, action_results in step.output.actions_and_results:
|
|
if action.action_type == ActionType.TERMINATE:
|
|
return action.reasoning
|
|
|
|
LOG.error(
|
|
"Failed to find failure reasoning for task",
|
|
task_id=task.task_id,
|
|
)
|
|
return None
|
|
|
|
async def clean_up_task(
|
|
self,
|
|
task: Task,
|
|
last_step: Step,
|
|
api_key: str | None = None,
|
|
need_call_webhook: bool = True,
|
|
close_browser_on_completion: bool = True,
|
|
need_final_screenshot: bool = True,
|
|
browser_session_id: str | None = None,
|
|
) -> None:
|
|
"""
|
|
send the task response to the webhook callback url
|
|
"""
|
|
# refresh the task from the db to get the latest status
|
|
try:
|
|
refreshed_task = await app.DATABASE.get_task(task_id=task.task_id, organization_id=task.organization_id)
|
|
if not refreshed_task:
|
|
LOG.error("Failed to get task from db when clean up task", task_id=task.task_id)
|
|
raise TaskNotFound(task_id=task.task_id)
|
|
except Exception as e:
|
|
LOG.exception(
|
|
"Failed to get task from db when clean up task",
|
|
task_id=task.task_id,
|
|
)
|
|
raise TaskNotFound(task_id=task.task_id) from e
|
|
task = refreshed_task
|
|
|
|
# Caches expire based on TTL (1 hour) or can be cleaned up via scheduled job
|
|
# This allows multiple tasks with the same llm_key to share the same cache
|
|
|
|
# log the task status as an event
|
|
analytics.capture("skyvern-oss-agent-task-status", {"status": task.status})
|
|
|
|
# Add task completion tag to trace
|
|
TraceManager.add_task_completion_tag(task.status.value)
|
|
if need_final_screenshot:
|
|
# Take one last screenshot and create an artifact before closing the browser to see the final state
|
|
# We don't need the artifacts and send the webhook response directly only when there is an issue with the browser
|
|
# initialization. In this case, we don't have any artifacts to send and we can't take final screenshots etc.
|
|
# since the browser is not initialized properly or the proxy is not working.
|
|
|
|
browser_state = app.BROWSER_MANAGER.get_for_task(task.task_id)
|
|
if browser_state is not None and await browser_state.get_working_page() is not None:
|
|
try:
|
|
screenshot = await browser_state.take_fullpage_screenshot()
|
|
await app.ARTIFACT_MANAGER.create_artifact(
|
|
step=last_step,
|
|
artifact_type=ArtifactType.SCREENSHOT_FINAL,
|
|
data=screenshot,
|
|
)
|
|
except TargetClosedError:
|
|
LOG.warning(
|
|
"Failed to take screenshot before sending task response, page is closed",
|
|
)
|
|
except Exception:
|
|
LOG.exception("Failed to take screenshot before sending task response")
|
|
|
|
if task.organization_id:
|
|
try:
|
|
async with asyncio.timeout(SAVE_DOWNLOADED_FILES_TIMEOUT):
|
|
context = skyvern_context.current()
|
|
await app.STORAGE.save_downloaded_files(
|
|
organization_id=task.organization_id,
|
|
run_id=context.run_id if context and context.run_id else task.workflow_run_id or task.task_id,
|
|
)
|
|
except asyncio.TimeoutError:
|
|
LOG.warning(
|
|
"Timeout to save downloaded files",
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
)
|
|
except Exception:
|
|
LOG.warning(
|
|
"Failed to save downloaded files",
|
|
exc_info=True,
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
)
|
|
|
|
# if it's a task block from workflow run,
|
|
# we don't need to close the browser, save browser artifacts, or call webhook
|
|
if task.workflow_run_id:
|
|
LOG.info(
|
|
"Task is part of a workflow run, not sending a webhook response",
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
)
|
|
return
|
|
|
|
await self.async_operation_pool.remove_task(task.task_id)
|
|
|
|
await self.cleanup_browser_and_create_artifacts(
|
|
close_browser_on_completion, last_step, task, browser_session_id=browser_session_id
|
|
)
|
|
|
|
# Wait for all tasks to complete before generating the links for the artifacts
|
|
await app.ARTIFACT_MANAGER.wait_for_upload_aiotasks([task.task_id])
|
|
|
|
if need_call_webhook:
|
|
await self.execute_task_webhook(task=task, api_key=api_key)
|
|
|
|
async def execute_task_webhook(
|
|
self,
|
|
task: Task,
|
|
api_key: str | None,
|
|
) -> None:
|
|
if not api_key:
|
|
LOG.warning(
|
|
"Request has no api key. Not sending task response",
|
|
task_id=task.task_id,
|
|
)
|
|
return
|
|
|
|
if not task.webhook_callback_url:
|
|
LOG.warning(
|
|
"Task has no webhook callback url. Not sending task response",
|
|
task_id=task.task_id,
|
|
)
|
|
return
|
|
last_step = await app.DATABASE.get_latest_step(task.task_id, organization_id=task.organization_id)
|
|
|
|
task_response = await self.build_task_response(task=task, last_step=last_step)
|
|
# try to build the new TaskRunResponse for backward compatibility
|
|
task_run_response_json: str | None = None
|
|
try:
|
|
run_response = await run_service.get_run_response(
|
|
run_id=task.task_id,
|
|
organization_id=task.organization_id,
|
|
)
|
|
if run_response is not None:
|
|
task_run_response_json = run_response.model_dump_json(exclude={"run_request"})
|
|
|
|
# send task_response to the webhook callback url
|
|
payload_json = task_response.model_dump_json(exclude={"request"})
|
|
payload_dict = json.loads(payload_json)
|
|
if task_run_response_json:
|
|
payload_dict.update(json.loads(task_run_response_json))
|
|
|
|
signed_data = generate_skyvern_webhook_signature(payload=payload_dict, api_key=api_key)
|
|
|
|
LOG.info(
|
|
"Sending task response to webhook callback url",
|
|
task_id=task.task_id,
|
|
webhook_callback_url=task.webhook_callback_url,
|
|
payload=signed_data.signed_payload,
|
|
headers=signed_data.headers,
|
|
)
|
|
|
|
async with httpx.AsyncClient() as client:
|
|
resp = await client.post(
|
|
task.webhook_callback_url,
|
|
data=signed_data.signed_payload,
|
|
headers=signed_data.headers,
|
|
timeout=httpx.Timeout(30.0),
|
|
)
|
|
if resp.status_code >= 200 and resp.status_code < 300:
|
|
LOG.info(
|
|
"Webhook sent successfully",
|
|
task_id=task.task_id,
|
|
resp_code=resp.status_code,
|
|
resp_text=resp.text,
|
|
)
|
|
await app.DATABASE.update_task(
|
|
task_id=task.task_id,
|
|
organization_id=task.organization_id,
|
|
webhook_failure_reason="",
|
|
)
|
|
else:
|
|
LOG.info(
|
|
"Webhook failed",
|
|
task_id=task.task_id,
|
|
resp=resp,
|
|
resp_code=resp.status_code,
|
|
resp_text=resp.text,
|
|
)
|
|
await app.DATABASE.update_task(
|
|
task_id=task.task_id,
|
|
organization_id=task.organization_id,
|
|
webhook_failure_reason=f"Webhook failed with status code {resp.status_code}, error message: {resp.text}",
|
|
)
|
|
except Exception as e:
|
|
raise FailedToSendWebhook(task_id=task.task_id) from e
|
|
|
|
async def build_task_response(
|
|
self,
|
|
task: Task,
|
|
last_step: Step | None = None,
|
|
failure_reason: str | None = None,
|
|
need_browser_log: bool = False,
|
|
step_count: int | None = None,
|
|
) -> TaskResponse:
|
|
# no last step means the task didn't start, so we don't have any other artifacts
|
|
if last_step is None:
|
|
return task.to_task_response(
|
|
failure_reason=failure_reason,
|
|
step_count=step_count,
|
|
)
|
|
|
|
screenshot_url = None
|
|
recording_url = None
|
|
browser_console_log_url: str | None = None
|
|
latest_action_screenshot_urls: list[str] | None = None
|
|
downloaded_files: list[FileInfo] | None = None
|
|
|
|
# get the artifact of the screenshot and get the screenshot_url
|
|
screenshot_artifact = await app.DATABASE.get_artifact(
|
|
task_id=task.task_id,
|
|
step_id=last_step.step_id,
|
|
artifact_type=ArtifactType.SCREENSHOT_FINAL,
|
|
organization_id=task.organization_id,
|
|
)
|
|
if screenshot_artifact:
|
|
screenshot_url = await app.ARTIFACT_MANAGER.get_share_link(screenshot_artifact)
|
|
|
|
# Get recording url from browser session first,
|
|
# if not found, get the recording url from the first step
|
|
if task.browser_session_id:
|
|
try:
|
|
async with asyncio.timeout(GET_DOWNLOADED_FILES_TIMEOUT):
|
|
recordings = await app.STORAGE.get_shared_recordings_in_browser_session(
|
|
organization_id=task.organization_id,
|
|
browser_session_id=task.browser_session_id,
|
|
)
|
|
# FIXME: we only support one recording for now
|
|
recording_url = recordings[0].url if recordings else None
|
|
except asyncio.TimeoutError:
|
|
LOG.warning("Timeout getting recordings", browser_session_id=task.browser_session_id)
|
|
|
|
if recording_url is None:
|
|
first_step = await app.DATABASE.get_first_step(task_id=task.task_id, organization_id=task.organization_id)
|
|
if first_step:
|
|
recording_artifact = await app.DATABASE.get_artifact(
|
|
task_id=task.task_id,
|
|
step_id=first_step.step_id,
|
|
artifact_type=ArtifactType.RECORDING,
|
|
organization_id=task.organization_id,
|
|
)
|
|
if recording_artifact:
|
|
recording_url = await app.ARTIFACT_MANAGER.get_share_link(recording_artifact)
|
|
|
|
# get the artifact of the last TASK_RESPONSE_ACTION_SCREENSHOT_COUNT screenshots and get the screenshot_url
|
|
latest_action_screenshot_artifacts = await app.DATABASE.get_latest_n_artifacts(
|
|
task_id=task.task_id,
|
|
organization_id=task.organization_id,
|
|
artifact_types=[ArtifactType.SCREENSHOT_ACTION],
|
|
n=settings.TASK_RESPONSE_ACTION_SCREENSHOT_COUNT,
|
|
)
|
|
if latest_action_screenshot_artifacts:
|
|
latest_action_screenshot_urls = await app.ARTIFACT_MANAGER.get_share_links(
|
|
latest_action_screenshot_artifacts
|
|
)
|
|
|
|
if task.organization_id:
|
|
try:
|
|
async with asyncio.timeout(GET_DOWNLOADED_FILES_TIMEOUT):
|
|
context = skyvern_context.current()
|
|
downloaded_files = await app.STORAGE.get_downloaded_files(
|
|
organization_id=task.organization_id,
|
|
run_id=context.run_id if context and context.run_id else task.workflow_run_id or task.task_id,
|
|
)
|
|
except asyncio.TimeoutError:
|
|
LOG.warning(
|
|
"Timeout to get downloaded files",
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
)
|
|
except Exception:
|
|
LOG.warning(
|
|
"Failed to get downloaded files",
|
|
exc_info=True,
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
)
|
|
|
|
if need_browser_log:
|
|
browser_console_log = await app.DATABASE.get_latest_artifact(
|
|
task_id=task.task_id,
|
|
artifact_types=[ArtifactType.BROWSER_CONSOLE_LOG],
|
|
organization_id=task.organization_id,
|
|
)
|
|
if browser_console_log:
|
|
browser_console_log_url = await app.ARTIFACT_MANAGER.get_share_link(browser_console_log)
|
|
|
|
# get the latest task from the db to get the latest status, extracted_information, and failure_reason
|
|
task_from_db = await app.DATABASE.get_task(task_id=task.task_id, organization_id=task.organization_id)
|
|
if not task_from_db:
|
|
LOG.error("Failed to get task from db when sending task response")
|
|
raise TaskNotFound(task_id=task.task_id)
|
|
|
|
task = task_from_db
|
|
return task.to_task_response(
|
|
action_screenshot_urls=latest_action_screenshot_urls,
|
|
screenshot_url=screenshot_url,
|
|
recording_url=recording_url,
|
|
browser_console_log_url=browser_console_log_url,
|
|
downloaded_files=downloaded_files,
|
|
failure_reason=failure_reason,
|
|
step_count=step_count,
|
|
)
|
|
|
|
async def cleanup_browser_and_create_artifacts(
|
|
self,
|
|
close_browser_on_completion: bool,
|
|
last_step: Step,
|
|
task: Task,
|
|
browser_session_id: str | None = None,
|
|
) -> None:
|
|
"""
|
|
Developer notes: we should not expect any exception to be raised here.
|
|
This function should handle exceptions gracefully.
|
|
If errors are raised and not caught inside this function, please catch and handle them.
|
|
"""
|
|
# We need to close the browser even if there is no webhook callback url or api key
|
|
browser_state = await app.BROWSER_MANAGER.cleanup_for_task(
|
|
task.task_id,
|
|
close_browser_on_completion,
|
|
browser_session_id,
|
|
task.organization_id,
|
|
)
|
|
if browser_state:
|
|
# Update recording artifact after closing the browser, so we can get an accurate recording
|
|
video_artifacts = await app.BROWSER_MANAGER.get_video_artifacts(
|
|
task_id=task.task_id, browser_state=browser_state
|
|
)
|
|
for video_artifact in video_artifacts:
|
|
await app.ARTIFACT_MANAGER.update_artifact_data(
|
|
artifact_id=video_artifact.video_artifact_id,
|
|
organization_id=task.organization_id,
|
|
data=video_artifact.video_data,
|
|
)
|
|
|
|
har_data = await app.BROWSER_MANAGER.get_har_data(task_id=task.task_id, browser_state=browser_state)
|
|
if har_data:
|
|
await app.ARTIFACT_MANAGER.create_artifact(
|
|
step=last_step,
|
|
artifact_type=ArtifactType.HAR,
|
|
data=har_data,
|
|
)
|
|
|
|
browser_log = await app.BROWSER_MANAGER.get_browser_console_log(
|
|
task_id=task.task_id, browser_state=browser_state
|
|
)
|
|
if browser_log:
|
|
await app.ARTIFACT_MANAGER.create_artifact(
|
|
step=last_step,
|
|
artifact_type=ArtifactType.BROWSER_CONSOLE_LOG,
|
|
data=browser_log,
|
|
)
|
|
|
|
if browser_state.browser_context and browser_state.browser_artifacts.traces_dir:
|
|
trace_path = f"{browser_state.browser_artifacts.traces_dir}/{task.task_id}.zip"
|
|
await app.ARTIFACT_MANAGER.create_artifact(
|
|
step=last_step,
|
|
artifact_type=ArtifactType.TRACE,
|
|
path=trace_path,
|
|
)
|
|
else:
|
|
LOG.warning(
|
|
"BrowserState is missing before sending response to webhook_callback_url",
|
|
web_hook_url=task.webhook_callback_url,
|
|
)
|
|
|
|
async def update_step(
|
|
self,
|
|
step: Step,
|
|
status: StepStatus | None = None,
|
|
output: AgentStepOutput | None = None,
|
|
is_last: bool | None = None,
|
|
retry_index: int | None = None,
|
|
) -> Step:
|
|
step.validate_update(status, output, is_last)
|
|
updates: dict[str, Any] = {}
|
|
if status is not None:
|
|
updates["status"] = status
|
|
if output is not None:
|
|
updates["output"] = output
|
|
if is_last is not None:
|
|
updates["is_last"] = is_last
|
|
if retry_index is not None:
|
|
updates["retry_index"] = retry_index
|
|
update_comparison = {
|
|
key: {"old": getattr(step, key), "new": value}
|
|
for key, value in updates.items()
|
|
if getattr(step, key) != value and key != "output"
|
|
}
|
|
LOG.debug(
|
|
"Updating step in db",
|
|
diff=update_comparison,
|
|
)
|
|
|
|
# Track step duration when step is completed or failed
|
|
if status in [StepStatus.completed, StepStatus.failed]:
|
|
duration_seconds = (datetime.now(UTC) - step.created_at.replace(tzinfo=UTC)).total_seconds()
|
|
LOG.info(
|
|
"Step duration metrics",
|
|
duration_seconds=duration_seconds,
|
|
step_status=status,
|
|
organization_id=step.organization_id,
|
|
)
|
|
|
|
await save_step_logs(step.step_id)
|
|
|
|
return await app.DATABASE.update_step(
|
|
task_id=step.task_id,
|
|
step_id=step.step_id,
|
|
organization_id=step.organization_id,
|
|
**updates,
|
|
)
|
|
|
|
async def update_task(
|
|
self,
|
|
task: Task,
|
|
status: TaskStatus,
|
|
extracted_information: dict[str, Any] | list | str | None = None,
|
|
failure_reason: str | None = None,
|
|
webhook_failure_reason: str | None = None,
|
|
errors: list[dict[str, Any]] | None = None,
|
|
) -> Task:
|
|
# refresh task from db to get the latest status
|
|
task_from_db = await app.DATABASE.get_task(task_id=task.task_id, organization_id=task.organization_id)
|
|
if task_from_db:
|
|
task = task_from_db
|
|
|
|
task.validate_update(status, extracted_information, failure_reason)
|
|
updates: dict[str, Any] = {}
|
|
if status is not None:
|
|
updates["status"] = status
|
|
if extracted_information is not None:
|
|
updates["extracted_information"] = extracted_information
|
|
if failure_reason is not None:
|
|
updates["failure_reason"] = failure_reason
|
|
if errors is not None:
|
|
updates["errors"] = errors
|
|
update_comparison = {
|
|
key: {"old": getattr(task, key), "new": value}
|
|
for key, value in updates.items()
|
|
if getattr(task, key) != value
|
|
}
|
|
|
|
# Track task duration when task is completed, failed, or terminated
|
|
if status in [TaskStatus.completed, TaskStatus.failed, TaskStatus.terminated]:
|
|
start_time = task.started_at.replace(tzinfo=UTC) if task.started_at else task.created_at.replace(tzinfo=UTC)
|
|
queued_seconds = (start_time - task.created_at.replace(tzinfo=UTC)).total_seconds()
|
|
duration_seconds = (datetime.now(UTC) - start_time).total_seconds()
|
|
LOG.info(
|
|
"Task duration metrics",
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
duration_seconds=duration_seconds,
|
|
queued_seconds=queued_seconds,
|
|
task_status=status,
|
|
organization_id=task.organization_id,
|
|
failure_reason=failure_reason,
|
|
)
|
|
|
|
await save_task_logs(task.task_id)
|
|
LOG.info("Updating task in db", task_id=task.task_id, diff=update_comparison)
|
|
return await app.DATABASE.update_task(
|
|
task.task_id,
|
|
organization_id=task.organization_id,
|
|
**updates,
|
|
)
|
|
|
|
async def _handle_completed_step_with_parallel_verification(
|
|
self,
|
|
organization: Organization,
|
|
task: Task,
|
|
step: Step,
|
|
page: Page | None,
|
|
browser_state: BrowserState,
|
|
scraped_page: ScrapedPage,
|
|
engine: RunEngine,
|
|
task_block: BaseTaskBlock | None = None,
|
|
) -> tuple[bool | None, Step | None, Step | None]:
|
|
"""
|
|
Handle completed step with parallel verification optimization.
|
|
|
|
Runs two tasks in parallel:
|
|
1. Verify if user goal is complete (check-user-goal)
|
|
2. Pre-scrape page for next step
|
|
|
|
If goal is complete, cancel pre-scraping and mark task done.
|
|
If goal not complete, use pre-scraped data for next step execution.
|
|
|
|
Note: This should only be called when verification is needed (i.e., when
|
|
the standard flow would have called check_user_goal_complete in agent_step).
|
|
"""
|
|
LOG.info(
|
|
"Starting parallel user goal verification with speculative extract-actions",
|
|
step_id=step.step_id,
|
|
task_id=task.task_id,
|
|
)
|
|
|
|
verification_task = asyncio.create_task(
|
|
self.check_user_goal_complete(
|
|
page=page,
|
|
scraped_page=scraped_page,
|
|
task=task,
|
|
step=step,
|
|
),
|
|
name=f"verify_goal_{step.step_id}",
|
|
)
|
|
|
|
next_step = await app.DATABASE.create_step(
|
|
task_id=task.task_id,
|
|
order=step.order + 1,
|
|
retry_index=0,
|
|
organization_id=task.organization_id,
|
|
)
|
|
|
|
LOG.debug(
|
|
"Waiting before launching speculative plan",
|
|
step_id=step.step_id,
|
|
task_id=task.task_id,
|
|
)
|
|
await asyncio.sleep(1.0)
|
|
|
|
speculative_task = asyncio.create_task(
|
|
self._speculate_next_step_plan(
|
|
task=task,
|
|
current_step=step,
|
|
next_step=next_step,
|
|
browser_state=browser_state,
|
|
engine=engine,
|
|
),
|
|
name=f"speculate_next_step_{step.step_id}",
|
|
)
|
|
|
|
try:
|
|
complete_action = await verification_task
|
|
except Exception:
|
|
LOG.warning(
|
|
"User goal verification failed in parallel mode, will continue with next step",
|
|
step_id=step.step_id,
|
|
exc_info=True,
|
|
)
|
|
complete_action = None
|
|
|
|
if complete_action is not None:
|
|
asyncio.create_task(
|
|
self._persist_speculative_metadata_for_discarded_plan(
|
|
next_step,
|
|
speculative_task,
|
|
cancel_step=True,
|
|
)
|
|
)
|
|
|
|
working_page = page or await browser_state.must_get_working_page()
|
|
|
|
if step.output is None:
|
|
step.output = AgentStepOutput(action_results=[], actions_and_results=[], errors=[])
|
|
if step.output.action_results is None:
|
|
step.output.action_results = []
|
|
if step.output.actions_and_results is None:
|
|
step.output.actions_and_results = []
|
|
|
|
persisted_action = cast(Action, complete_action)
|
|
if isinstance(persisted_action, (CompleteAction, TerminateAction)):
|
|
persisted_action.organization_id = task.organization_id
|
|
persisted_action.workflow_run_id = task.workflow_run_id
|
|
persisted_action.task_id = task.task_id
|
|
persisted_action.step_id = step.step_id
|
|
persisted_action.step_order = step.order
|
|
persisted_action.action_order = len(step.output.actions_and_results)
|
|
|
|
action_results = await ActionHandler.handle_action(scraped_page, task, step, working_page, persisted_action)
|
|
await self.record_artifacts_after_action(task, step, browser_state, engine, persisted_action)
|
|
step.output.action_results.extend(action_results)
|
|
step.output.actions_and_results.append((persisted_action, action_results))
|
|
if isinstance(persisted_action, DecisiveAction) and persisted_action.errors:
|
|
step.output.errors.extend(persisted_action.errors)
|
|
|
|
if isinstance(persisted_action, TerminateAction):
|
|
LOG.warning(
|
|
"Parallel verification: termination required, marking task as terminated",
|
|
step_id=step.step_id,
|
|
task_id=task.task_id,
|
|
reasoning=complete_action.reasoning,
|
|
)
|
|
final_status = step.speculative_original_status or StepStatus.completed
|
|
step.speculative_original_status = None
|
|
step.status = final_status
|
|
last_step = await self.update_step(
|
|
step,
|
|
status=final_status,
|
|
output=step.output,
|
|
is_last=True,
|
|
)
|
|
task_errors = None
|
|
if persisted_action.errors:
|
|
task_errors = [error.model_dump() for error in persisted_action.errors]
|
|
failure_reason = persisted_action.reasoning
|
|
if persisted_action.errors:
|
|
failure_reason = "; ".join(error.reasoning for error in persisted_action.errors)
|
|
await self.update_task(
|
|
task,
|
|
status=TaskStatus.terminated,
|
|
failure_reason=failure_reason,
|
|
errors=task_errors,
|
|
)
|
|
return True, last_step, None
|
|
|
|
if isinstance(persisted_action, CompleteAction) and task.navigation_goal and task.data_extraction_goal:
|
|
task = await self._run_data_extraction_after_complete_action(
|
|
task=task,
|
|
step=step,
|
|
scraped_page=scraped_page,
|
|
working_page=working_page,
|
|
)
|
|
|
|
LOG.info(
|
|
"Parallel verification: goal achieved, marking task as completed",
|
|
step_id=step.step_id,
|
|
task_id=task.task_id,
|
|
)
|
|
final_status = step.speculative_original_status or StepStatus.completed
|
|
step.speculative_original_status = None
|
|
step.status = final_status
|
|
last_step = await self.update_step(
|
|
step,
|
|
status=final_status,
|
|
output=step.output,
|
|
is_last=True,
|
|
)
|
|
extracted_information = await self.get_extracted_information_for_task(task)
|
|
await self.update_task(
|
|
task,
|
|
status=TaskStatus.completed,
|
|
extracted_information=extracted_information,
|
|
)
|
|
return True, last_step, None
|
|
|
|
LOG.info(
|
|
"Parallel verification: goal not achieved, awaiting speculative extract-actions",
|
|
step_id=step.step_id,
|
|
task_id=task.task_id,
|
|
)
|
|
|
|
try:
|
|
speculative_plan = await speculative_task
|
|
except CancelledError:
|
|
LOG.debug("Speculative extract-actions cancelled after verification finished", step_id=step.step_id)
|
|
speculative_plan = None
|
|
except Exception:
|
|
LOG.warning(
|
|
"Speculative extract-actions failed, next step will run sequentially",
|
|
step_id=step.step_id,
|
|
exc_info=True,
|
|
)
|
|
speculative_plan = None
|
|
|
|
context = skyvern_context.current()
|
|
override_max_steps_per_run = context.max_steps_override if context else None
|
|
max_steps_per_run = (
|
|
override_max_steps_per_run
|
|
or task.max_steps_per_run
|
|
or organization.max_steps_per_run
|
|
or settings.MAX_STEPS_PER_RUN
|
|
)
|
|
|
|
if step.order + 1 >= max_steps_per_run:
|
|
LOG.info(
|
|
"Step completed but max steps reached, marking task as failed",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
max_steps=max_steps_per_run,
|
|
)
|
|
final_status = step.speculative_original_status or StepStatus.completed
|
|
step.speculative_original_status = None
|
|
step.status = final_status
|
|
last_step = await self.update_step(
|
|
step,
|
|
status=final_status,
|
|
output=step.output,
|
|
is_last=True,
|
|
)
|
|
|
|
generated_failure_reason = await self.summary_failure_reason_for_max_steps(
|
|
organization=organization,
|
|
task=task,
|
|
step=step,
|
|
page=page,
|
|
)
|
|
failure_reason = f"Reached the maximum steps ({max_steps_per_run}). Possible failure reasons: {generated_failure_reason.reasoning}"
|
|
errors = [ReachMaxStepsError().model_dump()] + [
|
|
error.model_dump() for error in generated_failure_reason.errors
|
|
]
|
|
|
|
await self._cancel_speculative_step(next_step)
|
|
|
|
await self.update_task(
|
|
task,
|
|
status=TaskStatus.failed,
|
|
failure_reason=failure_reason,
|
|
errors=errors,
|
|
)
|
|
return False, last_step, None
|
|
|
|
if speculative_plan:
|
|
context = skyvern_context.ensure_context()
|
|
context.speculative_plans[next_step.step_id] = speculative_plan
|
|
LOG.info(
|
|
"Stored speculative extract-actions plan for next step",
|
|
current_step_id=step.step_id,
|
|
next_step_id=next_step.step_id,
|
|
)
|
|
|
|
step.status = step.speculative_original_status or StepStatus.completed
|
|
step.speculative_original_status = None
|
|
|
|
return None, None, next_step
|
|
|
|
async def handle_failed_step(self, organization: Organization, task: Task, step: Step) -> Step | None:
|
|
max_retries_per_step = (
|
|
organization.max_retries_per_step
|
|
# we need to check by None because 0 is a valid value for max_retries_per_step
|
|
if organization.max_retries_per_step is not None
|
|
else settings.MAX_RETRIES_PER_STEP
|
|
)
|
|
if step.retry_index >= max_retries_per_step:
|
|
LOG.warning(
|
|
"Step failed after max retries, marking task as failed",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
max_retries=settings.MAX_RETRIES_PER_STEP,
|
|
)
|
|
browser_state = app.BROWSER_MANAGER.get_for_task(task_id=task.task_id, workflow_run_id=task.workflow_run_id)
|
|
page = None
|
|
if browser_state is not None:
|
|
page = await browser_state.get_working_page()
|
|
|
|
failure_reason = await self.summary_failure_reason_for_max_retries(
|
|
organization=organization,
|
|
task=task,
|
|
step=step,
|
|
page=page,
|
|
max_retries=max_retries_per_step,
|
|
)
|
|
|
|
await self.update_task(
|
|
task,
|
|
TaskStatus.failed,
|
|
failure_reason=(
|
|
f"Max retries per step ({max_retries_per_step}) exceeded. Possible failure reasons: {failure_reason}"
|
|
),
|
|
errors=[ReachMaxRetriesError().model_dump()],
|
|
)
|
|
return None
|
|
else:
|
|
LOG.warning(
|
|
"Step failed, retrying",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
)
|
|
next_step = await app.DATABASE.create_step(
|
|
task_id=task.task_id,
|
|
organization_id=task.organization_id,
|
|
order=step.order,
|
|
retry_index=step.retry_index + 1,
|
|
)
|
|
return next_step
|
|
|
|
async def summary_failure_reason_for_max_steps(
|
|
self,
|
|
organization: Organization,
|
|
task: Task,
|
|
step: Step,
|
|
page: Page | None,
|
|
) -> MaxStepsReasonResponse:
|
|
steps_results = []
|
|
llm_errors: list[str] = []
|
|
|
|
try:
|
|
steps = await app.DATABASE.get_task_steps(
|
|
task_id=task.task_id, organization_id=organization.organization_id
|
|
)
|
|
for step_cnt, step in enumerate(steps):
|
|
if step.output is None:
|
|
continue
|
|
|
|
if len(step.output.errors) > 0:
|
|
failure_reason = ";".join([repr(err) for err in step.output.errors])
|
|
return MaxStepsReasonResponse(
|
|
page_info="",
|
|
reasoning=failure_reason,
|
|
errors=step.output.errors,
|
|
)
|
|
|
|
if step.output.actions_and_results is None:
|
|
continue
|
|
|
|
action_result_summary: list[str] = []
|
|
step_result: dict[str, Any] = {
|
|
"order": step_cnt,
|
|
}
|
|
for action, action_results in step.output.actions_and_results:
|
|
if len(action_results) == 0:
|
|
continue
|
|
last_result = action_results[-1]
|
|
|
|
# Check if this is an LLM provider error
|
|
if not last_result.success:
|
|
exception_type = last_result.exception_type or ""
|
|
exception_message = last_result.exception_message or ""
|
|
if (
|
|
exception_type in (LLM_PROVIDER_ERROR_TYPE, LLM_PROVIDER_ERROR_RETRYABLE_TASK_TYPE)
|
|
or "LLMProvider" in exception_message
|
|
):
|
|
llm_errors.append(f"Step {step_cnt}: {exception_message}")
|
|
|
|
action_result_summary.append(
|
|
f"{action.reasoning}(action_type={action.action_type}, result={'success' if last_result.success else 'failed'})"
|
|
)
|
|
step_result["actions_result"] = action_result_summary
|
|
steps_results.append(step_result)
|
|
|
|
# If we detected LLM errors, return a clear message without calling the LLM
|
|
if llm_errors:
|
|
llm_error_details = "; ".join(llm_errors)
|
|
return MaxStepsReasonResponse(
|
|
page_info="",
|
|
reasoning=(
|
|
f"The task failed due to LLM service errors. The LLM provider encountered errors and was unable to process the requests. "
|
|
f"This is typically caused by rate limiting, service outages, or resource exhaustion from the LLM provider. "
|
|
f"Error details: {llm_error_details}"
|
|
),
|
|
errors=[],
|
|
)
|
|
|
|
scroll = True
|
|
if await service_utils.is_cua_task(task=task):
|
|
scroll = False
|
|
|
|
screenshots: list[bytes] = []
|
|
if page is not None:
|
|
screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=page.url, scroll=scroll)
|
|
|
|
prompt = prompt_engine.load_prompt(
|
|
"summarize-max-steps-reason",
|
|
step_count=len(steps),
|
|
navigation_goal=task.navigation_goal,
|
|
navigation_payload=task.navigation_payload,
|
|
steps=steps_results,
|
|
error_code_mapping_str=(json.dumps(task.error_code_mapping) if task.error_code_mapping else None),
|
|
local_datetime=datetime.now(skyvern_context.ensure_context().tz_info).isoformat(),
|
|
)
|
|
json_response = await app.LLM_API_HANDLER(
|
|
prompt=prompt, screenshots=screenshots, step=step, prompt_name="summarize-max-steps-reason"
|
|
)
|
|
return MaxStepsReasonResponse.model_validate(json_response)
|
|
except Exception:
|
|
LOG.warning("Failed to summary the failure reason")
|
|
# Check if we have LLM errors even if the summarization failed
|
|
if llm_errors:
|
|
llm_error_details = "; ".join(llm_errors)
|
|
return MaxStepsReasonResponse(
|
|
page_info="",
|
|
reasoning=(
|
|
f"The task failed due to LLM service errors. The LLM provider encountered errors and was unable to process the requests. "
|
|
f"Error details: {llm_error_details}"
|
|
),
|
|
errors=[],
|
|
)
|
|
if steps_results:
|
|
last_step_result = steps_results[-1]
|
|
return MaxStepsReasonResponse(
|
|
page_info="",
|
|
reasoning=f"Step {last_step_result['order']}: {last_step_result['actions_result']}",
|
|
errors=[],
|
|
)
|
|
return MaxStepsReasonResponse(
|
|
page_info="",
|
|
reasoning="",
|
|
errors=[],
|
|
)
|
|
|
|
async def summary_failure_reason_for_max_retries(
|
|
self,
|
|
organization: Organization,
|
|
task: Task,
|
|
step: Step,
|
|
page: Page | None,
|
|
max_retries: int,
|
|
) -> str:
|
|
html = ""
|
|
screenshots: list[bytes] = []
|
|
steps_results = []
|
|
llm_errors: list[str] = []
|
|
steps_without_actions = 0
|
|
|
|
try:
|
|
steps = await app.DATABASE.get_task_steps(
|
|
task_id=task.task_id, organization_id=organization.organization_id
|
|
)
|
|
|
|
# Check for LLM provider errors in the failed steps
|
|
for step_cnt, cur_step in enumerate(steps[-max_retries:]):
|
|
if cur_step.status == StepStatus.failed:
|
|
# If step failed with no actions, it might be an LLM error during action extraction
|
|
if not cur_step.output or not cur_step.output.actions_and_results:
|
|
steps_without_actions += 1
|
|
|
|
if cur_step.output and cur_step.output.actions_and_results:
|
|
action_result_summary: list[str] = []
|
|
step_result: dict[str, Any] = {
|
|
"order": step_cnt,
|
|
}
|
|
for action, action_results in cur_step.output.actions_and_results:
|
|
if len(action_results) == 0:
|
|
continue
|
|
last_result = action_results[-1]
|
|
if last_result.success:
|
|
continue
|
|
reason = last_result.exception_message or ""
|
|
|
|
# Check if this is an LLM provider error
|
|
exception_type = last_result.exception_type or ""
|
|
if (
|
|
exception_type in (LLM_PROVIDER_ERROR_TYPE, LLM_PROVIDER_ERROR_RETRYABLE_TASK_TYPE)
|
|
or "LLMProvider" in reason
|
|
):
|
|
llm_errors.append(f"Step {step_cnt}: {reason}")
|
|
|
|
action_result_summary.append(
|
|
f"{action.reasoning}(action_type={action.action_type}, result=failed, reason={reason})"
|
|
)
|
|
step_result["actions_result"] = action_result_summary
|
|
steps_results.append(step_result)
|
|
|
|
# If we detected LLM errors, return a clear message without calling the LLM
|
|
if llm_errors:
|
|
llm_error_details = "; ".join(llm_errors)
|
|
return (
|
|
f"The task failed due to LLM service errors. The LLM provider encountered errors and was unable to process the requests. "
|
|
f"This is typically caused by rate limiting, service outages, or resource exhaustion from the LLM provider. "
|
|
f"Error details: {llm_error_details}"
|
|
)
|
|
|
|
# If multiple steps failed without producing any actions, it's likely an LLM error during action extraction
|
|
if steps_without_actions >= max_retries:
|
|
return (
|
|
f"The task failed because all {max_retries} retry attempts failed to generate actions. "
|
|
f"This is typically caused by LLM service errors during action extraction, such as rate limiting, "
|
|
f"service outages, or resource exhaustion from the LLM provider. Please check the LLM service status and try again."
|
|
)
|
|
|
|
if page is not None:
|
|
skyvern_frame = await SkyvernFrame.create_instance(frame=page)
|
|
html = await skyvern_frame.get_content()
|
|
screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=page.url)
|
|
|
|
prompt = prompt_engine.load_prompt(
|
|
"summarize-max-retries-reason",
|
|
navigation_goal=task.navigation_goal,
|
|
navigation_payload=task.navigation_payload,
|
|
steps=steps_results,
|
|
page_html=html,
|
|
max_retries=max_retries,
|
|
local_datetime=datetime.now(skyvern_context.ensure_context().tz_info).isoformat(),
|
|
)
|
|
json_response = await app.SECONDARY_LLM_API_HANDLER(
|
|
prompt=prompt,
|
|
screenshots=screenshots,
|
|
step=step,
|
|
prompt_name="summarize-max-retries-reason",
|
|
)
|
|
return json_response.get("reasoning", "")
|
|
except Exception:
|
|
LOG.warning("Failed to summarize the failure reason for max retries")
|
|
# Check if we have LLM errors even if the summarization failed
|
|
if llm_errors:
|
|
llm_error_details = "; ".join(llm_errors)
|
|
return (
|
|
f"The task failed due to LLM service errors. The LLM provider encountered errors and was unable to process the requests. "
|
|
f"Error details: {llm_error_details}"
|
|
)
|
|
# If multiple steps failed without actions during summarization failure, still report it
|
|
if steps_without_actions >= max_retries:
|
|
return (
|
|
f"The task failed because all {max_retries} retry attempts failed to generate actions. "
|
|
f"This is typically caused by LLM service errors during action extraction."
|
|
)
|
|
if steps_results:
|
|
last_step_result = steps_results[-1]
|
|
return f"Retry Step {last_step_result['order']}: {last_step_result['actions_result']}"
|
|
return ""
|
|
|
|
async def handle_completed_step(
|
|
self,
|
|
organization: Organization,
|
|
task: Task,
|
|
step: Step,
|
|
page: Page | None,
|
|
task_block: BaseTaskBlock | None = None,
|
|
browser_state: BrowserState | None = None,
|
|
scraped_page: ScrapedPage | None = None,
|
|
engine: RunEngine = RunEngine.skyvern_v1,
|
|
complete_verification: bool = True,
|
|
) -> tuple[bool | None, Step | None, Step | None]:
|
|
# Check if parallel verification should be used
|
|
# Only use it when we have the required data AND when verification would normally happen
|
|
task_completes_on_download = task_block and task_block.complete_on_download and task.workflow_run_id
|
|
should_verify = (
|
|
complete_verification
|
|
and not step.is_goal_achieved()
|
|
and not step.is_terminated()
|
|
and not isinstance(task_block, ActionBlock)
|
|
and not task_completes_on_download
|
|
and (task.navigation_goal or task.complete_criterion)
|
|
)
|
|
|
|
if should_verify and browser_state and scraped_page:
|
|
disable_user_goal_check = await app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
|
|
"DISABLE_USER_GOAL_CHECK",
|
|
task.workflow_run_id if task.workflow_run_id else task.task_id,
|
|
properties={"task_url": task.url, "organization_id": task.organization_id},
|
|
)
|
|
|
|
if disable_user_goal_check:
|
|
LOG.info(
|
|
"User goal verification disabled via feature flag",
|
|
step_id=step.step_id,
|
|
task_id=task.task_id,
|
|
)
|
|
else:
|
|
return await self._handle_completed_step_with_parallel_verification(
|
|
organization=organization,
|
|
task=task,
|
|
step=step,
|
|
page=page,
|
|
browser_state=browser_state,
|
|
scraped_page=scraped_page,
|
|
engine=engine,
|
|
task_block=task_block,
|
|
)
|
|
|
|
if step.is_goal_achieved():
|
|
LOG.info(
|
|
"Step completed and goal achieved, marking task as completed",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
output=step.output,
|
|
)
|
|
last_step = await self.update_step(step, is_last=True)
|
|
extracted_information = await self.get_extracted_information_for_task(task)
|
|
await self.update_task(
|
|
task,
|
|
status=TaskStatus.completed,
|
|
extracted_information=extracted_information,
|
|
)
|
|
return True, last_step, None
|
|
if step.is_terminated():
|
|
LOG.info(
|
|
"Step completed and terminated by the agent, marking task as terminated",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
output=step.output,
|
|
)
|
|
last_step = await self.update_step(step, is_last=True)
|
|
failure_reason = await self.get_failure_reason_for_task(task)
|
|
await self.update_task(task, status=TaskStatus.terminated, failure_reason=failure_reason)
|
|
return False, last_step, None
|
|
# If the max steps are exceeded, mark the current step as the last step and conclude the task
|
|
context = skyvern_context.current()
|
|
override_max_steps_per_run = context.max_steps_override if context else None
|
|
max_steps_per_run = (
|
|
override_max_steps_per_run
|
|
or task.max_steps_per_run
|
|
or organization.max_steps_per_run
|
|
or settings.MAX_STEPS_PER_RUN
|
|
)
|
|
|
|
# HACK: action block only have one step to execute without complete action, so we consider the task is completed as long as the step is completed
|
|
if isinstance(task_block, ActionBlock) and step.is_success():
|
|
LOG.info(
|
|
"Step completed for the action block, marking task as completed",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
output=step.output,
|
|
)
|
|
last_step = await self.update_step(step, is_last=True)
|
|
await self.update_task(
|
|
task,
|
|
status=TaskStatus.completed,
|
|
)
|
|
return True, last_step, None
|
|
|
|
if step.order + 1 >= max_steps_per_run:
|
|
LOG.info(
|
|
"Step completed but max steps reached, marking task as failed",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
max_steps=max_steps_per_run,
|
|
)
|
|
last_step = await self.update_step(step, is_last=True)
|
|
|
|
generated_failure_reason = await self.summary_failure_reason_for_max_steps(
|
|
organization=organization,
|
|
task=task,
|
|
step=step,
|
|
page=page,
|
|
)
|
|
failure_reason = f"Reached the maximum steps ({max_steps_per_run}). Possible failure reasons: {generated_failure_reason.reasoning}"
|
|
errors = [ReachMaxStepsError().model_dump()] + [
|
|
error.model_dump() for error in generated_failure_reason.errors
|
|
]
|
|
|
|
await self.update_task(
|
|
task,
|
|
status=TaskStatus.failed,
|
|
failure_reason=failure_reason,
|
|
errors=errors,
|
|
)
|
|
return False, last_step, None
|
|
else:
|
|
LOG.info(
|
|
"Step completed, creating next step",
|
|
step_order=step.order,
|
|
step_retry=step.retry_index,
|
|
)
|
|
next_step = await app.DATABASE.create_step(
|
|
task_id=task.task_id,
|
|
order=step.order + 1,
|
|
retry_index=0,
|
|
organization_id=task.organization_id,
|
|
)
|
|
|
|
if step.order == int(max_steps_per_run * settings.LONG_RUNNING_TASK_WARNING_RATIO - 1):
|
|
LOG.info(
|
|
"Long running task warning",
|
|
order=step.order,
|
|
max_steps=max_steps_per_run,
|
|
warning_ratio=settings.LONG_RUNNING_TASK_WARNING_RATIO,
|
|
)
|
|
return None, None, next_step
|
|
|
|
async def handle_potential_OTP_actions(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
scraped_page: ScrapedPage,
|
|
browser_state: BrowserState,
|
|
json_response: dict[str, Any],
|
|
) -> tuple[dict[str, Any], list[Action]]:
|
|
if not task.organization_id:
|
|
return json_response, []
|
|
|
|
if not task.totp_verification_url and not task.totp_identifier:
|
|
return json_response, []
|
|
|
|
should_verify_by_magic_link = json_response.get("should_verify_by_magic_link")
|
|
place_to_enter_verification_code = json_response.get("place_to_enter_verification_code")
|
|
should_enter_verification_code = json_response.get("should_enter_verification_code")
|
|
|
|
if (
|
|
not should_verify_by_magic_link
|
|
and not place_to_enter_verification_code
|
|
and not should_enter_verification_code
|
|
):
|
|
return json_response, []
|
|
|
|
if place_to_enter_verification_code and should_enter_verification_code:
|
|
json_response = await self.handle_potential_verification_code(
|
|
task, step, scraped_page, browser_state, json_response
|
|
)
|
|
actions = parse_actions(task, step.step_id, step.order, scraped_page, json_response["actions"])
|
|
return json_response, actions
|
|
|
|
if should_verify_by_magic_link:
|
|
actions = await self.handle_potential_magic_link(task, step, scraped_page, browser_state, json_response)
|
|
return json_response, actions
|
|
|
|
return json_response, []
|
|
|
|
async def handle_potential_magic_link(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
scraped_page: ScrapedPage,
|
|
browser_state: BrowserState,
|
|
json_response: dict[str, Any],
|
|
) -> list[Action]:
|
|
should_verify_by_magic_link = json_response.get("should_verify_by_magic_link")
|
|
if not should_verify_by_magic_link:
|
|
return []
|
|
|
|
LOG.info("Handling magic link verification", task_id=task.task_id)
|
|
otp_value = await poll_otp_value(
|
|
organization_id=task.organization_id,
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
totp_verification_url=task.totp_verification_url,
|
|
totp_identifier=task.totp_identifier,
|
|
)
|
|
if not otp_value or otp_value.get_otp_type() != OTPType.MAGIC_LINK:
|
|
return []
|
|
|
|
# always open a new tab to navigate to the magic link
|
|
page = await browser_state.new_page()
|
|
context = skyvern_context.ensure_context()
|
|
context.add_magic_link_page(task.task_id, page)
|
|
|
|
return [
|
|
GotoUrlAction(
|
|
reasoning="Navigating to the magic link URL to verify the login",
|
|
intention="Navigating to the magic link URL to verify the login",
|
|
url=otp_value.value,
|
|
organization_id=task.organization_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
task_id=task.task_id,
|
|
step_id=step.step_id,
|
|
step_order=step.order,
|
|
action_order=0,
|
|
is_magic_link=True,
|
|
),
|
|
]
|
|
|
|
async def handle_potential_verification_code(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
scraped_page: ScrapedPage,
|
|
browser_state: BrowserState,
|
|
json_response: dict[str, Any],
|
|
) -> dict[str, Any]:
|
|
place_to_enter_verification_code = json_response.get("place_to_enter_verification_code")
|
|
should_enter_verification_code = json_response.get("should_enter_verification_code")
|
|
if (
|
|
place_to_enter_verification_code
|
|
and should_enter_verification_code
|
|
and (task.totp_verification_url or task.totp_identifier)
|
|
and task.organization_id
|
|
):
|
|
LOG.info("Need verification code")
|
|
workflow_id = workflow_permanent_id = None
|
|
if task.workflow_run_id:
|
|
workflow_run = await app.DATABASE.get_workflow_run(task.workflow_run_id)
|
|
if workflow_run:
|
|
workflow_id = workflow_run.workflow_id
|
|
workflow_permanent_id = workflow_run.workflow_permanent_id
|
|
otp_value = await poll_otp_value(
|
|
organization_id=task.organization_id,
|
|
task_id=task.task_id,
|
|
workflow_id=workflow_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
workflow_permanent_id=workflow_permanent_id,
|
|
totp_verification_url=task.totp_verification_url,
|
|
totp_identifier=task.totp_identifier,
|
|
)
|
|
if not otp_value or otp_value.get_otp_type() != OTPType.TOTP:
|
|
return json_response
|
|
|
|
current_context = skyvern_context.ensure_context()
|
|
current_context.totp_codes[task.task_id] = otp_value.value
|
|
|
|
extract_action_prompt, use_caching, prompt_name = await self._build_extract_action_prompt(
|
|
task,
|
|
step,
|
|
browser_state,
|
|
scraped_page,
|
|
verification_code_check=False,
|
|
)
|
|
llm_key_override = task.llm_key
|
|
if await service_utils.is_cua_task(task=task):
|
|
llm_key_override = None
|
|
llm_api_handler = LLMAPIHandlerFactory.get_override_llm_api_handler(
|
|
llm_key_override, default=app.LLM_API_HANDLER
|
|
)
|
|
# Add caching flag to context for monitoring
|
|
if use_caching:
|
|
context = skyvern_context.current()
|
|
if context:
|
|
context.use_prompt_caching = True
|
|
|
|
return await llm_api_handler(
|
|
prompt=extract_action_prompt,
|
|
step=step,
|
|
screenshots=scraped_page.screenshots,
|
|
prompt_name=prompt_name,
|
|
)
|
|
return json_response
|
|
|
|
@staticmethod
|
|
async def get_task_errors(task: Task) -> list[UserDefinedError]:
|
|
steps = await app.DATABASE.get_task_steps(task_id=task.task_id, organization_id=task.organization_id)
|
|
errors = []
|
|
for step in steps:
|
|
if step.output and step.output.errors:
|
|
errors.extend(step.output.errors)
|
|
|
|
return errors
|
|
|
|
@staticmethod
|
|
async def update_task_errors_from_detailed_output(
|
|
task: Task, detailed_step_output: DetailedAgentStepOutput
|
|
) -> Task:
|
|
task_errors = task.errors
|
|
step_errors = detailed_step_output.extract_errors() or []
|
|
task_errors.extend([error.model_dump() for error in step_errors])
|
|
|
|
return await app.DATABASE.update_task(
|
|
task_id=task.task_id,
|
|
organization_id=task.organization_id,
|
|
errors=task_errors,
|
|
)
|
|
|
|
@staticmethod
|
|
async def create_extract_action(task: Task, step: Step, scraped_page: ScrapedPage) -> ExtractAction:
|
|
context = skyvern_context.ensure_context()
|
|
# generate reasoning by prompt llm to think briefly what data to extract
|
|
prompt = prompt_engine.load_prompt(
|
|
"data-extraction-summary",
|
|
data_extraction_goal=task.data_extraction_goal,
|
|
data_extraction_schema=task.extracted_information_schema,
|
|
current_url=scraped_page.url,
|
|
local_datetime=datetime.now(context.tz_info).isoformat(),
|
|
)
|
|
|
|
data_extraction_summary_resp = await app.EXTRACTION_LLM_API_HANDLER(
|
|
prompt=prompt, step=step, prompt_name="data-extraction-summary"
|
|
)
|
|
return ExtractAction(
|
|
reasoning=data_extraction_summary_resp.get("summary", "Extracting information from the page"),
|
|
data_extraction_goal=task.data_extraction_goal,
|
|
data_extraction_schema=task.extracted_information_schema,
|
|
organization_id=task.organization_id,
|
|
task_id=task.task_id,
|
|
workflow_run_id=task.workflow_run_id,
|
|
step_id=step.step_id,
|
|
step_order=step.order,
|
|
action_order=0,
|
|
confidence_float=1.0,
|
|
)
|
|
|
|
@staticmethod
|
|
def step_has_completed_goal(detailed_agent_step_output: DetailedAgentStepOutput) -> bool:
|
|
if not detailed_agent_step_output.actions_and_results:
|
|
return False
|
|
|
|
last_action, last_action_results = detailed_agent_step_output.actions_and_results[-1]
|
|
if last_action.action_type not in [ActionType.COMPLETE, ActionType.EXTRACT]:
|
|
return False
|
|
|
|
return any(action_result.success for action_result in last_action_results)
|
|
|
|
async def _run_data_extraction_after_complete_action(
|
|
self,
|
|
task: Task,
|
|
step: Step,
|
|
scraped_page: ScrapedPage,
|
|
working_page: Page,
|
|
) -> Task:
|
|
"""
|
|
Run the extraction flow when a task with a data extraction goal completes during parallel verification.
|
|
"""
|
|
refreshed_task = await app.DATABASE.get_task(task.task_id, task.organization_id)
|
|
if refreshed_task:
|
|
task = refreshed_task
|
|
|
|
extract_action = await self.create_extract_action(task, step, scraped_page)
|
|
extract_results = await ActionHandler.handle_action(scraped_page, task, step, working_page, extract_action)
|
|
await app.AGENT_FUNCTION.post_action_execution(extract_action)
|
|
|
|
if step.output is None:
|
|
step.output = AgentStepOutput(action_results=[], actions_and_results=[], errors=[])
|
|
if step.output.action_results is None:
|
|
step.output.action_results = []
|
|
if step.output.actions_and_results is None:
|
|
step.output.actions_and_results = []
|
|
|
|
step.output.action_results.extend(extract_results)
|
|
step.output.actions_and_results.append((extract_action, extract_results))
|
|
|
|
return task
|