from typing import Any, Dict import structlog from openai.types.responses.response import Response as OpenAIResponse from pydantic import ValidationError from skyvern.exceptions import UnsupportedActionType from skyvern.forge.sdk.models import Step from skyvern.forge.sdk.schemas.tasks import Task from skyvern.webeye.actions.actions import ( Action, ActionType, CheckboxAction, ClickAction, CompleteAction, DownloadFileAction, InputTextAction, KeypressAction, NullAction, ScrollAction, SelectOption, SelectOptionAction, SolveCaptchaAction, TerminateAction, UploadFileAction, WaitAction, ) from skyvern.webeye.scraper.scraper import ScrapedPage LOG = structlog.get_logger() def parse_action(action: Dict[str, Any], scraped_page: ScrapedPage, data_extraction_goal: str | None = None) -> Action: if "id" in action: element_id = action["id"] elif "element_id" in action: element_id = action["element_id"] else: element_id = None skyvern_element_hash = scraped_page.id_to_element_hash.get(element_id) if element_id else None skyvern_element_data = scraped_page.id_to_element_dict.get(element_id) if element_id else None reasoning = action["reasoning"] if "reasoning" in action else None confidence_float = action["confidence_float"] if "confidence_float" in action else None # TODO: currently action intention and response are only used for Q&A actions, like input_text # When we start supporting click action, intention will be the reasoning for the click action (why take the action) intention = action["user_detail_query"] if "user_detail_query" in action else None response = action["user_detail_answer"] if "user_detail_answer" in action else None base_action_dict = { "element_id": element_id, "skyvern_element_hash": skyvern_element_hash, "skyvern_element_data": skyvern_element_data, "reasoning": reasoning, "confidence_float": confidence_float, "intention": intention, "response": response, } if "action_type" not in action or action["action_type"] is None: return NullAction(**base_action_dict) # `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK") action_type = ActionType[action["action_type"].upper()] if not action_type.is_web_action(): # LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc. # That can sometimes cause cached action plan to be invalidated. This way we're making sure the element id is not # set for non-web actions. base_action_dict["element_id"] = None if action_type == ActionType.TERMINATE: return TerminateAction(**base_action_dict, errors=action["errors"] if "errors" in action else []) if action_type == ActionType.CLICK: file_url = action["file_url"] if "file_url" in action else None return ClickAction(**base_action_dict, file_url=file_url, download=action.get("download", False)) if action_type == ActionType.INPUT_TEXT: return InputTextAction(**base_action_dict, text=action["text"]) if action_type == ActionType.UPLOAD_FILE: # TODO: see if the element is a file input element. if it's not, convert this action into a click action return UploadFileAction( **base_action_dict, file_url=action["file_url"], ) # This action is not used in the current implementation. Click actions are used instead. if action_type == ActionType.DOWNLOAD_FILE: return DownloadFileAction(**base_action_dict, file_name=action["file_name"]) if action_type == ActionType.SELECT_OPTION: option = action["option"] if option is None: raise ValueError("SelectOptionAction requires an 'option' field") label = option.get("label") value = option.get("value") index = option.get("index") if label is None and value is None and index is None: raise ValueError("At least one of 'label', 'value', or 'index' must be provided for a SelectOption") return SelectOptionAction( **base_action_dict, option=SelectOption( label=label, value=value, index=index, ), ) if action_type == ActionType.CHECKBOX: return CheckboxAction( **base_action_dict, is_checked=action["is_checked"], ) if action_type == ActionType.WAIT: return WaitAction(**base_action_dict) if action_type == ActionType.COMPLETE: return CompleteAction( **base_action_dict, data_extraction_goal=data_extraction_goal, errors=action["errors"] if "errors" in action else [], ) if action_type == "null": return NullAction(**base_action_dict) if action_type == ActionType.SOLVE_CAPTCHA: return SolveCaptchaAction(**base_action_dict) raise UnsupportedActionType(action_type=action_type) def parse_actions( task: Task, step_id: str, step_order: int, scraped_page: ScrapedPage, json_response: list[Dict[str, Any]] ) -> list[Action]: actions: list[Action] = [] for idx, action in enumerate(json_response): try: action_instance = parse_action( action=action, scraped_page=scraped_page, data_extraction_goal=task.data_extraction_goal ) action_instance.organization_id = task.organization_id action_instance.workflow_run_id = task.workflow_run_id action_instance.task_id = task.task_id action_instance.step_id = step_id action_instance.step_order = step_order action_instance.action_order = idx if isinstance(action_instance, TerminateAction): LOG.warning( "Agent decided to terminate", task_id=task.task_id, llm_response=json_response, reasoning=action_instance.reasoning, actions=actions, ) actions.append(action_instance) except UnsupportedActionType: LOG.error( "Unsupported action type when parsing actions", task_id=task.task_id, raw_action=action, exc_info=True, ) except (ValidationError, ValueError): LOG.warning( "Invalid action", task_id=task.task_id, raw_action=action, exc_info=True, ) except Exception: LOG.error( "Failed to marshal action", task_id=task.task_id, raw_action=action, exc_info=True, ) ############################ This part of code might not be needed ############################ # Reason #1. validation can be done in action handler but not in parser # Reason #2. no need to validate whether the element_id has a hash. # If there's no hash, we can fall back to normal operation all_element_ids = [action.element_id for action in actions if action.element_id] missing_element_ids = [ element_id for element_id in all_element_ids if element_id not in scraped_page.id_to_element_hash ] if missing_element_ids: LOG.warning( "Missing elements in scraped page", task_id=task.task_id, missing_element_ids=missing_element_ids, all_element_ids=all_element_ids, ) ############################ This part of code might not be needed ############################ return actions def parse_cua_actions( task: Task, step: Step, response: OpenAIResponse, ) -> list[Action]: computer_calls = [item for item in response.output if item.type == "computer_call"] reasonings = [item for item in response.output if item.type == "reasoning"] actions: list[Action] = [] for idx, computer_call in enumerate(computer_calls): cua_action = computer_call.action action_type = cua_action.type try: reasoning = None if idx < len(reasonings): try: reasoning = reasonings[idx].summary[0].text except Exception: LOG.exception( "Failed to parse reasoning", task_id=task.task_id, step_id=step.step_id, step_order=step.order, action_order=idx, ) match action_type: case "click": button = cua_action.button if button != "left" and button != "right": button = "left" reasoning = reasoning or f"Click at: ({cua_action.x}, {cua_action.y})" action = ClickAction( element_id="", x=cua_action.x, y=cua_action.y, button=button, reasoning=reasoning, intention=reasoning, response=f"Click at: ({cua_action.x}, {cua_action.y})", ) case "scroll": reasoning = reasoning or f"Scroll by: ({cua_action.x}, {cua_action.y})" action = ScrollAction( element_id="", x=cua_action.x, y=cua_action.y, scroll_x=cua_action.scroll_x, scroll_y=cua_action.scroll_y, reasoning=reasoning, intention=reasoning, response=f"Scroll by: ({cua_action.x}, {cua_action.y})", ) case "keypress": reasoning_str = f"Press keys: {cua_action.keys}" if len(cua_action.keys) == 1: reasoning_str = f"Press the '{cua_action.keys[0]}' key" reasoning = reasoning or reasoning_str action = KeypressAction( element_id="", keys=cua_action.keys, reasoning=reasoning, intention=reasoning, response=str(cua_action.keys), ) case "type": action = InputTextAction( element_id="", text=cua_action.text, reasoning=reasoning, intention=reasoning, response=cua_action.text, ) case "wait": action = WaitAction( seconds=5, reasoning=reasoning, intention=reasoning, ) case _: raise ValueError(f"Unsupported action type: {action_type}") action.organization_id = task.organization_id action.workflow_run_id = task.workflow_run_id action.task_id = task.task_id action.step_id = step.step_id action.step_order = step.order action.action_order = idx actions.append(action) except Exception: LOG.exception( "Failed to parse action", task_id=task.task_id, step_id=step.step_id, step_order=step.order, action_order=idx, ) break if not actions: LOG.info( "Empty action returned by CUA", task_id=task.task_id, step_id=step.step_id, organization_id=task.organization_id, workflow_run_id=task.workflow_run_id, response=response.dict(), ) complete_action = CompleteAction( reasoning="No more actions to take", verified=True, data_extraction_goal=task.data_extraction_goal, organization_id=task.organization_id, workflow_run_id=task.workflow_run_id, task_id=task.task_id, step_id=step.step_id, step_order=step.order, action_order=0, ) return [complete_action] return actions