diff --git a/skyvern/config.py b/skyvern/config.py index db1079d6..223ab153 100644 --- a/skyvern/config.py +++ b/skyvern/config.py @@ -116,6 +116,7 @@ class Settings(BaseSettings): ENABLE_AZURE_O3_MINI: bool = False ENABLE_BEDROCK: bool = False ENABLE_GEMINI: bool = False + ENABLE_AZURE_CUA: bool = False # OPENAI OPENAI_API_KEY: str | None = None # ANTHROPIC @@ -125,6 +126,10 @@ class Settings(BaseSettings): AZURE_API_KEY: str | None = None AZURE_API_BASE: str | None = None AZURE_API_VERSION: str | None = None + AZURE_CUA_API_KEY: str | None = None + AZURE_CUA_ENDPOINT: str | None = None + AZURE_CUA_DEPLOYMENT: str | None = "computer-use-preview" + AZURE_CUA_API_VERSION: str | None = "2025-03-01-preview" # AZURE GPT-4o mini AZURE_GPT4O_MINI_DEPLOYMENT: str | None = None diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index d6e8e066..a5bff072 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -1,4 +1,5 @@ import asyncio +import base64 import json import os import random @@ -10,6 +11,7 @@ from typing import Any, Tuple import httpx import structlog +from openai.types.responses.response import Response as OpenAIResponse from playwright._impl._errors import TargetClosedError from playwright.async_api import Page @@ -68,6 +70,7 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskRequest, TaskResponse, Tas from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus +from skyvern.schemas.runs import RunEngine, RunType from skyvern.utils.prompt_engine import load_prompt_with_elements from skyvern.webeye.actions.actions import ( Action, @@ -85,7 +88,7 @@ from skyvern.webeye.actions.actions import ( from skyvern.webeye.actions.caching import retrieve_action_plan from skyvern.webeye.actions.handler import ActionHandler, poll_verification_code from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput -from skyvern.webeye.actions.parse_actions import parse_actions +from skyvern.webeye.actions.parse_actions import parse_actions, parse_cua_actions from skyvern.webeye.actions.responses import ActionResult, ActionSuccess from skyvern.webeye.browser_factory import BrowserState from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, scrape_website @@ -248,6 +251,8 @@ class ForgeAgent: task_block: BaseTaskBlock | None = None, browser_session_id: str | None = None, complete_verification: bool = True, + engine: RunEngine = RunEngine.skyvern_v1, + cua_response: OpenAIResponse | None = None, ) -> Tuple[Step, DetailedAgentStepOutput | None, Step | None]: workflow_run: WorkflowRun | None = None if task.workflow_run_id: @@ -380,6 +385,8 @@ class ForgeAgent: organization=organization, task_block=task_block, complete_verification=complete_verification, + engine=engine, + cua_response=cua_response, ) await app.AGENT_FUNCTION.post_step_execution(task, step) task = await self.update_task_errors_from_detailed_output(task, detailed_output) @@ -506,6 +513,10 @@ class ForgeAgent: step_status=step.status, ) + cua_response_param = detailed_output.cua_response if detailed_output else None + if not cua_response_param and cua_response: + cua_response_param = cua_response + if retry and next_step: return await self.execute_step( organization, @@ -516,6 +527,8 @@ class ForgeAgent: browser_session_id=browser_session_id, task_block=task_block, complete_verification=complete_verification, + engine=engine, + cua_response=cua_response_param, ) elif settings.execute_all_steps() and next_step: return await self.execute_step( @@ -527,6 +540,8 @@ class ForgeAgent: browser_session_id=browser_session_id, task_block=task_block, complete_verification=complete_verification, + engine=engine, + cua_response=cua_response_param, ) else: LOG.info( @@ -757,9 +772,11 @@ class ForgeAgent: task: Task, step: Step, browser_state: BrowserState, + engine: RunEngine = RunEngine.skyvern_v1, organization: Organization | None = None, task_block: BaseTaskBlock | None = None, complete_verification: bool = True, + cua_response: OpenAIResponse | None = None, ) -> tuple[Step, DetailedAgentStepOutput]: detailed_agent_step_output = DetailedAgentStepOutput( scraped_page=None, @@ -768,6 +785,7 @@ class ForgeAgent: actions=None, action_results=None, actions_and_results=None, + cua_response=None, ) try: LOG.info( @@ -789,52 +807,62 @@ class ForgeAgent: task, step, browser_state, + engine, ) detailed_agent_step_output.scraped_page = scraped_page detailed_agent_step_output.extract_action_prompt = extract_action_prompt json_response = None actions: list[Action] - using_cached_action_plan = False - if not task.navigation_goal and not isinstance(task_block, ValidationBlock): - actions = [await self.create_extract_action(task, step, scraped_page)] - elif ( - task_block - and task_block.cache_actions - and (actions := await retrieve_action_plan(task, step, scraped_page)) - ): - using_cached_action_plan = True - else: - self.async_operation_pool.run_operation(task.task_id, AgentPhase.llm) - json_response = await app.LLM_API_HANDLER( - prompt=extract_action_prompt, - prompt_name="extract-actions", + if engine == RunEngine.openai_cua: + actions, new_cua_response = await self._generate_cua_actions( + task=task, step=step, - screenshots=scraped_page.screenshots, + scraped_page=scraped_page, + previous_response=cua_response, ) - try: - json_response = await self.handle_potential_verification_code( - task, - step, - scraped_page, - browser_state, - json_response, + detailed_agent_step_output.cua_response = new_cua_response + else: + using_cached_action_plan = False + if not task.navigation_goal and not isinstance(task_block, ValidationBlock): + actions = [await self.create_extract_action(task, step, scraped_page)] + elif ( + task_block + and task_block.cache_actions + and (actions := await retrieve_action_plan(task, step, scraped_page)) + ): + using_cached_action_plan = True + else: + self.async_operation_pool.run_operation(task.task_id, AgentPhase.llm) + json_response = await app.LLM_API_HANDLER( + prompt=extract_action_prompt, + prompt_name="extract-actions", + step=step, + screenshots=scraped_page.screenshots, ) - detailed_agent_step_output.llm_response = json_response - actions = parse_actions(task, step.step_id, step.order, scraped_page, json_response["actions"]) - except NoTOTPVerificationCodeFound: - actions = [ - TerminateAction( - organization_id=task.organization_id, - workflow_run_id=task.workflow_run_id, - task_id=task.task_id, - step_id=step.step_id, - step_order=step.order, - action_order=0, - reasoning="No TOTP verification code found. Going to terminate.", - intention="No TOTP verification code found. Going to terminate.", + try: + json_response = await self.handle_potential_verification_code( + task, + step, + scraped_page, + browser_state, + json_response, ) - ] + detailed_agent_step_output.llm_response = json_response + actions = parse_actions(task, step.step_id, step.order, scraped_page, json_response["actions"]) + except NoTOTPVerificationCodeFound: + actions = [ + TerminateAction( + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=0, + reasoning="No TOTP verification code found. Going to terminate.", + intention="No TOTP verification code found. Going to terminate.", + ) + ] detailed_agent_step_output.actions = actions if len(actions) == 0: @@ -1187,6 +1215,77 @@ class ForgeAgent: ) return failed_step, detailed_agent_step_output.get_clean_detailed_output() + async def _generate_cua_actions( + self, + task: Task, + step: Step, + scraped_page: ScrapedPage, + previous_response: OpenAIResponse | None = None, + ) -> tuple[list[Action], OpenAIResponse]: + if not previous_response: + # this is the first step + first_response: OpenAIResponse = await app.OPENAI_CLIENT.responses.create( + model="computer-use-preview", + tools=[ + { + "type": "computer_use_preview", + "display_width": settings.BROWSER_WIDTH, + "display_height": settings.BROWSER_HEIGHT, + "environment": "browser", + } + ], + input=[ + { + "role": "user", + "content": task.navigation_goal, + } + ], + reasoning={ + "generate_summary": "concise", + }, + truncation="auto", + ) + previous_response = first_response + + computer_calls = [item for item in previous_response.output if item.type == "computer_call"] + if not computer_calls: + return [], previous_response + + if not scraped_page.screenshots: + return [], previous_response + + last_call_id = computer_calls[-1].call_id + screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8") + + current_response = await app.OPENAI_CLIENT.responses.create( + model="computer-use-preview", + previous_response_id=previous_response.id, + tools=[ + { + "type": "computer_use_preview", + "display_width": settings.BROWSER_WIDTH, + "display_height": settings.BROWSER_HEIGHT, + "environment": "browser", + } + ], + input=[ + { + "call_id": last_call_id, + "type": "computer_call_output", + "output": { + "type": "input_image", + "image_url": f"data:image/png;base64,{screenshot_base64}", + }, + } + ], + reasoning={ + "generate_summary": "concise", + }, + truncation="auto", + ) + + return parse_cua_actions(task, step, current_response), current_response + @staticmethod async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult: LOG.info( @@ -1195,7 +1294,12 @@ class ForgeAgent: step_id=step.step_id, workflow_run_id=task.workflow_run_id, ) - scraped_page_refreshed = await scraped_page.refresh(draw_boxes=False) + run_obj = await app.DATABASE.get_run(run_id=task.task_id, organization_id=task.organization_id) + scroll = True + if run_obj and run_obj.task_run_type == RunType.openai_cua: + scroll = False + + scraped_page_refreshed = await scraped_page.refresh(draw_boxes=False, scroll=scroll) verification_prompt = load_prompt_with_elements( scraped_page=scraped_page_refreshed, @@ -1351,6 +1455,7 @@ class ForgeAgent: step: Step, browser_state: BrowserState, scrape_type: ScrapeType, + engine: RunEngine, ) -> ScrapedPage: if scrape_type == ScrapeType.NORMAL: pass @@ -1370,11 +1475,21 @@ class ForgeAgent: ) await browser_state.reload_page() + max_screenshot_number = settings.MAX_NUM_SCREENSHOTS + draw_boxes = True + scroll = True + if engine == RunEngine.openai_cua: + max_screenshot_number = 1 + draw_boxes = False + scroll = False return await scrape_website( browser_state, task.url, app.AGENT_FUNCTION.cleanup_element_tree_factory(task=task, step=step), scrape_exclude=app.scrape_exclude, + max_screenshot_number=max_screenshot_number, + draw_boxes=draw_boxes, + scroll=scroll, ) async def build_and_record_step_prompt( @@ -1382,6 +1497,7 @@ class ForgeAgent: task: Task, step: Step, browser_state: BrowserState, + engine: RunEngine, ) -> tuple[ScrapedPage, str]: # start the async tasks while running scrape_website self.async_operation_pool.run_operation(task.task_id, AgentPhase.scrape) @@ -1399,6 +1515,7 @@ class ForgeAgent: step=step, browser_state=browser_state, scrape_type=scrape_type, + engine=engine, ) break except (FailedToTakeScreenshot, ScrapingFailed) as e: @@ -1431,14 +1548,16 @@ class ForgeAgent: # TODO: we only use HTML element for now, introduce a way to switch in the future element_tree_format = ElementTreeFormat.HTML element_tree_in_prompt: str = scraped_page.build_element_tree(element_tree_format) - extract_action_prompt = await self._build_extract_action_prompt( - task, - step, - browser_state, - scraped_page, - verification_code_check=bool(task.totp_verification_url or task.totp_identifier), - expire_verification_code=True, - ) + extract_action_prompt = "" + if engine != RunEngine.openai_cua: + extract_action_prompt = await self._build_extract_action_prompt( + task, + step, + browser_state, + scraped_page, + verification_code_check=bool(task.totp_verification_url or task.totp_identifier), + expire_verification_code=True, + ) await app.ARTIFACT_MANAGER.create_artifact( step=step, @@ -2146,9 +2265,14 @@ class ForgeAgent: step_result["actions_result"] = action_result_summary steps_results.append(step_result) + run_obj = await app.DATABASE.get_run(run_id=task.task_id, organization_id=task.organization_id) + scroll = True + if run_obj and run_obj.task_run_type == RunType.openai_cua: + scroll = False + screenshots: list[bytes] = [] if page is not None: - screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=page.url) + screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=page.url, scroll=scroll) prompt = prompt_engine.load_prompt( "summarize-max-steps-reason", diff --git a/skyvern/forge/app.py b/skyvern/forge/app.py index 9ce12a7c..34fcb155 100644 --- a/skyvern/forge/app.py +++ b/skyvern/forge/app.py @@ -1,6 +1,7 @@ from typing import Awaitable, Callable from fastapi import FastAPI +from openai import AsyncAzureOpenAI, AsyncOpenAI from skyvern.forge.agent import ForgeAgent from skyvern.forge.agent_functions import AgentFunction @@ -32,6 +33,15 @@ ARTIFACT_MANAGER = ArtifactManager() BROWSER_MANAGER = BrowserManager() EXPERIMENTATION_PROVIDER: BaseExperimentationProvider = NoOpExperimentationProvider() LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(SettingsManager.get_settings().LLM_KEY) +OPENAI_CLIENT = AsyncOpenAI(api_key=SettingsManager.get_settings().OPENAI_API_KEY) +if SettingsManager.get_settings().ENABLE_AZURE_CUA: + OPENAI_CLIENT = AsyncAzureOpenAI( + api_key=SettingsManager.get_settings().AZURE_CUA_API_KEY, + api_version=SettingsManager.get_settings().AZURE_CUA_API_VERSION, + azure_endpoint=SettingsManager.get_settings().AZURE_CUA_ENDPOINT, + azure_deployment=SettingsManager.get_settings().AZURE_CUA_DEPLOYMENT, + ) + SECONDARY_LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler( SETTINGS_MANAGER.SECONDARY_LLM_KEY if SETTINGS_MANAGER.SECONDARY_LLM_KEY else SETTINGS_MANAGER.LLM_KEY ) diff --git a/skyvern/forge/sdk/executor/async_executor.py b/skyvern/forge/sdk/executor/async_executor.py index 9cdc7302..2bc36c15 100644 --- a/skyvern/forge/sdk/executor/async_executor.py +++ b/skyvern/forge/sdk/executor/async_executor.py @@ -10,6 +10,7 @@ from skyvern.forge.sdk.core.skyvern_context import SkyvernContext from skyvern.forge.sdk.schemas.task_v2 import TaskV2Status from skyvern.forge.sdk.schemas.tasks import TaskStatus from skyvern.forge.sdk.workflow.models.workflow import WorkflowRunStatus +from skyvern.schemas.runs import RunEngine, RunType from skyvern.services import task_v2_service LOG = structlog.get_logger() @@ -91,6 +92,10 @@ class BackgroundTaskExecutor(AsyncExecutor): status=TaskStatus.running, organization_id=organization_id, ) + run_obj = await app.DATABASE.get_run(run_id=task_id, organization_id=organization_id) + engine = RunEngine.skyvern_v1 + if run_obj and run_obj.task_run_type == RunType.openai_cua: + engine = RunEngine.openai_cua context: SkyvernContext = skyvern_context.ensure_context() context.task_id = task.task_id @@ -106,6 +111,7 @@ class BackgroundTaskExecutor(AsyncExecutor): api_key, close_browser_on_completion=close_browser_on_completion, browser_session_id=browser_session_id, + engine=engine, ) async def execute_workflow( diff --git a/skyvern/forge/sdk/routes/agent_protocol.py b/skyvern/forge/sdk/routes/agent_protocol.py index 14149dd0..42a8ccf8 100644 --- a/skyvern/forge/sdk/routes/agent_protocol.py +++ b/skyvern/forge/sdk/routes/agent_protocol.py @@ -1465,7 +1465,7 @@ async def run_task( analytics.capture("skyvern-oss-run-task", data={"url": run_request.url}) await PermissionCheckerFactory.get_instance().check(current_org, browser_session_id=run_request.browser_session_id) - if run_request.engine == RunEngine.skyvern_v1: + if run_request.engine in [RunEngine.skyvern_v1, RunEngine.openai_cua]: # create task v1 # if there's no url, call task generation first to generate the url, data schema if any url = run_request.url @@ -1497,6 +1497,7 @@ async def run_task( task_v1_response = await task_v1_service.run_task( task=task_v1_request, organization=current_org, + engine=run_request.engine, x_max_steps_override=run_request.max_steps, x_api_key=x_api_key, request=request, @@ -1577,6 +1578,8 @@ async def run_task( publish_workflow=run_request.publish_workflow, ), ) + if run_request.engine == RunEngine.openai_cua: + pass raise HTTPException(status_code=400, detail=f"Invalid agent engine: {run_request.engine}") diff --git a/skyvern/schemas/runs.py b/skyvern/schemas/runs.py index 9151a676..4492006a 100644 --- a/skyvern/schemas/runs.py +++ b/skyvern/schemas/runs.py @@ -92,11 +92,13 @@ class RunType(StrEnum): task_v1 = "task_v1" task_v2 = "task_v2" workflow_run = "workflow_run" + openai_cua = "openai_cua" class RunEngine(StrEnum): skyvern_v1 = "skyvern-1.0" skyvern_v2 = "skyvern-2.0" + openai_cua = "openai-cua" class RunStatus(StrEnum): diff --git a/skyvern/services/task_v1_service.py b/skyvern/services/task_v1_service.py index 41e06bb8..5a319e4c 100644 --- a/skyvern/services/task_v1_service.py +++ b/skyvern/services/task_v1_service.py @@ -13,7 +13,7 @@ from skyvern.forge.sdk.executor.factory import AsyncExecutorFactory from skyvern.forge.sdk.schemas.organizations import Organization from skyvern.forge.sdk.schemas.task_generations import TaskGeneration, TaskGenerationBase from skyvern.forge.sdk.schemas.tasks import Task, TaskRequest -from skyvern.schemas.runs import RunType +from skyvern.schemas.runs import RunEngine, RunType LOG = structlog.get_logger() @@ -76,6 +76,7 @@ async def generate_task(user_prompt: str, organization: Organization) -> TaskGen async def run_task( task: TaskRequest, organization: Organization, + engine: RunEngine = RunEngine.skyvern_v1, x_max_steps_override: int | None = None, x_api_key: str | None = None, request: Request | None = None, @@ -83,8 +84,11 @@ async def run_task( ) -> Task: created_task = await app.agent.create_task(task, organization.organization_id) url_hash = generate_url_hash(task.url) + run_type = RunType.task_v1 + if engine == RunEngine.openai_cua: + run_type = RunType.openai_cua await app.DATABASE.create_task_run( - task_run_type=RunType.task_v1, + task_run_type=run_type, organization_id=organization.organization_id, run_id=created_task.task_id, title=task.title, diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index 1717dd03..0d360475 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -28,6 +28,9 @@ class ActionType(StrEnum): RELOAD_PAGE = "reload_page" EXTRACT = "extract" + SCROLL = "scroll" + KEYPRESS = "keypress" + TYPE = "type" def is_web_action(self) -> bool: return self in [ @@ -177,6 +180,9 @@ class ClickAction(WebAction): action_type: ActionType = ActionType.CLICK file_url: str | None = None download: bool = False + x: int | None = None + y: int | None = None + button: str = "left" def __repr__(self) -> str: return f"ClickAction(element_id={self.element_id}, file_url={self.file_url}, download={self.download})" @@ -240,6 +246,7 @@ class CheckboxAction(WebAction): class WaitAction(Action): action_type: ActionType = ActionType.WAIT + seconds: int = 20 class TerminateAction(DecisiveAction): @@ -258,6 +265,19 @@ class ExtractAction(Action): data_extraction_schema: dict[str, Any] | None = None +class ScrollAction(Action): + action_type: ActionType = ActionType.SCROLL + x: int + y: int + scroll_x: int + scroll_y: int + + +class KeypressAction(Action): + action_type: ActionType = ActionType.KEYPRESS + keys: list[str] = [] + + class ScrapeResult(BaseModel): """ Scraped response from a webpage, including: diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 7a50db89..b7cdf049 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -77,6 +77,7 @@ from skyvern.webeye.actions.actions import ( CheckboxAction, ClickAction, InputOrSelectContext, + InputTextAction, ScrapeResult, SelectOption, SelectOptionAction, @@ -392,6 +393,12 @@ def check_for_invalid_web_action( task: Task, step: Step, ) -> list[ActionResult]: + if isinstance(action, ClickAction) and action.x is not None and action.y is not None: + return [] + + if isinstance(action, InputTextAction) and not action.element_id: + return [] + if isinstance(action, WebAction) and action.element_id not in scraped_page.id_to_element_dict: return [ActionFailure(MissingElement(element_id=action.element_id), stop_execution_on_failure=False)] @@ -420,6 +427,36 @@ async def handle_click_action( task: Task, step: Step, ) -> list[ActionResult]: + if action.x is not None and action.y is not None: + # Find the element at the clicked location using JavaScript evaluation + element_id = await page.evaluate( + """data => { + const element = document.elementFromPoint(data.x, data.y); + if (!element) return null; + + // Function to get the unique_id attribute of an element + function getElementUniqueId(element) { + if (element && element.nodeType === 1) { + // Check if the element has the unique_id attribute + if (element.hasAttribute('unique_id')) { + return element.getAttribute('unique_id'); + } + + // If no unique_id attribute is found, return null + return null; + } + return null; + } + + return getElementUniqueId(element); + }""", + {"x": action.x, "y": action.y}, + ) + LOG.info("Clicked element at location", x=action.x, y=action.y, element_id=element_id, button=action.button) + + await page.mouse.click(x=action.x, y=action.y, button=action.button) + return [ActionSuccess()] + dom = DomUtil(scraped_page=scraped_page, page=page) skyvern_element = await dom.get_skyvern_element_by_id(action.element_id) await asyncio.sleep(0.3) @@ -591,6 +628,11 @@ async def handle_input_text_action( task: Task, step: Step, ) -> list[ActionResult]: + if not action.element_id: + # This is a CUA type action + await page.keyboard.type(action.text) + return [ActionSuccess()] + dom = DomUtil(scraped_page, page) skyvern_element = await dom.get_skyvern_element_by_id(action.element_id) skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame()) @@ -1348,7 +1390,7 @@ async def handle_wait_action( task: Task, step: Step, ) -> list[ActionResult]: - await asyncio.sleep(20) + await asyncio.sleep(action.seconds) return [ActionFailure(exception=Exception("Wait action is treated as a failure"))] @@ -1422,6 +1464,35 @@ async def handle_extract_action( return [ActionFailure(exception=Exception("No data extraction goal"))] +async def handle_scroll_action( + action: actions.ScrollAction, + page: Page, + scraped_page: ScrapedPage, + task: Task, + step: Step, +) -> list[ActionResult]: + await page.mouse.move(action.x, action.y) + await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})") + return [ActionSuccess()] + + +async def handle_keypress_action( + action: actions.KeypressAction, + page: Page, + scraped_page: ScrapedPage, + task: Task, + step: Step, +) -> list[ActionResult]: + for key in action.keys: + if key.lower() == "enter": + await page.keyboard.press("Enter") + elif key.lower() == "space": + await page.keyboard.press(" ") + else: + await page.keyboard.press(key) + return [ActionSuccess()] + + ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action) ActionHandler.register_action_type(ActionType.CLICK, handle_click_action) ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action) @@ -1433,6 +1504,8 @@ ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action) ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action) ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action) ActionHandler.register_action_type(ActionType.EXTRACT, handle_extract_action) +ActionHandler.register_action_type(ActionType.SCROLL, handle_scroll_action) +ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action) async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any: diff --git a/skyvern/webeye/actions/models.py b/skyvern/webeye/actions/models.py index 33a877e6..6976d608 100644 --- a/skyvern/webeye/actions/models.py +++ b/skyvern/webeye/actions/models.py @@ -2,6 +2,7 @@ from __future__ import annotations from typing import Any +from openai.types.responses.response import Response as OpenAIResponse from pydantic import BaseModel from skyvern.config import settings @@ -40,6 +41,7 @@ class DetailedAgentStepOutput(BaseModel): action_results: list[ActionResult] | None actions_and_results: list[tuple[Action, list[ActionResult]]] | None step_exception: str | None = None + cua_response: OpenAIResponse | None = None class Config: exclude = ["scraped_page", "extract_action_prompt"] @@ -72,6 +74,7 @@ class DetailedAgentStepOutput(BaseModel): if self.actions_and_results is None else [(action, result) for action, result in self.actions_and_results if result], step_exception=self.step_exception, + cua_response=self.cua_response, ) def to_agent_step_output(self) -> AgentStepOutput: diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index 37c953e5..b379b35b 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -1,9 +1,11 @@ from typing import Any, Dict import structlog +from openai.types.responses.response import Response as OpenAIResponse from pydantic import ValidationError from skyvern.exceptions import UnsupportedActionType +from skyvern.forge.sdk.models import Step from skyvern.forge.sdk.schemas.tasks import Task from skyvern.webeye.actions.actions import ( Action, @@ -13,7 +15,9 @@ from skyvern.webeye.actions.actions import ( CompleteAction, DownloadFileAction, InputTextAction, + KeypressAction, NullAction, + ScrollAction, SelectOption, SelectOptionAction, SolveCaptchaAction, @@ -194,3 +198,104 @@ def parse_actions( ) ############################ This part of code might not be needed ############################ return actions + + +def parse_cua_actions( + task: Task, + step: Step, + response: OpenAIResponse, +) -> list[Action]: + computer_calls = [item for item in response.output if item.type == "computer_call"] + reasonings = [item for item in response.output if item.type == "reasoning"] + actions: list[Action] = [] + for idx, computer_call in enumerate(computer_calls): + cua_action = computer_call.action + action_type = cua_action.type + try: + reasoning = None + if idx < len(reasonings): + try: + reasoning = reasonings[idx].summary[0].text + except Exception: + LOG.exception( + "Failed to parse reasoning", + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + ) + + match action_type: + case "click": + button = cua_action.button + if button != "left" and button != "right": + button = "left" + reasoning = reasoning or f"Click at: ({cua_action.x}, {cua_action.y})" + action = ClickAction( + element_id="", + x=cua_action.x, + y=cua_action.y, + button=button, + reasoning=reasoning, + intention=reasoning, + response=f"Click at: ({cua_action.x}, {cua_action.y})", + ) + case "scroll": + reasoning = reasoning or f"Scroll by: ({cua_action.x}, {cua_action.y})" + action = ScrollAction( + element_id="", + x=cua_action.x, + y=cua_action.y, + scroll_x=cua_action.scroll_x, + scroll_y=cua_action.scroll_y, + reasoning=reasoning, + intention=reasoning, + response=f"Scroll by: ({cua_action.x}, {cua_action.y})", + ) + case "keypress": + reasoning_str = f"Press keys: {cua_action.keys}" + if len(cua_action.keys) == 1: + reasoning_str = f"Press the '{cua_action.keys[0]}' key" + reasoning = reasoning or reasoning_str + action = KeypressAction( + element_id="", + keys=cua_action.keys, + reasoning=reasoning, + intention=reasoning, + response=str(cua_action.keys), + ) + case "type": + action = InputTextAction( + element_id="", + text=cua_action.text, + reasoning=reasoning, + intention=reasoning, + response=cua_action.text, + ) + case "wait": + action = WaitAction( + seconds=5, + reasoning=reasoning, + intention=reasoning, + ) + case _: + raise ValueError(f"Unsupported action type: {action_type}") + action.organization_id = task.organization_id + action.workflow_run_id = task.workflow_run_id + action.task_id = task.task_id + action.step_id = step.step_id + action.step_order = step.order + action.action_order = idx + actions.append(action) + except Exception: + LOG.exception( + "Failed to parse action", + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + ) + break + if not actions: + return [CompleteAction(reasoning="No actions generated", verified=True)] + return actions diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 0cee02ca..a1d95d75 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -326,13 +326,14 @@ class ScrapedPage(BaseModel): element["children"] = new_children return element - async def refresh(self, draw_boxes: bool = True) -> Self: + async def refresh(self, draw_boxes: bool = True, scroll: bool = True) -> Self: refreshed_page = await scrape_website( browser_state=self._browser_state, url=self.url, cleanup_element_tree=self._clean_up_func, scrape_exclude=self._scrape_exclude, draw_boxes=draw_boxes, + scroll=scroll, ) self.elements = refreshed_page.elements self.id_to_css_dict = refreshed_page.id_to_css_dict @@ -366,6 +367,8 @@ async def scrape_website( scrape_exclude: ScrapeExcludeFunc | None = None, take_screenshots: bool = True, draw_boxes: bool = True, + max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS, + scroll: bool = True, ) -> ScrapedPage: """ ************************************************************************************************ @@ -397,6 +400,8 @@ async def scrape_website( scrape_exclude=scrape_exclude, take_screenshots=take_screenshots, draw_boxes=draw_boxes, + max_screenshot_number=max_screenshot_number, + scroll=scroll, ) except Exception as e: # NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production @@ -420,6 +425,8 @@ async def scrape_website( scrape_exclude=scrape_exclude, take_screenshots=take_screenshots, draw_boxes=draw_boxes, + max_screenshot_number=max_screenshot_number, + scroll=scroll, ) @@ -469,6 +476,8 @@ async def scrape_web_unsafe( scrape_exclude: ScrapeExcludeFunc | None = None, take_screenshots: bool = True, draw_boxes: bool = True, + max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS, + scroll: bool = True, ) -> ScrapedPage: """ Asynchronous function that performs web scraping without any built-in error handling. This function is intended @@ -503,7 +512,6 @@ async def scrape_web_unsafe( json_to_html(element, need_skyvern_attrs=False) for element in element_tree_trimmed ) token_count = count_tokens(element_tree_trimmed_html_str) - max_screenshot_number = settings.MAX_NUM_SCREENSHOTS if token_count > DEFAULT_MAX_TOKENS: max_screenshot_number = min(max_screenshot_number, 1) @@ -512,6 +520,7 @@ async def scrape_web_unsafe( url=url, draw_boxes=draw_boxes, max_number=max_screenshot_number, + scroll=scroll, ) id_to_css_dict, id_to_element_dict, id_to_frame_dict, id_to_element_hash, hash_to_element_ids = build_element_dict( elements diff --git a/skyvern/webeye/utils/page.py b/skyvern/webeye/utils/page.py index 0fc1b777..87b7f422 100644 --- a/skyvern/webeye/utils/page.py +++ b/skyvern/webeye/utils/page.py @@ -98,8 +98,11 @@ class SkyvernFrame: url: str, draw_boxes: bool = False, max_number: int = settings.MAX_NUM_SCREENSHOTS, + scroll: bool = True, ) -> List[bytes]: skyvern_page = await SkyvernFrame.create_instance(frame=page) + if not scroll: + return [await SkyvernFrame.take_screenshot(page=skyvern_page.frame, full_page=False)] # page is the main frame and the index must be 0 assert isinstance(skyvern_page.frame, Page)