integration with CUA (#2126)

2025-04-11 11:18:53 -07:00
parent 2ac65c4a9b
commit f883b91180
13 changed files with 420 additions and 53 deletions
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -28,6 +28,9 @@ class ActionType(StrEnum):
    RELOAD_PAGE = "reload_page"

    EXTRACT = "extract"
+    SCROLL = "scroll"
+    KEYPRESS = "keypress"
+    TYPE = "type"

    def is_web_action(self) -> bool:
        return self in [
@@ -177,6 +180,9 @@ class ClickAction(WebAction):
    action_type: ActionType = ActionType.CLICK
    file_url: str | None = None
    download: bool = False
+    x: int | None = None
+    y: int | None = None
+    button: str = "left"

    def __repr__(self) -> str:
        return f"ClickAction(element_id={self.element_id}, file_url={self.file_url}, download={self.download})"
@@ -240,6 +246,7 @@ class CheckboxAction(WebAction):

 class WaitAction(Action):
    action_type: ActionType = ActionType.WAIT
+    seconds: int = 20


 class TerminateAction(DecisiveAction):
@@ -258,6 +265,19 @@ class ExtractAction(Action):
    data_extraction_schema: dict[str, Any] | None = None


+class ScrollAction(Action):
+    action_type: ActionType = ActionType.SCROLL
+    x: int
+    y: int
+    scroll_x: int
+    scroll_y: int
+
+
+class KeypressAction(Action):
+    action_type: ActionType = ActionType.KEYPRESS
+    keys: list[str] = []
+
+
 class ScrapeResult(BaseModel):
    """
    Scraped response from a webpage, including:
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -77,6 +77,7 @@ from skyvern.webeye.actions.actions import (
    CheckboxAction,
    ClickAction,
    InputOrSelectContext,
+    InputTextAction,
    ScrapeResult,
    SelectOption,
    SelectOptionAction,
@@ -392,6 +393,12 @@ def check_for_invalid_web_action(
    task: Task,
    step: Step,
 ) -> list[ActionResult]:
+    if isinstance(action, ClickAction) and action.x is not None and action.y is not None:
+        return []
+
+    if isinstance(action, InputTextAction) and not action.element_id:
+        return []
+
    if isinstance(action, WebAction) and action.element_id not in scraped_page.id_to_element_dict:
        return [ActionFailure(MissingElement(element_id=action.element_id), stop_execution_on_failure=False)]

@@ -420,6 +427,36 @@ async def handle_click_action(
    task: Task,
    step: Step,
 ) -> list[ActionResult]:
+    if action.x is not None and action.y is not None:
+        # Find the element at the clicked location using JavaScript evaluation
+        element_id = await page.evaluate(
+            """data => {
+            const element = document.elementFromPoint(data.x, data.y);
+            if (!element) return null;
+
+            // Function to get the unique_id attribute of an element
+            function getElementUniqueId(element) {
+                if (element && element.nodeType === 1) {
+                    // Check if the element has the unique_id attribute
+                    if (element.hasAttribute('unique_id')) {
+                        return element.getAttribute('unique_id');
+                    }
+                    
+                    // If no unique_id attribute is found, return null
+                    return null;
+                }
+                return null;
+            }
+
+            return getElementUniqueId(element);
+        }""",
+            {"x": action.x, "y": action.y},
+        )
+        LOG.info("Clicked element at location", x=action.x, y=action.y, element_id=element_id, button=action.button)
+
+        await page.mouse.click(x=action.x, y=action.y, button=action.button)
+        return [ActionSuccess()]
+
    dom = DomUtil(scraped_page=scraped_page, page=page)
    skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
    await asyncio.sleep(0.3)
@@ -591,6 +628,11 @@ async def handle_input_text_action(
    task: Task,
    step: Step,
 ) -> list[ActionResult]:
+    if not action.element_id:
+        # This is a CUA type action
+        await page.keyboard.type(action.text)
+        return [ActionSuccess()]
+
    dom = DomUtil(scraped_page, page)
    skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
    skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame())
@@ -1348,7 +1390,7 @@ async def handle_wait_action(
    task: Task,
    step: Step,
 ) -> list[ActionResult]:
-    await asyncio.sleep(20)
+    await asyncio.sleep(action.seconds)
    return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]


@@ -1422,6 +1464,35 @@ async def handle_extract_action(
        return [ActionFailure(exception=Exception("No data extraction goal"))]


+async def handle_scroll_action(
+    action: actions.ScrollAction,
+    page: Page,
+    scraped_page: ScrapedPage,
+    task: Task,
+    step: Step,
+) -> list[ActionResult]:
+    await page.mouse.move(action.x, action.y)
+    await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
+    return [ActionSuccess()]
+
+
+async def handle_keypress_action(
+    action: actions.KeypressAction,
+    page: Page,
+    scraped_page: ScrapedPage,
+    task: Task,
+    step: Step,
+) -> list[ActionResult]:
+    for key in action.keys:
+        if key.lower() == "enter":
+            await page.keyboard.press("Enter")
+        elif key.lower() == "space":
+            await page.keyboard.press(" ")
+        else:
+            await page.keyboard.press(key)
+    return [ActionSuccess()]
+
+
 ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
 ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
 ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
@@ -1433,6 +1504,8 @@ ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
 ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
 ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
 ActionHandler.register_action_type(ActionType.EXTRACT, handle_extract_action)
+ActionHandler.register_action_type(ActionType.SCROLL, handle_scroll_action)
+ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action)


 async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
--- a/skyvern/webeye/actions/models.py
+++ b/skyvern/webeye/actions/models.py
@@ -2,6 +2,7 @@ from __future__ import annotations

 from typing import Any

+from openai.types.responses.response import Response as OpenAIResponse
 from pydantic import BaseModel

 from skyvern.config import settings
@@ -40,6 +41,7 @@ class DetailedAgentStepOutput(BaseModel):
    action_results: list[ActionResult] | None
    actions_and_results: list[tuple[Action, list[ActionResult]]] | None
    step_exception: str | None = None
+    cua_response: OpenAIResponse | None = None

    class Config:
        exclude = ["scraped_page", "extract_action_prompt"]
@@ -72,6 +74,7 @@ class DetailedAgentStepOutput(BaseModel):
            if self.actions_and_results is None
            else [(action, result) for action, result in self.actions_and_results if result],
            step_exception=self.step_exception,
+            cua_response=self.cua_response,
        )

    def to_agent_step_output(self) -> AgentStepOutput:
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -1,9 +1,11 @@
 from typing import Any, Dict

 import structlog
+from openai.types.responses.response import Response as OpenAIResponse
 from pydantic import ValidationError

 from skyvern.exceptions import UnsupportedActionType
+from skyvern.forge.sdk.models import Step
 from skyvern.forge.sdk.schemas.tasks import Task
 from skyvern.webeye.actions.actions import (
    Action,
@@ -13,7 +15,9 @@ from skyvern.webeye.actions.actions import (
    CompleteAction,
    DownloadFileAction,
    InputTextAction,
+    KeypressAction,
    NullAction,
+    ScrollAction,
    SelectOption,
    SelectOptionAction,
    SolveCaptchaAction,
@@ -194,3 +198,104 @@ def parse_actions(
        )
    ############################ This part of code might not be needed ############################
    return actions
+
+
+def parse_cua_actions(
+    task: Task,
+    step: Step,
+    response: OpenAIResponse,
+) -> list[Action]:
+    computer_calls = [item for item in response.output if item.type == "computer_call"]
+    reasonings = [item for item in response.output if item.type == "reasoning"]
+    actions: list[Action] = []
+    for idx, computer_call in enumerate(computer_calls):
+        cua_action = computer_call.action
+        action_type = cua_action.type
+        try:
+            reasoning = None
+            if idx < len(reasonings):
+                try:
+                    reasoning = reasonings[idx].summary[0].text
+                except Exception:
+                    LOG.exception(
+                        "Failed to parse reasoning",
+                        task_id=task.task_id,
+                        step_id=step.step_id,
+                        step_order=step.order,
+                        action_order=idx,
+                    )
+
+            match action_type:
+                case "click":
+                    button = cua_action.button
+                    if button != "left" and button != "right":
+                        button = "left"
+                    reasoning = reasoning or f"Click at: ({cua_action.x}, {cua_action.y})"
+                    action = ClickAction(
+                        element_id="",
+                        x=cua_action.x,
+                        y=cua_action.y,
+                        button=button,
+                        reasoning=reasoning,
+                        intention=reasoning,
+                        response=f"Click at: ({cua_action.x}, {cua_action.y})",
+                    )
+                case "scroll":
+                    reasoning = reasoning or f"Scroll by: ({cua_action.x}, {cua_action.y})"
+                    action = ScrollAction(
+                        element_id="",
+                        x=cua_action.x,
+                        y=cua_action.y,
+                        scroll_x=cua_action.scroll_x,
+                        scroll_y=cua_action.scroll_y,
+                        reasoning=reasoning,
+                        intention=reasoning,
+                        response=f"Scroll by: ({cua_action.x}, {cua_action.y})",
+                    )
+                case "keypress":
+                    reasoning_str = f"Press keys: {cua_action.keys}"
+                    if len(cua_action.keys) == 1:
+                        reasoning_str = f"Press the '{cua_action.keys[0]}' key"
+                    reasoning = reasoning or reasoning_str
+                    action = KeypressAction(
+                        element_id="",
+                        keys=cua_action.keys,
+                        reasoning=reasoning,
+                        intention=reasoning,
+                        response=str(cua_action.keys),
+                    )
+                case "type":
+                    action = InputTextAction(
+                        element_id="",
+                        text=cua_action.text,
+                        reasoning=reasoning,
+                        intention=reasoning,
+                        response=cua_action.text,
+                    )
+                case "wait":
+                    action = WaitAction(
+                        seconds=5,
+                        reasoning=reasoning,
+                        intention=reasoning,
+                    )
+                case _:
+                    raise ValueError(f"Unsupported action type: {action_type}")
+            action.organization_id = task.organization_id
+            action.workflow_run_id = task.workflow_run_id
+            action.task_id = task.task_id
+            action.step_id = step.step_id
+            action.step_order = step.order
+            action.action_order = idx
+            actions.append(action)
+        except Exception:
+            LOG.exception(
+                "Failed to parse action",
+                task_id=task.task_id,
+                step_id=step.step_id,
+                step_order=step.order,
+                action_order=idx,
+            )
+            break
+    if not actions:
+        return [CompleteAction(reasoning="No actions generated", verified=True)]
+    return actions
--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -326,13 +326,14 @@ class ScrapedPage(BaseModel):
            element["children"] = new_children
        return element

-    async def refresh(self, draw_boxes: bool = True) -> Self:
+    async def refresh(self, draw_boxes: bool = True, scroll: bool = True) -> Self:
        refreshed_page = await scrape_website(
            browser_state=self._browser_state,
            url=self.url,
            cleanup_element_tree=self._clean_up_func,
            scrape_exclude=self._scrape_exclude,
            draw_boxes=draw_boxes,
+            scroll=scroll,
        )
        self.elements = refreshed_page.elements
        self.id_to_css_dict = refreshed_page.id_to_css_dict
@@ -366,6 +367,8 @@ async def scrape_website(
    scrape_exclude: ScrapeExcludeFunc | None = None,
    take_screenshots: bool = True,
    draw_boxes: bool = True,
+    max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
+    scroll: bool = True,
 ) -> ScrapedPage:
    """
    ************************************************************************************************
@@ -397,6 +400,8 @@ async def scrape_website(
            scrape_exclude=scrape_exclude,
            take_screenshots=take_screenshots,
            draw_boxes=draw_boxes,
+            max_screenshot_number=max_screenshot_number,
+            scroll=scroll,
        )
    except Exception as e:
        # NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
@@ -420,6 +425,8 @@ async def scrape_website(
            scrape_exclude=scrape_exclude,
            take_screenshots=take_screenshots,
            draw_boxes=draw_boxes,
+            max_screenshot_number=max_screenshot_number,
+            scroll=scroll,
        )


@@ -469,6 +476,8 @@ async def scrape_web_unsafe(
    scrape_exclude: ScrapeExcludeFunc | None = None,
    take_screenshots: bool = True,
    draw_boxes: bool = True,
+    max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
+    scroll: bool = True,
 ) -> ScrapedPage:
    """
    Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -503,7 +512,6 @@ async def scrape_web_unsafe(
            json_to_html(element, need_skyvern_attrs=False) for element in element_tree_trimmed
        )
        token_count = count_tokens(element_tree_trimmed_html_str)
-        max_screenshot_number = settings.MAX_NUM_SCREENSHOTS
        if token_count > DEFAULT_MAX_TOKENS:
            max_screenshot_number = min(max_screenshot_number, 1)

@@ -512,6 +520,7 @@ async def scrape_web_unsafe(
            url=url,
            draw_boxes=draw_boxes,
            max_number=max_screenshot_number,
+            scroll=scroll,
        )
    id_to_css_dict, id_to_element_dict, id_to_frame_dict, id_to_element_hash, hash_to_element_ids = build_element_dict(
        elements
--- a/skyvern/webeye/utils/page.py
+++ b/skyvern/webeye/utils/page.py
@@ -98,8 +98,11 @@ class SkyvernFrame:
        url: str,
        draw_boxes: bool = False,
        max_number: int = settings.MAX_NUM_SCREENSHOTS,
+        scroll: bool = True,
    ) -> List[bytes]:
        skyvern_page = await SkyvernFrame.create_instance(frame=page)
+        if not scroll:
+            return [await SkyvernFrame.take_screenshot(page=skyvern_page.frame, full_page=False)]

        # page is the main frame and the index must be 0
        assert isinstance(skyvern_page.frame, Page)