From eacf9d5388963749474a523d7d2e539621814654 Mon Sep 17 00:00:00 2001
From: Shuchang Zheng <wintonzheng0325@gmail.com>
Date: Wed, 30 Apr 2025 16:13:36 +0800
Subject: [PATCH] resize screenshots to make it compatible with anthropic CUA
 (#2255)

---
 skyvern/forge/agent.py                        | 10 +++-
 .../forge/sdk/api/llm/api_handler_factory.py  | 16 ++++-
 skyvern/utils/image_resizer.py                | 59 +++++++++++++++++++
 skyvern/webeye/actions/parse_actions.py       | 14 ++++-
 4 files changed, 94 insertions(+), 5 deletions(-)
 create mode 100644 skyvern/utils/image_resizer.py

diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py
index 93259c52..6d919297 100644
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -385,7 +385,7 @@ class ForgeAgent:
                 # llm_caller = LLMCaller(llm_key="BEDROCK_ANTHROPIC_CLAUDE3.5_SONNET_INFERENCE_PROFILE")
                 llm_caller = LLMCallerManager.get_llm_caller(task.task_id)
                 if not llm_caller:
-                    llm_caller = LLMCaller(llm_key=settings.ANTHROPIC_CUA_LLM_KEY)
+                    llm_caller = LLMCaller(llm_key=settings.ANTHROPIC_CUA_LLM_KEY, screenshot_scaling_enabled=True)
                     LLMCallerManager.set_llm_caller(task.task_id, llm_caller)
             step, detailed_output = await self.agent_step(
                 task,
@@ -1450,7 +1450,13 @@ class ForgeAgent:
         assistant_content = llm_response["content"]
         llm_caller.message_history.append({"role": "assistant", "content": assistant_content})
 
-        actions = await parse_anthropic_actions(task, step, assistant_content)
+        actions = await parse_anthropic_actions(
+            task,
+            step,
+            assistant_content,
+            llm_caller.browser_window_dimension,
+            llm_caller.screenshot_resize_target_dimension,
+        )
         return actions
 
     @staticmethod
diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py
index 234ed698..0f4b260e 100644
--- a/skyvern/forge/sdk/api/llm/api_handler_factory.py
+++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py
@@ -28,6 +28,7 @@ from skyvern.forge.sdk.core import skyvern_context
 from skyvern.forge.sdk.models import Step
 from skyvern.forge.sdk.schemas.ai_suggestions import AISuggestion
 from skyvern.forge.sdk.schemas.task_v2 import TaskV2, Thought
+from skyvern.utils.image_resizer import Resolution, get_resize_target_dimension, resize_screenshots
 
 LOG = structlog.get_logger()
 
@@ -454,12 +455,22 @@ class LLMCaller:
     An LLMCaller instance defines the LLM configs and keeps the chat history if needed.
     """
 
-    def __init__(self, llm_key: str, base_parameters: dict[str, Any] | None = None):
+    def __init__(
+        self,
+        llm_key: str,
+        screenshot_scaling_enabled: bool = False,
+        base_parameters: dict[str, Any] | None = None,
+    ):
         self.llm_key = llm_key
         self.llm_config = LLMConfigRegistry.get_config(llm_key)
         self.base_parameters = base_parameters
         self.message_history: list[dict[str, Any]] = []
         self.current_tool_results: list[dict[str, Any]] = []
+        self.screenshot_scaling_enabled = screenshot_scaling_enabled
+        self.browser_window_dimension = Resolution(width=settings.BROWSER_WIDTH, height=settings.BROWSER_HEIGHT)
+        self.screenshot_resize_target_dimension = self.browser_window_dimension
+        if screenshot_scaling_enabled:
+            self.screenshot_resize_target_dimension = get_resize_target_dimension(self.browser_window_dimension)
 
     def add_tool_result(self, tool_result: dict[str, Any]) -> None:
         self.current_tool_results.append(tool_result)
@@ -504,6 +515,9 @@ class LLMCaller:
                 ai_suggestion=ai_suggestion,
             )
 
+        if screenshots and self.screenshot_scaling_enabled:
+            screenshots = resize_screenshots(screenshots, self.screenshot_resize_target_dimension)
+
         await app.ARTIFACT_MANAGER.create_llm_artifact(
             data=prompt.encode("utf-8") if prompt else b"",
             artifact_type=ArtifactType.LLM_PROMPT,
diff --git a/skyvern/utils/image_resizer.py b/skyvern/utils/image_resizer.py
new file mode 100644
index 00000000..da23a9a7
--- /dev/null
+++ b/skyvern/utils/image_resizer.py
@@ -0,0 +1,59 @@
+import io
+from typing import TypedDict
+
+from PIL import Image
+
+
+class Resolution(TypedDict):
+    width: int
+    height: int
+
+
+MAX_SCALING_TARGETS_ANTHROPIC_CUA: dict[str, Resolution] = {
+    "XGA": Resolution(width=1024, height=768),  # 4:3
+    "WXGA": Resolution(width=1280, height=800),  # 16:10
+    "FWXGA": Resolution(width=1366, height=768),  # ~16:9
+}
+
+
+def get_resize_target_dimension(
+    window_size: Resolution, max_scaling_targets: dict[str, Resolution] = MAX_SCALING_TARGETS_ANTHROPIC_CUA
+) -> Resolution:
+    ratio = window_size["width"] / window_size["height"]
+    for dimension in max_scaling_targets.values():
+        if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
+            return dimension
+    return window_size
+
+
+def resize_screenshots(screenshots: list[bytes], target_dimension: Resolution) -> list[bytes]:
+    """
+    The image scaling logic is originated from anthropic's quickstart guide:
+    https://github.com/anthropics/anthropic-quickstarts/blob/81c4085944abb1734db411f05290b538fdc46dcd/computer-use-demo/computer_use_demo/tools/computer.py#L49-L60
+    """
+    new_screenshots = []
+    for screenshot in screenshots:
+        # Convert bytes to PIL Image
+        img = Image.open(io.BytesIO(screenshot))
+
+        # Resize image to target dimensions
+        resized_img = img.resize((target_dimension["width"], target_dimension["height"]), Image.Resampling.LANCZOS)
+
+        # Convert back to bytes
+        img_byte_arr = io.BytesIO()
+        resized_img.save(img_byte_arr, format="PNG")
+        img_byte = img_byte_arr.getvalue()
+
+        new_screenshots.append(img_byte)
+    return new_screenshots
+
+
+def scale_coordinates(
+    current_coordinates: tuple[int, int],
+    current_dimension: Resolution,
+    target_dimension: Resolution,
+) -> tuple[int, int]:
+    return (
+        int(current_coordinates[0] * target_dimension["width"] / current_dimension["width"]),
+        int(current_coordinates[1] * target_dimension["height"] / current_dimension["height"]),
+    )
diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py
index 715a3567..e508a15a 100644
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -9,6 +9,7 @@ from skyvern.forge import app
 from skyvern.forge.prompts import prompt_engine
 from skyvern.forge.sdk.models import Step
 from skyvern.forge.sdk.schemas.tasks import Task
+from skyvern.utils.image_resizer import Resolution, scale_coordinates
 from skyvern.webeye.actions.actions import (
     Action,
     ActionType,
@@ -454,6 +455,8 @@ async def parse_anthropic_actions(
     task: Task,
     step: Step,
     assistant_content: list[dict[str, Any]],
+    browser_window_dimension: Resolution,
+    screenshot_resize_target_dimension: Resolution,
 ) -> list[Action]:
     tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"]
     idx = 0
@@ -468,7 +471,11 @@ async def parse_anthropic_actions(
             continue
         action = tool_call_input["action"]
         if action == "mouse_move":
-            x, y = tool_call_input["coordinate"]
+            original_x, original_y = tool_call_input["coordinate"]
+            # (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension.
+            x, y = scale_coordinates(
+                (original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension
+            )
             actions.append(
                 MoveAction(
                     x=x,
@@ -497,7 +504,10 @@ async def parse_anthropic_actions(
                 )
                 idx += 1
                 continue
-            x, y = coordinate
+            original_x, original_y = coordinate
+            x, y = scale_coordinates(
+                (original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension
+            )
             actions.append(
                 ClickAction(
                     element_id="",