From eacf9d5388963749474a523d7d2e539621814654 Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Wed, 30 Apr 2025 16:13:36 +0800 Subject: [PATCH] resize screenshots to make it compatible with anthropic CUA (#2255) --- skyvern/forge/agent.py | 10 +++- .../forge/sdk/api/llm/api_handler_factory.py | 16 ++++- skyvern/utils/image_resizer.py | 59 +++++++++++++++++++ skyvern/webeye/actions/parse_actions.py | 14 ++++- 4 files changed, 94 insertions(+), 5 deletions(-) create mode 100644 skyvern/utils/image_resizer.py diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 93259c52..6d919297 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -385,7 +385,7 @@ class ForgeAgent: # llm_caller = LLMCaller(llm_key="BEDROCK_ANTHROPIC_CLAUDE3.5_SONNET_INFERENCE_PROFILE") llm_caller = LLMCallerManager.get_llm_caller(task.task_id) if not llm_caller: - llm_caller = LLMCaller(llm_key=settings.ANTHROPIC_CUA_LLM_KEY) + llm_caller = LLMCaller(llm_key=settings.ANTHROPIC_CUA_LLM_KEY, screenshot_scaling_enabled=True) LLMCallerManager.set_llm_caller(task.task_id, llm_caller) step, detailed_output = await self.agent_step( task, @@ -1450,7 +1450,13 @@ class ForgeAgent: assistant_content = llm_response["content"] llm_caller.message_history.append({"role": "assistant", "content": assistant_content}) - actions = await parse_anthropic_actions(task, step, assistant_content) + actions = await parse_anthropic_actions( + task, + step, + assistant_content, + llm_caller.browser_window_dimension, + llm_caller.screenshot_resize_target_dimension, + ) return actions @staticmethod diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py index 234ed698..0f4b260e 100644 --- a/skyvern/forge/sdk/api/llm/api_handler_factory.py +++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py @@ -28,6 +28,7 @@ from skyvern.forge.sdk.core import skyvern_context from skyvern.forge.sdk.models import Step from skyvern.forge.sdk.schemas.ai_suggestions import AISuggestion from skyvern.forge.sdk.schemas.task_v2 import TaskV2, Thought +from skyvern.utils.image_resizer import Resolution, get_resize_target_dimension, resize_screenshots LOG = structlog.get_logger() @@ -454,12 +455,22 @@ class LLMCaller: An LLMCaller instance defines the LLM configs and keeps the chat history if needed. """ - def __init__(self, llm_key: str, base_parameters: dict[str, Any] | None = None): + def __init__( + self, + llm_key: str, + screenshot_scaling_enabled: bool = False, + base_parameters: dict[str, Any] | None = None, + ): self.llm_key = llm_key self.llm_config = LLMConfigRegistry.get_config(llm_key) self.base_parameters = base_parameters self.message_history: list[dict[str, Any]] = [] self.current_tool_results: list[dict[str, Any]] = [] + self.screenshot_scaling_enabled = screenshot_scaling_enabled + self.browser_window_dimension = Resolution(width=settings.BROWSER_WIDTH, height=settings.BROWSER_HEIGHT) + self.screenshot_resize_target_dimension = self.browser_window_dimension + if screenshot_scaling_enabled: + self.screenshot_resize_target_dimension = get_resize_target_dimension(self.browser_window_dimension) def add_tool_result(self, tool_result: dict[str, Any]) -> None: self.current_tool_results.append(tool_result) @@ -504,6 +515,9 @@ class LLMCaller: ai_suggestion=ai_suggestion, ) + if screenshots and self.screenshot_scaling_enabled: + screenshots = resize_screenshots(screenshots, self.screenshot_resize_target_dimension) + await app.ARTIFACT_MANAGER.create_llm_artifact( data=prompt.encode("utf-8") if prompt else b"", artifact_type=ArtifactType.LLM_PROMPT, diff --git a/skyvern/utils/image_resizer.py b/skyvern/utils/image_resizer.py new file mode 100644 index 00000000..da23a9a7 --- /dev/null +++ b/skyvern/utils/image_resizer.py @@ -0,0 +1,59 @@ +import io +from typing import TypedDict + +from PIL import Image + + +class Resolution(TypedDict): + width: int + height: int + + +MAX_SCALING_TARGETS_ANTHROPIC_CUA: dict[str, Resolution] = { + "XGA": Resolution(width=1024, height=768), # 4:3 + "WXGA": Resolution(width=1280, height=800), # 16:10 + "FWXGA": Resolution(width=1366, height=768), # ~16:9 +} + + +def get_resize_target_dimension( + window_size: Resolution, max_scaling_targets: dict[str, Resolution] = MAX_SCALING_TARGETS_ANTHROPIC_CUA +) -> Resolution: + ratio = window_size["width"] / window_size["height"] + for dimension in max_scaling_targets.values(): + if abs(dimension["width"] / dimension["height"] - ratio) < 0.02: + return dimension + return window_size + + +def resize_screenshots(screenshots: list[bytes], target_dimension: Resolution) -> list[bytes]: + """ + The image scaling logic is originated from anthropic's quickstart guide: + https://github.com/anthropics/anthropic-quickstarts/blob/81c4085944abb1734db411f05290b538fdc46dcd/computer-use-demo/computer_use_demo/tools/computer.py#L49-L60 + """ + new_screenshots = [] + for screenshot in screenshots: + # Convert bytes to PIL Image + img = Image.open(io.BytesIO(screenshot)) + + # Resize image to target dimensions + resized_img = img.resize((target_dimension["width"], target_dimension["height"]), Image.Resampling.LANCZOS) + + # Convert back to bytes + img_byte_arr = io.BytesIO() + resized_img.save(img_byte_arr, format="PNG") + img_byte = img_byte_arr.getvalue() + + new_screenshots.append(img_byte) + return new_screenshots + + +def scale_coordinates( + current_coordinates: tuple[int, int], + current_dimension: Resolution, + target_dimension: Resolution, +) -> tuple[int, int]: + return ( + int(current_coordinates[0] * target_dimension["width"] / current_dimension["width"]), + int(current_coordinates[1] * target_dimension["height"] / current_dimension["height"]), + ) diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index 715a3567..e508a15a 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -9,6 +9,7 @@ from skyvern.forge import app from skyvern.forge.prompts import prompt_engine from skyvern.forge.sdk.models import Step from skyvern.forge.sdk.schemas.tasks import Task +from skyvern.utils.image_resizer import Resolution, scale_coordinates from skyvern.webeye.actions.actions import ( Action, ActionType, @@ -454,6 +455,8 @@ async def parse_anthropic_actions( task: Task, step: Step, assistant_content: list[dict[str, Any]], + browser_window_dimension: Resolution, + screenshot_resize_target_dimension: Resolution, ) -> list[Action]: tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"] idx = 0 @@ -468,7 +471,11 @@ async def parse_anthropic_actions( continue action = tool_call_input["action"] if action == "mouse_move": - x, y = tool_call_input["coordinate"] + original_x, original_y = tool_call_input["coordinate"] + # (x, y) is the coordinate in resized screenshots. We need to scale it to the browser window dimension. + x, y = scale_coordinates( + (original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension + ) actions.append( MoveAction( x=x, @@ -497,7 +504,10 @@ async def parse_anthropic_actions( ) idx += 1 continue - x, y = coordinate + original_x, original_y = coordinate + x, y = scale_coordinates( + (original_x, original_y), screenshot_resize_target_dimension, browser_window_dimension + ) actions.append( ClickAction( element_id="",