diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 517d6826..2848d776 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -7,7 +7,7 @@ import string from asyncio.exceptions import CancelledError from datetime import UTC, datetime from pathlib import Path -from typing import Any, Tuple +from typing import Any, Tuple, cast import httpx import structlog @@ -72,6 +72,7 @@ from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus from skyvern.schemas.runs import CUA_ENGINES, CUA_RUN_TYPES, RunEngine +from skyvern.utils.image_resizer import Resolution from skyvern.utils.prompt_engine import load_prompt_with_elements from skyvern.webeye.actions.actions import ( Action, @@ -1428,6 +1429,7 @@ class ForgeAgent: ] thinking = {"type": "enabled", "budget_tokens": 1024} betas = ["computer-use-2025-01-24"] + window_dimension = cast(Resolution, scraped_page.window_dimension) if scraped_page.window_dimension else None if not llm_caller.message_history: llm_response = await llm_caller.call( prompt=task.navigation_goal, @@ -1437,6 +1439,7 @@ class ForgeAgent: raw_response=True, betas=betas, thinking=thinking, + window_dimension=window_dimension, ) else: llm_response = await llm_caller.call( @@ -1446,6 +1449,7 @@ class ForgeAgent: raw_response=True, betas=betas, thinking=thinking, + window_dimension=window_dimension, ) assistant_content = llm_response["content"] llm_caller.message_history.append({"role": "assistant", "content": assistant_content}) @@ -1454,8 +1458,8 @@ class ForgeAgent: task, step, assistant_content, - llm_caller.browser_window_dimension, - llm_caller.screenshot_resize_target_dimension, + window_dimension or llm_caller.browser_window_dimension, + llm_caller.get_screenshot_resize_target_dimension(window_dimension), ) return actions diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py index 0f4b260e..f546b062 100644 --- a/skyvern/forge/sdk/api/llm/api_handler_factory.py +++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py @@ -491,6 +491,7 @@ class LLMCaller: tools: list | None = None, use_message_history: bool = False, raw_response: bool = False, + window_dimension: Resolution | None = None, **extra_parameters: Any, ) -> dict[str, Any]: start_time = time.perf_counter() @@ -516,7 +517,21 @@ class LLMCaller: ) if screenshots and self.screenshot_scaling_enabled: - screenshots = resize_screenshots(screenshots, self.screenshot_resize_target_dimension) + target_dimension = self.get_screenshot_resize_target_dimension(window_dimension) + if window_dimension and window_dimension != self.browser_window_dimension and tools: + # THIS situation only applies to Anthropic CUA + LOG.info( + "Window dimension is different from the default browser window dimension when making LLM call", + window_dimension=window_dimension, + browser_window_dimension=self.browser_window_dimension, + ) + # update the tools to use the new target dimension + for tool in tools: + if "display_height_px" in tool: + tool["display_height_px"] = target_dimension["height"] + if "display_width_px" in tool: + tool["display_width_px"] = target_dimension["width"] + screenshots = resize_screenshots(screenshots, target_dimension) await app.ARTIFACT_MANAGER.create_llm_artifact( data=prompt.encode("utf-8") if prompt else b"", @@ -683,6 +698,11 @@ class LLMCaller: return parsed_response + def get_screenshot_resize_target_dimension(self, window_dimension: Resolution | None) -> Resolution: + if window_dimension and window_dimension != self.browser_window_dimension: + return get_resize_target_dimension(window_dimension) + return self.screenshot_resize_target_dimension + async def _dispatch_llm_call( self, messages: list[dict[str, Any]], diff --git a/skyvern/utils/image_resizer.py b/skyvern/utils/image_resizer.py index da23a9a7..5f2848ee 100644 --- a/skyvern/utils/image_resizer.py +++ b/skyvern/utils/image_resizer.py @@ -22,7 +22,9 @@ def get_resize_target_dimension( ratio = window_size["width"] / window_size["height"] for dimension in max_scaling_targets.values(): if abs(dimension["width"] / dimension["height"] - ratio) < 0.02: - return dimension + if dimension["width"] < window_size["width"]: + # we only return the dimension if it's smaller than the window size + return dimension return window_size diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 524b1246..a74fd21d 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -15,6 +15,7 @@ from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, DEFAULT_MAX_TOKE from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, UnknownElementTreeFormat from skyvern.forge.sdk.api.crypto import calculate_sha256 from skyvern.forge.sdk.core import skyvern_context +from skyvern.utils.image_resizer import Resolution from skyvern.utils.token_counter import count_tokens from skyvern.webeye.browser_factory import BrowserState from skyvern.webeye.utils.page import SkyvernFrame @@ -237,7 +238,7 @@ class ScrapedPage(BaseModel): url: str html: str extracted_text: str | None = None - + window_dimension: dict[str, int] | None = None _browser_state: BrowserState = PrivateAttr() _clean_up_func: CleanupElementTreeFunc = PrivateAttr() _scrape_exclude: ScrapeExcludeFunc | None = PrivateAttr(default=None) @@ -540,9 +541,11 @@ async def scrape_web_unsafe( text_content = await get_frame_text(page.main_frame) html = "" + window_dimension = None try: skyvern_frame = await SkyvernFrame.create_instance(frame=page) html = await skyvern_frame.get_content() + window_dimension = Resolution(width=page.viewport_size["width"], height=page.viewport_size["height"]) except Exception: LOG.error( "Failed out to get HTML content", @@ -563,6 +566,7 @@ async def scrape_web_unsafe( url=page.url, html=html, extracted_text=text_content, + window_dimension=window_dimension, _browser_state=browser_state, _clean_up_func=cleanup_element_tree, _scrape_exclude=scrape_exclude,