anthropic support - dymanic window size / window popup (#2284)

This commit is contained in:
Shuchang Zheng
2025-05-04 00:40:16 -07:00
committed by GitHub
parent a851e8fdd8
commit 8b834436b5
4 changed files with 36 additions and 6 deletions

View File

@@ -7,7 +7,7 @@ import string
from asyncio.exceptions import CancelledError from asyncio.exceptions import CancelledError
from datetime import UTC, datetime from datetime import UTC, datetime
from pathlib import Path from pathlib import Path
from typing import Any, Tuple from typing import Any, Tuple, cast
import httpx import httpx
import structlog import structlog
@@ -72,6 +72,7 @@ from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext
from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock
from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus
from skyvern.schemas.runs import CUA_ENGINES, CUA_RUN_TYPES, RunEngine from skyvern.schemas.runs import CUA_ENGINES, CUA_RUN_TYPES, RunEngine
from skyvern.utils.image_resizer import Resolution
from skyvern.utils.prompt_engine import load_prompt_with_elements from skyvern.utils.prompt_engine import load_prompt_with_elements
from skyvern.webeye.actions.actions import ( from skyvern.webeye.actions.actions import (
Action, Action,
@@ -1428,6 +1429,7 @@ class ForgeAgent:
] ]
thinking = {"type": "enabled", "budget_tokens": 1024} thinking = {"type": "enabled", "budget_tokens": 1024}
betas = ["computer-use-2025-01-24"] betas = ["computer-use-2025-01-24"]
window_dimension = cast(Resolution, scraped_page.window_dimension) if scraped_page.window_dimension else None
if not llm_caller.message_history: if not llm_caller.message_history:
llm_response = await llm_caller.call( llm_response = await llm_caller.call(
prompt=task.navigation_goal, prompt=task.navigation_goal,
@@ -1437,6 +1439,7 @@ class ForgeAgent:
raw_response=True, raw_response=True,
betas=betas, betas=betas,
thinking=thinking, thinking=thinking,
window_dimension=window_dimension,
) )
else: else:
llm_response = await llm_caller.call( llm_response = await llm_caller.call(
@@ -1446,6 +1449,7 @@ class ForgeAgent:
raw_response=True, raw_response=True,
betas=betas, betas=betas,
thinking=thinking, thinking=thinking,
window_dimension=window_dimension,
) )
assistant_content = llm_response["content"] assistant_content = llm_response["content"]
llm_caller.message_history.append({"role": "assistant", "content": assistant_content}) llm_caller.message_history.append({"role": "assistant", "content": assistant_content})
@@ -1454,8 +1458,8 @@ class ForgeAgent:
task, task,
step, step,
assistant_content, assistant_content,
llm_caller.browser_window_dimension, window_dimension or llm_caller.browser_window_dimension,
llm_caller.screenshot_resize_target_dimension, llm_caller.get_screenshot_resize_target_dimension(window_dimension),
) )
return actions return actions

View File

@@ -491,6 +491,7 @@ class LLMCaller:
tools: list | None = None, tools: list | None = None,
use_message_history: bool = False, use_message_history: bool = False,
raw_response: bool = False, raw_response: bool = False,
window_dimension: Resolution | None = None,
**extra_parameters: Any, **extra_parameters: Any,
) -> dict[str, Any]: ) -> dict[str, Any]:
start_time = time.perf_counter() start_time = time.perf_counter()
@@ -516,7 +517,21 @@ class LLMCaller:
) )
if screenshots and self.screenshot_scaling_enabled: if screenshots and self.screenshot_scaling_enabled:
screenshots = resize_screenshots(screenshots, self.screenshot_resize_target_dimension) target_dimension = self.get_screenshot_resize_target_dimension(window_dimension)
if window_dimension and window_dimension != self.browser_window_dimension and tools:
# THIS situation only applies to Anthropic CUA
LOG.info(
"Window dimension is different from the default browser window dimension when making LLM call",
window_dimension=window_dimension,
browser_window_dimension=self.browser_window_dimension,
)
# update the tools to use the new target dimension
for tool in tools:
if "display_height_px" in tool:
tool["display_height_px"] = target_dimension["height"]
if "display_width_px" in tool:
tool["display_width_px"] = target_dimension["width"]
screenshots = resize_screenshots(screenshots, target_dimension)
await app.ARTIFACT_MANAGER.create_llm_artifact( await app.ARTIFACT_MANAGER.create_llm_artifact(
data=prompt.encode("utf-8") if prompt else b"", data=prompt.encode("utf-8") if prompt else b"",
@@ -683,6 +698,11 @@ class LLMCaller:
return parsed_response return parsed_response
def get_screenshot_resize_target_dimension(self, window_dimension: Resolution | None) -> Resolution:
if window_dimension and window_dimension != self.browser_window_dimension:
return get_resize_target_dimension(window_dimension)
return self.screenshot_resize_target_dimension
async def _dispatch_llm_call( async def _dispatch_llm_call(
self, self,
messages: list[dict[str, Any]], messages: list[dict[str, Any]],

View File

@@ -22,7 +22,9 @@ def get_resize_target_dimension(
ratio = window_size["width"] / window_size["height"] ratio = window_size["width"] / window_size["height"]
for dimension in max_scaling_targets.values(): for dimension in max_scaling_targets.values():
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02: if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
return dimension if dimension["width"] < window_size["width"]:
# we only return the dimension if it's smaller than the window size
return dimension
return window_size return window_size

View File

@@ -15,6 +15,7 @@ from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, DEFAULT_MAX_TOKE
from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, UnknownElementTreeFormat from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, UnknownElementTreeFormat
from skyvern.forge.sdk.api.crypto import calculate_sha256 from skyvern.forge.sdk.api.crypto import calculate_sha256
from skyvern.forge.sdk.core import skyvern_context from skyvern.forge.sdk.core import skyvern_context
from skyvern.utils.image_resizer import Resolution
from skyvern.utils.token_counter import count_tokens from skyvern.utils.token_counter import count_tokens
from skyvern.webeye.browser_factory import BrowserState from skyvern.webeye.browser_factory import BrowserState
from skyvern.webeye.utils.page import SkyvernFrame from skyvern.webeye.utils.page import SkyvernFrame
@@ -237,7 +238,7 @@ class ScrapedPage(BaseModel):
url: str url: str
html: str html: str
extracted_text: str | None = None extracted_text: str | None = None
window_dimension: dict[str, int] | None = None
_browser_state: BrowserState = PrivateAttr() _browser_state: BrowserState = PrivateAttr()
_clean_up_func: CleanupElementTreeFunc = PrivateAttr() _clean_up_func: CleanupElementTreeFunc = PrivateAttr()
_scrape_exclude: ScrapeExcludeFunc | None = PrivateAttr(default=None) _scrape_exclude: ScrapeExcludeFunc | None = PrivateAttr(default=None)
@@ -540,9 +541,11 @@ async def scrape_web_unsafe(
text_content = await get_frame_text(page.main_frame) text_content = await get_frame_text(page.main_frame)
html = "" html = ""
window_dimension = None
try: try:
skyvern_frame = await SkyvernFrame.create_instance(frame=page) skyvern_frame = await SkyvernFrame.create_instance(frame=page)
html = await skyvern_frame.get_content() html = await skyvern_frame.get_content()
window_dimension = Resolution(width=page.viewport_size["width"], height=page.viewport_size["height"])
except Exception: except Exception:
LOG.error( LOG.error(
"Failed out to get HTML content", "Failed out to get HTML content",
@@ -563,6 +566,7 @@ async def scrape_web_unsafe(
url=page.url, url=page.url,
html=html, html=html,
extracted_text=text_content, extracted_text=text_content,
window_dimension=window_dimension,
_browser_state=browser_state, _browser_state=browser_state,
_clean_up_func=cleanup_element_tree, _clean_up_func=cleanup_element_tree,
_scrape_exclude=scrape_exclude, _scrape_exclude=scrape_exclude,