anthropic support - dymanic window size / window popup (#2284)

This commit is contained in:
Shuchang Zheng
2025-05-04 00:40:16 -07:00
committed by GitHub
parent a851e8fdd8
commit 8b834436b5
4 changed files with 36 additions and 6 deletions

View File

@@ -7,7 +7,7 @@ import string
from asyncio.exceptions import CancelledError
from datetime import UTC, datetime
from pathlib import Path
from typing import Any, Tuple
from typing import Any, Tuple, cast
import httpx
import structlog
@@ -72,6 +72,7 @@ from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext
from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock
from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus
from skyvern.schemas.runs import CUA_ENGINES, CUA_RUN_TYPES, RunEngine
from skyvern.utils.image_resizer import Resolution
from skyvern.utils.prompt_engine import load_prompt_with_elements
from skyvern.webeye.actions.actions import (
Action,
@@ -1428,6 +1429,7 @@ class ForgeAgent:
]
thinking = {"type": "enabled", "budget_tokens": 1024}
betas = ["computer-use-2025-01-24"]
window_dimension = cast(Resolution, scraped_page.window_dimension) if scraped_page.window_dimension else None
if not llm_caller.message_history:
llm_response = await llm_caller.call(
prompt=task.navigation_goal,
@@ -1437,6 +1439,7 @@ class ForgeAgent:
raw_response=True,
betas=betas,
thinking=thinking,
window_dimension=window_dimension,
)
else:
llm_response = await llm_caller.call(
@@ -1446,6 +1449,7 @@ class ForgeAgent:
raw_response=True,
betas=betas,
thinking=thinking,
window_dimension=window_dimension,
)
assistant_content = llm_response["content"]
llm_caller.message_history.append({"role": "assistant", "content": assistant_content})
@@ -1454,8 +1458,8 @@ class ForgeAgent:
task,
step,
assistant_content,
llm_caller.browser_window_dimension,
llm_caller.screenshot_resize_target_dimension,
window_dimension or llm_caller.browser_window_dimension,
llm_caller.get_screenshot_resize_target_dimension(window_dimension),
)
return actions

View File

@@ -491,6 +491,7 @@ class LLMCaller:
tools: list | None = None,
use_message_history: bool = False,
raw_response: bool = False,
window_dimension: Resolution | None = None,
**extra_parameters: Any,
) -> dict[str, Any]:
start_time = time.perf_counter()
@@ -516,7 +517,21 @@ class LLMCaller:
)
if screenshots and self.screenshot_scaling_enabled:
screenshots = resize_screenshots(screenshots, self.screenshot_resize_target_dimension)
target_dimension = self.get_screenshot_resize_target_dimension(window_dimension)
if window_dimension and window_dimension != self.browser_window_dimension and tools:
# THIS situation only applies to Anthropic CUA
LOG.info(
"Window dimension is different from the default browser window dimension when making LLM call",
window_dimension=window_dimension,
browser_window_dimension=self.browser_window_dimension,
)
# update the tools to use the new target dimension
for tool in tools:
if "display_height_px" in tool:
tool["display_height_px"] = target_dimension["height"]
if "display_width_px" in tool:
tool["display_width_px"] = target_dimension["width"]
screenshots = resize_screenshots(screenshots, target_dimension)
await app.ARTIFACT_MANAGER.create_llm_artifact(
data=prompt.encode("utf-8") if prompt else b"",
@@ -683,6 +698,11 @@ class LLMCaller:
return parsed_response
def get_screenshot_resize_target_dimension(self, window_dimension: Resolution | None) -> Resolution:
if window_dimension and window_dimension != self.browser_window_dimension:
return get_resize_target_dimension(window_dimension)
return self.screenshot_resize_target_dimension
async def _dispatch_llm_call(
self,
messages: list[dict[str, Any]],

View File

@@ -22,7 +22,9 @@ def get_resize_target_dimension(
ratio = window_size["width"] / window_size["height"]
for dimension in max_scaling_targets.values():
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
return dimension
if dimension["width"] < window_size["width"]:
# we only return the dimension if it's smaller than the window size
return dimension
return window_size

View File

@@ -15,6 +15,7 @@ from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, DEFAULT_MAX_TOKE
from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, UnknownElementTreeFormat
from skyvern.forge.sdk.api.crypto import calculate_sha256
from skyvern.forge.sdk.core import skyvern_context
from skyvern.utils.image_resizer import Resolution
from skyvern.utils.token_counter import count_tokens
from skyvern.webeye.browser_factory import BrowserState
from skyvern.webeye.utils.page import SkyvernFrame
@@ -237,7 +238,7 @@ class ScrapedPage(BaseModel):
url: str
html: str
extracted_text: str | None = None
window_dimension: dict[str, int] | None = None
_browser_state: BrowserState = PrivateAttr()
_clean_up_func: CleanupElementTreeFunc = PrivateAttr()
_scrape_exclude: ScrapeExcludeFunc | None = PrivateAttr(default=None)
@@ -540,9 +541,11 @@ async def scrape_web_unsafe(
text_content = await get_frame_text(page.main_frame)
html = ""
window_dimension = None
try:
skyvern_frame = await SkyvernFrame.create_instance(frame=page)
html = await skyvern_frame.get_content()
window_dimension = Resolution(width=page.viewport_size["width"], height=page.viewport_size["height"])
except Exception:
LOG.error(
"Failed out to get HTML content",
@@ -563,6 +566,7 @@ async def scrape_web_unsafe(
url=page.url,
html=html,
extracted_text=text_content,
window_dimension=window_dimension,
_browser_state=browser_state,
_clean_up_func=cleanup_element_tree,
_scrape_exclude=scrape_exclude,