anthropic support - dymanic window size / window popup (#2284)
This commit is contained in:
@@ -7,7 +7,7 @@ import string
|
||||
from asyncio.exceptions import CancelledError
|
||||
from datetime import UTC, datetime
|
||||
from pathlib import Path
|
||||
from typing import Any, Tuple
|
||||
from typing import Any, Tuple, cast
|
||||
|
||||
import httpx
|
||||
import structlog
|
||||
@@ -72,6 +72,7 @@ from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext
|
||||
from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock
|
||||
from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus
|
||||
from skyvern.schemas.runs import CUA_ENGINES, CUA_RUN_TYPES, RunEngine
|
||||
from skyvern.utils.image_resizer import Resolution
|
||||
from skyvern.utils.prompt_engine import load_prompt_with_elements
|
||||
from skyvern.webeye.actions.actions import (
|
||||
Action,
|
||||
@@ -1428,6 +1429,7 @@ class ForgeAgent:
|
||||
]
|
||||
thinking = {"type": "enabled", "budget_tokens": 1024}
|
||||
betas = ["computer-use-2025-01-24"]
|
||||
window_dimension = cast(Resolution, scraped_page.window_dimension) if scraped_page.window_dimension else None
|
||||
if not llm_caller.message_history:
|
||||
llm_response = await llm_caller.call(
|
||||
prompt=task.navigation_goal,
|
||||
@@ -1437,6 +1439,7 @@ class ForgeAgent:
|
||||
raw_response=True,
|
||||
betas=betas,
|
||||
thinking=thinking,
|
||||
window_dimension=window_dimension,
|
||||
)
|
||||
else:
|
||||
llm_response = await llm_caller.call(
|
||||
@@ -1446,6 +1449,7 @@ class ForgeAgent:
|
||||
raw_response=True,
|
||||
betas=betas,
|
||||
thinking=thinking,
|
||||
window_dimension=window_dimension,
|
||||
)
|
||||
assistant_content = llm_response["content"]
|
||||
llm_caller.message_history.append({"role": "assistant", "content": assistant_content})
|
||||
@@ -1454,8 +1458,8 @@ class ForgeAgent:
|
||||
task,
|
||||
step,
|
||||
assistant_content,
|
||||
llm_caller.browser_window_dimension,
|
||||
llm_caller.screenshot_resize_target_dimension,
|
||||
window_dimension or llm_caller.browser_window_dimension,
|
||||
llm_caller.get_screenshot_resize_target_dimension(window_dimension),
|
||||
)
|
||||
return actions
|
||||
|
||||
|
||||
@@ -491,6 +491,7 @@ class LLMCaller:
|
||||
tools: list | None = None,
|
||||
use_message_history: bool = False,
|
||||
raw_response: bool = False,
|
||||
window_dimension: Resolution | None = None,
|
||||
**extra_parameters: Any,
|
||||
) -> dict[str, Any]:
|
||||
start_time = time.perf_counter()
|
||||
@@ -516,7 +517,21 @@ class LLMCaller:
|
||||
)
|
||||
|
||||
if screenshots and self.screenshot_scaling_enabled:
|
||||
screenshots = resize_screenshots(screenshots, self.screenshot_resize_target_dimension)
|
||||
target_dimension = self.get_screenshot_resize_target_dimension(window_dimension)
|
||||
if window_dimension and window_dimension != self.browser_window_dimension and tools:
|
||||
# THIS situation only applies to Anthropic CUA
|
||||
LOG.info(
|
||||
"Window dimension is different from the default browser window dimension when making LLM call",
|
||||
window_dimension=window_dimension,
|
||||
browser_window_dimension=self.browser_window_dimension,
|
||||
)
|
||||
# update the tools to use the new target dimension
|
||||
for tool in tools:
|
||||
if "display_height_px" in tool:
|
||||
tool["display_height_px"] = target_dimension["height"]
|
||||
if "display_width_px" in tool:
|
||||
tool["display_width_px"] = target_dimension["width"]
|
||||
screenshots = resize_screenshots(screenshots, target_dimension)
|
||||
|
||||
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
||||
data=prompt.encode("utf-8") if prompt else b"",
|
||||
@@ -683,6 +698,11 @@ class LLMCaller:
|
||||
|
||||
return parsed_response
|
||||
|
||||
def get_screenshot_resize_target_dimension(self, window_dimension: Resolution | None) -> Resolution:
|
||||
if window_dimension and window_dimension != self.browser_window_dimension:
|
||||
return get_resize_target_dimension(window_dimension)
|
||||
return self.screenshot_resize_target_dimension
|
||||
|
||||
async def _dispatch_llm_call(
|
||||
self,
|
||||
messages: list[dict[str, Any]],
|
||||
|
||||
@@ -22,7 +22,9 @@ def get_resize_target_dimension(
|
||||
ratio = window_size["width"] / window_size["height"]
|
||||
for dimension in max_scaling_targets.values():
|
||||
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
|
||||
return dimension
|
||||
if dimension["width"] < window_size["width"]:
|
||||
# we only return the dimension if it's smaller than the window size
|
||||
return dimension
|
||||
return window_size
|
||||
|
||||
|
||||
|
||||
@@ -15,6 +15,7 @@ from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, DEFAULT_MAX_TOKE
|
||||
from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, UnknownElementTreeFormat
|
||||
from skyvern.forge.sdk.api.crypto import calculate_sha256
|
||||
from skyvern.forge.sdk.core import skyvern_context
|
||||
from skyvern.utils.image_resizer import Resolution
|
||||
from skyvern.utils.token_counter import count_tokens
|
||||
from skyvern.webeye.browser_factory import BrowserState
|
||||
from skyvern.webeye.utils.page import SkyvernFrame
|
||||
@@ -237,7 +238,7 @@ class ScrapedPage(BaseModel):
|
||||
url: str
|
||||
html: str
|
||||
extracted_text: str | None = None
|
||||
|
||||
window_dimension: dict[str, int] | None = None
|
||||
_browser_state: BrowserState = PrivateAttr()
|
||||
_clean_up_func: CleanupElementTreeFunc = PrivateAttr()
|
||||
_scrape_exclude: ScrapeExcludeFunc | None = PrivateAttr(default=None)
|
||||
@@ -540,9 +541,11 @@ async def scrape_web_unsafe(
|
||||
text_content = await get_frame_text(page.main_frame)
|
||||
|
||||
html = ""
|
||||
window_dimension = None
|
||||
try:
|
||||
skyvern_frame = await SkyvernFrame.create_instance(frame=page)
|
||||
html = await skyvern_frame.get_content()
|
||||
window_dimension = Resolution(width=page.viewport_size["width"], height=page.viewport_size["height"])
|
||||
except Exception:
|
||||
LOG.error(
|
||||
"Failed out to get HTML content",
|
||||
@@ -563,6 +566,7 @@ async def scrape_web_unsafe(
|
||||
url=page.url,
|
||||
html=html,
|
||||
extracted_text=text_content,
|
||||
window_dimension=window_dimension,
|
||||
_browser_state=browser_state,
|
||||
_clean_up_func=cleanup_element_tree,
|
||||
_scrape_exclude=scrape_exclude,
|
||||
|
||||
Reference in New Issue
Block a user