anthropic support - dymanic window size / window popup (#2284)
This commit is contained in:
@@ -7,7 +7,7 @@ import string
|
|||||||
from asyncio.exceptions import CancelledError
|
from asyncio.exceptions import CancelledError
|
||||||
from datetime import UTC, datetime
|
from datetime import UTC, datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from typing import Any, Tuple
|
from typing import Any, Tuple, cast
|
||||||
|
|
||||||
import httpx
|
import httpx
|
||||||
import structlog
|
import structlog
|
||||||
@@ -72,6 +72,7 @@ from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext
|
|||||||
from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock
|
from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock
|
||||||
from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus
|
from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus
|
||||||
from skyvern.schemas.runs import CUA_ENGINES, CUA_RUN_TYPES, RunEngine
|
from skyvern.schemas.runs import CUA_ENGINES, CUA_RUN_TYPES, RunEngine
|
||||||
|
from skyvern.utils.image_resizer import Resolution
|
||||||
from skyvern.utils.prompt_engine import load_prompt_with_elements
|
from skyvern.utils.prompt_engine import load_prompt_with_elements
|
||||||
from skyvern.webeye.actions.actions import (
|
from skyvern.webeye.actions.actions import (
|
||||||
Action,
|
Action,
|
||||||
@@ -1428,6 +1429,7 @@ class ForgeAgent:
|
|||||||
]
|
]
|
||||||
thinking = {"type": "enabled", "budget_tokens": 1024}
|
thinking = {"type": "enabled", "budget_tokens": 1024}
|
||||||
betas = ["computer-use-2025-01-24"]
|
betas = ["computer-use-2025-01-24"]
|
||||||
|
window_dimension = cast(Resolution, scraped_page.window_dimension) if scraped_page.window_dimension else None
|
||||||
if not llm_caller.message_history:
|
if not llm_caller.message_history:
|
||||||
llm_response = await llm_caller.call(
|
llm_response = await llm_caller.call(
|
||||||
prompt=task.navigation_goal,
|
prompt=task.navigation_goal,
|
||||||
@@ -1437,6 +1439,7 @@ class ForgeAgent:
|
|||||||
raw_response=True,
|
raw_response=True,
|
||||||
betas=betas,
|
betas=betas,
|
||||||
thinking=thinking,
|
thinking=thinking,
|
||||||
|
window_dimension=window_dimension,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
llm_response = await llm_caller.call(
|
llm_response = await llm_caller.call(
|
||||||
@@ -1446,6 +1449,7 @@ class ForgeAgent:
|
|||||||
raw_response=True,
|
raw_response=True,
|
||||||
betas=betas,
|
betas=betas,
|
||||||
thinking=thinking,
|
thinking=thinking,
|
||||||
|
window_dimension=window_dimension,
|
||||||
)
|
)
|
||||||
assistant_content = llm_response["content"]
|
assistant_content = llm_response["content"]
|
||||||
llm_caller.message_history.append({"role": "assistant", "content": assistant_content})
|
llm_caller.message_history.append({"role": "assistant", "content": assistant_content})
|
||||||
@@ -1454,8 +1458,8 @@ class ForgeAgent:
|
|||||||
task,
|
task,
|
||||||
step,
|
step,
|
||||||
assistant_content,
|
assistant_content,
|
||||||
llm_caller.browser_window_dimension,
|
window_dimension or llm_caller.browser_window_dimension,
|
||||||
llm_caller.screenshot_resize_target_dimension,
|
llm_caller.get_screenshot_resize_target_dimension(window_dimension),
|
||||||
)
|
)
|
||||||
return actions
|
return actions
|
||||||
|
|
||||||
|
|||||||
@@ -491,6 +491,7 @@ class LLMCaller:
|
|||||||
tools: list | None = None,
|
tools: list | None = None,
|
||||||
use_message_history: bool = False,
|
use_message_history: bool = False,
|
||||||
raw_response: bool = False,
|
raw_response: bool = False,
|
||||||
|
window_dimension: Resolution | None = None,
|
||||||
**extra_parameters: Any,
|
**extra_parameters: Any,
|
||||||
) -> dict[str, Any]:
|
) -> dict[str, Any]:
|
||||||
start_time = time.perf_counter()
|
start_time = time.perf_counter()
|
||||||
@@ -516,7 +517,21 @@ class LLMCaller:
|
|||||||
)
|
)
|
||||||
|
|
||||||
if screenshots and self.screenshot_scaling_enabled:
|
if screenshots and self.screenshot_scaling_enabled:
|
||||||
screenshots = resize_screenshots(screenshots, self.screenshot_resize_target_dimension)
|
target_dimension = self.get_screenshot_resize_target_dimension(window_dimension)
|
||||||
|
if window_dimension and window_dimension != self.browser_window_dimension and tools:
|
||||||
|
# THIS situation only applies to Anthropic CUA
|
||||||
|
LOG.info(
|
||||||
|
"Window dimension is different from the default browser window dimension when making LLM call",
|
||||||
|
window_dimension=window_dimension,
|
||||||
|
browser_window_dimension=self.browser_window_dimension,
|
||||||
|
)
|
||||||
|
# update the tools to use the new target dimension
|
||||||
|
for tool in tools:
|
||||||
|
if "display_height_px" in tool:
|
||||||
|
tool["display_height_px"] = target_dimension["height"]
|
||||||
|
if "display_width_px" in tool:
|
||||||
|
tool["display_width_px"] = target_dimension["width"]
|
||||||
|
screenshots = resize_screenshots(screenshots, target_dimension)
|
||||||
|
|
||||||
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
||||||
data=prompt.encode("utf-8") if prompt else b"",
|
data=prompt.encode("utf-8") if prompt else b"",
|
||||||
@@ -683,6 +698,11 @@ class LLMCaller:
|
|||||||
|
|
||||||
return parsed_response
|
return parsed_response
|
||||||
|
|
||||||
|
def get_screenshot_resize_target_dimension(self, window_dimension: Resolution | None) -> Resolution:
|
||||||
|
if window_dimension and window_dimension != self.browser_window_dimension:
|
||||||
|
return get_resize_target_dimension(window_dimension)
|
||||||
|
return self.screenshot_resize_target_dimension
|
||||||
|
|
||||||
async def _dispatch_llm_call(
|
async def _dispatch_llm_call(
|
||||||
self,
|
self,
|
||||||
messages: list[dict[str, Any]],
|
messages: list[dict[str, Any]],
|
||||||
|
|||||||
@@ -22,7 +22,9 @@ def get_resize_target_dimension(
|
|||||||
ratio = window_size["width"] / window_size["height"]
|
ratio = window_size["width"] / window_size["height"]
|
||||||
for dimension in max_scaling_targets.values():
|
for dimension in max_scaling_targets.values():
|
||||||
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
|
if abs(dimension["width"] / dimension["height"] - ratio) < 0.02:
|
||||||
return dimension
|
if dimension["width"] < window_size["width"]:
|
||||||
|
# we only return the dimension if it's smaller than the window size
|
||||||
|
return dimension
|
||||||
return window_size
|
return window_size
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, DEFAULT_MAX_TOKE
|
|||||||
from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, UnknownElementTreeFormat
|
from skyvern.exceptions import FailedToTakeScreenshot, ScrapingFailed, UnknownElementTreeFormat
|
||||||
from skyvern.forge.sdk.api.crypto import calculate_sha256
|
from skyvern.forge.sdk.api.crypto import calculate_sha256
|
||||||
from skyvern.forge.sdk.core import skyvern_context
|
from skyvern.forge.sdk.core import skyvern_context
|
||||||
|
from skyvern.utils.image_resizer import Resolution
|
||||||
from skyvern.utils.token_counter import count_tokens
|
from skyvern.utils.token_counter import count_tokens
|
||||||
from skyvern.webeye.browser_factory import BrowserState
|
from skyvern.webeye.browser_factory import BrowserState
|
||||||
from skyvern.webeye.utils.page import SkyvernFrame
|
from skyvern.webeye.utils.page import SkyvernFrame
|
||||||
@@ -237,7 +238,7 @@ class ScrapedPage(BaseModel):
|
|||||||
url: str
|
url: str
|
||||||
html: str
|
html: str
|
||||||
extracted_text: str | None = None
|
extracted_text: str | None = None
|
||||||
|
window_dimension: dict[str, int] | None = None
|
||||||
_browser_state: BrowserState = PrivateAttr()
|
_browser_state: BrowserState = PrivateAttr()
|
||||||
_clean_up_func: CleanupElementTreeFunc = PrivateAttr()
|
_clean_up_func: CleanupElementTreeFunc = PrivateAttr()
|
||||||
_scrape_exclude: ScrapeExcludeFunc | None = PrivateAttr(default=None)
|
_scrape_exclude: ScrapeExcludeFunc | None = PrivateAttr(default=None)
|
||||||
@@ -540,9 +541,11 @@ async def scrape_web_unsafe(
|
|||||||
text_content = await get_frame_text(page.main_frame)
|
text_content = await get_frame_text(page.main_frame)
|
||||||
|
|
||||||
html = ""
|
html = ""
|
||||||
|
window_dimension = None
|
||||||
try:
|
try:
|
||||||
skyvern_frame = await SkyvernFrame.create_instance(frame=page)
|
skyvern_frame = await SkyvernFrame.create_instance(frame=page)
|
||||||
html = await skyvern_frame.get_content()
|
html = await skyvern_frame.get_content()
|
||||||
|
window_dimension = Resolution(width=page.viewport_size["width"], height=page.viewport_size["height"])
|
||||||
except Exception:
|
except Exception:
|
||||||
LOG.error(
|
LOG.error(
|
||||||
"Failed out to get HTML content",
|
"Failed out to get HTML content",
|
||||||
@@ -563,6 +566,7 @@ async def scrape_web_unsafe(
|
|||||||
url=page.url,
|
url=page.url,
|
||||||
html=html,
|
html=html,
|
||||||
extracted_text=text_content,
|
extracted_text=text_content,
|
||||||
|
window_dimension=window_dimension,
|
||||||
_browser_state=browser_state,
|
_browser_state=browser_state,
|
||||||
_clean_up_func=cleanup_element_tree,
|
_clean_up_func=cleanup_element_tree,
|
||||||
_scrape_exclude=scrape_exclude,
|
_scrape_exclude=scrape_exclude,
|
||||||
|
|||||||
Reference in New Issue
Block a user