diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 37e37032..79586bee 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -1433,7 +1433,6 @@ class ForgeAgent: raw_response=True, betas=["computer-use-2025-01-24"], ) - LOG.info("Anthropic response", llm_response=llm_response) assistant_content = llm_response["content"] llm_caller.message_history.append({"role": "assistant", "content": assistant_content}) diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py index 1386d295..d704efaa 100644 --- a/skyvern/forge/sdk/api/llm/api_handler_factory.py +++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py @@ -6,7 +6,7 @@ from typing import Any import litellm import structlog -from anthropic.types.message import Message as AnthropicMessage +from anthropic.types.beta.beta_message import BetaMessage as AnthropicMessage from jinja2 import Template from litellm.utils import CustomStreamWrapper, ModelResponse @@ -678,7 +678,7 @@ class LLMCaller: ) -> AnthropicMessage: max_tokens = active_parameters.get("max_completion_tokens") or active_parameters.get("max_tokens") or 4096 model_name = self.llm_config.model_name.replace("bedrock/", "").replace("anthropic/", "") - return await app.ANTHROPIC_CLIENT.messages.create( + response = await app.ANTHROPIC_CLIENT.beta.messages.create( max_tokens=max_tokens, messages=messages, model=model_name, @@ -686,6 +686,8 @@ class LLMCaller: timeout=timeout, betas=active_parameters.get("betas", None), ) + LOG.info("Anthropic response", response=response) + return response class LLMCallerManager: diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index 8e0cd40d..325705d5 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -482,7 +482,6 @@ async def parse_anthropic_actions( tool_call_id=tool_call_id, ) ) - idx += 1 elif action == "left_click": if idx - 1 >= 0: prev_tool_call = tool_calls[idx - 1] @@ -494,12 +493,12 @@ async def parse_anthropic_actions( else: coordinate = parsed_args.get("coordinate") - idx += 1 if not coordinate: LOG.warning( "Left click action has no coordinate and it doesn't have mouse_move before it", tool_call=tool_call, ) + idx += 1 continue x, y = coordinate actions.append( @@ -519,12 +518,12 @@ async def parse_anthropic_actions( ) elif action == "type": text = parsed_args.get("text") - idx += 1 if not text: LOG.warning( "Type action has no text", tool_call=tool_call, ) + idx += 1 continue actions.append( InputTextAction( @@ -541,12 +540,12 @@ async def parse_anthropic_actions( ) elif action == "key": text = parsed_args.get("text") - idx += 1 if not text: LOG.warning( "Key action has no text", tool_call=tool_call, ) + idx += 1 continue actions.append( KeypressAction( @@ -561,12 +560,24 @@ async def parse_anthropic_actions( tool_call_id=tool_call_id, ) ) + elif action == "screenshot": + actions.append( + NullAction( + organization_id=task.organization_id, + workflow_run_id=task.workflow_run_id, + task_id=task.task_id, + step_id=step.step_id, + step_order=step.order, + action_order=idx, + tool_call_id=tool_call_id, + ) + ) else: LOG.error( "Unsupported action", tool_call=tool_call, ) - idx += 1 + idx += 1 return actions