fix tool_call_input parsing with anthropic sdk (#2242)

This commit is contained in:
Shuchang Zheng
2025-04-29 02:57:56 +08:00
committed by GitHub
parent 1449720038
commit 7e4a193443
2 changed files with 30 additions and 31 deletions

View File

@@ -642,7 +642,7 @@ class LLMCaller:
organization_id=step.organization_id if step else (thought.organization_id if thought else None), organization_id=step.organization_id if step else (thought.organization_id if thought else None),
) )
if raw_response: if raw_response:
return response.model_dump() return response.model_dump(exclude_none=True)
parsed_response = parse_api_response(response, self.llm_config.add_assistant_prefix) parsed_response = parse_api_response(response, self.llm_config.add_assistant_prefix)
await app.ARTIFACT_MANAGER.create_llm_artifact( await app.ARTIFACT_MANAGER.create_llm_artifact(
@@ -694,7 +694,13 @@ class LLMCaller:
model_name = self.llm_config.model_name.replace("bedrock/", "").replace("anthropic/", "") model_name = self.llm_config.model_name.replace("bedrock/", "").replace("anthropic/", "")
betas = active_parameters.get("betas", NOT_GIVEN) betas = active_parameters.get("betas", NOT_GIVEN)
thinking = active_parameters.get("thinking", NOT_GIVEN) thinking = active_parameters.get("thinking", NOT_GIVEN)
LOG.info("Anthropic request", betas=betas, tools=tools, timeout=timeout) LOG.info(
"Anthropic request",
model_name=model_name,
betas=betas,
tools=tools,
timeout=timeout,
)
response = await app.ANTHROPIC_CLIENT.beta.messages.create( response = await app.ANTHROPIC_CLIENT.beta.messages.create(
max_tokens=max_tokens, max_tokens=max_tokens,
messages=messages, messages=messages,
@@ -704,7 +710,14 @@ class LLMCaller:
betas=betas, betas=betas,
thinking=thinking, thinking=thinking,
) )
LOG.info("Anthropic response", response=response, betas=betas, tools=tools, timeout=timeout) LOG.info(
"Anthropic response",
model_name=model_name,
response=response,
betas=betas,
tools=tools,
timeout=timeout,
)
return response return response

View File

@@ -1,4 +1,3 @@
import json
from typing import Any, Dict from typing import Any, Dict
import structlog import structlog
@@ -456,19 +455,20 @@ async def parse_anthropic_actions(
step: Step, step: Step,
assistant_content: list[dict[str, Any]], assistant_content: list[dict[str, Any]],
) -> list[Action]: ) -> list[Action]:
tool_calls = [block for block in assistant_content if block["type"] == "tool_use"] tool_calls = [block for block in assistant_content if block["type"] == "tool_use" and block["name"] == "computer"]
idx = 0 idx = 0
actions: list[Action] = [] actions: list[Action] = []
LOG.info("Anthropic tool calls", tool_calls=tool_calls, assistant_content=assistant_content)
while idx < len(tool_calls): while idx < len(tool_calls):
tool_call = tool_calls[idx] tool_call = tool_calls[idx]
tool_call_id = tool_call["id"] tool_call_id = tool_call["id"]
parsed_args = _parse_anthropic_computer_args(tool_call) tool_call_input = tool_call.get("input")
if not parsed_args: if not tool_call_input:
idx += 1 idx += 1
continue continue
action = parsed_args["action"] action = tool_call_input["action"]
if action == "mouse_move": if action == "mouse_move":
x, y = parsed_args["coordinate"] x, y = tool_call_input["coordinate"]
actions.append( actions.append(
MoveAction( MoveAction(
x=x, x=x,
@@ -483,15 +483,12 @@ async def parse_anthropic_actions(
) )
) )
elif action == "left_click": elif action == "left_click":
if idx - 1 >= 0: coordinate = tool_call_input.get("coordinate")
if not coordinate and idx - 1 >= 0:
prev_tool_call = tool_calls[idx - 1] prev_tool_call = tool_calls[idx - 1]
prev_parsed_args = _parse_anthropic_computer_args(prev_tool_call) prev_tool_call_input = prev_tool_call.get("input")
if prev_parsed_args and prev_parsed_args["action"] == "mouse_move": if prev_tool_call_input and prev_tool_call_input["action"] == "mouse_move":
coordinate = prev_parsed_args["coordinate"] coordinate = prev_tool_call_input.get("coordinate")
else:
coordinate = parsed_args.get("coordinate")
else:
coordinate = parsed_args.get("coordinate")
if not coordinate: if not coordinate:
LOG.warning( LOG.warning(
@@ -517,10 +514,10 @@ async def parse_anthropic_actions(
) )
) )
elif action == "type": elif action == "type":
text = parsed_args.get("text") text = tool_call_input.get("text")
if not text: if not text:
LOG.warning( LOG.warning(
"Type action has no text", "Anthropic type action has no text",
tool_call=tool_call, tool_call=tool_call,
) )
idx += 1 idx += 1
@@ -539,7 +536,7 @@ async def parse_anthropic_actions(
) )
) )
elif action == "key": elif action == "key":
text = parsed_args.get("text") text = tool_call_input.get("text")
if not text: if not text:
LOG.warning( LOG.warning(
"Key action has no text", "Key action has no text",
@@ -579,14 +576,3 @@ async def parse_anthropic_actions(
) )
idx += 1 idx += 1
return actions return actions
def _parse_anthropic_computer_args(tool_call: dict[str, Any]) -> dict[str, Any] | None:
tool_call_type = tool_call["type"]
if tool_call_type != "function":
return None
tool_call_name = tool_call["function"]["name"]
if tool_call_name != "computer":
return None
tool_call_arguments = tool_call["function"]["arguments"]
return json.loads(tool_call_arguments)