From 33bea92a5a91cfd85c6c1fe089da6f30d68cd759 Mon Sep 17 00:00:00 2001 From: pedrohsdb Date: Tue, 9 Dec 2025 09:37:30 -0800 Subject: [PATCH] fix: add speculative metadata support to non-router LLM handler (#4241) --- .../forge/sdk/api/llm/api_handler_factory.py | 54 +++++++++++++------ 1 file changed, 38 insertions(+), 16 deletions(-) diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py index db3ece1e..dd22f317 100644 --- a/skyvern/forge/sdk/api/llm/api_handler_factory.py +++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py @@ -988,7 +988,7 @@ class LLMAPIHandlerFactory: _log_vertex_cache_hit_if_needed(context, prompt_name, model_name, cached_tokens) - if step: + if step and not is_speculative_step: await app.DATABASE.update_step( task_id=step.task_id, step_id=step.step_id, @@ -1010,28 +1010,33 @@ class LLMAPIHandlerFactory: thought_cost=llm_cost, ) parsed_response = parse_api_response(response, llm_config.add_assistant_prefix, force_dict) - await app.ARTIFACT_MANAGER.create_llm_artifact( - data=json.dumps(parsed_response, indent=2).encode("utf-8"), - artifact_type=ArtifactType.LLM_RESPONSE_PARSED, - step=step, - task_v2=task_v2, - thought=thought, - ai_suggestion=ai_suggestion, - ) - - if context and len(context.hashed_href_map) > 0: - llm_content = json.dumps(parsed_response) - rendered_content = Template(llm_content).render(context.hashed_href_map) - parsed_response = json.loads(rendered_content) + parsed_response_json = json.dumps(parsed_response, indent=2) + if step and not is_speculative_step: await app.ARTIFACT_MANAGER.create_llm_artifact( - data=json.dumps(parsed_response, indent=2).encode("utf-8"), - artifact_type=ArtifactType.LLM_RESPONSE_RENDERED, + data=parsed_response_json.encode("utf-8"), + artifact_type=ArtifactType.LLM_RESPONSE_PARSED, step=step, task_v2=task_v2, thought=thought, ai_suggestion=ai_suggestion, ) + rendered_response_json = None + if context and len(context.hashed_href_map) > 0: + llm_content = json.dumps(parsed_response) + rendered_content = Template(llm_content).render(context.hashed_href_map) + parsed_response = json.loads(rendered_content) + rendered_response_json = json.dumps(parsed_response, indent=2) + if step and not is_speculative_step: + await app.ARTIFACT_MANAGER.create_llm_artifact( + data=rendered_response_json.encode("utf-8"), + artifact_type=ArtifactType.LLM_RESPONSE_RENDERED, + step=step, + task_v2=task_v2, + thought=thought, + ai_suggestion=ai_suggestion, + ) + # Track LLM API handler duration, token counts, and cost organization_id = organization_id or ( step.organization_id if step else (thought.organization_id if thought else None) @@ -1053,6 +1058,23 @@ class LLMAPIHandlerFactory: llm_cost=llm_cost if llm_cost > 0 else None, ) + if step and is_speculative_step: + step.speculative_llm_metadata = SpeculativeLLMMetadata( + prompt=llm_prompt_value, + llm_request_json=llm_request_json, + llm_response_json=llm_response_json, + parsed_response_json=parsed_response_json, + rendered_response_json=rendered_response_json, + llm_key=llm_key, + model=llm_config.model_name, + duration_seconds=duration_seconds, + input_tokens=prompt_tokens if prompt_tokens > 0 else None, + output_tokens=completion_tokens if completion_tokens > 0 else None, + reasoning_tokens=reasoning_tokens if reasoning_tokens > 0 else None, + cached_tokens=cached_tokens if cached_tokens > 0 else None, + llm_cost=llm_cost if llm_cost > 0 else None, + ) + return parsed_response llm_api_handler.llm_key = llm_key # type: ignore[attr-defined]