From 7c189818d90a55eaa1ae2de7da10ff423434fdb4 Mon Sep 17 00:00:00 2001 From: LawyZheng Date: Sat, 22 Nov 2025 10:36:43 +0800 Subject: [PATCH] allow extract result to be non dict (#4069) --- skyvern/forge/sdk/api/llm/api_handler_factory.py | 15 +++++++++------ skyvern/forge/sdk/api/llm/models.py | 6 ++++-- skyvern/forge/sdk/api/llm/utils.py | 10 +++++++++- skyvern/forge/sdk/workflow/models/block.py | 8 ++++++-- skyvern/webeye/actions/handler.py | 1 + 5 files changed, 29 insertions(+), 11 deletions(-) diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py index 54c0329d..3bb94ffc 100644 --- a/skyvern/forge/sdk/api/llm/api_handler_factory.py +++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py @@ -251,7 +251,8 @@ class LLMAPIHandlerFactory: use_message_history: bool = False, raw_response: bool = False, window_dimension: Resolution | None = None, - ) -> dict[str, Any]: + force_dict: bool = True, + ) -> dict[str, Any] | Any: """ Custom LLM API handler that utilizes the LiteLLM router and fallbacks to OpenAI GPT-4 Vision. @@ -482,7 +483,7 @@ class LLMAPIHandlerFactory: reasoning_token_count=reasoning_tokens if reasoning_tokens > 0 else None, cached_token_count=cached_tokens if cached_tokens > 0 else None, ) - parsed_response = parse_api_response(response, llm_config.add_assistant_prefix) + parsed_response = parse_api_response(response, llm_config.add_assistant_prefix, force_dict) parsed_response_json = json.dumps(parsed_response, indent=2) if step and not is_speculative_step: await app.ARTIFACT_MANAGER.create_llm_artifact( @@ -585,7 +586,8 @@ class LLMAPIHandlerFactory: use_message_history: bool = False, raw_response: bool = False, window_dimension: Resolution | None = None, - ) -> dict[str, Any]: + force_dict: bool = True, + ) -> dict[str, Any] | Any: start_time = time.time() active_parameters = base_parameters or {} if parameters is None: @@ -816,7 +818,7 @@ class LLMAPIHandlerFactory: cached_token_count=cached_tokens if cached_tokens > 0 else None, thought_cost=llm_cost, ) - parsed_response = parse_api_response(response, llm_config.add_assistant_prefix) + parsed_response = parse_api_response(response, llm_config.add_assistant_prefix, force_dict) await app.ARTIFACT_MANAGER.create_llm_artifact( data=json.dumps(parsed_response, indent=2).encode("utf-8"), artifact_type=ArtifactType.LLM_RESPONSE_PARSED, @@ -957,8 +959,9 @@ class LLMCaller: use_message_history: bool = False, raw_response: bool = False, window_dimension: Resolution | None = None, + force_dict: bool = True, **extra_parameters: Any, - ) -> dict[str, Any]: + ) -> dict[str, Any] | Any: start_time = time.perf_counter() active_parameters = self.base_parameters or {} if parameters is None: @@ -1140,7 +1143,7 @@ class LLMCaller: if raw_response: return response.model_dump(exclude_none=True) - parsed_response = parse_api_response(response, self.llm_config.add_assistant_prefix) + parsed_response = parse_api_response(response, self.llm_config.add_assistant_prefix, force_dict) parsed_response_json = json.dumps(parsed_response, indent=2) if step and not is_speculative_step: await app.ARTIFACT_MANAGER.create_llm_artifact( diff --git a/skyvern/forge/sdk/api/llm/models.py b/skyvern/forge/sdk/api/llm/models.py index 655a1e99..1e22554d 100644 --- a/skyvern/forge/sdk/api/llm/models.py +++ b/skyvern/forge/sdk/api/llm/models.py @@ -101,7 +101,8 @@ class LLMAPIHandler(Protocol): use_message_history: bool = False, raw_response: bool = False, window_dimension: Resolution | None = None, - ) -> Awaitable[dict[str, Any]]: ... + force_dict: bool = True, + ) -> Awaitable[dict[str, Any] | Any]: ... async def dummy_llm_api_handler( @@ -118,5 +119,6 @@ async def dummy_llm_api_handler( use_message_history: bool = False, raw_response: bool = False, window_dimension: Resolution | None = None, -) -> dict[str, Any]: + force_dict: bool = True, +) -> dict[str, Any] | Any: raise NotImplementedError("Your LLM provider is not configured. Please configure it in the .env file.") diff --git a/skyvern/forge/sdk/api/llm/utils.py b/skyvern/forge/sdk/api/llm/utils.py index 62b9b848..df6f3aff 100644 --- a/skyvern/forge/sdk/api/llm/utils.py +++ b/skyvern/forge/sdk/api/llm/utils.py @@ -165,7 +165,9 @@ def _coerce_response_to_dict(response: Any) -> dict[str, Any]: raise InvalidLLMResponseType(type(response).__name__) -def parse_api_response(response: litellm.ModelResponse, add_assistant_prefix: bool = False) -> dict[str, Any]: +def parse_api_response( + response: litellm.ModelResponse, add_assistant_prefix: bool = False, force_dict: bool = True +) -> dict[str, Any] | Any: content = None try: content = response.choices[0].message.content @@ -174,6 +176,8 @@ def parse_api_response(response: litellm.ModelResponse, add_assistant_prefix: bo content = "{" + content parsed = json_repair.loads(content) + if not force_dict: + return parsed return _coerce_response_to_dict(parsed) except Exception: @@ -186,6 +190,8 @@ def parse_api_response(response: litellm.ModelResponse, add_assistant_prefix: bo raise EmptyLLMResponseError(str(response)) content = _try_to_extract_json_from_markdown_format(content) parsed = commentjson.loads(content) + if not force_dict: + return parsed return _coerce_response_to_dict(parsed) except Exception as e: if content: @@ -196,6 +202,8 @@ def parse_api_response(response: litellm.ModelResponse, add_assistant_prefix: bo ) try: parsed = _fix_and_parse_json_string(content) + if not force_dict: + return parsed return _coerce_response_to_dict(parsed) except Exception as e2: LOG.exception("Failed to auto-fix LLM response.", error=str(e2)) diff --git a/skyvern/forge/sdk/workflow/models/block.py b/skyvern/forge/sdk/workflow/models/block.py index d358fff5..7d83ce25 100644 --- a/skyvern/forge/sdk/workflow/models/block.py +++ b/skyvern/forge/sdk/workflow/models/block.py @@ -2873,7 +2873,9 @@ class FileParserBlock(Block): self.override_llm_key, default=app.LLM_API_HANDLER ) - llm_response = await llm_api_handler(prompt=llm_prompt, prompt_name="extract-information-from-file-text") + llm_response = await llm_api_handler( + prompt=llm_prompt, prompt_name="extract-information-from-file-text", force_dict=False + ) return llm_response async def execute( @@ -3088,7 +3090,9 @@ class PDFParserBlock(Block): llm_prompt = prompt_engine.load_prompt( "extract-information-from-file-text", extracted_text_content=extracted_text, json_schema=self.json_schema ) - llm_response = await app.LLM_API_HANDLER(prompt=llm_prompt, prompt_name="extract-information-from-file-text") + llm_response = await app.LLM_API_HANDLER( + prompt=llm_prompt, prompt_name="extract-information-from-file-text", force_dict=False + ) # Record the parsed data await self.record_output_parameter_value(workflow_run_context, workflow_run_id, llm_response) return await self.build_block_result( diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 8d57ef8c..f70fb6d6 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -3787,6 +3787,7 @@ async def extract_information_for_navigation_goal( step=step, screenshots=scraped_page.screenshots, prompt_name="extract-information", + force_dict=False, ) return ScrapeResult(