diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 15425844..8b8c88ea 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -132,6 +132,10 @@ from skyvern.webeye.utils.page import SkyvernFrame LOG = structlog.get_logger() +EXTRACT_ACTION_TEMPLATE = "extract-action" +EXTRACT_ACTION_PROMPT_NAME = "extract-actions" +EXTRACT_ACTION_CACHE_KEY_PREFIX = f"{EXTRACT_ACTION_TEMPLATE}-static" + class ActionLinkedNode: def __init__(self, action: Action) -> None: @@ -2272,7 +2276,9 @@ class ForgeAgent: return scraped_page, extract_action_prompt, use_caching - async def _create_vertex_cache_for_task(self, task: Task, static_prompt: str, context: SkyvernContext) -> None: + async def _create_vertex_cache_for_task( + self, task: Task, static_prompt: str, context: SkyvernContext, llm_key_override: str | None + ) -> None: """ Create a Vertex AI cache for the task's static prompt. @@ -2285,7 +2291,9 @@ class ForgeAgent: """ # Early return if task doesn't have an llm_key # This should not happen given the guard at the call site, but being defensive - if not task.llm_key: + resolved_llm_key = llm_key_override or task.llm_key + + if not resolved_llm_key: LOG.warning( "Cannot create Vertex AI cache without llm_key, skipping cache creation", task_id=task.task_id, @@ -2293,18 +2301,23 @@ class ForgeAgent: return try: + LOG.info( + "Attempting Vertex AI cache creation", + task_id=task.task_id, + llm_key=resolved_llm_key, + ) cache_manager = get_cache_manager() # Use llm_key as cache_key so all tasks with the same model share the same cache # This maximizes cache reuse and reduces cache storage costs - cache_key = f"extract-action-static-{task.llm_key}" + cache_key = f"{EXTRACT_ACTION_CACHE_KEY_PREFIX}-{resolved_llm_key}" # Get the actual model name from LLM config to ensure correct format # (e.g., "gemini-2.5-flash" with decimal, not "gemini-2-5-flash") model_name = "gemini-2.5-flash" # Default try: - llm_config = LLMConfigRegistry.get_config(task.llm_key) + llm_config = LLMConfigRegistry.get_config(resolved_llm_key) extracted_name = None # Try to extract from model_name if it contains "vertex_ai/" or starts with "gemini-" @@ -2328,13 +2341,13 @@ class ForgeAgent: if not extracted_name: # Extract version from llm_key (e.g., VERTEX_GEMINI_1_5_FLASH -> "1_5" or VERTEX_GEMINI_2.5_FLASH -> "2.5") # Pattern: GEMINI_{version}_{flavor} where version can use dots, underscores, or dashes - version_match = re.search(r"GEMINI[_-](\d+[._-]\d+)", task.llm_key, re.IGNORECASE) + version_match = re.search(r"GEMINI[_-](\d+[._-]\d+)", resolved_llm_key, re.IGNORECASE) version = version_match.group(1).replace("_", ".").replace("-", ".") if version_match else "2.5" # Determine flavor - if "_PRO_" in task.llm_key or task.llm_key.endswith("_PRO"): + if "_PRO_" in resolved_llm_key or resolved_llm_key.endswith("_PRO"): extracted_name = f"gemini-{version}-pro" - elif "_FLASH_LITE_" in task.llm_key or task.llm_key.endswith("_FLASH_LITE"): + elif "_FLASH_LITE_" in resolved_llm_key or resolved_llm_key.endswith("_FLASH_LITE"): extracted_name = f"gemini-{version}-flash-lite" else: # Default to flash flavor @@ -2345,6 +2358,11 @@ class ForgeAgent: except Exception as e: LOG.debug("Failed to extract model name from config, using default", error=str(e)) + # Normalize model name to the canonical Vertex identifier (e.g., gemini-2.5-pro) + match = re.search(r"(gemini-\d+(?:\.\d+)?-(?:flash-lite|flash|pro))", model_name, re.IGNORECASE) + if match: + model_name = match.group(1).lower() + # Create cache for this task # Use asyncio.to_thread to offload blocking HTTP request (requests.post) # This prevents freezing the event loop during cache creation @@ -2395,11 +2413,12 @@ class ForgeAgent: final_navigation_payload = self._build_navigation_payload( task, expire_verification_code=expire_verification_code, step=step, scraped_page=scraped_page ) + navigation_payload_str = json.dumps(final_navigation_payload) task_type = task.task_type if task.task_type else TaskType.general template = "" if task_type == TaskType.general: - template = "extract-action" + template = EXTRACT_ACTION_TEMPLATE elif task_type == TaskType.validation: template = "decisive-criterion-validate" elif task_type == TaskType.action: @@ -2438,43 +2457,72 @@ class ForgeAgent: context = skyvern_context.ensure_context() + # Reset cached prompt by default; we will set it below if caching is enabled. + context.cached_static_prompt = None + # Check if prompt caching is enabled for extract-action use_caching = False - if ( - template == "extract-action" - and LLMAPIHandlerFactory._prompt_caching_settings - and LLMAPIHandlerFactory._prompt_caching_settings.get("extract-action", False) - ): + prompt_caching_settings = LLMAPIHandlerFactory._prompt_caching_settings or {} + effective_llm_key = task.llm_key + if not effective_llm_key: + handler_for_key = LLMAPIHandlerFactory.get_override_llm_api_handler( + task.llm_key, default=app.LLM_API_HANDLER + ) + effective_llm_key = getattr(handler_for_key, "llm_key", None) + cache_enabled = prompt_caching_settings.get(EXTRACT_ACTION_PROMPT_NAME) or prompt_caching_settings.get( + EXTRACT_ACTION_TEMPLATE + ) + LOG.info( + "Extract-action prompt caching evaluation", + template=template, + cache_enabled=cache_enabled, + prompt_caching_settings=prompt_caching_settings, + task_llm_key=task.llm_key, + effective_llm_key=effective_llm_key, + ) + if template == EXTRACT_ACTION_TEMPLATE and cache_enabled: try: # Try to load split templates for caching - static_prompt = prompt_engine.load_prompt(f"{template}-static") - dynamic_prompt = prompt_engine.load_prompt( - f"{template}-dynamic", - navigation_goal=navigation_goal, - navigation_payload_str=json.dumps(final_navigation_payload), - starting_url=starting_url, - current_url=current_url, - data_extraction_goal=task.data_extraction_goal, - action_history=actions_and_results_str, - error_code_mapping_str=(json.dumps(task.error_code_mapping) if task.error_code_mapping else None), - local_datetime=datetime.now(context.tz_info).isoformat(), - verification_code_check=verification_code_check, - complete_criterion=task.complete_criterion.strip() if task.complete_criterion else None, - terminate_criterion=task.terminate_criterion.strip() if task.terminate_criterion else None, - parse_select_feature_enabled=context.enable_parse_select_in_extract, - has_magic_link_page=context.has_magic_link_page(task.task_id), - ) + prompt_kwargs = { + "navigation_goal": navigation_goal, + "navigation_payload_str": navigation_payload_str, + "starting_url": starting_url, + "current_url": current_url, + "data_extraction_goal": task.data_extraction_goal, + "action_history": actions_and_results_str, + "error_code_mapping_str": ( + json.dumps(task.error_code_mapping) if task.error_code_mapping else None + ), + "local_datetime": datetime.now(context.tz_info).isoformat(), + "verification_code_check": verification_code_check, + "complete_criterion": task.complete_criterion.strip() if task.complete_criterion else None, + "terminate_criterion": task.terminate_criterion.strip() if task.terminate_criterion else None, + "parse_select_feature_enabled": context.enable_parse_select_in_extract, + "has_magic_link_page": context.has_magic_link_page(task.task_id), + } + static_prompt = prompt_engine.load_prompt(f"{template}-static", **prompt_kwargs) + dynamic_prompt = prompt_engine.load_prompt(f"{template}-dynamic", **prompt_kwargs) - # Store static prompt for caching and return dynamic prompt + # Store static prompt for caching and continue sending it alongside the dynamic section. + # Vertex explicit caching expects the static content to still be present in the request so the + # first call succeeds even if the cache is cold. The cached reference simply lets the service + # reuse the static portion internally. context.cached_static_prompt = static_prompt + context.use_prompt_caching = True use_caching = True # Create Vertex AI cache for Gemini models - if task.llm_key and "GEMINI" in task.llm_key: - await self._create_vertex_cache_for_task(task, static_prompt, context) + if effective_llm_key and "GEMINI" in effective_llm_key: + await self._create_vertex_cache_for_task(task, static_prompt, context, effective_llm_key) - LOG.info("Using cached prompt for extract-action", task_id=task.task_id) - return dynamic_prompt, use_caching + combined_prompt = f"{static_prompt.rstrip()}\n\n{dynamic_prompt.lstrip()}" + + LOG.info( + "Using cached prompt", + task_id=task.task_id, + prompt_name=EXTRACT_ACTION_PROMPT_NAME, + ) + return combined_prompt, use_caching except Exception as e: LOG.warning("Failed to load cached prompt templates, falling back to original", error=str(e)) @@ -2486,7 +2534,7 @@ class ForgeAgent: prompt_engine=prompt_engine, template_name=template, navigation_goal=navigation_goal, - navigation_payload_str=json.dumps(final_navigation_payload), + navigation_payload_str=navigation_payload_str, starting_url=starting_url, current_url=current_url, data_extraction_goal=task.data_extraction_goal, diff --git a/skyvern/forge/prompts/skyvern/extract-action-static.j2 b/skyvern/forge/prompts/skyvern/extract-action-static.j2 index c2a18afe..d41296d9 100644 --- a/skyvern/forge/prompts/skyvern/extract-action-static.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action-static.j2 @@ -18,7 +18,7 @@ Reply in JSON format with the following keys: "user_detail_query": str, // Think of this value as a Jeopardy question and the intention behind the action. Ask the user for the details you need for executing this action. Ask the question even if the details are disclosed in user goal or user details. If it's a text field, ask for the text. If it's a file upload, ask for the file. If it's a dropdown, ask for the relevant information. If you are clicking on something specific, ask about what the intention is behind the click and what to click on. If you're downloading a file and you have multiple options, ask the user which one to download. Examples are: "What product ID should I input into the search bar?", "What file should I upload?", "What is the previous insurance provider of the user?", "Which invoice should I download?", "Does the user have any pets?". If the action doesn't require any user details, describe the intention behind the action. "user_detail_answer": str, // The answer to the `user_detail_query`. The source of this answer can be user goal or user details. "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence - "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned. + "action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{ "complete criterion has been met" if complete_criterion else "user goal has been achieved" }} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} "id": str, // The id of the element to take action on. The id has to be one from the elements list "text": str, // Text for INPUT_TEXT action only "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise. @@ -44,7 +44,8 @@ Reply in JSON format with the following keys: }],{% if verification_code_check %} "verification_code_reasoning": str, // Let's think step by step. Describe what you see and think if there is somewhere on the current page where you must enter the verification code now for login or any verification step. Explain why you believe a verification code needs to be entered somewhere or not. Do not imagine any place to enter the code if the code has not been sent yet. "place_to_enter_verification_code": bool, // Whether there is a place on the current page to enter the verification code now. - "should_enter_verification_code": bool // Whether the user should proceed to enter the verification code {% endif %} + "should_enter_verification_code": bool, // Whether the user should proceed to enter the verification code. + "should_verify_by_magic_link": bool // Whether the page instructs the user to check their email for a magic link to verify the login.{% endif %} } Consider the action history from the last step and the screenshot together, if actions from the last step don't yield positive impact, try other actions or other action combinations. diff --git a/skyvern/forge/sdk/api/llm/api_handler_factory.py b/skyvern/forge/sdk/api/llm/api_handler_factory.py index 8860a2e2..0b5fe5bb 100644 --- a/skyvern/forge/sdk/api/llm/api_handler_factory.py +++ b/skyvern/forge/sdk/api/llm/api_handler_factory.py @@ -37,6 +37,8 @@ from skyvern.utils.image_resizer import Resolution, get_resize_target_dimension, LOG = structlog.get_logger() +EXTRACT_ACTION_PROMPT_NAME = "extract-actions" + class LLMCallStats(BaseModel): input_tokens: int | None = None @@ -313,12 +315,28 @@ class LLMAPIHandlerFactory: except Exception as e: LOG.warning("Failed to apply context caching system message", error=str(e), exc_info=True) + vertex_cache_attached = False + cache_resource_name = getattr(context, "vertex_cache_name", None) + if ( + cache_resource_name + and prompt_name == EXTRACT_ACTION_PROMPT_NAME + and getattr(context, "use_prompt_caching", False) + ): + parameters = {**parameters, "cached_content": cache_resource_name} + vertex_cache_attached = True + LOG.info( + "Adding Vertex AI cache reference to router request", + prompt_name=prompt_name, + cache_attached=True, + ) + await app.ARTIFACT_MANAGER.create_llm_artifact( data=json.dumps( { "model": llm_key, "messages": messages, **parameters, + "vertex_cache_attached": vertex_cache_attached, } ).encode("utf-8"), artifact_type=ArtifactType.LLM_REQUEST, @@ -473,6 +491,7 @@ class LLMAPIHandlerFactory: return parsed_response + llm_api_handler_with_router_and_fallback.llm_key = llm_key # type: ignore[attr-defined] return llm_api_handler_with_router_and_fallback @staticmethod @@ -592,10 +611,15 @@ class LLMAPIHandlerFactory: # Add Vertex AI cache reference only for the intended cached prompt vertex_cache_attached = False cache_resource_name = getattr(context, "vertex_cache_name", None) + LOG.info( + "Vertex cache attachment check", + cache_resource_name=cache_resource_name, + prompt_name=prompt_name, + use_prompt_caching=getattr(context, "use_prompt_caching", None) if context else None, + ) if ( cache_resource_name - and "vertex_ai/" in model_name - and prompt_name == "extract-actions" + and prompt_name == EXTRACT_ACTION_PROMPT_NAME and getattr(context, "use_prompt_caching", False) ): active_parameters["cached_content"] = cache_resource_name @@ -779,6 +803,7 @@ class LLMAPIHandlerFactory: return parsed_response + llm_api_handler.llm_key = llm_key # type: ignore[attr-defined] return llm_api_handler @staticmethod