diff --git a/skyvern/core/script_generations/real_skyvern_page_ai.py b/skyvern/core/script_generations/real_skyvern_page_ai.py index 3e7edd31..ff475f4b 100644 --- a/skyvern/core/script_generations/real_skyvern_page_ai.py +++ b/skyvern/core/script_generations/real_skyvern_page_ai.py @@ -2,7 +2,7 @@ from __future__ import annotations import json from datetime import datetime, timezone -from typing import Any +from typing import Any, cast import structlog from jinja2.sandbox import SandboxedEnvironment @@ -35,6 +35,9 @@ jinja_sandbox_env = SandboxedEnvironment() LOG = structlog.get_logger() +INPUT_GOAL = """- The intention to fill out an input: {intention}. +- The overall goal that the user wants to achieve: {prompt}.""" + SELECT_OPTION_GOAL = """- The intention to select an option: {intention}. - The overall goal that the user wants to achieve: {prompt}.""" @@ -183,7 +186,7 @@ class RealSkyvernPageAi(SkyvernPageAi): async def ai_input_text( self, - selector: str, + selector: str | None, value: str, intention: str, data: str | dict[str, Any] | None = None, @@ -193,19 +196,20 @@ class RealSkyvernPageAi(SkyvernPageAi): ) -> str: """Input text into an element using AI to determine the value.""" - context = skyvern_context.current() + context = skyvern_context.ensure_context() value = value or "" transformed_value = value - element_id: str | None = None - organization_id = context.organization_id if context else None - task_id = context.task_id if context else None - step_id = context.step_id if context else None - workflow_run_id = context.workflow_run_id if context else None + action: InputTextAction | None = None + organization_id = context.organization_id + task_id = context.task_id + step_id = context.step_id + workflow_run_id = context.workflow_run_id task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None + if intention: try: - prompt = context.prompt if context else None + prompt = context.prompt data = data or {} if (totp_identifier or totp_url) and context and organization_id and task_id: if totp_identifier: @@ -232,40 +236,72 @@ class RealSkyvernPageAi(SkyvernPageAi): refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots() self.scraped_page = refreshed_page - # get the element_id by the selector - element_id = await _get_element_id_by_selector(selector, self.page) - script_generation_input_text_prompt = prompt_engine.load_prompt( - template="script-generation-input-text-generatiion", - intention=intention, - goal=prompt, - data=data, - ) - json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER( - prompt=script_generation_input_text_prompt, - prompt_name="script-generation-input-text-generatiion", - step=step, - organization_id=organization_id, - ) - value = json_response.get("answer", value) + + # Try to get element_id from selector if selector is provided + element_id = await _get_element_id_by_selector(selector, self.page) if selector else None + + if element_id: + # The selector/element is valid, using a simpler/smaller prompt + script_generation_input_text_prompt = prompt_engine.load_prompt( + template="script-generation-input-text-generatiion", + intention=intention, + goal=prompt, + data=data, + ) + json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER( + prompt=script_generation_input_text_prompt, + prompt_name="script-generation-input-text-generatiion", + step=step, + organization_id=organization_id, + ) + value = json_response.get("answer", value) + + if context and context.workflow_run_id: + transformed_value = await _get_actual_value_of_parameter_if_secret( + context.workflow_run_id, str(value) + ) + action = InputTextAction( + element_id=element_id, + text=value, + status=ActionStatus.pending, + organization_id=organization_id, + workflow_run_id=workflow_run_id, + task_id=task_id, + step_id=context.step_id if context else None, + reasoning=intention, + intention=intention, + response=value, + ) + else: + # Use a heavier single-input-action when selector is not found + element_tree = refreshed_page.build_element_tree() + payload_str = _get_context_data(data) + merged_goal = INPUT_GOAL.format(intention=intention, prompt=prompt) + + single_input_prompt = prompt_engine.load_prompt( + template="single-input-action", + navigation_goal=merged_goal, + navigation_payload_str=payload_str, + current_url=self.page.url, + elements=element_tree, + local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(), + ) + json_response = await app.SINGLE_INPUT_AGENT_LLM_API_HANDLER( + prompt=single_input_prompt, + prompt_name="single-input-action", + step=step, + organization_id=organization_id, + ) + + actions_json = json_response.get("actions", []) + if actions_json and task and step: + actions = parse_actions(task, step.step_id, step.order, refreshed_page, actions_json) + if actions and isinstance(actions[0], InputTextAction): + action = cast(InputTextAction, actions[0]) except Exception: LOG.exception(f"Failed to adapt value for input text action on selector={selector}, value={value}") - if context and context.workflow_run_id: - transformed_value = await _get_actual_value_of_parameter_if_secret(context.workflow_run_id, str(value)) - - if element_id and organization_id and task and step: - action = InputTextAction( - element_id=element_id, - text=value, - status=ActionStatus.pending, - organization_id=organization_id, - workflow_run_id=workflow_run_id, - task_id=task_id, - step_id=context.step_id if context else None, - reasoning=intention, - intention=intention, - response=value, - ) + if action and organization_id and task and step: result = await handle_input_text_action(action, self.page, self.scraped_page, task, step) if result and result[-1].success is False: raise Exception(result[-1].exception_message) diff --git a/skyvern/core/script_generations/skyvern_page.py b/skyvern/core/script_generations/skyvern_page.py index b3512571..6d82650f 100644 --- a/skyvern/core/script_generations/skyvern_page.py +++ b/skyvern/core/script_generations/skyvern_page.py @@ -502,7 +502,7 @@ class SkyvernPage: @action_wrap(ActionType.INPUT_TEXT) async def fill( self, - selector: str, + selector: str | None, value: str, ai: str | None = "fallback", intention: str | None = None, @@ -525,7 +525,7 @@ class SkyvernPage: @action_wrap(ActionType.INPUT_TEXT) async def type( self, - selector: str, + selector: str | None, value: str, ai: str | None = "fallback", intention: str | None = None, @@ -547,7 +547,7 @@ class SkyvernPage: async def _input_text( self, - selector: str, + selector: str | None, value: str, ai: str | None = "fallback", intention: str | None = None, @@ -569,15 +569,17 @@ class SkyvernPage: context = skyvern_context.current() if context and context.ai_mode_override: ai = context.ai_mode_override + # format the text with the actual value of the parameter if it's a secret when running a workflow if ai == "fallback": error_to_raise = None - try: - locator = self.page.locator(selector) - await handler_utils.input_sequentially(locator, value, timeout=timeout) - return value - except Exception as e: - error_to_raise = e + if selector: + try: + locator = self.page.locator(selector) + await handler_utils.input_sequentially(locator, value, timeout=timeout) + return value + except Exception as e: + error_to_raise = e if intention: return await self._ai.ai_input_text( @@ -603,6 +605,10 @@ class SkyvernPage: totp_url=totp_url, timeout=timeout, ) + + if not selector: + raise ValueError("Selector is required but was not provided") + locator = self.page.locator(selector) await handler_utils.input_sequentially(locator, value, timeout=timeout) return value diff --git a/skyvern/core/script_generations/skyvern_page_ai.py b/skyvern/core/script_generations/skyvern_page_ai.py index 1b8c7c61..4a31a125 100644 --- a/skyvern/core/script_generations/skyvern_page_ai.py +++ b/skyvern/core/script_generations/skyvern_page_ai.py @@ -20,7 +20,7 @@ class SkyvernPageAi(Protocol): async def ai_input_text( self, - selector: str, + selector: str | None, value: str, intention: str, data: str | dict[str, Any] | None = None, diff --git a/skyvern/forge/sdk/schemas/sdk_actions.py b/skyvern/forge/sdk/schemas/sdk_actions.py index cdf3b302..3f835ef3 100644 --- a/skyvern/forge/sdk/schemas/sdk_actions.py +++ b/skyvern/forge/sdk/schemas/sdk_actions.py @@ -37,7 +37,7 @@ class InputTextAction(SdkActionBase): """Input text action parameters.""" type: Literal["ai_input_text"] = "ai_input_text" - selector: str = Field(default="", description="CSS selector for the element") + selector: str | None = Field(default="", description="CSS selector for the element") value: str = Field(default="", description="Value to input") intention: str = Field(default="", description="The intention or goal of the input") data: str | dict[str, Any] | None = Field(None, description="Additional context data") diff --git a/skyvern/library/SdkSkyvernPageAi.py b/skyvern/library/SdkSkyvernPageAi.py index 8c766b7c..faa8dc63 100644 --- a/skyvern/library/SdkSkyvernPageAi.py +++ b/skyvern/library/SdkSkyvernPageAi.py @@ -47,7 +47,7 @@ class SdkSkyvernPageAi(SkyvernPageAi): async def ai_input_text( self, - selector: str, + selector: str | None, value: str, intention: str, data: str | dict[str, Any] | None = None, diff --git a/skyvern/library/skyvern_browser_page.py b/skyvern/library/skyvern_browser_page.py index 0e516483..35667b2e 100644 --- a/skyvern/library/skyvern_browser_page.py +++ b/skyvern/library/skyvern_browser_page.py @@ -408,7 +408,7 @@ class SkyvernBrowserPage: ``` """ return await self._input_text( - selector=selector or "", + selector=selector, value=value or "", ai=ai, intention=prompt, @@ -698,7 +698,7 @@ class SkyvernBrowserPage: async def _input_text( self, - selector: str, + selector: str | None, value: str, ai: str | None = "fallback", intention: str | None = None, @@ -721,12 +721,13 @@ class SkyvernBrowserPage: # format the text with the actual value of the parameter if it's a secret when running a workflow if ai == "fallback": error_to_raise = None - try: - locator = self._page.locator(selector) - await handler_utils.input_sequentially(locator, value, timeout=timeout) - return value - except Exception as e: - error_to_raise = e + if selector: + try: + locator = self._page.locator(selector) + await handler_utils.input_sequentially(locator, value, timeout=timeout) + return value + except Exception as e: + error_to_raise = e if intention: return await self._ai.ai_input_text( @@ -752,6 +753,10 @@ class SkyvernBrowserPage: totp_url=totp_url, timeout=timeout, ) + + if not selector: + raise ValueError("Selector is required but was not provided") + locator = self._page.locator(selector) await handler_utils.input_sequentially(locator, value, timeout=timeout) return value