SkyvernPage actions: Support selector and xpath at the same time. Migrate generate script to use click(selector=f"xpath={xpath}") (#3729)

2025-10-15 19:50:44 -07:00
parent 3f7b2c2fdd
commit ea2e73f8cc
3 changed files with 318 additions and 106 deletions
--- a/skyvern/core/script_generations/generate_script.py
+++ b/skyvern/core/script_generations/generate_script.py
@@ -27,6 +27,7 @@ from skyvern.schemas.workflows import FileStorageType
 from skyvern.webeye.actions.action_types import ActionType
 LOG = structlog.get_logger(__name__)
 GENERATE_CODE_AI_MODE = "proactive"
 # --------------------------------------------------------------------- #
@@ -227,7 +228,7 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
    """
    Turn one Action dict into:
-        await page.<method>(xpath=..., intention=..., data=context.parameters)
+        await page.<method>(selector=..., intention=..., data=context.parameters)
    Or if assign_to_output is True for extract actions:
@@ -239,8 +240,8 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
    if method in ACTIONS_WITH_XPATH:
        args.append(
            cst.Arg(
-                keyword=cst.Name("xpath"),
+                keyword=cst.Name("selector"),
-                value=_value(act["xpath"]),
+                value=_value(f"xpath={act['xpath']}"),
                whitespace_after_arg=cst.ParenthesizedWhitespace(
                    indent=True,
                    last_line=cst.SimpleWhitespace(INDENT),
@@ -248,7 +249,18 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
            )
        )
-    if method in ["type", "fill"]:
+    if method == "click":
        args.append(
            cst.Arg(
                keyword=cst.Name("ai"),
                value=_value(GENERATE_CODE_AI_MODE),
                whitespace_after_arg=cst.ParenthesizedWhitespace(
                    indent=True,
                    last_line=cst.SimpleWhitespace(INDENT),
                ),
            )
        )
    elif method in ["type", "fill"]:
        # Use context.parameters if field_name is available, otherwise fallback to direct value
        if act.get("field_name"):
            text_value = cst.Subscript(
@@ -273,8 +285,8 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
        )
        args.append(
            cst.Arg(
-                keyword=cst.Name("ai_infer"),
+                keyword=cst.Name("ai"),
-                value=cst.Name("True"),
+                value=_value(GENERATE_CODE_AI_MODE),
                whitespace_after_arg=cst.ParenthesizedWhitespace(
                    indent=True,
                    last_line=cst.SimpleWhitespace(INDENT),
@@ -330,8 +342,8 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
            )
            args.append(
                cst.Arg(
-                    keyword=cst.Name("ai_infer"),
+                    keyword=cst.Name("ai"),
-                    value=cst.Name("True"),
+                    value=_value(GENERATE_CODE_AI_MODE),
                    whitespace_after_arg=cst.ParenthesizedWhitespace(
                        indent=True,
                        last_line=cst.SimpleWhitespace(INDENT),
@@ -361,8 +373,8 @@ def _action_to_stmt(act: dict[str, Any], task: dict[str, Any], assign_to_output:
        )
        args.append(
            cst.Arg(
-                keyword=cst.Name("ai_infer"),
+                keyword=cst.Name("ai"),
-                value=cst.Name("True"),
+                value=_value(GENERATE_CODE_AI_MODE),
                whitespace_after_arg=cst.ParenthesizedWhitespace(
                    indent=True,
                    last_line=cst.SimpleWhitespace(INDENT),
--- a/skyvern/core/script_generations/skyvern_page.py
+++ b/skyvern/core/script_generations/skyvern_page.py
@@ -36,6 +36,7 @@ from skyvern.webeye.actions.actions import (
 )
 from skyvern.webeye.actions.handler import (
    ActionHandler,
    handle_click_action,
    handle_complete_action,
    handle_input_text_action,
    handle_select_option_action,
@@ -72,8 +73,8 @@ class ActionCall:
    error: Exception | None = None  # populated if failed
-async def _get_element_id_by_xpath(xpath: str, page: Page) -> str | None:
+async def _get_element_id_by_selector(selector: str, page: Page) -> str | None:
-    locator = page.locator(f"xpath={xpath}")
+    locator = page.locator(selector)
    element_id = await locator.get_attribute("unique_id")
    return element_id
@@ -353,9 +354,12 @@ class SkyvernPage:
            # Create action record. TODO: store more action fields
            kwargs = kwargs or {}
            # we're using "value" instead of "text" for input text actions interface
-            xpath = kwargs.get("xpath")
+            xpath = None
            if action_type == ActionType.CLICK:
-                xpath = call_result or xpath
+                if isinstance(call_result, str) and "xpath=" in call_result:
                    xpath_split_list = call_result.split("xpath=")
                    if len(xpath_split_list) > 1:
                        xpath = xpath_split_list[1]
            text = None
            select_option = None
            response: str | None = kwargs.get("response")
@@ -470,59 +474,120 @@ class SkyvernPage:
            # If screenshot creation fails, don't block execution
            pass
    async def _ai_click(
        self,
        selector: str,
        intention: str,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> str:
        try:
            # Build the element tree of the current page for the prompt
            context = skyvern_context.ensure_context()
            payload_str = _get_context_data(data)
            refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
            element_tree = refreshed_page.build_element_tree()
            single_click_prompt = prompt_engine.load_prompt(
                template="single-click-action",
                navigation_goal=intention,
                navigation_payload_str=payload_str,
                current_url=self.page.url,
                elements=element_tree,
                local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(),
                # user_context=getattr(context, "prompt", None),
            )
            json_response = await app.SINGLE_CLICK_AGENT_LLM_API_HANDLER(
                prompt=single_click_prompt,
                prompt_name="single-click-action",
                organization_id=context.organization_id,
            )
            actions_json = json_response.get("actions", [])
            if actions_json:
                organization_id = context.organization_id if context else None
                task_id = context.task_id if context else None
                step_id = context.step_id if context else None
                task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None
                step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None
                if organization_id and task and step:
                    actions = parse_actions(
                        task, step.step_id, step.order, self.scraped_page, json_response.get("actions", [])
                    )
                    action = actions[0]
                    result = await handle_click_action(action, self.page, self.scraped_page, task, step)
                    if result and result[-1].success is False:
                        raise Exception(result[-1].exception_message)
                    xpath = action.get_xpath()
                    selector = f"xpath={xpath}" if xpath else selector
                    return selector
        except Exception:
            LOG.exception(
                f"Failed to do ai click. Falling back to original selector={selector}, intention={intention}, data={data}"
            )
        locator = self.page.locator(selector)
        await locator.click(timeout=timeout)
        return selector
    ######### Public Interfaces #########
    @action_wrap(ActionType.CLICK)
-    async def click(self, xpath: str, intention: str | None = None, data: str | dict[str, Any] | None = None) -> str:
+    async def click(
-        """Click an element identified by ``xpath``.
+        self,
        selector: str,
        intention: str | None = None,
        ai: str | None = "fallback",
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> str:
        """Click an element identified by ``selector``.
        When ``intention`` and ``data`` are provided a new click action is
        generated via the ``single-click-action`` prompt.  The model returns a
-        fresh xpath based on the current DOM and the updated data for this run.
+        fresh "xpath=..." selector based on the current DOM and the updated data for this run.
-        The browser then clicks the element using this newly generated xpath.
+        The browser then clicks the element using this newly generated xpath selector.
        If the prompt generation or parsing fails for any reason we fall back to
-        clicking the originally supplied ``xpath``.
+        clicking the originally supplied ``selector``.
        """
-        new_xpath = xpath
+        if ai == "fallback":
-
+            # try to click the element with the original selector first
-        if intention:
+            error_to_raise = None
            try:
-                # Build the element tree of the current page for the prompt
+                locator = self.page.locator(selector)
-                context = skyvern_context.ensure_context()
+                await locator.click(timeout=timeout)
-                payload_str = _get_context_data(data)
+                return selector
-                refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
+            except Exception as e:
-                element_tree = refreshed_page.build_element_tree()
+                error_to_raise = e
                single_click_prompt = prompt_engine.load_prompt(
                    template="single-click-action",
                    navigation_goal=intention,
                    navigation_payload_str=payload_str,
                    current_url=self.page.url,
                    elements=element_tree,
                    local_datetime=datetime.now(context.tz_info or datetime.now().astimezone().tzinfo).isoformat(),
                    # user_context=getattr(context, "prompt", None),
                )
                json_response = await app.SINGLE_CLICK_AGENT_LLM_API_HANDLER(
                    prompt=single_click_prompt,
                    prompt_name="single-click-action",
                    organization_id=context.organization_id,
                )
                actions = json_response.get("actions", [])
                if actions:
                    new_xpath = actions[0].get("xpath", xpath) or xpath
            except Exception:
                # If anything goes wrong, fall back to the original xpath
                new_xpath = xpath
-        locator = self.page.locator(f"xpath={new_xpath}")
+            # if the original selector doesn't work, try to click the element with the ai generated selector
-        await locator.click(timeout=5000)
+            if intention:
-        return new_xpath
+                return await self._ai_click(
                    selector=selector,
                    intention=intention,
                    data=data,
                    timeout=timeout,
                )
            if error_to_raise:
                raise error_to_raise
            else:
                return selector
        elif ai == "proactive":
            if intention:
                return await self._ai_click(
                    selector=selector,
                    intention=intention,
                    data=data,
                    timeout=timeout,
                )
        locator = self.page.locator(selector)
        await locator.click(timeout=timeout)
        return selector
    @action_wrap(ActionType.INPUT_TEXT)
    async def fill(
        self,
-        xpath: str,
+        selector: str,
        value: str,
-        ai_infer: bool = False,
+        ai: str | None = "fallback",
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
@@ -530,9 +595,9 @@ class SkyvernPage:
        totp_url: str | None = None,
    ) -> str:
        return await self._input_text(
-            xpath=xpath,
+            selector=selector,
            value=value,
-            ai_infer=ai_infer,
+            ai=ai,
            intention=intention,
            data=data,
            timeout=timeout,
@@ -543,9 +608,9 @@ class SkyvernPage:
    @action_wrap(ActionType.INPUT_TEXT)
    async def type(
        self,
-        xpath: str,
+        selector: str,
        value: str,
-        ai_infer: bool = False,
+        ai: str | None = "fallback",
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
@@ -553,9 +618,9 @@ class SkyvernPage:
        totp_url: str | None = None,
    ) -> str:
        return await self._input_text(
-            xpath=xpath,
+            selector=selector,
            value=value,
-            ai_infer=ai_infer,
+            ai=ai,
            intention=intention,
            data=data,
            timeout=timeout,
@@ -563,28 +628,16 @@ class SkyvernPage:
            totp_url=totp_url,
        )
-    async def _input_text(
+    async def _ai_input_text(
        self,
-        xpath: str,
+        selector: str,
        value: str,
-        ai_infer: bool = False,
+        intention: str,
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
        totp_identifier: str | None = None,
        totp_url: str | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> str:
        """Input text into an element identified by ``xpath``.
        When ``intention`` and ``data`` are provided a new input text action is
        generated via the `script-generation-input-text-generatiion` prompt.  The model returns a
        fresh text based on the current DOM and the updated data for this run.
        The browser then inputs the text using this newly generated text.
        If the prompt generation or parsing fails for any reason we fall back to
        inputting the originally supplied ``text``.
        """
        # format the text with the actual value of the parameter if it's a secret when running a workflow
        context = skyvern_context.current()
        value = value or ""
        transformed_value = value
@@ -595,7 +648,7 @@ class SkyvernPage:
        workflow_run_id = context.workflow_run_id if context else None
        task = await app.DATABASE.get_task(task_id, organization_id) if task_id and organization_id else None
        step = await app.DATABASE.get_step(step_id, organization_id) if step_id and organization_id else None
-        if ai_infer and intention:
+        if intention:
            try:
                prompt = context.prompt if context else None
                data = data or {}
@@ -624,8 +677,8 @@ class SkyvernPage:
                refreshed_page = await self.scraped_page.generate_scraped_page_without_screenshots()
                self.scraped_page = refreshed_page
-                # get the element_id by the xpath
+                # get the element_id by the selector
-                element_id = await _get_element_id_by_xpath(xpath, self.page)
+                element_id = await _get_element_id_by_selector(selector, self.page)
                script_generation_input_text_prompt = prompt_engine.load_prompt(
                    template="script-generation-input-text-generatiion",
                    intention=intention,
@@ -639,10 +692,10 @@ class SkyvernPage:
                )
                value = json_response.get("answer", value)
            except Exception:
-                LOG.exception(f"Failed to adapt value for input text action on xpath={xpath}, value={value}")
+                LOG.exception(f"Failed to adapt value for input text action on selector={selector}, value={value}")
        if context and context.workflow_run_id:
-            transformed_value = await _get_actual_value_of_parameter_if_secret(context.workflow_run_id, value)
+            transformed_value = await _get_actual_value_of_parameter_if_secret(context.workflow_run_id, str(value))
        if element_id and organization_id and task and step:
            action = InputTextAction(
@@ -661,20 +714,78 @@ class SkyvernPage:
            if result and result[-1].success is False:
                raise Exception(result[-1].exception_message)
        else:
-            locator = self.page.locator(f"xpath={xpath}")
+            locator = self.page.locator(selector)
            await handler_utils.input_sequentially(locator, transformed_value, timeout=timeout)
        return value
-    @action_wrap(ActionType.UPLOAD_FILE)
+    async def _input_text(
    async def upload_file(
        self,
-        xpath: str,
+        selector: str,
-        files: str,
+        value: str,
-        ai_infer: bool = False,
+        ai: str | None = "fallback",
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
        totp_identifier: str | None = None,
        totp_url: str | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> str:
-        if ai_infer and intention:
+        """Input text into an element identified by ``selector``.
        When ``intention`` and ``data`` are provided a new input text action is
        generated via the `script-generation-input-text-generation` prompt.  The model returns a
        fresh text based on the current DOM and the updated data for this run.
        The browser then inputs the text using this newly generated text.
        If the prompt generation or parsing fails for any reason we fall back to
        inputting the originally supplied ``text``.
        """
        # format the text with the actual value of the parameter if it's a secret when running a workflow
        if ai == "fallback":
            error_to_raise = None
            try:
                locator = self.page.locator(selector)
                await handler_utils.input_sequentially(locator, value, timeout=timeout)
            except Exception as e:
                error_to_raise = e
            if intention:
                return await self._ai_input_text(
                    selector=selector,
                    value=value,
                    intention=intention,
                    data=data,
                    totp_identifier=totp_identifier,
                    totp_url=totp_url,
                    timeout=timeout,
                )
            if error_to_raise:
                raise error_to_raise
            else:
                return value
        elif ai == "proactive" and intention:
            return await self._ai_input_text(
                selector=selector,
                value=value,
                intention=intention,
                data=data,
                totp_identifier=totp_identifier,
                totp_url=totp_url,
                timeout=timeout,
            )
        locator = self.page.locator(selector)
        await handler_utils.input_sequentially(locator, value, timeout=timeout)
        return value
    async def _ai_upload_file(
        self,
        selector: str,
        files: str,
        file_path: str,
        intention: str,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> str:
        if intention:
            try:
                context = skyvern_context.current()
                prompt = context.prompt if context else None
@@ -692,28 +803,71 @@ class SkyvernPage:
                )
                files = json_response.get("answer", files)
            except Exception:
-                LOG.exception(f"Failed to adapt value for input text action on xpath={xpath}, file={files}")
+                LOG.exception(f"Failed to adapt value for input text action on selector={selector}, file={files}")
-        file_path = await download_file(files)
+        if not files:
-        locator = self.page.locator(f"xpath={xpath}")
+            raise ValueError("file url must be provided")
-        await locator.set_input_files(file_path)
+        locator = self.page.locator(selector)
        await locator.set_input_files(file_path, timeout=timeout)
        return files
-    @action_wrap(ActionType.SELECT_OPTION)
+    @action_wrap(ActionType.UPLOAD_FILE)
-    async def select_option(
+    async def upload_file(
        self,
-        xpath: str,
+        selector: str,
-        value: str,
+        files: str,
-        ai_infer: bool = False,
+        ai: str | None = "fallback",
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> str:
        file_path = await download_file(files)
        if ai == "fallback":
            error_to_raise = None
            try:
                locator = self.page.locator(selector)
                await locator.set_input_files(file_path)
            except Exception as e:
                error_to_raise = e
            if intention:
                return await self._ai_upload_file(
                    selector=selector,
                    files=files,
                    file_path=file_path,
                    intention=intention,
                    data=data,
                    timeout=timeout,
                )
            if error_to_raise:
                raise error_to_raise
            else:
                return files
        elif ai == "proactive" and intention:
            return await self._ai_upload_file(
                selector=selector,
                files=files,
                file_path=file_path,
                intention=intention,
                data=data,
                timeout=timeout,
            )
        locator = self.page.locator(selector)
        await locator.set_input_files(file_path, timeout=timeout)
        return files
    async def _ai_select_option(
        self,
        selector: str,
        value: str,
        intention: str,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> str:
        option_value = value or ""
        context = skyvern_context.current()
        if context and context.task_id and context.step_id and context.organization_id:
            task = await app.DATABASE.get_task(context.task_id, organization_id=context.organization_id)
            step = await app.DATABASE.get_step(context.step_id, organization_id=context.organization_id)
-            if ai_infer and intention and task and step:
+            if intention and task and step:
                try:
                    prompt = context.prompt if context else None
                    # data = _get_context_data(data)
@@ -735,7 +889,9 @@ class SkyvernPage:
                        prompt_name="single-select-action",
                        organization_id=context.organization_id if context else None,
                    )
-                    actions = parse_actions(task, step.step_id, step.order, self.scraped_page, json_response["actions"])
+                    actions = parse_actions(
                        task, step.step_id, step.order, self.scraped_page, json_response.get("actions", [])
                    )
                    if actions:
                        action = actions[0]
                        if not action.option:
@@ -750,15 +906,60 @@ class SkyvernPage:
                        )
                    else:
                        LOG.exception(
-                            f"Failed to parse actions for select option action on xpath={xpath}, value={value}"
+                            f"Failed to parse actions for select option action on selector={selector}, value={value}"
                        )
                except Exception:
-                    LOG.exception(f"Failed to adapt value for select option action on xpath={xpath}, value={value}")
+                    LOG.exception(
                        f"Failed to adapt value for select option action on selector={selector}, value={value}"
                    )
        else:
-            locator = self.page.locator(f"xpath={xpath}")
+            locator = self.page.locator(selector)
            await locator.select_option(option_value, timeout=timeout)
        return option_value
    @action_wrap(ActionType.SELECT_OPTION)
    async def select_option(
        self,
        selector: str,
        value: str | None = None,
        label: str | None = None,
        ai: str | None = "fallback",
        intention: str | None = None,
        data: str | dict[str, Any] | None = None,
        timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS,
    ) -> str:
        value = value or ""
        if ai == "fallback":
            error_to_raise = None
            try:
                locator = self.page.locator(selector)
                await locator.select_option(value, timeout=timeout)
            except Exception as e:
                error_to_raise = e
            if intention:
                return await self._ai_select_option(
                    selector=selector,
                    value=value,
                    intention=intention,
                    data=data,
                    timeout=timeout,
                )
            if error_to_raise:
                raise error_to_raise
            else:
                return value
        elif ai == "proactive" and intention:
            return await self._ai_select_option(
                selector=selector,
                value=value,
                intention=intention,
                data=data,
                timeout=timeout,
            )
        locator = self.page.locator(selector)
        await locator.select_option(value, timeout=timeout)
        return value
    @action_wrap(ActionType.WAIT)
    async def wait(
        self, seconds: float, intention: str | None = None, data: str | dict[str, Any] | None = None
@@ -873,9 +1074,7 @@ class SkyvernPage:
        return result
    @action_wrap(ActionType.VERIFICATION_CODE)
-    async def verification_code(
+    async def verification_code(self, intention: str | None = None, data: str | dict[str, Any] | None = None) -> None:
        self, xpath: str, intention: str | None = None, data: str | dict[str, Any] | None = None
    ) -> None:
        return
    @action_wrap(ActionType.SCROLL)
--- a/skyvern/services/script_service.py
+++ b/skyvern/services/script_service.py
@@ -37,6 +37,7 @@ from skyvern.forge.sdk.workflow.models.block import (
    ForLoopBlock,
    HttpRequestBlock,
    LoginBlock,
    NavigationBlock,
    PDFParserBlock,
    SendEmailBlock,
    TaskBlock,
@@ -1176,7 +1177,7 @@ async def run_task(
    if cache_key and cached_fn:
        # Auto-create workflow block run and task if workflow_run_id is available
        workflow_run_block_id, task_id, step_id = await _create_workflow_block_run_and_task(
-            block_type=BlockType.TASK,
+            block_type=BlockType.NAVIGATION,
            prompt=prompt,
            url=url,
            label=cache_key,
@@ -1201,7 +1202,7 @@ async def run_task(
        except Exception as e:
            LOG.exception("Failed to run task block. Falling back to AI run.")
            await _fallback_to_ai_run(
-                block_type=BlockType.TASK,
+                block_type=BlockType.NAVIGATION,
                cache_key=cache_key,
                prompt=prompt,
                url=url,
@@ -1216,7 +1217,7 @@ async def run_task(
            context.prompt = None
    else:
        block_validation_output = await _validate_and_get_output_parameter(label)
-        task_block = TaskBlock(
+        task_block = NavigationBlock(
            label=block_validation_output.label,
            output_parameter=block_validation_output.output_parameter,
            url=url,