SDK: Prompt-based locator (#4027)

2025-11-21 19:13:42 -07:00
parent 90f51bcacb
commit 8fb46ef1ca
19 changed files with 899 additions and 4 deletions
--- a/skyvern/forge/prompts/skyvern/single-locate-element.j2
+++ b/skyvern/forge/prompts/skyvern/single-locate-element.j2
@@ -0,0 +1,32 @@
+You are here to help the user locate a specific element on a web page and return its element ID. Use the user's description, the content of the elements parsed from the page, the screenshots of the page, and the current URL to identify the correct element.
+
+Each actionable element is tagged with an ID. Only select elements provided in the HTML elements list - do not imagine any new elements.
+
+MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
+
+Reply in JSON format with the following keys:
+{
+    "thoughts": str, // Think step by step. Explain what information and visual cues help you identify the correct element. Reference specific attributes, text content, position, or visual characteristics you see.
+    "element_id": str, // The ID of the element from the HTML elements list. This must be one of the IDs from the elements provided above or the nearest parent with id containing the element.
+    "xpath": str, // A fallback XPath selector for the element. This will be used if the element_id cannot be found in the page data. Provide a complete, valid XPath (e.g., "//button[@id='submit']" or "//input[@name='username']").
+    "confidence_float": float // Your confidence that this is the correct element. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence.
+}
+
+User's element description (what element to locate):
+```
+{{ data_extraction_goal }}
+```
+
+The URL of the page you're on right now is `{{ current_url }}`.
+
+HTML elements from `{{ current_url }}`:
+```
+{{ elements }}
+```
+
+Text extracted from the webpage: {{ extracted_text }}
+
+Current datetime, ISO format:
+```
+{{ local_datetime }}
+```
--- a/skyvern/forge/sdk/routes/sdk.py
+++ b/skyvern/forge/sdk/routes/sdk.py
@@ -199,6 +199,11 @@ async def run_sdk_action(
                data=action.data,
            )
            result = extract_result
+        elif action.type == "locate_element":
+            xpath_result = await page_ai.ai_locate_element(
+                prompt=action.prompt,
+            )
+            result = xpath_result
    finally:
        skyvern_context.reset()

--- a/skyvern/forge/sdk/schemas/sdk_actions.py
+++ b/skyvern/forge/sdk/schemas/sdk_actions.py
@@ -15,6 +15,7 @@ class SdkActionType(str, Enum):
    AI_UPLOAD_FILE = "ai_upload_file"
    AI_ACT = "ai_act"
    EXTRACT = "extract"
+    LOCATE_ELEMENT = "locate_element"


 # Base action class
@@ -137,9 +138,30 @@ class ExtractAction(SdkActionBase):
        return self.data if isinstance(self.data, dict) else None


+class LocateElementAction(SdkActionBase):
+    """Locate element action parameters."""
+
+    type: Literal["locate_element"] = "locate_element"
+    prompt: str = Field(default="", description="Natural language prompt to locate an element")
+
+    def get_navigation_goal(self) -> str | None:
+        return self.prompt
+
+    def get_navigation_payload(self) -> dict[str, Any] | None:
+        return None
+
+
 # Discriminated union of all action types
 SdkAction = Annotated[
-    Union[ClickAction, InputTextAction, SelectOptionAction, UploadFileAction, ActAction, ExtractAction],
+    Union[
+        ClickAction,
+        InputTextAction,
+        SelectOptionAction,
+        UploadFileAction,
+        ActAction,
+        ExtractAction,
+        LocateElementAction,
+    ],
    Field(discriminator="type"),
 ]