feat: add hover action support (#3994)

Co-authored-by: LawyZheng <lawyzheng1106@gmail.com>
This commit is contained in:
Mohamed Khalil
2025-12-09 17:27:26 +02:00
committed by GitHub
parent 0e8d667959
commit f49b07f30d
22 changed files with 281 additions and 13 deletions

View File

@@ -411,6 +411,44 @@ function hasASPClientControl() {
return typeof ASPxClientControl !== "undefined";
}
// Check if element is only visible on hover (e.g., hover-only buttons)
function isHoverOnlyElement(element) {
// Check for common hover-only patterns in class names
const className = element.className?.toString() ?? "";
const parentClassName = element.parentElement?.className?.toString() ?? "";
// Common hover-only class patterns
if (
className.includes("hover-") ||
className.includes("-hover") ||
parentClassName.includes("hover-") ||
parentClassName.includes("-hover")
) {
return true;
}
// Check if parent has hover-related attributes or classes that might reveal this element
let parent = element.parentElement;
let depth = 0;
// Cap recursion to avoid walking the entire tree and bloating prompts
const maxDepth = 5;
while (parent && parent !== document.body && depth < maxDepth) {
const parentClass = parent.className?.toString() ?? "";
if (
parentClass.includes("hover") ||
parentClass.includes("card") ||
parentClass.includes("item")
) {
// This element might be revealed on parent hover
return true;
}
parent = parent.parentElement;
depth += 1;
}
return false;
}
// from playwright: https://github.com/microsoft/playwright/blob/1b65f26f0287c0352e76673bc5f85bc36c934b55/packages/playwright-core/src/server/injected/domUtils.ts#L100-L119
// NOTE: According this logic, some elements with aria-hidden won't be considered as invisible. And the result shows they are indeed interactable.
function isElementVisible(element) {
@@ -450,6 +488,10 @@ function isElementVisible(element) {
if (!isElementStyleVisibilityVisible(element, style)) return false;
const rect = element.getBoundingClientRect();
if (rect.width <= 0 || rect.height <= 0) {
// Check if this element might be visible on hover before marking as invisible
if (isHoverOnlyElement(element)) {
return true;
}
return false;
}
@@ -824,7 +866,12 @@ function isInteractable(element, hoverStylesMap) {
// https://developer.mozilla.org/en-US/docs/Web/CSS/pointer-events#none
const elementPointerEvent = getElementComputedStyle(element)?.pointerEvents;
if (elementPointerEvent === "none" && !element.disabled) {
return false;
// Some CTAs stay hidden until the parent is hovered
// When we can infer that the element is revealed on hover, keep it interactable so the agent
// has a chance to hover the parent before clicking.
if (!isHoverOnlyElement(element)) {
return false;
}
}
if (isInteractableInput(element, hoverStylesMap)) {
@@ -1569,6 +1616,7 @@ async function buildElementObject(
frame: frame,
frame_index: window.GlobalSkyvernFrameIndex,
interactable: interactable,
hoverOnly: isHoverOnlyElement(element),
tagName: elementTagNameLower,
attributes: attrs,
beforePseudoText: getPseudoContent(element, "::before"),

View File

@@ -2,6 +2,7 @@ import asyncio
import copy
import json
from collections import defaultdict
from typing import Any
import structlog
from playwright._impl._errors import TimeoutError
@@ -92,6 +93,14 @@ def load_js_script() -> str:
JS_FUNCTION_DEFS = load_js_script()
# function to convert JSON element to HTML
def build_attribute(key: str, value: Any) -> str:
if isinstance(value, bool) or isinstance(value, int):
return f'{key}="{str(value).lower()}"'
return f'{key}="{str(value)}"' if value else key
def clean_element_before_hashing(element: dict) -> dict:
def clean_nested(element: dict) -> dict:
element_cleaned = {key: value for key, value in element.items() if key not in {"id", "rect", "frame_index"}}
@@ -125,7 +134,7 @@ def build_element_dict(
for element in elements:
element_id: str = element.get("id", "")
# get_interactable_element_tree marks each interactable element with a unique_id attribute
# get_interactable_element_tree marks each interactable element with a SKYVERN_ID_ATTR attribute
id_to_css_dict[element_id] = f"[{SKYVERN_ID_ATTR}='{element_id}']"
id_to_element_dict[element_id] = element
id_to_frame_dict[element_id] = element["frame"]
@@ -409,16 +418,18 @@ async def add_frame_interactable_elements(
# it will get stuck when we `frame.evaluate()` on an invisible iframe
if not await frame_element.is_visible():
return elements, element_tree
unique_id = await frame_element.get_attribute("unique_id")
if not unique_id:
skyvern_id = await frame_element.get_attribute(SKYVERN_ID_ATTR)
if not skyvern_id:
LOG.info(
"No unique_id found for frame, skipping",
"No Skyvern id found for frame, skipping",
frame_index=frame_index,
attr=SKYVERN_ID_ATTR,
)
return elements, element_tree
except Exception:
LOG.warning(
"Unable to get unique_id from frame_element",
"Unable to get Skyvern id from frame_element",
attr=SKYVERN_ID_ATTR,
exc_info=True,
)
return elements, element_tree
@@ -427,11 +438,11 @@ async def add_frame_interactable_elements(
await skyvern_frame.safe_wait_for_animation_end()
frame_elements, frame_element_tree = await skyvern_frame.build_tree_from_body(
frame_name=unique_id, frame_index=frame_index
frame_name=skyvern_id, frame_index=frame_index
)
for element in elements:
if element["id"] == unique_id:
if element["id"] == skyvern_id:
element["children"] = frame_element_tree
elements = elements + frame_elements
@@ -638,6 +649,9 @@ def _should_keep_unique_id(element: dict) -> bool:
# 1. no readonly attr and not disable attr and no interactable
# 2. readonly=false and disable=false and interactable=false
if element.get("hoverOnly"):
return True
attributes = element.get("attributes", {})
if (
"disabled" not in attributes