Add PRESS_ENTER action type to extract-action prompt (#SKY-7792) (#4669)

This commit is contained in:
pedrohsdb
2026-02-09 11:46:12 -08:00
committed by GitHub
parent 7e6dfcc6d1
commit 226288445d
5 changed files with 60 additions and 3 deletions

View File

@@ -52,4 +52,5 @@ POST_ACTION_EXECUTION_ACTION_TYPES = [
ActionType.WAIT,
ActionType.SOLVE_CAPTCHA,
ActionType.EXTRACT,
ActionType.KEYPRESS,
]

View File

@@ -87,7 +87,11 @@ def parse_action(
return NullAction(**base_action_dict)
# `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
action_type = ActionType[action["action_type"].upper()]
action_type_str = action["action_type"].upper()
# Backward compat: map PRESS_ENTER to KEYPRESS (old prompt used PRESS_ENTER)
if action_type_str == "PRESS_ENTER":
action_type_str = "KEYPRESS"
action_type = ActionType[action_type_str]
if not action_type.is_web_action():
# LLM sometimes hallucinates and returns element id for non-web actions such as WAIT, TERMINATE, COMPLETE etc.
@@ -187,6 +191,23 @@ def parse_action(
**base_action_dict, captcha_type=CaptchaType[captcha_type.upper()] if captcha_type else None
)
if action_type == ActionType.KEYPRESS:
# KEYPRESS is a global keyboard action, not element-targeted
base_action_dict["skyvern_element_hash"] = None
base_action_dict["skyvern_element_data"] = None
# Support both "key" (single key from prompt) and "keys" (list, from code/legacy)
# Limited to navigation/submission keys to prevent misuse on regular form fields
allowed_keys = {"Enter", "Tab", "Escape", "ArrowDown", "ArrowUp"}
key = action.get("key")
if key:
if key not in allowed_keys:
LOG.warning("KEYPRESS action has unsupported key, skipping action", key=key)
return NullAction(**base_action_dict)
keys = [key]
else:
keys = action.get("keys", ["Enter"])
return KeypressAction(**base_action_dict, keys=keys)
if action_type == ActionType.CLOSE_PAGE:
return ClosePageAction(**base_action_dict)