diff --git a/skyvern/constants.py b/skyvern/constants.py index a523ce9e..fb644d7c 100644 --- a/skyvern/constants.py +++ b/skyvern/constants.py @@ -16,6 +16,7 @@ SAVE_DOWNLOADED_FILES_TIMEOUT = 180 GET_DOWNLOADED_FILES_TIMEOUT = 30 NAVIGATION_MAX_RETRY_TIME = 5 AUTO_COMPLETION_POTENTIAL_VALUES_COUNT = 5 +DROPDOWN_MENU_MAX_DISTANCE = 100 # reserved fields for navigation payload SPECIAL_FIELD_VERIFICATION_CODE = "verification_code" diff --git a/skyvern/forge/agent_functions.py b/skyvern/forge/agent_functions.py index 5e2f1e1d..a0d8740e 100644 --- a/skyvern/forge/agent_functions.py +++ b/skyvern/forge/agent_functions.py @@ -267,6 +267,16 @@ async def _convert_css_shape_to_string( try: LOG.debug("call LLM to convert css shape to string shape", element_id=element_id) + if not await locater.is_visible(timeout=settings.BROWSER_ACTION_TIMEOUT_MS): + LOG.info( + "element is not visible on the page, going to abort conversion", + task_id=task_id, + step_id=step_id, + element_id=element_id, + key=shape_key, + ) + return None + screenshot = await locater.screenshot(timeout=settings.BROWSER_SCREENSHOT_TIMEOUT_MS) prompt = prompt_engine.load_prompt("css-shape-convert") diff --git a/skyvern/forge/prompts/skyvern/custom-select.j2 b/skyvern/forge/prompts/skyvern/custom-select.j2 index 98cadabb..f73f10a7 100644 --- a/skyvern/forge/prompts/skyvern/custom-select.j2 +++ b/skyvern/forge/prompts/skyvern/custom-select.j2 @@ -2,7 +2,7 @@ You are performing a {{ "multi-level selection" if select_history else "selectio You can identify the matching element based on the following guidelines: 1. Select the most suitable element based on the user goal, user details, and the context. - 2. If no option is a perfect match, and there is a fallback option such as "Others" or "None of the above" in the DOM elements, you can consider it a match. + 2. If none of the options perfectly match, and there is no search box for input, but there is a fallback option such as "Others" or "None of the above" in the DOM elements, you can consider it a match. 3. If a field is required, do not leave it blank. 4. If a field is required, do not select a placeholder value, such as "Please select", "-", or "Select...". 5. Exclude loading indicators like "loading more results" as valid options.{% if select_history %} diff --git a/skyvern/forge/prompts/skyvern/opened-dropdown-confirm.j2 b/skyvern/forge/prompts/skyvern/opened-dropdown-confirm.j2 index 9e147fb1..f90ecb4d 100644 --- a/skyvern/forge/prompts/skyvern/opened-dropdown-confirm.j2 +++ b/skyvern/forge/prompts/skyvern/opened-dropdown-confirm.j2 @@ -2,7 +2,7 @@ There is a screenshot from a part of a web HTML page. Help me confirm if it is a An open dropdown menu can be defined as: - At least one option is visible in the screenshot. - - A calendar view or date picker could be considered as an open dropdown menu. + - A calendar view could be considered as an open dropdown menu. But DO NOT consider an calendar icon as the dropdown menu. - Do not consider it an open dropdown menu if the only visible option displays a message like "No results" or "No match". - Do not consider it an open dropdown menu if the only visible element displays a placeholder like "Please select", "-", or "Select...". diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index e516c7be..ea0f4059 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -17,6 +17,7 @@ from skyvern.config import settings from skyvern.constants import ( AUTO_COMPLETION_POTENTIAL_VALUES_COUNT, BROWSER_DOWNLOAD_TIMEOUT, + DROPDOWN_MENU_MAX_DISTANCE, REPO_ROOT_DIR, SKYVERN_ID_ATTR, ) @@ -579,6 +580,16 @@ async def handle_input_text_action( # press arrowdown to watch if there's any options popping up await incremental_scraped.start_listen_dom_increment() + try: + await skyvern_element.input_clear() + except Exception: + LOG.info( + "Failed to clear up the input, but continue to input", + task_id=task.task_id, + step_id=step.step_id, + element_id=skyvern_element.get_id(), + ) + try: await skyvern_element.press_key("ArrowDown") except TimeoutError: @@ -613,6 +624,7 @@ async def handle_input_text_action( action=select_action, page=page, dom=dom, + skyvern_element=skyvern_element, skyvern_frame=skyvern_frame, incremental_scraped=incremental_scraped, step=step, @@ -641,14 +653,29 @@ async def handle_input_text_action( ) except Exception: - await skyvern_element.scroll_into_view() LOG.warning( "Failed to do custom selection transformed from input action, continue to input text", exc_info=True, task_id=task.task_id, step_id=step.step_id, ) + await skyvern_element.scroll_into_view() finally: + blocking_element, exist = await skyvern_element.find_blocking_element( + dom=dom, incremental_page=incremental_scraped + ) + if blocking_element and exist: + LOG.info( + "Find a blocking element to the current element, going to blur the blocking element first", + task_id=task.task_id, + step_id=step.step_id, + blocking_element=blocking_element.get_locator(), + ) + if await blocking_element.get_locator().count(): + await blocking_element.press_key("Escape") + if await blocking_element.get_locator().count(): + await blocking_element.blur() + await skyvern_element.press_key("Escape") await skyvern_element.blur() await incremental_scraped.stop_listen_dom_increment() @@ -672,6 +699,25 @@ async def handle_input_text_action( await skyvern_element.press_fill(text=text) return [ActionSuccess()] + # wait 2s for blocking element to show up + await asyncio.sleep(2) + try: + blocking_element, exist = await skyvern_element.find_blocking_element( + dom=dom, incremental_page=incremental_scraped + ) + if blocking_element and exist: + LOG.warning( + "Find a blocking element to the current element, going to input on the blocking element", + ) + skyvern_element = blocking_element + except Exception: + LOG.info( + "Failed to find the blocking element, continue with the orignal element", + exc_info=True, + task_id=task.task_id, + step_id=step.step_id, + ) + try: # TODO: not sure if this case will trigger auto-completion if tag_name not in COMMON_INPUT_TAGS: @@ -1032,6 +1078,7 @@ async def handle_select_option_action( action=action, page=page, dom=dom, + skyvern_element=skyvern_element, skyvern_frame=skyvern_frame, incremental_scraped=incremental_scraped, step=step, @@ -1087,6 +1134,7 @@ async def handle_select_option_action( value=suggested_value, page=page, dom=dom, + skyvern_element=skyvern_element, skyvern_frame=skyvern_frame, incremental_scraped=incremental_scraped, task=task, @@ -1765,6 +1813,7 @@ async def sequentially_select_from_dropdown( action: SelectOptionAction, page: Page, dom: DomUtil, + skyvern_element: SkyvernElement, skyvern_frame: SkyvernFrame, incremental_scraped: IncrementalScrapePage, step: Step, @@ -1812,6 +1861,7 @@ async def sequentially_select_from_dropdown( single_select_result = await select_from_dropdown( context=input_or_select_context, page=page, + skyvern_element=skyvern_element, skyvern_frame=skyvern_frame, incremental_scraped=incremental_scraped, check_exist_funcs=check_exist_funcs, @@ -1887,6 +1937,7 @@ def build_sequential_select_history(history_list: list[CustomSingleSelectResult] async def select_from_dropdown( context: InputOrSelectContext, page: Page, + skyvern_element: SkyvernElement, skyvern_frame: SkyvernFrame, incremental_scraped: IncrementalScrapePage, check_exist_funcs: list[CheckExistIDFunc], @@ -1911,6 +1962,7 @@ async def select_from_dropdown( if dropdown_menu_element is None: dropdown_menu_element = await locate_dropdown_menu( + current_anchor_element=skyvern_element, incremental_scraped=incremental_scraped, step=step, task=task, @@ -2059,6 +2111,7 @@ async def select_from_dropdown( async def select_from_dropdown_by_value( value: str, page: Page, + skyvern_element: SkyvernElement, skyvern_frame: SkyvernFrame, dom: DomUtil, incremental_scraped: IncrementalScrapePage, @@ -2078,6 +2131,7 @@ async def select_from_dropdown_by_value( if dropdown_menu_element is None: dropdown_menu_element = await locate_dropdown_menu( + current_anchor_element=skyvern_element, incremental_scraped=incremental_scraped, step=step, task=task, @@ -2131,6 +2185,7 @@ async def select_from_dropdown_by_value( async def locate_dropdown_menu( + current_anchor_element: SkyvernElement, incremental_scraped: IncrementalScrapePage, step: Step, task: Task, @@ -2164,6 +2219,30 @@ async def locate_dropdown_menu( ) continue + try: + if not await head_element.is_next_to_element( + target_locator=current_anchor_element.get_locator(), + max_x_distance=DROPDOWN_MENU_MAX_DISTANCE, + max_y_distance=DROPDOWN_MENU_MAX_DISTANCE, + ): + LOG.debug( + "Skip the element since it's too far away from the anchor element", + step_id=step.step_id, + task_id=task.task_id, + element_id=element_id, + ) + continue + + except Exception: + LOG.info( + "Failed to calculate the distance between the elements", + element_id=element_id, + step_id=step.step_id, + task_id=task.task_id, + exc_info=True, + ) + continue + if not await skyvern_frame.get_element_visible(await head_element.get_element_handler()): LOG.debug( "Skip the element since it's invisible", diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index f8453c00..0d8f2ec7 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -770,6 +770,21 @@ const isComboboxDropdown = (element) => { return role && haspopup && controls && readonly; }; +const isDivComboboxDropdown = (element) => { + const tagName = element.tagName.toLowerCase(); + if (tagName !== "div") { + return false; + } + const role = element.getAttribute("role") + ? element.getAttribute("role").toLowerCase() + : ""; + const haspopup = element.getAttribute("aria-haspopup") + ? element.getAttribute("aria-haspopup").toLowerCase() + : ""; + const controls = element.hasAttribute("aria-controls"); + return role === "combobox" && controls && haspopup; +}; + const isDropdownButton = (element) => { const tagName = element.tagName.toLowerCase(); const type = element.getAttribute("type") @@ -1182,6 +1197,7 @@ function buildElementObject(frame, element, interactable, purgeable = false) { elementTagNameLower === "svg" || element.closest("svg") !== null, isSelectable: elementTagNameLower === "select" || + isDivComboboxDropdown(element) || isDropdownButton(element) || isAngularDropdown(element) || isSelect2Dropdown(element) || @@ -2083,9 +2099,12 @@ if (window.globalObserverForDOMIncrement === undefined) { } if (mutation.attributeName === "class") { const node = mutation.target; + if (node.nodeType === Node.TEXT_NODE) continue; + if (node.tagName.toLowerCase() === "body") continue; + if (!mutation.oldValue) continue; if ( - !mutation.oldValue || - !isClassNameIncludesHidden(mutation.oldValue) + !isClassNameIncludesHidden(mutation.oldValue) && + !node.hasAttribute("data-menu-uid") // google framework use this to trace dropdown menu ) continue; const newStyle = getElementComputedStyle(node); diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index c44d27a8..93169803 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -541,6 +541,12 @@ class IncrementalScrapePage: self.element_tree_trimmed: list[dict] = list() self.skyvern_frame = skyvern_frame + def check_id_in_page(self, element_id: str) -> bool: + css_selector = self.id_to_css_dict.get(element_id, "") + if css_selector: + return True + return False + async def get_incremental_element_tree( self, cleanup_element_tree: CleanupElementTreeFunc, diff --git a/skyvern/webeye/utils/dom.py b/skyvern/webeye/utils/dom.py index 7eb06477..a06b8c89 100644 --- a/skyvern/webeye/utils/dom.py +++ b/skyvern/webeye/utils/dom.py @@ -318,12 +318,21 @@ class SkyvernElement: assert handler is not None return handler - async def find_blocking_element(self, dom: DomUtil) -> tuple[SkyvernElement | None, bool]: + async def find_blocking_element( + self, dom: DomUtil, incremental_page: IncrementalScrapePage | None = None + ) -> tuple[SkyvernElement | None, bool]: skyvern_frame = await SkyvernFrame.create_instance(self.get_frame()) blocking_element_id, blocked = await skyvern_frame.get_blocking_element_id(await self.get_element_handler()) if not blocking_element_id: return None, blocked - return await dom.get_skyvern_element_by_id(blocking_element_id), blocked + + if dom.check_id_in_dom(blocking_element_id): + return await dom.get_skyvern_element_by_id(blocking_element_id), blocked + + if incremental_page and incremental_page.check_id_in_page(blocking_element_id): + return await SkyvernElement.create_from_incremental(incremental_page, blocking_element_id), blocked + + return None, blocked async def find_element_in_label_children( self, dom: DomUtil, element_type: InteractiveElement @@ -590,10 +599,9 @@ class SkyvernElement: await self.focus(timeout=timeout) await asyncio.sleep(2) # wait for scrolling into the target - async def calculate_vertical_distance_to( + async def calculate_min_y_distance_to( self, target_locator: Locator, - mode: typing.Literal["inner", "outer"], timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, ) -> float: self_rect = await self.get_locator().bounding_box(timeout=timeout) @@ -604,10 +612,57 @@ class SkyvernElement: if self_rect is None or target_rect is None: raise Exception("Can't get the target element rect") - if mode == "inner": - return abs(self_rect["y"] + self_rect["height"] - target_rect["y"]) - else: - return abs(self_rect["y"] - (target_rect["y"] + target_rect["height"])) + y_1 = self_rect["y"] + self_rect["height"] - target_rect["y"] + y_2 = self_rect["y"] - (target_rect["y"] + target_rect["height"]) + + # if y1 * y2 <= 0, it means the two elements are overlapping + if y_1 * y_2 <= 0: + return 0 + + return min( + abs(y_1), + abs(y_2), + ) + + async def calculate_min_x_distance_to( + self, + target_locator: Locator, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> float: + self_rect = await self.get_locator().bounding_box(timeout=timeout) + if self_rect is None: + raise Exception("Can't Skyvern element rect") + + target_rect = await target_locator.bounding_box(timeout=timeout) + if self_rect is None or target_rect is None: + raise Exception("Can't get the target element rect") + + x_1 = self_rect["x"] + self_rect["width"] - target_rect["x"] + x_2 = self_rect["x"] - (target_rect["x"] + target_rect["width"]) + + # if x1 * x2 <= 0, it means the two elements are overlapping + if x_1 * x_2 <= 0: + return 0 + + return min( + abs(x_1), + abs(x_2), + ) + + async def is_next_to_element( + self, + target_locator: Locator, + max_x_distance: float = 0, + max_y_distance: float = 0, + timeout: float = settings.BROWSER_ACTION_TIMEOUT_MS, + ) -> bool: + if max_x_distance > 0 and await self.calculate_min_x_distance_to(target_locator, timeout) > max_x_distance: + return False + + if max_y_distance > 0 and await self.calculate_min_y_distance_to(target_locator, timeout) > max_y_distance: + return False + + return True class DomUtil: