general autocomplete solution (#713)

2024-08-21 10:54:32 +08:00
parent ef95dc6eca
commit 8baa8de032
9 changed files with 610 additions and 128 deletions
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -1,4 +1,5 @@
 import asyncio
+import copy
 import json
 import os
 import urllib.parse
@@ -9,13 +10,16 @@ from typing import Any, Awaitable, Callable, List
 import structlog
 from deprecation import deprecated
 from playwright.async_api import FileChooser, Locator, Page, TimeoutError
+from pydantic import BaseModel

-from skyvern.constants import REPO_ROOT_DIR, VERIFICATION_CODE_POLLING_TIMEOUT_MINS
+from skyvern.constants import REPO_ROOT_DIR, SKYVERN_ID_ATTR, VERIFICATION_CODE_POLLING_TIMEOUT_MINS
 from skyvern.exceptions import (
    EmptySelect,
+    ErrEmptyTweakValue,
    ErrFoundSelectableElement,
    FailedToFetchSecret,
    FailToClick,
+    FailToFindAutocompleteOption,
    FailToSelectByIndex,
    FailToSelectByLabel,
    FailToSelectByValue,
@@ -24,9 +28,12 @@ from skyvern.exceptions import (
    MissingElement,
    MissingFileUrl,
    MultipleElementsFound,
+    NoAutoCompleteOptionMeetCondition,
    NoElementMatchedForTargetOption,
+    NoIncrementalElementFoundForAutoCompletion,
    NoIncrementalElementFoundForCustomSelection,
    NoLabelOrValueForCustomSelection,
+    NoSuitableAutoCompleteOption,
    OptionIndexOutOfBound,
    WrongElementToUploadFile,
 )
@@ -59,7 +66,13 @@ from skyvern.webeye.actions.actions import (
 )
 from skyvern.webeye.actions.responses import ActionFailure, ActionResult, ActionSuccess
 from skyvern.webeye.browser_factory import BrowserState, get_download_dir
-from skyvern.webeye.scraper.scraper import ElementTreeFormat, IncrementalScrapePage, ScrapedPage
+from skyvern.webeye.scraper.scraper import (
+    ElementTreeFormat,
+    IncrementalScrapePage,
+    ScrapedPage,
+    json_to_html,
+    trim_element_tree,
+)
 from skyvern.webeye.utils.dom import DomUtil, InteractiveElement, SkyvernElement
 from skyvern.webeye.utils.page import SkyvernFrame

@@ -67,6 +80,12 @@ LOG = structlog.get_logger()
 COMMON_INPUT_TAGS = {"input", "textarea", "select"}


+class AutoCompletionResult(BaseModel):
+    auto_completion_attempt: bool = False
+    incremental_elements: list[dict] = []
+    action_result: ActionResult = ActionSuccess()
+
+
 class ActionHandler:
    _handled_action_types: dict[
        ActionType,
@@ -290,6 +309,7 @@ async def handle_input_text_action(
    dom = DomUtil(scraped_page, page)
    skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
    skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame())
+    incremental_scraped = IncrementalScrapePage(skyvern_frame=skyvern_frame)
    timeout = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS

    current_text = await get_input_value(skyvern_element.get_tag_name(), skyvern_element.get_locator())
@@ -319,7 +339,6 @@ async def handle_input_text_action(
            return await handle_select_option_action(select_action, page, scraped_page, task, step)

        # press arrowdown to watch if there's any options popping up
-        incremental_scraped = IncrementalScrapePage(skyvern_frame=skyvern_frame)
        await incremental_scraped.start_listen_dom_increment()
        await skyvern_element.get_locator().focus(timeout=timeout)
        await skyvern_element.get_locator().press("ArrowDown", timeout=timeout)
@@ -376,12 +395,26 @@ async def handle_input_text_action(
        LOG.warning("Failed to clear the input field", action=action, exc_info=True)
        return [ActionFailure(InvalidElementForTextInput(element_id=action.element_id, tag_name=tag_name))]

+    # TODO: not sure if this case will trigger auto-completion
    if tag_name not in COMMON_INPUT_TAGS:
        await skyvern_element.input_fill(text)
        return [ActionSuccess()]

-    # If the input is a text input, we type the text character by character
-    # 3 times the time it takes to type the text so it has time to finish typing
+    if len(text) == 0:
+        return [ActionSuccess()]
+
+    if await skyvern_element.is_auto_completion_input():
+        result = await input_or_auto_complete_input(
+            action=action,
+            page=page,
+            dom=dom,
+            text=text,
+            skyvern_element=skyvern_element,
+            step=step,
+            task=task,
+        )
+        return [result]
+
    await skyvern_element.input_sequentially(text=text)
    return [ActionSuccess()]

@@ -848,6 +881,282 @@ async def chain_click(
            return [ActionFailure(WrongElementToUploadFile(action.element_id))]


+def remove_exist_elements(dom: DomUtil, element_tree: list[dict]) -> list[dict]:
+    new_element_tree = []
+    for element in element_tree:
+        children_elements = element.get("children", [])
+        if len(children_elements) > 0:
+            children_elements = remove_exist_elements(dom=dom, element_tree=children_elements)
+        if dom.check_id_in_dom(element.get("id", "")):
+            new_element_tree.extend(children_elements)
+        else:
+            element["children"] = children_elements
+            new_element_tree.append(element)
+    return new_element_tree
+
+
+async def choose_auto_completion_dropdown(
+    action: actions.InputTextAction,
+    page: Page,
+    dom: DomUtil,
+    text: str,
+    skyvern_element: SkyvernElement,
+    step: Step,
+    task: Task,
+    preserved_elements: list[dict] | None = None,
+    relevance_threshold: float = 0.8,
+) -> AutoCompletionResult:
+    preserved_elements = preserved_elements or []
+    clear_input = True
+    result = AutoCompletionResult()
+
+    current_frame = skyvern_element.get_frame()
+    skyvern_frame = await SkyvernFrame.create_instance(current_frame)
+    incremental_scraped = IncrementalScrapePage(skyvern_frame=skyvern_frame)
+    await incremental_scraped.start_listen_dom_increment()
+
+    try:
+        await skyvern_element.press_fill(text)
+        # wait for new elemnts to load
+        await asyncio.sleep(5)
+        incremental_element = await incremental_scraped.get_incremental_element_tree(
+            app.AGENT_FUNCTION.cleanup_element_tree_factory(task=task, step=step)
+        )
+        incremental_element = remove_exist_elements(dom=dom, element_tree=incremental_element)
+
+        # check if elements in preserve list are still on the page
+        confirmed_preserved_list: list[dict] = []
+        for element in preserved_elements:
+            element_id = element.get("id")
+            if not element_id:
+                continue
+            locator = current_frame.locator(f'[{SKYVERN_ID_ATTR}="{element_id}"]')
+            cnt = await locator.count()
+            if cnt == 0:
+                continue
+
+            element_handler = await locator.element_handle(
+                timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
+            )
+            if not element_handler:
+                continue
+
+            current_element = await skyvern_frame.parse_element_from_html(
+                skyvern_element.get_frame_id(), element_handler, skyvern_element.is_interactable()
+            )
+            confirmed_preserved_list.append(current_element)
+
+        if len(confirmed_preserved_list) > 0:
+            confirmed_preserved_list = await app.AGENT_FUNCTION.cleanup_element_tree_factory(task=task, step=step)(
+                skyvern_frame.get_frame().url, copy.deepcopy(confirmed_preserved_list)
+            )
+            confirmed_preserved_list = trim_element_tree(copy.deepcopy(confirmed_preserved_list))
+
+        incremental_element.extend(confirmed_preserved_list)
+
+        result.incremental_elements = copy.deepcopy(incremental_element)
+        if len(incremental_element) == 0:
+            raise NoIncrementalElementFoundForAutoCompletion(element_id=skyvern_element.get_id(), text=text)
+
+        html = incremental_scraped.build_html_tree(incremental_element)
+        auto_completion_confirm_prompt = prompt_engine.load_prompt(
+            "auto-completion-choose-option",
+            context_reasoning=action.reasoning,
+            filled_value=text,
+            elements=html,
+        )
+        LOG.info(
+            "Confirm if it's an auto completion dropdown",
+            step_id=step.step_id,
+            task_id=task.task_id,
+        )
+        json_response = await app.SECONDARY_LLM_API_HANDLER(prompt=auto_completion_confirm_prompt, step=step)
+        element_id = json_response.get("id", "")
+        relevance_float = json_response.get("relevance_float", 0)
+        if not element_id:
+            reasoning = json_response.get("reasoning")
+            raise NoSuitableAutoCompleteOption(reasoning=reasoning, target_value=text)
+
+        if relevance_float < relevance_threshold:
+            LOG.info(
+                f"The closest option doesn't meet the condition(relevance_float>={relevance_threshold})",
+                element_id=element_id,
+                relevance_float=relevance_float,
+            )
+            reasoning = json_response.get("reasoning")
+            raise NoAutoCompleteOptionMeetCondition(
+                reasoning=reasoning,
+                required_relevance=relevance_threshold,
+                target_value=text,
+                closest_relevance=relevance_float,
+            )
+
+        LOG.info(
+            "Find a suitable option to choose",
+            element_id=element_id,
+            step_id=step.step_id,
+            task_id=task.task_id,
+        )
+
+        locator = current_frame.locator(f'[{SKYVERN_ID_ATTR}="{element_id}"]')
+        if await locator.count() == 0:
+            raise MissingElement(element_id=element_id)
+
+        await locator.click(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        clear_input = False
+        return result
+    except Exception as e:
+        LOG.info(
+            "Failed to choose the auto completion dropdown",
+            exc_info=True,
+            input_value=text,
+            task_id=task.task_id,
+            step_id=step.step_id,
+        )
+        result.action_result = ActionFailure(exception=e)
+        return result
+    finally:
+        await incremental_scraped.stop_listen_dom_increment()
+        if clear_input:
+            await skyvern_element.input_clear()
+
+
+async def input_or_auto_complete_input(
+    action: actions.InputTextAction,
+    page: Page,
+    dom: DomUtil,
+    text: str,
+    skyvern_element: SkyvernElement,
+    step: Step,
+    task: Task,
+) -> ActionResult:
+    LOG.info(
+        "Trigger auto completion",
+        task_id=task.task_id,
+        step_id=step.step_id,
+        element_id=skyvern_element.get_id(),
+    )
+
+    # 1. press the orignal text to see if there's a match
+    # 2. call LLM to find 5 potential values based on the orginal text
+    # 3. try each potential values from #2
+    # 4. call LLM to tweak the orignal text according to the information from #3, then start #1 again
+
+    # FIXME: try the whole loop for twice now, to prevent too many LLM calls
+    MAX_AUTO_COMPLETE_ATTEMP = 2
+    current_attemp = 0
+    context_reasoning = action.reasoning
+    current_value = text
+    result = AutoCompletionResult()
+
+    while current_attemp < MAX_AUTO_COMPLETE_ATTEMP:
+        current_attemp += 1
+        whole_new_elements: list[dict] = []
+        tried_values: list[str] = []
+
+        LOG.info(
+            "Try the potential value for auto completion",
+            step_id=step.step_id,
+            task_id=task.task_id,
+            input_value=current_value,
+        )
+        result = await choose_auto_completion_dropdown(
+            action=action,
+            page=page,
+            dom=dom,
+            text=current_value,
+            preserved_elements=result.incremental_elements,
+            skyvern_element=skyvern_element,
+            step=step,
+            task=task,
+        )
+        if isinstance(result.action_result, ActionSuccess):
+            return ActionSuccess()
+
+        tried_values.append(current_value)
+        whole_new_elements.extend(result.incremental_elements)
+
+        prompt = prompt_engine.load_prompt(
+            "auto-completion-potential-answers",
+            context_reasoning=context_reasoning,
+            current_value=current_value,
+        )
+
+        LOG.info(
+            "Ask LLM to give 10 potential values based on the current value",
+            current_value=current_value,
+            step_id=step.step_id,
+            task_id=task.task_id,
+        )
+        json_respone = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
+        values: list[dict] = json_respone.get("potential_values", [])
+
+        for each_value in values:
+            value: str = each_value.get("value", "")
+            if not value:
+                LOG.info(
+                    "Empty potential value, skip this attempt",
+                    step_id=step.step_id,
+                    task_id=task.task_id,
+                    value=each_value,
+                )
+                continue
+            LOG.info(
+                "Try the potential value for auto completion",
+                step_id=step.step_id,
+                task_id=task.task_id,
+                input_value=value,
+            )
+            result = await choose_auto_completion_dropdown(
+                action=action,
+                page=page,
+                dom=dom,
+                text=value,
+                preserved_elements=result.incremental_elements,
+                skyvern_element=skyvern_element,
+                step=step,
+                task=task,
+            )
+            if isinstance(result.action_result, ActionSuccess):
+                return ActionSuccess()
+
+            tried_values.append(value)
+            whole_new_elements.extend(result.incremental_elements)
+
+        if current_attemp < MAX_AUTO_COMPLETE_ATTEMP:
+            LOG.info(
+                "Ask LLM to tweak the current value based on tried input values",
+                step_id=step.step_id,
+                task_id=task.task_id,
+                current_value=current_value,
+                current_attemp=current_attemp,
+            )
+            prompt = prompt_engine.load_prompt(
+                "auto-completion-tweak-value",
+                context_reasoning=context_reasoning,
+                current_value=current_value,
+                tried_values=json.dumps(tried_values),
+                popped_up_elements="".join([json_to_html(element) for element in whole_new_elements]),
+            )
+            json_respone = await app.SECONDARY_LLM_API_HANDLER(prompt=prompt, step=step)
+            context_reasoning = json_respone.get("reasoning")
+            new_current_value = json_respone.get("tweaked_value", "")
+            if not new_current_value:
+                return ActionFailure(ErrEmptyTweakValue(reasoning=context_reasoning, current_value=current_value))
+            LOG.info(
+                "Ask LLM tweaked the current value with a new value",
+                step_id=step.step_id,
+                task_id=task.task_id,
+                reasoning=context_reasoning,
+                current_value=current_value,
+                new_value=new_current_value,
+            )
+            current_value = new_current_value
+
+    else:
+        return ActionFailure(FailToFindAutocompleteOption(current_value=text))
+
+
 async def select_from_dropdown(
    action: SelectOptionAction,
    page: Page,
--- a/skyvern/webeye/scraper/domUtils.js
+++ b/skyvern/webeye/scraper/domUtils.js
@@ -877,109 +877,105 @@ function uniqueId() {
  return result;
 }

-async function buildTreeFromBody(frame = "main.frame", open_select = false) {
+function buildElementObject(frame, element, interactable) {
+  var element_id = element.getAttribute("unique_id") ?? uniqueId();
+  var elementTagNameLower = element.tagName.toLowerCase();
+  element.setAttribute("unique_id", element_id);
+
+  const attrs = {};
+  for (const attr of element.attributes) {
+    var attrValue = attr.value;
+    if (
+      attr.name === "required" ||
+      attr.name === "aria-required" ||
+      attr.name === "checked" ||
+      attr.name === "aria-checked" ||
+      attr.name === "selected" ||
+      attr.name === "aria-selected" ||
+      attr.name === "readonly" ||
+      attr.name === "aria-readonly"
+    ) {
+      if (attrValue && attrValue.toLowerCase() === "false") {
+        attrValue = false;
+      } else {
+        attrValue = true;
+      }
+    }
+    attrs[attr.name] = attrValue;
+  }
+
+  if (
+    checkRequiredFromStyle(element) &&
+    !attrs["required"] &&
+    !attrs["aria-required"]
+  ) {
+    attrs["required"] = true;
+  }
+
+  if (elementTagNameLower === "input" || elementTagNameLower === "textarea") {
+    if (element.type === "radio") {
+      attrs["value"] = "" + element.checked + "";
+    } else {
+      attrs["value"] = element.value;
+    }
+  }
+
+  let elementObj = {
+    id: element_id,
+    frame: frame,
+    interactable: interactable,
+    tagName: elementTagNameLower,
+    attributes: attrs,
+    text: getElementContent(element),
+    children: [],
+    rect: DomUtils.getVisibleClientRect(element, true),
+    // don't trim any attr of this element if keepAllAttr=True
+    keepAllAttr:
+      elementTagNameLower === "svg" || element.closest("svg") !== null,
+    isSelectable:
+      elementTagNameLower === "select" ||
+      isSelect2Dropdown(element) ||
+      isSelect2MultiChoice(element),
+    isScrollable: isScrollable(element),
+  };
+
+  let isInShadowRoot = element.getRootNode() instanceof ShadowRoot;
+  if (isInShadowRoot) {
+    let shadowHostEle = element.getRootNode().host;
+    let shadowHostId = shadowHostEle.getAttribute("unique_id");
+    // assign shadowHostId to the shadowHost element if it doesn't have unique_id
+    if (!shadowHostId) {
+      shadowHostId = uniqueId();
+      shadowHostEle.setAttribute("unique_id", shadowHostId);
+    }
+    elementObj.shadowHost = shadowHostId;
+  }
+
+  // get options for select element or for listbox element
+  let selectOptions = null;
+  let selectedValue = "";
+  if (elementTagNameLower === "select") {
+    [selectOptions, selectedValue] = getSelectOptions(element);
+  }
+
+  if (selectOptions) {
+    elementObj.options = selectOptions;
+  }
+  if (selectedValue) {
+    elementObj.attributes["selected"] = selectedValue;
+  }
+
+  return elementObj;
+}
+
+function buildTreeFromBody(frame = "main.frame", open_select = false) {
  return buildElementTree(document.body, frame, open_select);
 }

-async function buildElementTree(
-  starter = document.body,
-  frame = "main.frame",
-  open_select = false,
-) {
+function buildElementTree(starter = document.body, frame = "main.frame") {
  var elements = [];
  var resultArray = [];

-  async function buildElementObject(element, interactable) {
-    var element_id = element.getAttribute("unique_id") ?? uniqueId();
-    var elementTagNameLower = element.tagName.toLowerCase();
-    element.setAttribute("unique_id", element_id);
-
-    const attrs = {};
-    for (const attr of element.attributes) {
-      var attrValue = attr.value;
-      if (
-        attr.name === "required" ||
-        attr.name === "aria-required" ||
-        attr.name === "checked" ||
-        attr.name === "aria-checked" ||
-        attr.name === "selected" ||
-        attr.name === "aria-selected" ||
-        attr.name === "readonly" ||
-        attr.name === "aria-readonly"
-      ) {
-        if (attrValue && attrValue.toLowerCase() === "false") {
-          attrValue = false;
-        } else {
-          attrValue = true;
-        }
-      }
-      attrs[attr.name] = attrValue;
-    }
-
-    if (
-      checkRequiredFromStyle(element) &&
-      !attrs["required"] &&
-      !attrs["aria-required"]
-    ) {
-      attrs["required"] = true;
-    }
-
-    if (elementTagNameLower === "input" || elementTagNameLower === "textarea") {
-      if (element.type === "radio") {
-        attrs["value"] = "" + element.checked + "";
-      } else {
-        attrs["value"] = element.value;
-      }
-    }
-
-    let elementObj = {
-      id: element_id,
-      frame: frame,
-      interactable: interactable,
-      tagName: elementTagNameLower,
-      attributes: attrs,
-      text: getElementContent(element),
-      children: [],
-      rect: DomUtils.getVisibleClientRect(element, true),
-      // don't trim any attr of this element if keepAllAttr=True
-      keepAllAttr:
-        elementTagNameLower === "svg" || element.closest("svg") !== null,
-      isSelectable:
-        elementTagNameLower === "select" ||
-        isSelect2Dropdown(element) ||
-        isSelect2MultiChoice(element),
-      isScrollable: isScrollable(element),
-    };
-
-    let isInShadowRoot = element.getRootNode() instanceof ShadowRoot;
-    if (isInShadowRoot) {
-      let shadowHostEle = element.getRootNode().host;
-      let shadowHostId = shadowHostEle.getAttribute("unique_id");
-      // assign shadowHostId to the shadowHost element if it doesn't have unique_id
-      if (!shadowHostId) {
-        shadowHostId = uniqueId();
-        shadowHostEle.setAttribute("unique_id", shadowHostId);
-      }
-      elementObj.shadowHost = shadowHostId;
-    }
-
-    // get options for select element or for listbox element
-    let selectOptions = null;
-    let selectedValue = "";
-    if (elementTagNameLower === "select") {
-      [selectOptions, selectedValue] = getSelectOptions(element);
-    }
-
-    if (selectOptions) {
-      elementObj.options = selectOptions;
-    }
-    if (selectedValue) {
-      elementObj.attributes["selected"] = selectedValue;
-    }
-
-    return elementObj;
-  }
-
  function getChildElements(element) {
    if (element.childElementCount !== 0) {
      return Array.from(element.children);
@@ -987,7 +983,7 @@ async function buildElementTree(
      return [];
    }
  }
-  async function processElement(element, parentId) {
+  function processElement(element, parentId) {
    if (element === null) {
      console.log("get a null element");
      return;
@@ -1008,7 +1004,7 @@ async function buildElementTree(

    // Check if the element is interactable
    if (isInteractable(element)) {
-      var elementObj = await buildElementObject(element, true);
+      var elementObj = buildElementObject(frame, element, true);
      elements.push(elementObj);
      // If the element is interactable but has no interactable parent,
      // then it starts a new tree, so add it to the result array
@@ -1029,24 +1025,24 @@ async function buildElementTree(
      const children = getChildElements(element);
      for (let i = 0; i < children.length; i++) {
        const childElement = children[i];
-        await processElement(childElement, elementObj.id);
+        processElement(childElement, elementObj.id);
      }
      return elementObj;
    } else if (element.tagName.toLowerCase() === "iframe") {
-      let iframeElementObject = await buildElementObject(element, false);
+      let iframeElementObject = buildElementObject(frame, element, false);

      elements.push(iframeElementObject);
      resultArray.push(iframeElementObject);
    } else if (element.shadowRoot) {
      // shadow host element
-      let shadowHostElement = await buildElementObject(element, false);
+      let shadowHostElement = buildElementObject(frame, element, false);
      elements.push(shadowHostElement);
      resultArray.push(shadowHostElement);

      const children = getChildElements(element.shadowRoot);
      for (let i = 0; i < children.length; i++) {
        const childElement = children[i];
-        await processElement(childElement, shadowHostElement.id);
+        processElement(childElement, shadowHostElement.id);
      }
    } else {
      // For a non-interactable element, if it has direct text, we also tagged
@@ -1063,14 +1059,14 @@ async function buildElementTree(
        let isParentSVG = element.closest("svg");
        if (element.tagName.toLowerCase() === "svg") {
          // if element is <svg> we save all attributes and its children
-          elementObj = await buildElementObject(element, false);
+          elementObj = buildElementObject(frame, element, false);
        } else if (isParentSVG && isParentSVG.getAttribute("unique_id")) {
          // if elemnet is the children of the <svg> with an unique_id
-          elementObj = await buildElementObject(element, false);
+          elementObj = buildElementObject(frame, element, false);
        } else if (isTableRelatedElement(element)) {
          // build all table related elements into skyvern element
          // we need these elements to preserve the DOM structure
-          elementObj = await buildElementObject(element, false);
+          elementObj = buildElementObject(frame, element, false);
        } else {
          // character length limit for non-interactable elements should be 5000
          // we don't use element context in HTML format,
@@ -1083,7 +1079,7 @@ async function buildElementTree(
            }
          }
          if (textContent && textContent.length <= 5000) {
-            elementObj = await buildElementObject(element, false);
+            elementObj = buildElementObject(frame, element, false);
          }
        }

@@ -1104,7 +1100,7 @@ async function buildElementTree(
      const children = getChildElements(element);
      for (let i = 0; i < children.length; i++) {
        const childElement = children[i];
-        await processElement(childElement, parentId);
+        processElement(childElement, parentId);
      }
    }
  }
@@ -1313,7 +1309,7 @@ async function buildElementTree(
  };

  // setup before parsing the dom
-  await processElement(starter, null);
+  processElement(starter, null);

  for (var element of elements) {
    if (
@@ -1545,17 +1541,17 @@ function removeBoundingBoxes() {
  }
 }

-async function scrollToTop(draw_boxes) {
+function scrollToTop(draw_boxes) {
  removeBoundingBoxes();
  window.scroll({ left: 0, top: 0, behavior: "instant" });
  if (draw_boxes) {
-    var elementsAndResultArray = await buildTreeFromBody();
+    var elementsAndResultArray = buildTreeFromBody();
    drawBoundingBoxes(elementsAndResultArray[0]);
  }
  return window.scrollY;
 }

-async function scrollToNextPage(draw_boxes) {
+function scrollToNextPage(draw_boxes) {
  // remove bounding boxes, scroll to next page with 200px overlap, then draw bounding boxes again
  // return true if there is a next page, false otherwise
  removeBoundingBoxes();
@@ -1565,7 +1561,7 @@ async function scrollToNextPage(draw_boxes) {
    behavior: "instant",
  });
  if (draw_boxes) {
-    var elementsAndResultArray = await buildTreeFromBody();
+    var elementsAndResultArray = buildTreeFromBody();
    drawBoundingBoxes(elementsAndResultArray[0]);
  }
  return window.scrollY;
@@ -1688,7 +1684,7 @@ function stopGlobalIncrementalObserver() {
  window.globalOneTimeIncrementElements = [];
 }

-async function getIncrementElements(frame) {
+function getIncrementElements(frame) {
  const domDepthMap = new Map();

  for (const element of window.globalOneTimeIncrementElements) {
@@ -1700,7 +1696,7 @@ async function getIncrementElements(frame) {
    }

    for (const child of element.newNodes) {
-      const [_, newNodeTree] = await buildElementTree(child, frame, false);
+      const [_, newNodeTree] = buildElementTree(child, frame, false);
      if (newNodeTree.length > 0) {
        newNodesTreeList.push(...newNodeTree);
      }
--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -337,7 +337,7 @@ async def get_interactable_element_tree_in_frame(

        unique_id = await frame_element.get_attribute("unique_id")

-        frame_js_script = f"async () => await buildTreeFromBody('{unique_id}', true)"
+        frame_js_script = f"() => buildTreeFromBody('{unique_id}', true)"

        await frame.evaluate(JS_FUNCTION_DEFS)
        frame_elements, frame_element_tree = await frame.evaluate(frame_js_script)
@@ -373,7 +373,7 @@ async def get_interactable_element_tree(
    :return: Tuple containing the element tree and a map of element IDs to elements.
    """
    await page.evaluate(JS_FUNCTION_DEFS)
-    main_frame_js_script = "async () => await buildTreeFromBody('main.frame', true)"
+    main_frame_js_script = "() => buildTreeFromBody('main.frame', true)"
    elements, element_tree = await page.evaluate(main_frame_js_script)

    if len(page.main_frame.child_frames) > 0:
@@ -415,7 +415,7 @@ class IncrementalScrapePage:
                    exc_info=True,
                )

-        js_script = f"async () => await getIncrementElements('{frame_id}')"
+        js_script = f"() => getIncrementElements('{frame_id}')"
        incremental_elements, incremental_tree = await frame.evaluate(js_script)
        # we listen the incremental elements seperated by frames, so all elements will be in the same SkyvernFrame
        self.id_to_css_dict, self.id_to_element_dict, _ = build_element_dict(incremental_elements)
@@ -473,7 +473,8 @@ def trim_element_tree(elements: list[dict]) -> list[dict]:
            else:
                del queue_ele["attributes"]
        # remove the tag, don't need it in the HTML tree
-        del queue_ele["keepAllAttr"]
+        if "keepAllAttr" in queue_ele:
+            del queue_ele["keepAllAttr"]

        if "children" in queue_ele:
            queue.extend(queue_ele["children"])
--- a/skyvern/webeye/utils/dom.py
+++ b/skyvern/webeye/utils/dom.py
@@ -159,6 +159,22 @@ class SkyvernElement:
        haspopup = await self.get_attr("aria-haspopup")
        return tag_name == InteractiveElement.INPUT and role == "combobox" and haspopup == "listbox"

+    async def is_auto_completion_input(self) -> bool:
+        tag_name = self.get_tag_name()
+        if tag_name != InteractiveElement.INPUT:
+            return False
+
+        haspopup = await self.get_attr("aria-haspopup")
+        autocomplete = await self.get_attr("aria-autocomplete")
+        if haspopup and autocomplete:
+            return True
+
+        element_id = await self.get_attr("id")
+        if element_id == "location-input":
+            return True
+
+        return False
+
    async def is_checkbox(self) -> bool:
        tag_name = self.get_tag_name()
        if tag_name != "input":
@@ -181,6 +197,9 @@ class SkyvernElement:
    async def is_selectable(self) -> bool:
        return self.get_selectable() or self.get_tag_name() in SELECTABLE_ELEMENT

+    def get_element_dict(self) -> dict:
+        return self.__static_element
+
    def get_scrollable(self) -> bool:
        return self.__static_element.get("isScrollable", False)

@@ -193,6 +212,9 @@ class SkyvernElement:
    def get_id(self) -> str:
        return self.__static_element.get("id", "")

+    def get_frame_id(self) -> str:
+        return self.__static_element.get("frame", "")
+
    def get_attributes(self) -> typing.Dict:
        return self.__static_element.get("attributes", {})

@@ -314,10 +336,15 @@ class SkyvernElement:
        if length > TEXT_PRESS_MAX_LENGTH:
            # if the text is longer than TEXT_PRESS_MAX_LENGTH characters, we will locator.fill in initial texts until the last TEXT_PRESS_MAX_LENGTH characters
            # and then type the last TEXT_PRESS_MAX_LENGTH characters with locator.press_sequentially
-            await self.get_locator().fill(text[: length - TEXT_PRESS_MAX_LENGTH])
+            await self.input_fill(text[: length - TEXT_PRESS_MAX_LENGTH])
            text = text[length - TEXT_PRESS_MAX_LENGTH :]

-        await self.get_locator().press_sequentially(text, delay=TEXT_INPUT_DELAY, timeout=default_timeout)
+        await self.press_fill(text, timeout=default_timeout)
+
+    async def press_fill(
+        self, text: str, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
+    ) -> None:
+        await self.get_locator().press_sequentially(text, delay=TEXT_INPUT_DELAY, timeout=timeout)

    async def input_fill(
        self, text: str, timeout: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
@@ -377,6 +404,12 @@ class DomUtil:
        self.scraped_page = scraped_page
        self.page = page

+    def check_id_in_dom(self, element_id: str) -> bool:
+        css_selector = self.scraped_page.id_to_css_dict.get(element_id, "")
+        if css_selector:
+            return True
+        return False
+
    async def get_skyvern_element_by_id(self, element_id: str) -> SkyvernElement:
        element = self.scraped_page.id_to_element_dict.get(element_id)
        if not element:
--- a/skyvern/webeye/utils/page.py
+++ b/skyvern/webeye/utils/page.py
@@ -168,6 +168,10 @@ class SkyvernFrame:
        js_script = "async (element) => await getListboxOptions(element)"
        return await self.frame.evaluate(js_script, element)

+    async def parse_element_from_html(self, frame: str, element: ElementHandle, interactable: bool) -> Dict:
+        js_script = "([frame, element, interactable]) => buildElementObject(frame, element, interactable)"
+        return await self.frame.evaluate(js_script, [frame, element, interactable])
+
    async def scroll_to_top(self, draw_boxes: bool) -> float:
        """
        Scroll to the top of the page and take a screenshot.
@@ -175,7 +179,7 @@ class SkyvernFrame:
        :param page: Page instance to take the screenshot from.
        :return: Screenshot of the page.
        """
-        js_script = f"async () => await scrollToTop({str(draw_boxes).lower()})"
+        js_script = f"() => scrollToTop({str(draw_boxes).lower()})"
        scroll_y_px = await self.frame.evaluate(js_script)
        return scroll_y_px

@@ -186,7 +190,7 @@ class SkyvernFrame:
        :param page: Page instance to take the screenshot from.
        :return: Screenshot of the page.
        """
-        js_script = f"async () => await scrollToNextPage({str(draw_boxes).lower()})"
+        js_script = f"() => scrollToNextPage({str(draw_boxes).lower()})"
        scroll_y_px = await self.frame.evaluate(js_script)
        return scroll_y_px