iframes support (#405)

Co-authored-by: Aleksei Zarubin <12220926+alexzarbn@users.noreply.github.com>
2024-06-06 10:07:32 +08:00
parent 064c831524
commit 285419349c
7 changed files with 207 additions and 51 deletions
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -818,6 +818,11 @@ class ForgeAgent:
            artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP,
            data=json.dumps(scraped_page.id_to_xpath_dict, indent=2).encode(),
        )
+        await app.ARTIFACT_MANAGER.create_artifact(
+            step=step,
+            artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP,
+            data=json.dumps(scraped_page.id_to_frame_dict, indent=2).encode(),
+        )
        await app.ARTIFACT_MANAGER.create_artifact(
            step=step,
            artifact_type=ArtifactType.VISIBLE_ELEMENTS_TREE,
--- a/skyvern/forge/sdk/artifact/models.py
+++ b/skyvern/forge/sdk/artifact/models.py
@@ -22,6 +22,7 @@ class ArtifactType(StrEnum):
    LLM_RESPONSE = "llm_response"
    LLM_RESPONSE_PARSED = "llm_response_parsed"
    VISIBLE_ELEMENTS_ID_XPATH_MAP = "visible_elements_id_xpath_map"
+    VISIBLE_ELEMENTS_ID_FRAME_MAP = "visible_elements_id_frame_map"
    VISIBLE_ELEMENTS_TREE = "visible_elements_tree"
    VISIBLE_ELEMENTS_TREE_TRIMMED = "visible_elements_tree_trimmed"
    VISIBLE_ELEMENTS_TREE_IN_PROMPT = "visible_elements_tree_in_prompt"
--- a/skyvern/forge/sdk/artifact/storage/base.py
+++ b/skyvern/forge/sdk/artifact/storage/base.py
@@ -14,6 +14,7 @@ FILE_EXTENTSION_MAP: dict[ArtifactType, str] = {
    ArtifactType.LLM_RESPONSE: "json",
    ArtifactType.LLM_RESPONSE_PARSED: "json",
    ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP: "json",
+    ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP: "json",
    ArtifactType.VISIBLE_ELEMENTS_TREE: "json",
    ArtifactType.VISIBLE_ELEMENTS_TREE_TRIMMED: "json",
    ArtifactType.VISIBLE_ELEMENTS_TREE_IN_PROMPT: "txt",
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -6,10 +6,10 @@ from typing import Any, Awaitable, Callable, List

 import structlog
 from deprecation import deprecated
-from playwright.async_api import Locator, Page
+from playwright.async_api import FrameLocator, Locator, Page

-from skyvern.constants import REPO_ROOT_DIR
-from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound
+from skyvern.constants import REPO_ROOT_DIR, SKYVERN_ID_ATTR
+from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound, SkyvernException
 from skyvern.forge import app
 from skyvern.forge.prompts import prompt_engine
 from skyvern.forge.sdk.api.files import (
@@ -175,16 +175,18 @@ async def handle_click_action(
            num_downloaded_files_before=num_downloaded_files_before,
            download_dir=download_dir,
        )
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
    await asyncio.sleep(0.3)
    if action.download:
        results = await handle_click_to_download_file_action(action, page, scraped_page)
    else:
        results = await chain_click(
            task,
+            scraped_page,
            page,
            action,
            xpath,
+            frame,
            timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
        )

@@ -208,10 +210,12 @@ async def handle_click_to_download_file_action(
    page: Page,
    scraped_page: ScrapedPage,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)
+
    try:
-        await page.click(
-            f"xpath={xpath}",
+        await locator.click(
            timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
            modifiers=["Alt"],
        )
@@ -229,8 +233,9 @@ async def handle_input_text_action(
    task: Task,
    step: Step,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
-    locator = page.locator(f"xpath={xpath}")
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)

    current_text = await locator.input_value()
    if current_text == action.text:
@@ -269,20 +274,28 @@ async def handle_upload_file_action(
            file_url=action.file_url,
        )
        return [ActionFailure(ImaginaryFileUrl(action.file_url))]
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
    file_path = await download_file(file_url)
-    locator = page.locator(f"xpath={xpath}")
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)
+
    is_file_input = await is_file_input_element(locator)
+
    if is_file_input:
        LOG.info("Taking UploadFileAction. Found file input tag", action=action)
        if file_path:
-            await page.locator(f"xpath={xpath}").set_input_files(
+            locator = resolve_locator(scraped_page, page, frame, xpath)
+
+            await locator.set_input_files(
                file_path,
                timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
            )

            # Sleep for 10 seconds after uploading a file to let the page process it
            await asyncio.sleep(10)
+
            return [ActionSuccess()]
        else:
            return [ActionFailure(Exception(f"Failed to download file from {action.file_url}"))]
@@ -292,9 +305,11 @@ async def handle_upload_file_action(
        action.is_upload_file_tag = False
        return await chain_click(
            task,
+            scraped_page,
            page,
            action,
            xpath,
+            frame,
            timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
        )

@@ -307,15 +322,17 @@ async def handle_download_file_action(
    task: Task,
    step: Step,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
    file_name = f"{action.file_name or uuid.uuid4()}"
    full_file_path = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}/{file_name}"
    try:
        # Start waiting for the download
        async with page.expect_download() as download_info:
            await asyncio.sleep(0.3)
-            await page.click(
-                f"xpath={xpath}",
+
+            locator = resolve_locator(scraped_page, page, frame, xpath)
+
+            await locator.click(
                timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
                modifiers=["Alt"],
            )
@@ -355,9 +372,10 @@ async def handle_select_option_action(
    task: Task,
    step: Step,
 ) -> list[ActionResult]:
-    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)

-    locator = page.locator(f"xpath={xpath}")
    tag_name = await get_tag_name_lowercase(locator)
    element_dict = scraped_page.id_to_element_dict[action.element_id]
    LOG.info(
@@ -400,7 +418,7 @@ async def handle_select_option_action(
                child_anchor_xpath=child_anchor_xpath,
            )
            click_action = ClickAction(element_id=action.element_id)
-            return await chain_click(task, page, click_action, child_anchor_xpath)
+            return await chain_click(task, scraped_page, page, click_action, child_anchor_xpath, frame)

        # handler the select action on <label>
        select_element_id = get_select_id_in_label_children(scraped_page, action.element_id)
@@ -432,7 +450,7 @@ async def handle_select_option_action(
            action=action,
        )
        click_action = ClickAction(element_id=action.element_id)
-        action_result = await chain_click(task, page, click_action, xpath)
+        action_result = await chain_click(task, scraped_page, page, click_action, xpath, frame)
        return action_result
    elif tag_name == "ul" or tag_name == "div" or tag_name == "li":
        # if the role is listbox, find the option with the "label" or "value" and click that option element
@@ -464,7 +482,7 @@ async def handle_select_option_action(
            )
            # click the option element
            click_action = ClickAction(element_id=action.element_id)
-            return await chain_click(task, page, click_action, xpath)
+            return await chain_click(task, scraped_page, page, click_action, xpath, frame)
        else:
            LOG.error(
                "SelectOptionAction on a non-listbox element. Cannot handle this action",
@@ -481,19 +499,17 @@ async def handle_select_option_action(
    current_text = await locator.input_value()
    if current_text == action.option.label:
        return [ActionSuccess()]
+
    try:
        # First click by label (if it matches)
-        await page.click(
-            f"xpath={xpath}",
+        await locator.click(
            timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
        )
-        await page.select_option(
-            xpath,
+        await locator.select_option(
            label=action.option.label,
            timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
        )
-        await page.click(
-            f"xpath={xpath}",
+        await locator.click(
            timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
        )
        return [ActionSuccess()]
@@ -536,7 +552,7 @@ async def handle_select_option_action(


 async def handle_checkbox_action(
-    self: actions.CheckboxAction,
+    action: actions.CheckboxAction,
    page: Page,
    scraped_page: ScrapedPage,
    task: Task,
@@ -549,11 +565,14 @@ async def handle_checkbox_action(
    Treating checkbox actions as click actions seem to perform way more reliably
    Developers who tried this and failed: 2 (Suchintan and Shu 😂)
    """
-    xpath = await validate_actions_in_dom(self, page, scraped_page)
-    if self.is_checked:
-        await page.check(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+    xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)
+
+    if action.is_checked:
+        await locator.check(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
    else:
-        await page.uncheck(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        await locator.uncheck(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)

    # TODO (suchintan): Why does checking the label work, but not the actual input element?
    return [ActionSuccess()]
@@ -630,9 +649,11 @@ def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
    return secret_value if secret_value is not None else parameter


-async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> str:
+async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> tuple[str, str]:
    xpath = scraped_page.id_to_xpath_dict[action.element_id]
-    locator = page.locator(xpath)
+    frame = scraped_page.id_to_frame_dict[action.element_id]
+
+    locator = resolve_locator(scraped_page, page, frame, xpath)

    num_elements = await locator.count()
    if num_elements < 1:
@@ -652,14 +673,16 @@ async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: S
    else:
        LOG.info("Validated action xpath in DOM", action=action)

-    return xpath
+    return xpath, frame


 async def chain_click(
    task: Task,
+    scraped_page: ScrapedPage,
    page: Page,
    action: ClickAction | UploadFileAction,
    xpath: str,
+    frame: str,
    timeout: int = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
 ) -> List[ActionResult]:
    # Add a defensive page handler here in case a click action opens a file chooser.
@@ -689,9 +712,11 @@ async def chain_click(
    Clicks on an element identified by the xpath and its parent if failed.
    :param xpath: xpath of the element to click
    """
-    javascript_triggered = await is_javascript_triggered(page, xpath)
+    javascript_triggered = await is_javascript_triggered(scraped_page, page, frame, xpath)
    try:
-        await page.click(f"xpath={xpath}", timeout=timeout)
+        locator = resolve_locator(scraped_page, page, frame, xpath)
+        await locator.click(timeout=timeout)
+
        LOG.info("Chain click: main element click succeeded", action=action, xpath=xpath)
        return [
            ActionSuccess(
@@ -718,10 +743,12 @@ async def chain_click(

        parent_xpath = f"{xpath}/.."
        try:
-            parent_javascript_triggered = await is_javascript_triggered(page, parent_xpath)
+            parent_javascript_triggered = await is_javascript_triggered(scraped_page, page, frame, parent_xpath)
            javascript_triggered = javascript_triggered or parent_javascript_triggered
-            parent_locator = page.locator(xpath).locator("..")
+
+            parent_locator = resolve_locator(scraped_page, page, frame, xpath).locator("..")
            await parent_locator.click(timeout=timeout)
+
            LOG.info(
                "Chain click: successfully clicked parent element",
                action=action,
@@ -806,9 +833,10 @@ def get_checkbox_id_in_label_children(scraped_page: ScrapedPage, element_id: str
    return None


-async def is_javascript_triggered(page: Page, xpath: str) -> bool:
-    locator = page.locator(f"xpath={xpath}")
+async def is_javascript_triggered(scraped_page: ScrapedPage, page: Page, frame: str, xpath: str) -> bool:
+    locator = resolve_locator(scraped_page, page, frame, xpath)
    element = locator.first
+
    tag_name = await element.evaluate("e => e.tagName")
    if tag_name.lower() == "a":
        href = await element.evaluate("e => e.href")
@@ -928,8 +956,13 @@ async def click_listbox_option(
            text = child["text"] if "text" in child else ""
            if text and (text == action.option.label or text == action.option.value):
                option_xpath = scraped_page.id_to_xpath_dict[child["id"]]
+                option_frame = scraped_page.id_to_frame_dict[child["id"]]
+
                try:
-                    await page.click(f"xpath={option_xpath}", timeout=1000)
+                    locator = resolve_locator(scraped_page, page, option_frame, option_xpath)
+
+                    await locator.click(timeout=1000)
+
                    return True
                except Exception:
                    LOG.error(
@@ -941,3 +974,28 @@ async def click_listbox_option(
        if "children" in child:
            bfs_queue.extend(child["children"])
    return False
+
+
+def resolve_locator(scrape_page: ScrapedPage, page: Page, frame: str, xpath: str) -> Locator:
+    iframe_path: list[str] = []
+
+    while frame != "main.frame":
+        iframe_path.append(frame)
+
+        frame_element = scrape_page.id_to_element_dict.get(frame)
+        if frame_element is None:
+            raise MissingElement(element_id=frame)
+
+        parent_frame = frame_element.get("frame")
+        if not parent_frame:
+            raise SkyvernException(f"element without frame: {frame_element}")
+
+        LOG.info(f"{frame} is a child frame of {parent_frame}")
+        frame = parent_frame
+
+    current_page: Page | FrameLocator = page
+    while len(iframe_path) > 0:
+        child_frame = iframe_path.pop()
+        current_page = current_page.frame_locator(f"[{SKYVERN_ID_ATTR}='{child_frame}']")
+
+    return current_page.locator(f"xpath={xpath}")
--- a/skyvern/webeye/scraper/domUtils.js
+++ b/skyvern/webeye/scraper/domUtils.js
@@ -342,6 +342,10 @@ function isInteractable(element) {

  const tagName = element.tagName.toLowerCase();

+  if (tagName === "iframe") {
+    return false;
+  }
+
  if (tagName === "a" && element.href) {
    return true;
  }
@@ -576,7 +580,7 @@ function uniqueId() {
  return result;
 }

-function buildTreeFromBody() {
+function buildTreeFromBody(frame = "main.frame") {
  var elements = [];
  var resultArray = [];

@@ -679,6 +683,7 @@ function buildTreeFromBody() {

    let elementObj = {
      id: element_id,
+      frame: frame,
      interactable: interactable,
      tagName: elementTagNameLower,
      attributes: attrs,
@@ -760,6 +765,11 @@ function buildTreeFromBody() {
        processElement(child, elementObj.id);
      });
      return elementObj;
+    } else if (element.tagName.toLowerCase() === "iframe") {
+      let iframeElementObject = buildElementObject(element, false);
+
+      elements.push(iframeElementObject);
+      resultArray.push(iframeElementObject);
    } else {
      // For a non-interactable element, if it has direct text, we also tagged
      // it with unique_id, but with interatable=false in the element.
--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -6,7 +6,7 @@ from enum import StrEnum
 from typing import Any

 import structlog
-from playwright.async_api import Page
+from playwright.async_api import Frame, Page
 from pydantic import BaseModel

 from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
@@ -122,6 +122,7 @@ class ScrapedPage(BaseModel):

    elements: list[dict]
    id_to_element_dict: dict[str, dict] = {}
+    id_to_frame_dict: dict[str, str] = {}
    id_to_xpath_dict: dict[str, str]
    element_tree: list[dict]
    element_tree_trimmed: list[dict]
@@ -187,14 +188,30 @@ async def scrape_website(
        )


-async def get_all_visible_text(page: Page) -> str:
+async def get_frame_text(iframe: Frame) -> str:
    """
-    Get all the visible text on the page.
-    :param page: Page instance to get the text from.
-    :return: All the visible text on the page.
+    Get all the visible text in the iframe.
+    :param iframe: Frame instance to get the text from.
+    :return: All the visible text from the iframe.
    """
    js_script = "() => document.body.innerText"
-    return await page.evaluate(js_script)
+
+    try:
+        text = await iframe.evaluate(js_script)
+    except Exception:
+        LOG.warning(
+            "failed to get text from iframe",
+            exc_info=True,
+        )
+        return ""
+
+    for child_frame in iframe.child_frames:
+        if child_frame.is_detached():
+            continue
+
+        text += await get_frame_text(child_frame)
+
+    return text


 async def scrape_web_unsafe(
@@ -256,17 +273,22 @@ async def scrape_web_unsafe(

    id_to_xpath_dict = {}
    id_to_element_dict = {}
+    id_to_frame_dict = {}
+
    for element in elements:
        element_id = element["id"]
        # get_interactable_element_tree marks each interactable element with a unique_id attribute
        id_to_xpath_dict[element_id] = f"//*[@{SKYVERN_ID_ATTR}='{element_id}']"
        id_to_element_dict[element_id] = element
+        id_to_frame_dict[element_id] = element["frame"]
+
+    text_content = await get_frame_text(page.main_frame)

-    text_content = await get_all_visible_text(page)
    return ScrapedPage(
        elements=elements,
        id_to_xpath_dict=id_to_xpath_dict,
        id_to_element_dict=id_to_element_dict,
+        id_to_frame_dict=id_to_frame_dict,
        element_tree=element_tree,
        element_tree_trimmed=trim_element_tree(copy.deepcopy(element_tree)),
        screenshots=screenshots,
@@ -276,6 +298,47 @@ async def scrape_web_unsafe(
    )


+async def get_interactable_element_tree_in_frame(
+    frames: list[Frame], elements: list[dict], element_tree: list[dict]
+) -> tuple[list[dict], list[dict]]:
+    for frame in frames:
+        if frame.is_detached():
+            continue
+
+        try:
+            frame_element = await frame.frame_element()
+        except Exception:
+            LOG.warning(
+                "Unable to get frame_element",
+                exc_info=True,
+            )
+            continue
+
+        unique_id = await frame_element.get_attribute("unique_id")
+
+        frame_js_script = f"() => buildTreeFromBody('{unique_id}')"
+
+        await frame.evaluate(JS_FUNCTION_DEFS)
+        frame_elements, frame_element_tree = await frame.evaluate(frame_js_script)
+
+        if len(frame.child_frames) > 0:
+            frame_elements, frame_element_tree = await get_interactable_element_tree_in_frame(
+                frame.child_frames, frame_elements, frame_element_tree
+            )
+
+        for element in elements:
+            if element["id"] == unique_id:
+                element["children"] = frame_elements
+
+        for element_tree_item in element_tree:
+            if element_tree_item["id"] == unique_id:
+                element_tree_item["children"] = frame_element_tree
+
+        elements = elements + frame_elements
+
+    return elements, element_tree
+
+
 async def get_interactable_element_tree(page: Page) -> tuple[list[dict], list[dict]]:
    """
    Get the element tree of the page, including all the elements that are interactable.
@@ -283,8 +346,14 @@ async def get_interactable_element_tree(page: Page) -> tuple[list[dict], list[di
    :return: Tuple containing the element tree and a map of element IDs to elements.
    """
    await page.evaluate(JS_FUNCTION_DEFS)
-    js_script = "() => buildTreeFromBody()"
-    elements, element_tree = await page.evaluate(js_script)
+    main_frame_js_script = "() => buildTreeFromBody('main.frame')"
+    elements, element_tree = await page.evaluate(main_frame_js_script)
+
+    if len(page.main_frame.child_frames) > 0:
+        elements, element_tree = await get_interactable_element_tree_in_frame(
+            page.main_frame.child_frames, elements, element_tree
+        )
+
    return elements, element_tree


@@ -352,6 +421,9 @@ def trim_element_tree(elements: list[dict]) -> list[dict]:
        queue.append(element)
    while queue:
        queue_ele = queue.pop(0)
+        if "frame" in queue_ele:
+            del queue_ele["frame"]
+
        if "attributes" in queue_ele:
            tag_name = queue_ele["tagName"] if "tagName" in queue_ele else ""
            new_attributes = _trimmed_attributes(tag_name, queue_ele["attributes"])
--- a/streamlit_app/visualizer/streamlit.py
+++ b/streamlit_app/visualizer/streamlit.py
@@ -308,6 +308,7 @@ with visualizer_tab:
            tab_screenshot,
            tab_post_action_screenshot,
            tab_id_to_xpath,
+            tab_id_to_frame,
            tab_element_tree,
            tab_element_tree_trimmed,
            tab_llm_prompt,
@@ -323,6 +324,7 @@ with visualizer_tab:
                ":rainbow[Screenshot]",
                ":rainbow[Action Screenshots]",
                ":red[ID -> XPath]",
+                ":red[ID -> Frame]",
                ":orange[Element Tree]",
                ":blue[Element Tree (Trimmed)]",
                ":yellow[LLM Prompt]",
@@ -422,6 +424,13 @@ with visualizer_tab:
                            read_artifact_safe(uri),
                            "No ID -> XPath map available.",
                        )
+                    elif file_name.endswith("id_frame_map.json"):
+                        streamlit_content_safe(
+                            tab_id_to_frame,
+                            tab_id_to_frame.json,
+                            read_artifact_safe(uri),
+                            "No ID -> Frame map available.",
+                        )
                    elif file_name.endswith("tree.json"):
                        streamlit_content_safe(
                            tab_element_tree,