add skyvern frame (#610)

2024-07-18 02:33:39 +08:00
parent ce2d605f98
commit 88389012be
5 changed files with 186 additions and 127 deletions
--- a/skyvern/forge/agent.py
+++ b/skyvern/forge/agent.py
@@ -57,7 +57,8 @@ from skyvern.webeye.actions.handler import ActionHandler
 from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput
 from skyvern.webeye.actions.responses import ActionResult
 from skyvern.webeye.browser_factory import BrowserState
-from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, get_page_content, scrape_website
+from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, scrape_website
+from skyvern.webeye.utils.page import SkyvernFrame

 LOG = structlog.get_logger()

@@ -797,7 +798,8 @@ class ForgeAgent:
            )

        try:
-            html = await get_page_content(browser_state.page)
+            skyvern_frame = await SkyvernFrame.create_instance(frame=browser_state.page)
+            html = await skyvern_frame.get_content()
            await app.ARTIFACT_MANAGER.create_artifact(
                step=step,
                artifact_type=ArtifactType.HTML_ACTION,
--- a/skyvern/webeye/browser_factory.py
+++ b/skyvern/webeye/browser_factory.py
@@ -8,7 +8,6 @@ from datetime import datetime
 from typing import Any, Awaitable, Callable, Protocol

 import structlog
-from playwright._impl._errors import TimeoutError
 from playwright.async_api import BrowserContext, Error, Page, Playwright, async_playwright
 from pydantic import BaseModel

@@ -17,7 +16,6 @@ from skyvern.exceptions import (
    FailedToNavigateToUrl,
    FailedToReloadPage,
    FailedToStopLoadingPage,
-    FailedToTakeScreenshot,
    MissingBrowserStatePage,
    UnknownBrowserType,
    UnknownErrorWhileCreatingBrowserContext,
@@ -25,6 +23,7 @@ from skyvern.exceptions import (
 from skyvern.forge.sdk.core.skyvern_context import current
 from skyvern.forge.sdk.schemas.tasks import ProxyLocation
 from skyvern.forge.sdk.settings_manager import SettingsManager
+from skyvern.webeye.utils.page import SkyvernFrame

 LOG = structlog.get_logger()

@@ -319,40 +318,6 @@ class BrowserState:
            await self.pw.stop()
            LOG.info("Playwright is stopped")

-    @staticmethod
-    async def take_screenshot_from_page(page: Page, full_page: bool = False, file_path: str | None = None) -> bytes:
-        try:
-            await page.wait_for_load_state(timeout=SettingsManager.get_settings().BROWSER_LOADING_TIMEOUT_MS)
-            LOG.info("Page is fully loaded, agent is about to take screenshots")
-            start_time = time.time()
-            screenshot: bytes = bytes()
-            if file_path:
-                screenshot = await page.screenshot(
-                    path=file_path,
-                    full_page=full_page,
-                    timeout=SettingsManager.get_settings().BROWSER_SCREENSHOT_TIMEOUT_MS,
-                )
-            else:
-                screenshot = await page.screenshot(
-                    full_page=full_page,
-                    timeout=SettingsManager.get_settings().BROWSER_SCREENSHOT_TIMEOUT_MS,
-                    animations="disabled",
-                )
-            end_time = time.time()
-            LOG.info(
-                "Screenshot taking time",
-                screenshot_time=end_time - start_time,
-                full_page=full_page,
-                file_path=file_path,
-            )
-            return screenshot
-        except TimeoutError as e:
-            LOG.exception(f"Timeout error while taking screenshot: {str(e)}")
-            raise FailedToTakeScreenshot(error_message=str(e)) from e
-        except Exception as e:
-            LOG.exception(f"Unknown error while taking screenshot: {str(e)}")
-            raise FailedToTakeScreenshot(error_message=str(e)) from e
-
    async def take_screenshot(self, full_page: bool = False, file_path: str | None = None) -> bytes:
        page = self.__assert_page()
-        return await self.take_screenshot_from_page(page, full_page, file_path)
+        return await SkyvernFrame.take_screenshot(page=page, full_page=full_page, file_path=file_path)
--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -6,13 +6,14 @@ from enum import StrEnum
 from typing import Any, Awaitable, Callable

 import structlog
-from playwright.async_api import ElementHandle, Frame, Page
+from playwright.async_api import Frame, Page
 from pydantic import BaseModel

-from skyvern.constants import PAGE_CONTENT_TIMEOUT, SKYVERN_DIR, SKYVERN_ID_ATTR
+from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
 from skyvern.exceptions import FailedToTakeScreenshot, UnknownElementTreeFormat
 from skyvern.forge.sdk.settings_manager import SettingsManager
 from skyvern.webeye.browser_factory import BrowserState
+from skyvern.webeye.utils.page import SkyvernFrame

 LOG = structlog.get_logger()

@@ -257,28 +258,7 @@ async def scrape_web_unsafe(
    LOG.info("Waiting for 5 seconds before scraping the website.")
    await asyncio.sleep(5)

-    screenshots: list[bytes] = []
-    scroll_y_px_old = -30.0
-    scroll_y_px = await scroll_to_top(page, drow_boxes=True)
-    # Checking max number of screenshots to prevent infinite loop
-    # We are checking the difference between the old and new scroll_y_px to determine if we have reached the end of the
-    # page. If the difference is less than 25, we assume we have reached the end of the page.
-    while (
-        abs(scroll_y_px_old - scroll_y_px) > 25
-        and len(screenshots) < SettingsManager.get_settings().MAX_NUM_SCREENSHOTS
-    ):
-        screenshot = await browser_state.take_screenshot(full_page=False)
-        screenshots.append(screenshot)
-        scroll_y_px_old = scroll_y_px
-        LOG.info("Scrolling to next page", url=url, num_screenshots=len(screenshots))
-        scroll_y_px = await scroll_to_next_page(page, drow_boxes=True)
-        LOG.info(
-            "Scrolled to next page",
-            scroll_y_px=scroll_y_px,
-            scroll_y_px_old=scroll_y_px_old,
-        )
-    await remove_bounding_boxes(page)
-    await scroll_to_top(page, drow_boxes=False)
+    screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=url, draw_boxes=True)

    elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
    element_tree = cleanup_elements(copy.deepcopy(element_tree))
@@ -300,7 +280,8 @@ async def scrape_web_unsafe(

    html = ""
    try:
-        html = await get_page_content(page)
+        skyvern_frame = await SkyvernFrame.create_instance(frame=page)
+        html = await skyvern_frame.get_content()
    except Exception:
        LOG.error(
            "Failed out to get HTML content",
@@ -322,23 +303,6 @@ async def scrape_web_unsafe(
    )


-async def get_page_content(page: Page, timeout: float = PAGE_CONTENT_TIMEOUT) -> str:
-    async with asyncio.timeout(timeout):
-        return await page.content()
-
-
-async def get_select2_options(frame: Page | Frame, element: ElementHandle) -> list[dict[str, Any]]:
-    await frame.evaluate(JS_FUNCTION_DEFS)
-    js_script = "async (element) => await getSelect2Options(element)"
-    return await frame.evaluate(js_script, element)
-
-
-async def get_combobox_options(frame: Page | Frame, element: ElementHandle) -> list[dict[str, Any]]:
-    await frame.evaluate(JS_FUNCTION_DEFS)
-    js_script = "async (element) => await getListboxOptions(element)"
-    return await frame.evaluate(js_script, element)
-
-
 async def get_interactable_element_tree_in_frame(
    frames: list[Frame],
    elements: list[dict],
@@ -413,41 +377,6 @@ async def get_interactable_element_tree(
    return elements, element_tree


-async def scroll_to_top(page: Page, drow_boxes: bool) -> float:
-    """
-    Scroll to the top of the page and take a screenshot.
-    :param drow_boxes: If True, draw bounding boxes around the elements.
-    :param page: Page instance to take the screenshot from.
-    :return: Screenshot of the page.
-    """
-    await page.evaluate(JS_FUNCTION_DEFS)
-    js_script = f"async () => await scrollToTop({str(drow_boxes).lower()})"
-    scroll_y_px = await page.evaluate(js_script)
-    return scroll_y_px
-
-
-async def scroll_to_next_page(page: Page, drow_boxes: bool) -> bool:
-    """
-    Scroll to the next page and take a screenshot.
-    :param drow_boxes: If True, draw bounding boxes around the elements.
-    :param page: Page instance to take the screenshot from.
-    :return: Screenshot of the page.
-    """
-    await page.evaluate(JS_FUNCTION_DEFS)
-    js_script = f"async () => await scrollToNextPage({str(drow_boxes).lower()})"
-    scroll_y_px = await page.evaluate(js_script)
-    return scroll_y_px
-
-
-async def remove_bounding_boxes(page: Page) -> None:
-    """
-    Remove the bounding boxes from the page.
-    :param page: Page instance to remove the bounding boxes from.
-    """
-    js_script = "() => removeBoundingBoxes()"
-    await page.evaluate(js_script)
-
-
 def cleanup_elements(elements: list[dict]) -> list[dict]:
    """
    Remove rect and attribute.unique_id from the elements.
--- a/skyvern/webeye/utils/dom.py
+++ b/skyvern/webeye/utils/dom.py
@@ -24,7 +24,8 @@ from skyvern.exceptions import (
    SkyvernException,
 )
 from skyvern.forge.sdk.settings_manager import SettingsManager
-from skyvern.webeye.scraper.scraper import ScrapedPage, get_combobox_options, get_select2_options
+from skyvern.webeye.scraper.scraper import ScrapedPage
+from skyvern.webeye.utils.page import SkyvernFrame

 LOG = structlog.get_logger()

@@ -164,13 +165,15 @@ class SkyvernElement:
        if not await self.is_select2_dropdown():
            raise ElementIsNotSelect2Dropdown(self.get_id(), self.__static_element)

-        return Select2Dropdown(self.get_frame(), self)
+        frame = await SkyvernFrame.create_instance(self.get_frame())
+        return Select2Dropdown(frame, self)

    async def get_combobox_dropdown(self) -> ComboboxDropdown:
        if not await self.is_combobox_dropdown():
            raise ElementIsNotComboboxDropdown(self.get_id(), self.__static_element)

-        return ComboboxDropdown(self.get_frame(), self)
+        frame = await SkyvernFrame.create_instance(self.get_frame())
+        return ComboboxDropdown(frame, self)

    def find_element_id_in_label_children(self, element_type: InteractiveElement) -> str | None:
        tag_name = self.get_tag_name()
@@ -344,12 +347,12 @@ class AbstractSelectDropdown(ABC):


 class Select2Dropdown(AbstractSelectDropdown):
-    def __init__(self, frame: Page | Frame, skyvern_element: SkyvernElement) -> None:
+    def __init__(self, skyvern_frame: SkyvernFrame, skyvern_element: SkyvernElement) -> None:
        self.skyvern_element = skyvern_element
-        self.frame = frame
+        self.skyvern_frame = skyvern_frame

    async def __find_anchor(self, timeout: float) -> Locator:
-        locator = self.frame.locator("[id='select2-drop']")
+        locator = self.skyvern_element.get_frame().locator("[id='select2-drop']")
        await locator.wait_for(state="visible", timeout=timeout)
        cnt = await locator.count()
        if cnt == 0:
@@ -397,7 +400,7 @@ class Select2Dropdown(AbstractSelectDropdown):
    ) -> typing.List[SkyvernOptionType]:
        anchor = await self.__find_anchor(timeout=timeout)
        element_handler = await anchor.element_handle(timeout=timeout)
-        options = await get_select2_options(self.frame, element_handler)
+        options = await self.skyvern_frame.get_select2_options(element_handler)
        return typing.cast(typing.List[SkyvernOptionType], options)

    async def select_by_index(
@@ -409,13 +412,13 @@ class Select2Dropdown(AbstractSelectDropdown):


 class ComboboxDropdown(AbstractSelectDropdown):
-    def __init__(self, frame: Page | Frame, skyvern_element: SkyvernElement) -> None:
+    def __init__(self, skyvern_frame: SkyvernFrame, skyvern_element: SkyvernElement) -> None:
        self.skyvern_element = skyvern_element
-        self.frame = frame
+        self.skyvern_frame = skyvern_frame

    async def __find_anchor(self, timeout: float) -> Locator:
        control_id = await self.skyvern_element.get_attr("aria-controls", timeout=timeout)
-        locator = self.frame.locator(f"[id='{control_id}']")
+        locator = self.skyvern_element.get_frame().locator(f"[id='{control_id}']")
        await locator.wait_for(state="visible", timeout=timeout)
        cnt = await locator.count()
        if cnt == 0:
@@ -445,7 +448,7 @@ class ComboboxDropdown(AbstractSelectDropdown):
    ) -> typing.List[SkyvernOptionType]:
        anchor = await self.__find_anchor(timeout=timeout)
        element_handler = await anchor.element_handle()
-        options = await get_combobox_options(self.frame, element_handler)
+        options = await self.skyvern_frame.get_combobox_options(element_handler)
        return typing.cast(typing.List[SkyvernOptionType], options)

    async def select_by_index(
--- a/skyvern/webeye/utils/page.py
+++ b/skyvern/webeye/utils/page.py
@@ -0,0 +1,160 @@
+from __future__ import annotations
+
+import asyncio
+import time
+from typing import Any, Dict, List
+
+import structlog
+from playwright._impl._errors import TimeoutError
+from playwright.async_api import ElementHandle, Frame, Page
+
+from skyvern.constants import PAGE_CONTENT_TIMEOUT, SKYVERN_DIR
+from skyvern.exceptions import FailedToTakeScreenshot
+from skyvern.forge.sdk.settings_manager import SettingsManager
+
+LOG = structlog.get_logger()
+
+
+def load_js_script() -> str:
+    # TODO: Handle file location better. This is a hacky way to find the file location.
+    path = f"{SKYVERN_DIR}/webeye/scraper/domUtils.js"
+    try:
+        # TODO: Implement TS of domUtils.js and use the complied JS file instead of the raw JS file.
+        # This will allow our code to be type safe.
+        with open(path, "r") as f:
+            return f.read()
+    except FileNotFoundError as e:
+        LOG.exception("Failed to load the JS script", path=path)
+        raise e
+
+
+JS_FUNCTION_DEFS = load_js_script()
+
+
+class SkyvernFrame:
+    @staticmethod
+    async def take_screenshot(
+        page: Page,
+        full_page: bool = False,
+        file_path: str | None = None,
+        timeout: float = SettingsManager.get_settings().BROWSER_LOADING_TIMEOUT_MS,
+    ) -> bytes:
+        try:
+            await page.wait_for_load_state(timeout=SettingsManager.get_settings().BROWSER_LOADING_TIMEOUT_MS)
+            LOG.info("Page is fully loaded, agent is about to take screenshots")
+            start_time = time.time()
+            screenshot: bytes = bytes()
+            if file_path:
+                screenshot = await page.screenshot(
+                    path=file_path,
+                    full_page=full_page,
+                    timeout=timeout,
+                )
+            else:
+                screenshot = await page.screenshot(
+                    full_page=full_page,
+                    timeout=timeout,
+                    animations="disabled",
+                )
+            end_time = time.time()
+            LOG.info(
+                "Screenshot taking time",
+                screenshot_time=end_time - start_time,
+                full_page=full_page,
+                file_path=file_path,
+            )
+            return screenshot
+        except TimeoutError as e:
+            LOG.exception(f"Timeout error while taking screenshot: {str(e)}")
+            raise FailedToTakeScreenshot(error_message=str(e)) from e
+        except Exception as e:
+            LOG.exception(f"Unknown error while taking screenshot: {str(e)}")
+            raise FailedToTakeScreenshot(error_message=str(e)) from e
+
+    @staticmethod
+    async def take_split_screenshots(
+        page: Page,
+        url: str,
+        draw_boxes: bool = False,
+        max_number: int = SettingsManager.get_settings().MAX_NUM_SCREENSHOTS,
+    ) -> List[bytes]:
+        skyvern_page = await SkyvernFrame.create_instance(frame=page)
+        assert isinstance(skyvern_page.frame, Page)
+
+        screenshots: List[bytes] = []
+        scroll_y_px_old = -30.0
+        scroll_y_px = await skyvern_page.scroll_to_top(draw_boxes=draw_boxes)
+        # Checking max number of screenshots to prevent infinite loop
+        # We are checking the difference between the old and new scroll_y_px to determine if we have reached the end of the
+        # page. If the difference is less than 25, we assume we have reached the end of the page.
+        while abs(scroll_y_px_old - scroll_y_px) > 25 and len(screenshots) < max_number:
+            screenshot = await SkyvernFrame.take_screenshot(page=skyvern_page.frame, full_page=False)
+            screenshots.append(screenshot)
+            scroll_y_px_old = scroll_y_px
+            LOG.info("Scrolling to next page", url=url, num_screenshots=len(screenshots))
+            scroll_y_px = await skyvern_page.scroll_to_next_page(draw_boxes=draw_boxes)
+            LOG.info(
+                "Scrolled to next page",
+                scroll_y_px=scroll_y_px,
+                scroll_y_px_old=scroll_y_px_old,
+            )
+        if draw_boxes:
+            await skyvern_page.remove_bounding_boxes()
+        await skyvern_page.scroll_to_top(draw_boxes=False)
+        return screenshots
+
+    @classmethod
+    async def create_instance(cls, frame: Page | Frame) -> SkyvernFrame:
+        instance = cls(frame=frame)
+        await instance.frame.evaluate(JS_FUNCTION_DEFS)
+        return instance
+
+    def __init__(self, frame: Page | Frame) -> None:
+        self.frame = frame
+
+    def get_frame(self) -> Page | Frame:
+        return self.frame
+
+    async def get_content(self, timeout: float = PAGE_CONTENT_TIMEOUT) -> str:
+        async with asyncio.timeout(timeout):
+            return await self.frame.content()
+
+    async def get_select2_options(self, element: ElementHandle) -> List[Dict[str, Any]]:
+        await self.frame.evaluate(JS_FUNCTION_DEFS)
+        js_script = "async (element) => await getSelect2Options(element)"
+        return await self.frame.evaluate(js_script, element)
+
+    async def get_combobox_options(self, element: ElementHandle) -> List[Dict[str, Any]]:
+        await self.frame.evaluate(JS_FUNCTION_DEFS)
+        js_script = "async (element) => await getListboxOptions(element)"
+        return await self.frame.evaluate(js_script, element)
+
+    async def scroll_to_top(self, draw_boxes: bool) -> float:
+        """
+        Scroll to the top of the page and take a screenshot.
+        :param drow_boxes: If True, draw bounding boxes around the elements.
+        :param page: Page instance to take the screenshot from.
+        :return: Screenshot of the page.
+        """
+        js_script = f"async () => await scrollToTop({str(draw_boxes).lower()})"
+        scroll_y_px = await self.frame.evaluate(js_script)
+        return scroll_y_px
+
+    async def scroll_to_next_page(self, draw_boxes: bool) -> float:
+        """
+        Scroll to the next page and take a screenshot.
+        :param drow_boxes: If True, draw bounding boxes around the elements.
+        :param page: Page instance to take the screenshot from.
+        :return: Screenshot of the page.
+        """
+        js_script = f"async () => await scrollToNextPage({str(draw_boxes).lower()})"
+        scroll_y_px = await self.frame.evaluate(js_script)
+        return scroll_y_px
+
+    async def remove_bounding_boxes(self) -> None:
+        """
+        Remove the bounding boxes from the page.
+        :param page: Page instance to remove the bounding boxes from.
+        """
+        js_script = "() => removeBoundingBoxes()"
+        await self.frame.evaluate(js_script)