Move the code over from private repository (#3)

2024-03-01 10:09:30 -08:00
parent 32dd6d92a5
commit 9eddb3d812
93 changed files with 16798 additions and 0 deletions
--- a/skyvern/webeye/init.py
+++ b/skyvern/webeye/init.py
--- a/skyvern/webeye/actions/init.py
+++ b/skyvern/webeye/actions/init.py
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -0,0 +1,204 @@
+import abc
+from enum import StrEnum
+from typing import Any, Dict, List
+
+import structlog
+from pydantic import BaseModel
+
+from skyvern.forge.sdk.schemas.tasks import Task
+
+LOG = structlog.get_logger()
+
+
+class ActionType(StrEnum):
+    CLICK = "click"
+    INPUT_TEXT = "input_text"
+    UPLOAD_FILE = "upload_file"
+    SELECT_OPTION = "select_option"
+    CHECKBOX = "checkbox"
+    WAIT = "wait"
+    NULL_ACTION = "null_action"
+    SOLVE_CAPTCHA = "solve_captcha"
+    TERMINATE = "terminate"
+    COMPLETE = "complete"
+    # Note: Remember to update ActionTypeUnion with new actions
+
+
+class Action(BaseModel):
+    action_type: ActionType
+    description: str | None = None
+    reasoning: str | None = None
+
+
+class WebAction(Action, abc.ABC):
+    element_id: int
+
+
+class ClickAction(WebAction):
+    action_type: ActionType = ActionType.CLICK
+    file_url: str | None = None
+
+    def __repr__(self) -> str:
+        return f"ClickAction(element_id={self.element_id}, file_url={self.file_url})"
+
+
+class InputTextAction(WebAction):
+    action_type: ActionType = ActionType.INPUT_TEXT
+    text: str
+
+    def __repr__(self) -> str:
+        return f"InputTextAction(element_id={self.element_id}, text={self.text})"
+
+
+class UploadFileAction(WebAction):
+    action_type: ActionType = ActionType.UPLOAD_FILE
+    file_url: str
+    is_upload_file_tag: bool = True
+
+    def __repr__(self) -> str:
+        return f"UploadFileAction(element_id={self.element_id}, file={self.file_url}, is_upload_file_tag={self.is_upload_file_tag})"
+
+
+class NullAction(Action):
+    action_type: ActionType = ActionType.NULL_ACTION
+
+
+class SolveCaptchaAction(Action):
+    action_type: ActionType = ActionType.SOLVE_CAPTCHA
+
+
+class SelectOption(BaseModel):
+    label: str | None
+    value: str | None
+    index: int | None
+
+    def __repr__(self) -> str:
+        return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"
+
+
+class SelectOptionAction(WebAction):
+    action_type: ActionType = ActionType.SELECT_OPTION
+    option: SelectOption
+
+    def __repr__(self) -> str:
+        return f"SelectOptionAction(element_id={self.element_id}, option={self.option})"
+
+
+###
+# This action causes more harm than it does good.
+# It frequently mis-behaves, or gets stuck in click loops.
+# Treating checkbox actions as click actions seem to perform way more reliably
+# Developers who tried this and failed: 2 (Suchintan and Shu 😂)
+###
+class CheckboxAction(WebAction):
+    action_type: ActionType = ActionType.CHECKBOX
+    is_checked: bool
+
+    def __repr__(self) -> str:
+        return f"CheckboxAction(element_id={self.element_id}, is_checked={self.is_checked})"
+
+
+class WaitAction(Action):
+    action_type: ActionType = ActionType.WAIT
+
+
+class TerminateAction(Action):
+    action_type: ActionType = ActionType.TERMINATE
+
+
+class CompleteAction(Action):
+    action_type: ActionType = ActionType.COMPLETE
+    data_extraction_goal: str | None = None
+
+
+def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Action]:
+    actions = []
+    for action in json_response:
+        element_id = action["id"]
+        reasoning = action["reasoning"] if "reasoning" in action else None
+        if "action_type" not in action or action["action_type"] is None:
+            actions.append(NullAction(reasoning=reasoning))
+            continue
+        # `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
+        action_type = ActionType[action["action_type"].upper()]
+        if action_type == ActionType.TERMINATE:
+            LOG.warning(
+                "Agent decided to terminate",
+                task_id=task.task_id,
+                llm_response=json_response,
+                reasoning=reasoning,
+                actions=actions,
+            )
+            actions.append(TerminateAction(reasoning=reasoning))
+        elif action_type == ActionType.CLICK:
+            file_url = action["file_url"] if "file_url" in action else None
+            actions.append(ClickAction(element_id=element_id, reasoning=reasoning, file_url=file_url))
+        elif action_type == ActionType.INPUT_TEXT:
+            actions.append(InputTextAction(element_id=element_id, text=action["text"], reasoning=reasoning))
+        elif action_type == ActionType.UPLOAD_FILE:
+            # TODO: see if the element is a file input element. if it's not, convert this action into a click action
+
+            actions.append(UploadFileAction(element_id=element_id, file_url=action["file_url"], reasoning=reasoning))
+        elif action_type == ActionType.SELECT_OPTION:
+            actions.append(
+                SelectOptionAction(
+                    element_id=element_id,
+                    option=SelectOption(
+                        label=action["option"]["label"],
+                        value=action["option"]["value"],
+                        index=action["option"]["index"],
+                    ),
+                    reasoning=reasoning,
+                )
+            )
+        elif action_type == ActionType.CHECKBOX:
+            actions.append(CheckboxAction(element_id=element_id, is_checked=action["is_checked"], reasoning=reasoning))
+        elif action_type == ActionType.WAIT:
+            actions.append(WaitAction(reasoning=reasoning))
+        elif action_type == ActionType.COMPLETE:
+            if actions:
+                LOG.info(
+                    "Navigation goal achieved, creating complete action and discarding all other actions except "
+                    "complete action",
+                    task_id=task.task_id,
+                    nav_goal=task.navigation_goal,
+                    actions=actions,
+                    llm_response=json_response,
+                )
+            return [CompleteAction(reasoning=reasoning, data_extraction_goal=task.data_extraction_goal)]
+        elif action_type == "null":
+            actions.append(NullAction(reasoning=reasoning))
+        elif action_type == ActionType.SOLVE_CAPTCHA:
+            actions.append(SolveCaptchaAction(reasoning=reasoning))
+        else:
+            LOG.error(
+                "Unsupported action type when parsing actions",
+                task_id=task.task_id,
+                action_type=action_type,
+                raw_action=action,
+            )
+    return actions
+
+
+class ScrapeResult(BaseModel):
+    """
+    Scraped response from a webpage, including:
+    1. JSON representation of what the user is seeing
+    """
+
+    scraped_data: dict[str, Any] | list[dict[str, Any]]
+
+
+# https://blog.devgenius.io/deserialize-child-classes-with-pydantic-that-gonna-work-784230e1cf83
+ActionTypeUnion = (
+    ClickAction
+    | InputTextAction
+    | UploadFileAction
+    | SelectOptionAction
+    | CheckboxAction
+    | WaitAction
+    | NullAction
+    | SolveCaptchaAction
+    | TerminateAction
+    | CompleteAction
+)
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -0,0 +1,445 @@
+import asyncio
+import re
+from typing import Awaitable, Callable, List
+
+import structlog
+from playwright.async_api import Locator, Page
+
+from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound
+from skyvern.forge import app
+from skyvern.forge.prompts import prompt_engine
+from skyvern.forge.sdk.api.files import download_file
+from skyvern.forge.sdk.models import Step
+from skyvern.forge.sdk.schemas.tasks import Task
+from skyvern.forge.sdk.settings_manager import SettingsManager
+from skyvern.webeye.actions import actions
+from skyvern.webeye.actions.actions import Action, ActionType, ClickAction, ScrapeResult, UploadFileAction, WebAction
+from skyvern.webeye.actions.responses import ActionFailure, ActionResult, ActionSuccess
+from skyvern.webeye.browser_factory import BrowserState
+from skyvern.webeye.scraper.scraper import ScrapedPage
+
+LOG = structlog.get_logger()
+
+
+class ActionHandler:
+    _handled_action_types: dict[
+        ActionType, Callable[[Action, Page, ScrapedPage, Task, Step], Awaitable[list[ActionResult]]]
+    ] = {}
+
+    @classmethod
+    def register_action_type(
+        cls,
+        action_type: ActionType,
+        handler: Callable[[Action, Page, ScrapedPage, Task, Step], Awaitable[list[ActionResult]]],
+    ) -> None:
+        cls._handled_action_types[action_type] = handler
+
+    @staticmethod
+    async def handle_action(
+        scraped_page: ScrapedPage,
+        task: Task,
+        step: Step,
+        browser_state: BrowserState,
+        action: Action,
+    ) -> list[ActionResult]:
+        LOG.info("Handling action", action=action)
+        page = await browser_state.get_or_create_page()
+        try:
+            if action.action_type in ActionHandler._handled_action_types:
+                handler = ActionHandler._handled_action_types[action.action_type]
+                return await handler(action, page, scraped_page, task, step)
+            else:
+                LOG.error("Unsupported action type in handler", action=action, type=type(action))
+                return [ActionFailure(Exception(f"Unsupported action type: {type(action)}"))]
+        except MissingElement as e:
+            LOG.info("Known exceptions", action=action, exception_type=type(e), exception_message=str(e))
+            return [ActionFailure(e)]
+        except MultipleElementsFound as e:
+            LOG.exception(
+                "Cannot handle multiple elements with the same xpath in one action.",
+                action=action,
+                exception=e,
+            )
+            return [ActionFailure(e)]
+        except Exception as e:
+            LOG.exception("Unhandled exception in action handler", action=action, exception=e)
+            return [ActionFailure(e)]
+
+
+async def handle_solve_captcha_action(
+    action: actions.SolveCaptchaAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    LOG.warning(
+        "Please solve the captcha on the page, you have 30 seconds",
+        action=action,
+    )
+    await asyncio.sleep(30)
+    return [ActionSuccess()]
+
+
+async def handle_click_action(
+    action: actions.ClickAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    await asyncio.sleep(0.3)
+    return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+
+
+async def handle_input_text_action(
+    action: actions.InputTextAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    locator = page.locator(f"xpath={xpath}")
+    await locator.clear()
+    await locator.fill(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+
+    # This is a hack that gets dropdowns to select the "best" option based on what's typed
+    # Fixes situations like tsk_228671423990405776 where the location isn't being autocompleted
+    await locator.press("Tab", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+    input_value = await locator.input_value(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+    if not input_value:
+        LOG.info("Failed to input the text, trying to press sequentially with an enter click", action=action)
+        await locator.clear()
+        await locator.press_sequentially(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        await locator.press("Enter", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        input_value = await locator.input_value(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        LOG.info("Input value", input_value=input_value, action=action)
+
+    return [ActionSuccess()]
+
+
+async def handle_upload_file_action(
+    action: actions.UploadFileAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    if not action.file_url:
+        LOG.warning("InputFileAction has no file_url", action=action)
+        return [ActionFailure(MissingFileUrl())]
+    if action.file_url not in str(task.navigation_payload):
+        LOG.warning(
+            "LLM might be imagining the file url, which is not in navigation payload",
+            action=action,
+            file_url=action.file_url,
+        )
+        return [ActionFailure(ImaginaryFileUrl(action.file_url))]
+    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    file_path = download_file(action.file_url)
+    locator = page.locator(f"xpath={xpath}")
+    is_file_input = await is_file_input_element(locator)
+    if is_file_input:
+        LOG.info("Taking UploadFileAction. Found file input tag", action=action)
+        if file_path:
+            await page.locator(f"xpath={xpath}").set_input_files(
+                file_path, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
+            )
+
+            # Sleep for 10 seconds after uploading a file to let the page process it
+            await asyncio.sleep(10)
+            return [ActionSuccess()]
+        else:
+            return [ActionFailure(Exception(f"Failed to download file from {action.file_url}"))]
+    else:
+        LOG.info("Taking UploadFileAction. Found non file input tag", action=action)
+        # treat it as a click action
+        action.is_upload_file_tag = False
+        return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+
+
+async def handle_null_action(
+    action: actions.NullAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    return [ActionSuccess()]
+
+
+async def handle_select_option_action(
+    action: actions.SelectOptionAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    xpath = await validate_actions_in_dom(action, page, scraped_page)
+
+    try:
+        # First click by label (if it matches)
+        await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        await page.select_option(
+            xpath,
+            label=action.option.label,
+            timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
+        )
+        await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        return [ActionSuccess()]
+    except Exception as e:
+        if action.option.index is not None:
+            LOG.warning(
+                "Failed to click on the option by label, trying by index",
+                exc_info=e,
+                action=action,
+                xpath=xpath,
+            )
+        else:
+            return [ActionFailure(e)]
+
+    try:
+        option_xpath = scraped_page.id_to_xpath_dict[action.option.index]
+        match = re.search(r"option\[(\d+)]$", option_xpath)
+        if match:
+            # This means we were trying to select an option xpath, click the option
+            option_index = int(match.group(1))
+            await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+            await page.select_option(
+                xpath,
+                index=option_index,
+                timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
+            )
+            await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+            return [ActionSuccess()]
+        else:
+            # This means the supplied index was for the select element, not a reference to the xpath dict
+            await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+            await page.select_option(
+                xpath,
+                index=action.option.index,
+                timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
+            )
+            await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        return [ActionSuccess()]
+    except Exception as e:
+        LOG.warning("Failed to click on the option by index", exception=e, action=action)
+        return [ActionFailure(e)]
+
+
+async def handle_checkbox_action(
+    self: actions.CheckboxAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    """
+    ******* NOT REGISTERED *******
+    This action causes more harm than it does good.
+    It frequently mis-behaves, or gets stuck in click loops.
+    Treating checkbox actions as click actions seem to perform way more reliably
+    Developers who tried this and failed: 2 (Suchintan and Shu 😂)
+    """
+    xpath = await validate_actions_in_dom(self, page, scraped_page)
+    if self.is_checked:
+        await page.check(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+    else:
+        await page.uncheck(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+
+    # TODO (suchintan): Why does checking the label work, but not the actual input element?
+    return [ActionSuccess()]
+
+
+async def handle_wait_action(
+    action: actions.WaitAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    await asyncio.sleep(10)
+    return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]
+
+
+async def handle_terminate_action(
+    action: actions.TerminateAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    return [ActionSuccess()]
+
+
+async def handle_complete_action(
+    action: actions.CompleteAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    extracted_data = None
+    if action.data_extraction_goal:
+        scrape_action_result = await extract_information_for_navigation_goal(
+            scraped_page=scraped_page,
+            task=task,
+            step=step,
+        )
+        extracted_data = scrape_action_result.scraped_data
+    return [ActionSuccess(data=extracted_data)]
+
+
+ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
+ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
+ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
+ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
+ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
+ActionHandler.register_action_type(ActionType.SELECT_OPTION, handle_select_option_action)
+ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
+ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
+ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
+
+
+async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> str:
+    xpath = scraped_page.id_to_xpath_dict[action.element_id]
+    locator = page.locator(xpath)
+
+    num_elements = await locator.count()
+    if num_elements < 1:
+        LOG.warning("No elements found with action xpath. Validation failed.", action=action, xpath=xpath)
+        raise MissingElement(xpath=xpath, element_id=action.element_id)
+    elif num_elements > 1:
+        LOG.warning(
+            "Multiple elements found with action xpath. Expected 1. Validation failed.",
+            action=action,
+            num_elements=num_elements,
+        )
+        raise MultipleElementsFound(num=num_elements, xpath=xpath, element_id=action.element_id)
+    else:
+        LOG.info("Validated action xpath in DOM", action=action)
+
+    return xpath
+
+
+async def chain_click(
+    page: Page,
+    action: ClickAction | UploadFileAction,
+    xpath: str,
+    timeout: int = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
+) -> List[ActionResult]:
+    # Add a defensive page handler here in case a click action opens a file chooser.
+    # This automatically dismisses the dialog
+    # File choosers are impossible to close if you don't expect one. Instead of dealing with it, close it!
+
+    # TODO (suchintan): This should likely result in an ActionFailure -- we can figure out how to do this later!
+    LOG.info("Chain click starts", action=action, xpath=xpath)
+    file: list[str] | str = []
+    if action.file_url:
+        file = download_file(action.file_url) or []
+
+    fc_func = lambda fc: fc.set_files(files=file)
+    page.on("filechooser", fc_func)
+
+    LOG.info("Registered file chooser listener", action=action, path=file)
+    """
+    Clicks on an element identified by the xpath and its parent if failed.
+    :param xpath: xpath of the element to click
+    """
+    javascript_triggered = await is_javascript_triggered(page, xpath)
+    try:
+        await page.click(f"xpath={xpath}", timeout=timeout)
+        LOG.info("Chain click: main element click succeeded", action=action, xpath=xpath)
+        return [ActionSuccess(javascript_triggered=javascript_triggered)]
+    except Exception as e:
+        action_results: list[ActionResult] = [ActionFailure(e, javascript_triggered=javascript_triggered)]
+        if await is_input_element(page.locator(xpath)):
+            LOG.info("Chain click: it's an input element. going to try sibling click", action=action, xpath=xpath)
+            sibling_action_result = await click_sibling_of_input(page.locator(xpath), timeout=timeout)
+            action_results.append(sibling_action_result)
+            if type(sibling_action_result) == ActionSuccess:
+                return action_results
+
+        parent_xpath = f"{xpath}/.."
+        try:
+            parent_javascript_triggered = await is_javascript_triggered(page, parent_xpath)
+            javascript_triggered = javascript_triggered or parent_javascript_triggered
+            parent_locator = page.locator(xpath).locator("..")
+            await parent_locator.click(timeout=timeout)
+            LOG.info("Chain click: successfully clicked parent element", action=action, parent_xpath=parent_xpath)
+            action_results.append(
+                ActionSuccess(
+                    javascript_triggered=javascript_triggered,
+                    interacted_with_parent=True,
+                )
+            )
+        except Exception as pe:
+            LOG.warning("Failed to click parent element", action=action, parent_xpath=parent_xpath, exc_info=True)
+            action_results.append(
+                ActionFailure(pe, javascript_triggered=javascript_triggered, interacted_with_parent=True)
+            )
+            # We don't raise exception here because we do log the exception, and return ActionFailure as the last action
+
+        return action_results
+    finally:
+        LOG.info("Remove file chooser listener", action=action)
+
+        # Sleep for 10 seconds after uploading a file to let the page process it
+        # Removing this breaks file uploads using the filechooser
+        # KEREM DO NOT REMOVE
+        if file:
+            await asyncio.sleep(10)
+        page.remove_listener("filechooser", fc_func)
+
+
+async def is_javascript_triggered(page: Page, xpath: str) -> bool:
+    locator = page.locator(f"xpath={xpath}")
+    element = locator.first
+    tag_name = await element.evaluate("e => e.tagName")
+    if tag_name.lower() == "a":
+        href = await element.evaluate("e => e.href")
+        if href.lower().startswith("javascript:"):
+            LOG.info("Found javascript call in anchor tag, marking step as completed. Dropping remaining actions")
+            return True
+    return False
+
+
+async def is_file_input_element(locator: Locator) -> bool:
+    element = locator.first
+    if element:
+        tag_name = await element.evaluate("el => el.tagName")
+        type_name = await element.evaluate("el => el.type")
+        return tag_name.lower() == "input" and type_name == "file"
+    return False
+
+
+async def is_input_element(locator: Locator) -> bool:
+    element = locator.first
+    if element:
+        tag_name = await element.evaluate("el => el.tagName")
+        return tag_name.lower() == "input"
+    return False
+
+
+async def click_sibling_of_input(
+    locator: Locator,
+    timeout: int,
+    javascript_triggered: bool = False,
+) -> ActionResult:
+    try:
+        input_element = locator.first
+        parent_locator = locator.locator("..")
+        if input_element:
+            input_id = await input_element.get_attribute("id")
+            sibling_label_xpath = f'//label[@for="{input_id}"]'
+            label_locator = parent_locator.locator(sibling_label_xpath)
+            await label_locator.click(timeout=timeout)
+            LOG.info(
+                "Successfully clicked sibling label of input element",
+                sibling_label_xpath=sibling_label_xpath,
+            )
+            return ActionSuccess(javascript_triggered=javascript_triggered, interacted_with_sibling=True)
+        # Should never get here
+        return ActionFailure(
+            exception=Exception("Failed while trying to click sibling of input element"),
+            javascript_triggered=javascript_triggered,
+            interacted_with_sibling=True,
+        )
+    except Exception as e:
+        LOG.warning("Failed to click sibling label of input element", exc_info=e)
+        return ActionFailure(exception=e, javascript_triggered=javascript_triggered)
+
+
+async def extract_information_for_navigation_goal(
+    task: Task,
+    step: Step,
+    scraped_page: ScrapedPage,
+) -> ScrapeResult:
+    """
+    Scrapes a webpage and returns the scraped response, including:
+    1. JSON representation of what the user is seeing
+    2. The scraped page
+    """
+    prompt_template = "extract-information"
+
+    extract_information_prompt = prompt_engine.load_prompt(
+        prompt_template,
+        navigation_goal=task.navigation_goal,
+        elements=scraped_page.element_tree,
+        data_extraction_goal=task.data_extraction_goal,
+        extracted_information_schema=task.extracted_information_schema,
+        current_url=scraped_page.url,
+        extracted_text=scraped_page.extracted_text,
+    )
+
+    json_response = await app.OPENAI_CLIENT.chat_completion(
+        step=step,
+        prompt=extract_information_prompt,
+        screenshots=scraped_page.screenshots,
+    )
+
+    return ScrapeResult(
+        scraped_data=json_response,
+    )
--- a/skyvern/webeye/actions/models.py
+++ b/skyvern/webeye/actions/models.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel
+
+from skyvern.forge.sdk.settings_manager import SettingsManager
+from skyvern.webeye.actions.actions import Action, ActionTypeUnion
+from skyvern.webeye.actions.responses import ActionResult
+from skyvern.webeye.scraper.scraper import ScrapedPage
+
+
+class AgentStepOutput(BaseModel):
+    """
+    Output of the agent step, this is recorded in the database.
+    """
+
+    # Will be deprecated once we move to the new format below
+    action_results: list[ActionResult] | None = None
+    # Nullable for backwards compatibility, once backfill is done, this won't be nullable anymore
+    actions_and_results: list[tuple[ActionTypeUnion, list[ActionResult]]] | None = None
+
+    def __repr__(self) -> str:
+        return f"AgentStepOutput({self.model_dump()})"
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+
+class DetailedAgentStepOutput(BaseModel):
+    """
+    Output of the agent step, this is not recorded in the database, only used for debugging in the Jupyter notebook.
+    """
+
+    scraped_page: ScrapedPage | None
+    extract_action_prompt: str | None
+    llm_response: dict[str, Any] | None
+    actions: list[Action] | None
+    action_results: list[ActionResult] | None
+    actions_and_results: list[tuple[ActionTypeUnion, list[ActionResult]]] | None
+
+    class Config:
+        exclude = ["scraped_page", "extract_action_prompt"]
+
+    def __repr__(self) -> str:
+        if SettingsManager.get_settings().DEBUG_MODE:
+            return f"DetailedAgentStepOutput({self.model_dump()})"
+        else:
+            return f"AgentStepOutput({self.to_agent_step_output().model_dump()})"
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+    def to_agent_step_output(self) -> AgentStepOutput:
+        return AgentStepOutput(
+            action_results=self.action_results if self.action_results else [],
+            actions_and_results=self.actions_and_results if self.actions_and_results else [],
+        )
--- a/skyvern/webeye/actions/responses.py
+++ b/skyvern/webeye/actions/responses.py
@@ -0,0 +1,62 @@
+from typing import Any
+
+from pydantic import BaseModel
+
+from skyvern.webeye.string_util import remove_whitespace
+
+
+class ActionResult(BaseModel):
+    success: bool
+    exception_type: str | None = None
+    exception_message: str | None = None
+    data: dict[str, Any] | list | str | None = None
+    step_retry_number: int | None = None
+    step_order: int | None = None
+    javascript_triggered: bool = False
+    # None is used for old data so that we can differentiate between old and new data which only has boolean
+    interacted_with_sibling: bool | None = None
+    interacted_with_parent: bool | None = None
+
+    def __str__(self) -> str:
+        return (
+            f"ActionResult(success={self.success}, exception_type={self.exception_type}, "
+            f"exception_message={self.exception_message}), data={self.data}"
+        )
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+
+class ActionSuccess(ActionResult):
+    def __init__(
+        self,
+        data: dict[str, Any] | list | str | None = None,
+        javascript_triggered: bool = False,
+        interacted_with_sibling: bool = False,
+        interacted_with_parent: bool = False,
+    ):
+        super().__init__(
+            success=True,
+            data=data,
+            javascript_triggered=javascript_triggered,
+            interacted_with_sibling=interacted_with_sibling,
+            interacted_with_parent=interacted_with_parent,
+        )
+
+
+class ActionFailure(ActionResult):
+    def __init__(
+        self,
+        exception: Exception,
+        javascript_triggered: bool = False,
+        interacted_with_sibling: bool = False,
+        interacted_with_parent: bool = False,
+    ):
+        super().__init__(
+            success=False,
+            exception_type=type(exception).__name__,
+            exception_message=remove_whitespace(str(exception)),
+            javascript_triggered=javascript_triggered,
+            interacted_with_sibling=interacted_with_sibling,
+            interacted_with_parent=interacted_with_parent,
+        )
--- a/skyvern/webeye/browser_factory.py
+++ b/skyvern/webeye/browser_factory.py
@@ -0,0 +1,167 @@
+from __future__ import annotations
+
+import uuid
+from datetime import datetime
+from typing import Any, Awaitable, Protocol
+
+import structlog
+from playwright.async_api import BrowserContext, Error, Page, Playwright, async_playwright
+from pydantic import BaseModel
+
+from skyvern.exceptions import FailedToNavigateToUrl, UnknownBrowserType, UnknownErrorWhileCreatingBrowserContext
+from skyvern.forge.sdk.core.skyvern_context import current
+from skyvern.forge.sdk.settings_manager import SettingsManager
+
+LOG = structlog.get_logger()
+
+
+class BrowserContextCreator(Protocol):
+    def __call__(
+        self, playwright: Playwright, **kwargs: dict[str, Any]
+    ) -> Awaitable[tuple[BrowserContext, BrowserArtifacts]]:
+        ...
+
+
+class BrowserContextFactory:
+    _creators: dict[str, BrowserContextCreator] = {}
+
+    @staticmethod
+    def get_subdir() -> str:
+        curr_context = current()
+        if curr_context and curr_context.task_id:
+            return curr_context.task_id
+        elif curr_context and curr_context.request_id:
+            return curr_context.request_id
+        return str(uuid.uuid4())
+
+    @staticmethod
+    def build_browser_args() -> dict[str, Any]:
+        video_dir = f"{SettingsManager.get_settings().VIDEO_PATH}/{datetime.utcnow().strftime('%Y-%m-%d')}"
+        har_dir = f"{SettingsManager.get_settings().HAR_PATH}/{datetime.utcnow().strftime('%Y-%m-%d')}/{BrowserContextFactory.get_subdir()}.har"
+        return {
+            "record_har_path": har_dir,
+            "record_video_dir": video_dir,
+            "viewport": {"width": 1920, "height": 1080},
+        }
+
+    @staticmethod
+    def build_browser_artifacts(
+        video_path: str | None = None, har_path: str | None = None, video_artifact_id: str | None = None
+    ) -> BrowserArtifacts:
+        return BrowserArtifacts(video_path=video_path, har_path=har_path, video_artifact_id=video_artifact_id)
+
+    @classmethod
+    def register_type(cls, browser_type: str, creator: BrowserContextCreator) -> None:
+        cls._creators[browser_type] = creator
+
+    @classmethod
+    async def create_browser_context(
+        cls, playwright: Playwright, **kwargs: Any
+    ) -> tuple[BrowserContext, BrowserArtifacts]:
+        browser_type = SettingsManager.get_settings().BROWSER_TYPE
+        try:
+            creator = cls._creators.get(browser_type)
+            if not creator:
+                raise UnknownBrowserType(browser_type)
+            return await creator(playwright, **kwargs)
+        except UnknownBrowserType as e:
+            raise e
+        except Exception as e:
+            raise UnknownErrorWhileCreatingBrowserContext(browser_type, e) from e
+
+
+class BrowserArtifacts(BaseModel):
+    video_path: str | None = None
+    video_artifact_id: str | None = None
+    har_path: str | None = None
+
+
+async def _create_headless_chromium(playwright: Playwright, **kwargs: dict) -> tuple[BrowserContext, BrowserArtifacts]:
+    browser = await playwright.chromium.launch(headless=True)
+    browser_args = BrowserContextFactory.build_browser_args()
+    browser_artifacts = BrowserContextFactory.build_browser_artifacts(har_path=browser_args["record_har_path"])
+    browser_context = await browser.new_context(**browser_args)
+    return browser_context, browser_artifacts
+
+
+async def _create_headful_chromium(playwright: Playwright, **kwargs: dict) -> tuple[BrowserContext, BrowserArtifacts]:
+    browser = await playwright.chromium.launch(headless=False)
+    browser_args = BrowserContextFactory.build_browser_args()
+    browser_artifacts = BrowserContextFactory.build_browser_artifacts(har_path=browser_args["record_har_path"])
+    browser_context = await browser.new_context(**browser_args)
+    return browser_context, browser_artifacts
+
+
+BrowserContextFactory.register_type("chromium-headless", _create_headless_chromium)
+BrowserContextFactory.register_type("chromium-headful", _create_headful_chromium)
+
+
+class BrowserState:
+    instance = None
+
+    def __init__(
+        self,
+        pw: Playwright | None = None,
+        browser_context: BrowserContext | None = None,
+        page: Page | None = None,
+        browser_artifacts: BrowserArtifacts = BrowserArtifacts(),
+    ):
+        self.pw = pw
+        self.browser_context = browser_context
+        self.page = page
+        self.browser_artifacts = browser_artifacts
+
+    async def _close_all_other_pages(self) -> None:
+        if not self.browser_context or not self.page:
+            return
+        pages = self.browser_context.pages
+        for page in pages:
+            if page != self.page:
+                await page.close()
+
+    async def check_and_fix_state(self, url: str | None = None) -> None:
+        if self.pw is None:
+            LOG.info("Starting playwright")
+            self.pw = await async_playwright().start()
+            LOG.info("playwright is started")
+        if self.browser_context is None:
+            LOG.info("creating browser context")
+            browser_context, browser_artifacts = await BrowserContextFactory.create_browser_context(self.pw, url=url)
+            self.browser_context = browser_context
+            self.browser_artifacts = browser_artifacts
+            LOG.info("browser context is created")
+
+        assert self.browser_context is not None
+
+        if self.page is None:
+            LOG.info("Creating a new page")
+            self.page = await self.browser_context.new_page()
+            await self._close_all_other_pages()
+            LOG.info("A new page is created")
+            if url:
+                LOG.info(f"Navigating page to {url} and waiting for 5 seconds")
+                try:
+                    await self.page.goto(url)
+                except Error as playright_error:
+                    LOG.exception(f"Error while navigating to url: {str(playright_error)}", exc_info=True)
+                    raise FailedToNavigateToUrl(url=url, error_message=str(playright_error))
+                LOG.info(f"Successfully went to {url}")
+
+        if self.browser_artifacts.video_path is None:
+            self.browser_artifacts.video_path = await self.page.video.path()
+
+    async def get_or_create_page(self, url: str | None = None) -> Page:
+        await self.check_and_fix_state(url)
+        assert self.page is not None
+        return self.page
+
+    async def close(self, close_browser_on_completion: bool = True) -> None:
+        LOG.info("Closing browser state")
+        if self.browser_context and close_browser_on_completion:
+            LOG.info("Closing browser context and its pages")
+            await self.browser_context.close()
+            LOG.info("Main browser context and all its pages are closed")
+        if self.pw and close_browser_on_completion:
+            LOG.info("Stopping playwright")
+            await self.pw.stop()
+            LOG.info("Playwright is stopped")
--- a/skyvern/webeye/browser_manager.py
+++ b/skyvern/webeye/browser_manager.py
@@ -0,0 +1,152 @@
+from __future__ import annotations
+
+import structlog
+from playwright.async_api import Browser, Playwright, async_playwright
+
+from skyvern.exceptions import MissingBrowserState
+from skyvern.forge.sdk.schemas.tasks import ProxyLocation, Task
+from skyvern.forge.sdk.settings_manager import SettingsManager
+from skyvern.forge.sdk.workflow.models.workflow import WorkflowRun
+from skyvern.webeye.browser_factory import BrowserContextFactory, BrowserState
+
+LOG = structlog.get_logger()
+
+
+class BrowserManager:
+    instance = None
+    pages: dict[str, BrowserState] = dict()
+
+    def __new__(cls) -> BrowserManager:
+        if cls.instance is None:
+            cls.instance = super().__new__(cls)
+        return cls.instance
+
+    @staticmethod
+    async def _create_browser_state(
+        proxy_location: ProxyLocation | None = None, url: str | None = None
+    ) -> BrowserState:
+        pw = await async_playwright().start()
+        browser_context, browser_artifacts = await BrowserContextFactory.create_browser_context(
+            pw, proxy_location=proxy_location, url=url
+        )
+        return BrowserState(pw=pw, browser_context=browser_context, page=None, browser_artifacts=browser_artifacts)
+
+    async def get_or_create_for_task(self, task: Task) -> BrowserState:
+        if task.task_id in self.pages:
+            return self.pages[task.task_id]
+        elif task.workflow_run_id in self.pages:
+            LOG.info(
+                "Browser state for task not found. Using browser state for workflow run",
+                task_id=task.task_id,
+                workflow_run_id=task.workflow_run_id,
+            )
+            self.pages[task.task_id] = self.pages[task.workflow_run_id]
+            return self.pages[task.task_id]
+        LOG.info("Creating browser state for task", task_id=task.task_id)
+        browser_state = await self._create_browser_state(task.proxy_location, task.url)
+
+        # The URL here is only used when creating a new page, and not when using an existing page.
+        # This will make sure browser_state.page is not None.
+        await browser_state.get_or_create_page(task.url)
+
+        self.pages[task.task_id] = browser_state
+        return browser_state
+
+    async def get_or_create_for_workflow_run(self, workflow_run: WorkflowRun, url: str | None = None) -> BrowserState:
+        if workflow_run.workflow_run_id in self.pages:
+            return self.pages[workflow_run.workflow_run_id]
+        LOG.info("Creating browser state for workflow run", workflow_run_id=workflow_run.workflow_run_id)
+        browser_state = await self._create_browser_state(workflow_run.proxy_location, url=url)
+
+        # The URL here is only used when creating a new page, and not when using an existing page.
+        # This will make sure browser_state.page is not None.
+        await browser_state.get_or_create_page(url)
+
+        self.pages[workflow_run.workflow_run_id] = browser_state
+        return browser_state
+
+    def set_video_artifact_for_task(self, task: Task, artifact_id: str) -> None:
+        if task.workflow_run_id and task.workflow_run_id in self.pages:
+            if self.pages[task.workflow_run_id].browser_artifacts.video_artifact_id:
+                LOG.warning(
+                    "Video artifact is already set for workflow run. Overwriting",
+                    workflow_run_id=task.workflow_run_id,
+                    old_artifact_id=self.pages[task.workflow_run_id].browser_artifacts.video_artifact_id,
+                    new_artifact_id=artifact_id,
+                )
+            self.pages[task.workflow_run_id].browser_artifacts.video_artifact_id = artifact_id
+            return
+        if task.task_id in self.pages:
+            if self.pages[task.task_id].browser_artifacts.video_artifact_id:
+                LOG.warning(
+                    "Video artifact is already set for task. Overwriting",
+                    task_id=task.task_id,
+                    old_artifact_id=self.pages[task.task_id].browser_artifacts.video_artifact_id,
+                    new_artifact_id=artifact_id,
+                )
+            self.pages[task.task_id].browser_artifacts.video_artifact_id = artifact_id
+            return
+
+        raise MissingBrowserState(task_id=task.task_id)
+
+    async def get_video_data(
+        self, browser_state: BrowserState, task_id: str = "", workflow_id: str = "", workflow_run_id: str = ""
+    ) -> bytes:
+        if browser_state:
+            path = browser_state.browser_artifacts.video_path
+            if path:
+                with open(path, "rb") as f:
+                    return f.read()
+        LOG.warning(
+            "Video data not found for task", task_id=task_id, workflow_id=workflow_id, workflow_run_id=workflow_run_id
+        )
+        return b""
+
+    async def get_har_data(
+        self, browser_state: BrowserState, task_id: str = "", workflow_id: str = "", workflow_run_id: str = ""
+    ) -> bytes:
+        if browser_state:
+            path = browser_state.browser_artifacts.har_path
+            if path:
+                with open(path, "rb") as f:
+                    return f.read()
+        LOG.warning(
+            "HAR data not found for task", task_id=task_id, workflow_id=workflow_id, workflow_run_id=workflow_run_id
+        )
+        return b""
+
+    @classmethod
+    async def connect_to_scraping_browser(cls, pw: Playwright) -> Browser:
+        if not SettingsManager.get_settings().REMOTE_BROWSER_KEY:
+            raise Exception("REMOTE_BROWSER_KEY is empty. Cannot connect to remote browser.")
+        browser = await pw.chromium.connect_over_cdp(SettingsManager.get_settings().REMOTE_BROWSER_KEY)
+        LOG.info("Connected to remote browser", browser_type=SettingsManager.get_settings().BROWSER_TYPE)
+        return browser
+
+    @classmethod
+    async def close(cls) -> None:
+        LOG.info("Closing BrowserManager")
+        for browser_state in cls.pages.values():
+            await browser_state.close()
+        cls.pages = dict()
+        LOG.info("BrowserManger is closed")
+
+    async def cleanup_for_task(self, task_id: str, close_browser_on_completion: bool = True) -> BrowserState | None:
+        LOG.info("Cleaning up for task")
+        browser_state_to_close = self.pages.pop(task_id, None)
+        if browser_state_to_close:
+            await browser_state_to_close.close(close_browser_on_completion=close_browser_on_completion)
+        LOG.info("Task is cleaned up")
+
+        return browser_state_to_close
+
+    async def cleanup_for_workflow_run(
+        self, workflow_run_id: str, close_browser_on_completion: bool = True
+    ) -> BrowserState | None:
+        LOG.info("Cleaning up for workflow run")
+        browser_state_to_close = self.pages.pop(workflow_run_id, None)
+        if browser_state_to_close:
+            await browser_state_to_close.close(close_browser_on_completion=close_browser_on_completion)
+        LOG.info("Workflow run is cleaned up")
+
+        return browser_state_to_close
--- a/skyvern/webeye/scraper/init.py
+++ b/skyvern/webeye/scraper/init.py
--- a/skyvern/webeye/scraper/domUtils.js
+++ b/skyvern/webeye/scraper/domUtils.js
@@ -0,0 +1,806 @@
+// Commands for manipulating rects.
+class Rect {
+  // Create a rect given the top left and bottom right corners.
+  static create(x1, y1, x2, y2) {
+    return {
+      bottom: y2,
+      top: y1,
+      left: x1,
+      right: x2,
+      width: x2 - x1,
+      height: y2 - y1,
+    };
+  }
+
+  static copy(rect) {
+    return {
+      bottom: rect.bottom,
+      top: rect.top,
+      left: rect.left,
+      right: rect.right,
+      width: rect.width,
+      height: rect.height,
+    };
+  }
+
+  // Translate a rect by x horizontally and y vertically.
+  static translate(rect, x, y) {
+    if (x == null) x = 0;
+    if (y == null) y = 0;
+    return {
+      bottom: rect.bottom + y,
+      top: rect.top + y,
+      left: rect.left + x,
+      right: rect.right + x,
+      width: rect.width,
+      height: rect.height,
+    };
+  }
+
+  // Determine whether two rects overlap.
+  static intersects(rect1, rect2) {
+    return (
+      rect1.right > rect2.left &&
+      rect1.left < rect2.right &&
+      rect1.bottom > rect2.top &&
+      rect1.top < rect2.bottom
+    );
+  }
+
+  static equals(rect1, rect2) {
+    for (const property of [
+      "top",
+      "bottom",
+      "left",
+      "right",
+      "width",
+      "height",
+    ]) {
+      if (rect1[property] !== rect2[property]) return false;
+    }
+    return true;
+  }
+}
+
+class DomUtils {
+  //
+  // Bounds the rect by the current viewport dimensions. If the rect is offscreen or has a height or
+  // width < 3 then null is returned instead of a rect.
+  //
+  static cropRectToVisible(rect) {
+    const boundedRect = Rect.create(
+      Math.max(rect.left, 0),
+      Math.max(rect.top, 0),
+      rect.right,
+      rect.bottom,
+    );
+    if (
+      boundedRect.top >= window.innerHeight - 4 ||
+      boundedRect.left >= window.innerWidth - 4
+    ) {
+      return null;
+    } else {
+      return boundedRect;
+    }
+  }
+
+  static getVisibleClientRect(element, testChildren) {
+    // Note: this call will be expensive if we modify the DOM in between calls.
+    let clientRect;
+    if (testChildren == null) testChildren = false;
+    const clientRects = (() => {
+      const result = [];
+      for (clientRect of element.getClientRects()) {
+        result.push(Rect.copy(clientRect));
+      }
+      return result;
+    })();
+
+    // Inline elements with font-size: 0px; will declare a height of zero, even if a child with
+    // non-zero font-size contains text.
+    let isInlineZeroHeight = function () {
+      const elementComputedStyle = window.getComputedStyle(element, null);
+      const isInlineZeroFontSize =
+        0 ===
+          elementComputedStyle.getPropertyValue("display").indexOf("inline") &&
+        elementComputedStyle.getPropertyValue("font-size") === "0px";
+      // Override the function to return this value for the rest of this context.
+      isInlineZeroHeight = () => isInlineZeroFontSize;
+      return isInlineZeroFontSize;
+    };
+
+    for (clientRect of clientRects) {
+      // If the link has zero dimensions, it may be wrapping visible but floated elements. Check for
+      // this.
+      let computedStyle;
+      if ((clientRect.width === 0 || clientRect.height === 0) && testChildren) {
+        for (const child of Array.from(element.children)) {
+          computedStyle = window.getComputedStyle(child, null);
+          // Ignore child elements which are not floated and not absolutely positioned for parent
+          // elements with zero width/height, as long as the case described at isInlineZeroHeight
+          // does not apply.
+          // NOTE(mrmr1993): This ignores floated/absolutely positioned descendants nested within
+          // inline children.
+          const position = computedStyle.getPropertyValue("position");
+          if (
+            computedStyle.getPropertyValue("float") === "none" &&
+            !["absolute", "fixed"].includes(position) &&
+            !(
+              clientRect.height === 0 &&
+              isInlineZeroHeight() &&
+              0 === computedStyle.getPropertyValue("display").indexOf("inline")
+            )
+          ) {
+            continue;
+          }
+          const childClientRect = this.getVisibleClientRect(child, true);
+          if (
+            childClientRect === null ||
+            childClientRect.width < 3 ||
+            childClientRect.height < 3
+          )
+            continue;
+          return childClientRect;
+        }
+      } else {
+        clientRect = this.cropRectToVisible(clientRect);
+
+        if (
+          clientRect === null ||
+          clientRect.width < 3 ||
+          clientRect.height < 3
+        )
+          continue;
+
+        // eliminate invisible elements (see test_harnesses/visibility_test.html)
+        computedStyle = window.getComputedStyle(element, null);
+        if (computedStyle.getPropertyValue("visibility") !== "visible")
+          continue;
+
+        return clientRect;
+      }
+    }
+
+    return null;
+  }
+
+  static getViewportTopLeft() {
+    const box = document.documentElement;
+    const style = getComputedStyle(box);
+    const rect = box.getBoundingClientRect();
+    if (
+      style.position === "static" &&
+      !/content|paint|strict/.test(style.contain || "")
+    ) {
+      // The margin is included in the client rect, so we need to subtract it back out.
+      const marginTop = parseInt(style.marginTop);
+      const marginLeft = parseInt(style.marginLeft);
+      return {
+        top: -rect.top + marginTop,
+        left: -rect.left + marginLeft,
+      };
+    } else {
+      const { clientTop, clientLeft } = box;
+      return {
+        top: -rect.top - clientTop,
+        left: -rect.left - clientLeft,
+      };
+    }
+  }
+}
+
+// from playwright
+function getElementComputedStyle(element, pseudo) {
+  return element.ownerDocument && element.ownerDocument.defaultView
+    ? element.ownerDocument.defaultView.getComputedStyle(element, pseudo)
+    : undefined;
+}
+
+// from playwright
+function isElementStyleVisibilityVisible(element, style) {
+  style = style ?? getElementComputedStyle(element);
+  if (!style) return true;
+  if (
+    !element.checkVisibility({ checkOpacity: false, checkVisibilityCSS: false })
+  )
+    return false;
+  if (style.visibility !== "visible") return false;
+  return true;
+}
+
+// from playwright
+function isElementVisible(element) {
+  // TODO: This is a hack to not check visibility for option elements
+  // because they are not visible by default. We check their parent instead for visibility.
+  if (element.tagName.toLowerCase() === "option")
+    return element.parentElement && isElementVisible(element.parentElement);
+
+  const style = getElementComputedStyle(element);
+  if (!style) return true;
+  if (style.display === "contents") {
+    // display:contents is not rendered itself, but its child nodes are.
+    for (let child = element.firstChild; child; child = child.nextSibling) {
+      if (
+        child.nodeType === 1 /* Node.ELEMENT_NODE */ &&
+        isElementVisible(child)
+      )
+        return true;
+      // skipping other nodes including text
+    }
+    return false;
+  }
+  if (!isElementStyleVisibilityVisible(element, style)) return false;
+  const rect = element.getBoundingClientRect();
+  return rect.width > 0 && rect.height > 0;
+}
+
+function isHiddenOrDisabled(element) {
+  const style = getElementComputedStyle(element);
+  return style?.display === "none" || element.hidden || element.disabled;
+}
+
+function isScriptOrStyle(element) {
+  const tagName = element.tagName.toLowerCase();
+  return tagName === "script" || tagName === "style";
+}
+
+function hasWidgetRole(element) {
+  const role = element.getAttribute("role");
+  if (!role) {
+    return false;
+  }
+  // https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles#2._widget_roles
+  // Not all roles make sense for the time being so we only check for the ones that do
+  const widgetRoles = [
+    "button",
+    "link",
+    "checkbox",
+    "menuitem",
+    "menuitemcheckbox",
+    "menuitemradio",
+    "radio",
+    "tab",
+    "combobox",
+    "textbox",
+    "searchbox",
+    "slider",
+    "spinbutton",
+    "switch",
+    "gridcell",
+  ];
+  return widgetRoles.includes(role.toLowerCase().trim());
+}
+
+function isInteractableInput(element) {
+  const tagName = element.tagName.toLowerCase();
+  const type = element.getAttribute("type");
+  if (tagName !== "input" || !type) {
+    // let other checks decide
+    return false;
+  }
+  const clickableTypes = [
+    "button",
+    "checkbox",
+    "date",
+    "datetime-local",
+    "email",
+    "file",
+    "image",
+    "month",
+    "number",
+    "password",
+    "radio",
+    "range",
+    "reset",
+    "search",
+    "submit",
+    "tel",
+    "text",
+    "time",
+    "url",
+    "week",
+  ];
+  return clickableTypes.includes(type.toLowerCase().trim());
+}
+
+function isInteractable(element) {
+  if (!isElementVisible(element)) {
+    return false;
+  }
+
+  if (isHiddenOrDisabled(element)) {
+    return false;
+  }
+
+  if (isScriptOrStyle(element)) {
+    return false;
+  }
+
+  if (hasWidgetRole(element)) {
+    return true;
+  }
+
+  if (isInteractableInput(element)) {
+    return true;
+  }
+
+  const tagName = element.tagName.toLowerCase();
+
+  if (tagName === "a" && element.href) {
+    return true;
+  }
+
+  if (
+    tagName === "button" ||
+    tagName === "select" ||
+    tagName === "option" ||
+    tagName === "textarea"
+  ) {
+    return true;
+  }
+
+  if (tagName === "label" && element.control && !element.control.disabled) {
+    return true;
+  }
+
+  if (
+    element.hasAttribute("onclick") ||
+    element.isContentEditable ||
+    element.hasAttribute("jsaction")
+  ) {
+    return true;
+  }
+
+  if (tagName === "div" || tagName === "img" || tagName === "span") {
+    const computedStyle = window.getComputedStyle(element);
+    const hasPointer = computedStyle.cursor === "pointer";
+    const hasCursor = computedStyle.cursor === "cursor";
+    return hasPointer || hasCursor;
+  }
+
+  return false;
+}
+
+function removeMultipleSpaces(str) {
+  if (!str) {
+    return str;
+  }
+  return str.replace(/\s+/g, " ");
+}
+
+function cleanupText(text) {
+  return removeMultipleSpaces(
+    text.replace("SVGs not supported by this browser.", ""),
+  ).trim();
+}
+
+function getElementContext(element) {
+  // dfs to collect the non unique_id context
+  let fullContext = "";
+  if (element.childNodes.length === 0) {
+    return fullContext;
+  }
+  let childContextList = new Array();
+  for (var child of element.childNodes) {
+    let childContext = "";
+    if (child.nodeType === Node.TEXT_NODE) {
+      if (!element.hasAttribute("unique_id")) {
+        childContext = child.data.trim();
+      }
+    } else if (child.nodeType === Node.ELEMENT_NODE) {
+      if (!child.hasAttribute("unique_id")) {
+        childContext = getElementContext(child);
+      }
+    }
+    if (childContext.length > 0) {
+      childContextList.push(childContext);
+    }
+
+    if (childContextList.length > 0) {
+      fullContext = childContextList.join(";");
+    }
+
+    const charLimit = 1000;
+    if (fullContext.length > charLimit) {
+      fullContext = "";
+    }
+  }
+  return fullContext;
+}
+
+function getElementContent(element) {
+  // DFS to get all the text content from all the nodes under the element
+
+  let textContent = element.textContent;
+  let nodeContent = "";
+  // if element has children, then build a list of text and join with a semicolon
+  if (element.childNodes.length > 0) {
+    let childTextContentList = new Array();
+    let nodeTextContentList = new Array();
+    for (var child of element.childNodes) {
+      let childText = "";
+      if (child.nodeType === Node.TEXT_NODE) {
+        childText = child.data.trim();
+        nodeTextContentList.push(childText);
+      } else if (child.nodeType === Node.ELEMENT_NODE) {
+        // childText = child.textContent.trim();
+        childText = getElementContent(child);
+      } else {
+        console.log("Unhandled node type: ", child.nodeType);
+      }
+      if (childText.length > 0) {
+        childTextContentList.push(childText);
+      }
+    }
+    textContent = childTextContentList.join(";");
+    nodeContent = cleanupText(nodeTextContentList.join(";"));
+  }
+  let finalTextContent = cleanupText(textContent);
+
+  // Currently we don't support too much context. Character limit is 1000 per element.
+  // we don't think element context has to be that big
+  const charLimit = 1000;
+  if (finalTextContent.length > charLimit) {
+    if (nodeContent.length <= charLimit) {
+      finalTextContent = nodeContent;
+    } else {
+      finalTextContent = "";
+    }
+  }
+
+  return finalTextContent;
+}
+
+function getSelectOptions(element) {
+  const options = Array.from(element.options);
+  const selectOptions = [];
+  for (const option of options) {
+    selectOptions.push({
+      optionIndex: option.index,
+      text: removeMultipleSpaces(option.textContent),
+    });
+  }
+  return selectOptions;
+}
+
+function buildTreeFromBody() {
+  var elements = [];
+  var resultArray = [];
+
+  function buildElementObject(element) {
+    var element_id = elements.length;
+    var elementTagNameLower = element.tagName.toLowerCase();
+    element.setAttribute("unique_id", element_id);
+    // if element is an "a" tag and has a target="_blank" attribute, remove the target attribute
+    // We're doing this so that skyvern can do all the navigation in a single page/tab and not open new tab
+    if (element.tagName.toLowerCase() === "a") {
+      if (element.getAttribute("target") === "_blank") {
+        element.removeAttribute("target");
+      }
+    }
+    const attrs = {};
+    for (const attr of element.attributes) {
+      var attrValue = attr.value;
+      if (
+        attr.name === "required" ||
+        attr.name === "aria-required" ||
+        attr.name === "checked" ||
+        attr.name === "aria-checked" ||
+        attr.name === "selected" ||
+        attr.name === "aria-selected" ||
+        attr.name === "readonly" ||
+        attr.name === "aria-readonly"
+      ) {
+        attrValue = true;
+      }
+      attrs[attr.name] = attrValue;
+    }
+    if (elementTagNameLower === "input" || elementTagNameLower === "textarea") {
+      attrs["value"] = element.value;
+    }
+
+    let elementObj = {
+      id: element_id,
+      tagName: elementTagNameLower,
+      attributes: attrs,
+      text: getElementContent(element),
+      children: [],
+      rect: DomUtils.getVisibleClientRect(element, true),
+    };
+
+    // get options for select element or for listbox element
+    let selectOptions = null;
+    if (elementTagNameLower === "select") {
+      selectOptions = getSelectOptions(element);
+    }
+    if (selectOptions) {
+      elementObj.options = selectOptions;
+    }
+
+    return elementObj;
+  }
+
+  function getChildElements(element) {
+    if (element.childElementCount !== 0) {
+      return Array.from(element.children);
+    } else {
+      return [];
+    }
+  }
+  function processElement(element, interactableParentId) {
+    // Check if the element is interactable
+    if (isInteractable(element)) {
+      var elementObj = buildElementObject(element);
+      elements.push(elementObj);
+      // If the element is interactable but has no interactable parent,
+      // then it starts a new tree, so add it to the result array
+      // and set its id as the interactable parent id for the next elements
+      // under it
+      if (interactableParentId === null) {
+        resultArray.push(elementObj);
+      }
+      // If the element is interactable and has an interactable parent,
+      // then add it to the children of the parent
+      else {
+        elements[interactableParentId].children.push(elementObj);
+      }
+      // Recursively process the children of the element
+      getChildElements(element).forEach((child) => {
+        processElement(child, elementObj.id);
+      });
+      return elementObj;
+    } else {
+      // For a non-interactable element, process its children
+      // and check if any of them are interactable
+      let interactableChildren = [];
+      getChildElements(element).forEach((child) => {
+        let children = processElement(child, interactableParentId);
+      });
+    }
+  }
+
+  // TODO: Handle iframes
+
+  // Clear all the unique_id attributes so that there are no conflicts
+  removeAllUniqueIdAttributes();
+  processElement(document.body, null);
+
+  for (var element of elements) {
+    if (
+      ((element.tagName === "input" && element.attributes["type"] === "text") ||
+        element.tagName === "textarea") &&
+      (element.attributes["required"] || element.attributes["aria-required"]) &&
+      element.attributes.value === ""
+    ) {
+      // TODO (kerem): we may want to pass these elements to the LLM as empty but required fields in the future
+      console.log(
+        "input element with required attribute and no value",
+        element,
+      );
+    }
+
+    // for most elements, we're going 10 layers up to see if we can find "label" as a parent
+    // if found, most likely the context under label is relevant to this element
+    let targetParentElements = new Set(["label", "fieldset"]);
+
+    // look up for 10 levels to find the most contextual parent element
+    let targetContextualParent = null;
+    let currentEle = document.querySelector(`[unique_id="${element.id}"]`);
+    let parentEle = currentEle;
+    for (var i = 0; i < 10; i++) {
+      parentEle = parentEle.parentElement;
+      if (parentEle) {
+        if (targetParentElements.has(parentEle.tagName.toLowerCase())) {
+          targetContextualParent = parentEle;
+        }
+      } else {
+        break;
+      }
+    }
+    if (targetContextualParent) {
+      let context = "";
+      var lowerCaseTagName = targetContextualParent.tagName.toLowerCase();
+      if (lowerCaseTagName === "label") {
+        context = getElementContext(targetContextualParent);
+      } else if (lowerCaseTagName === "fieldset") {
+        // fieldset is usually within a form or another element that contains the whole context
+        targetContextualParent = targetContextualParent.parentElement;
+        if (targetContextualParent) {
+          context = getElementContext(targetContextualParent);
+        }
+      }
+      if (context.length > 0) {
+        element.context = context;
+      }
+    }
+  }
+
+  return [elements, resultArray];
+}
+
+function drawBoundingBoxes(elements) {
+  // draw a red border around the elements
+  var groups = groupElementsVisually(elements);
+  var hintMarkers = createHintMarkersForGroups(groups);
+  addHintMarkersToPage(hintMarkers);
+}
+
+function removeAllUniqueIdAttributes() {
+  var elementsWithUniqueId = document.querySelectorAll("[unique_id]");
+
+  elementsWithUniqueId.forEach(function (element) {
+    element.removeAttribute("unique_id");
+  });
+}
+
+function captchaSolvedCallback() {
+  console.log("captcha solved");
+  if (!window["captchaSolvedCounter"]) {
+    window["captchaSolvedCounter"] = 0;
+  }
+  // For some reason this isn't being called.. TODO figure out why
+  window["captchaSolvedCounter"] = window["captchaSolvedCounter"] + 1;
+}
+
+function getCaptchaSolves() {
+  if (!window["captchaSolvedCounter"]) {
+    window["captchaSolvedCounter"] = 0;
+  }
+  return window["captchaSolvedCounter"];
+}
+
+function groupElementsVisually(elements) {
+  const groups = [];
+  // o n^2
+  // go through each hint and see if it overlaps with any other hints, if it does, add it to the group of the other hint
+  // *** if we start from the bigger elements (top -> bottom) we can avoid merging groups
+  for (const element of elements) {
+    if (!element.rect) {
+      continue;
+    }
+    const group = groups.find((group) => {
+      for (const groupElement of group.elements) {
+        if (Rect.intersects(groupElement.rect, element.rect)) {
+          return true;
+        }
+      }
+      return false;
+    });
+    if (group) {
+      group.elements.push(element);
+    } else {
+      groups.push({
+        elements: [element],
+      });
+    }
+  }
+
+  // go through each group and create a rectangle that encompasses all the hints in the group
+  for (const group of groups) {
+    group.rect = createRectangleForGroup(group);
+  }
+
+  return groups;
+}
+
+function createRectangleForGroup(group) {
+  const rects = group.elements.map((element) => element.rect);
+  const top = Math.min(...rects.map((rect) => rect.top));
+  const left = Math.min(...rects.map((rect) => rect.left));
+  const bottom = Math.max(...rects.map((rect) => rect.bottom));
+  const right = Math.max(...rects.map((rect) => rect.right));
+  return Rect.create(left, top, right, bottom);
+}
+
+function generateHintStrings(count) {
+  const hintCharacters = "sadfjklewcmpgh";
+  let hintStrings = [""];
+  let offset = 0;
+
+  while (hintStrings.length - offset < count || hintStrings.length === 1) {
+    const hintString = hintStrings[offset++];
+    for (const ch of hintCharacters) {
+      hintStrings.push(ch + hintString);
+    }
+  }
+  hintStrings = hintStrings.slice(offset, offset + count);
+
+  // Shuffle the hints so that they're scattered; hints starting with the same character and short
+  // hints are spread evenly throughout the array.
+  return hintStrings.sort(); // .map((str) => str.reverse())
+}
+
+function createHintMarkersForGroups(groups) {
+  if (groups.length === 0) {
+    console.log("No groups found, not adding hint markers to page.");
+    return [];
+  }
+
+  const hintMarkers = groups.map((group) => createHintMarkerForGroup(group));
+
+  // fill in marker text
+  const hintStrings = generateHintStrings(hintMarkers.length);
+  for (let i = 0; i < hintMarkers.length; i++) {
+    const hintMarker = hintMarkers[i];
+    hintMarker.hintString = hintStrings[i];
+    hintMarker.element.innerHTML = hintMarker.hintString.toUpperCase();
+  }
+
+  return hintMarkers;
+}
+
+function createHintMarkerForGroup(group) {
+  const marker = {};
+  // yellow annotation box with string
+  const el = document.createElement("div");
+  el.style.left = group.rect.left + "px";
+  el.style.top = group.rect.top + "px";
+  // Each group is assigned a different incremental z-index, we use the same z-index for the
+  // bounding box and the hint marker
+  el.style.zIndex = this.currentZIndex;
+
+  // The bounding box around the group of hints.
+  const boundingBox = document.createElement("div");
+
+  // Calculate the position of the element relative to the document
+  var scrollTop = window.pageYOffset || document.documentElement.scrollTop;
+  var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
+
+  // Set styles for the bounding box
+  boundingBox.style.position = "absolute";
+  boundingBox.style.display = "display";
+  boundingBox.style.left = group.rect.left + scrollLeft + "px";
+  boundingBox.style.top = group.rect.top + scrollTop + "px";
+  boundingBox.style.width = group.rect.width + "px";
+  boundingBox.style.height = group.rect.height + "px";
+  boundingBox.style.bottom = boundingBox.style.top + boundingBox.style.height;
+  boundingBox.style.right = boundingBox.style.left + boundingBox.style.width;
+  boundingBox.style.border = "2px solid blue"; // Change the border color as needed
+  boundingBox.style.pointerEvents = "none"; // Ensures the box doesn't interfere with other interactions
+  boundingBox.style.zIndex = this.currentZIndex++;
+
+  return Object.assign(marker, {
+    element: el,
+    boundingBox: boundingBox,
+    group: group,
+  });
+}
+
+function addHintMarkersToPage(hintMarkers) {
+  const parent = document.createElement("div");
+  parent.id = "boundingBoxContainer";
+  for (const hintMarker of hintMarkers) {
+    // parent.appendChild(hintMarker.element);
+    parent.appendChild(hintMarker.boundingBox);
+  }
+  document.documentElement.appendChild(parent);
+}
+
+function removeBoundingBoxes() {
+  var hintMarkerContainer = document.querySelector("#boundingBoxContainer");
+  if (hintMarkerContainer) {
+    hintMarkerContainer.remove();
+  }
+}
+
+function scrollToTop(draw_boxes) {
+  removeBoundingBoxes();
+  window.scrollTo(0, 0);
+  if (draw_boxes) {
+    var elementsAndResultArray = buildTreeFromBody();
+    drawBoundingBoxes(elementsAndResultArray[0]);
+  }
+  return window.scrollY;
+}
+
+function scrollToNextPage(draw_boxes) {
+  // remove bounding boxes, scroll to next page with 200px overlap, then draw bounding boxes again
+  // return true if there is a next page, false otherwise
+  removeBoundingBoxes();
+  window.scrollBy(0, window.innerHeight - 200);
+  if (draw_boxes) {
+    var elementsAndResultArray = buildTreeFromBody();
+    drawBoundingBoxes(elementsAndResultArray[0]);
+  }
+  return window.scrollY;
+}
--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -0,0 +1,316 @@
+import asyncio
+import copy
+
+import structlog
+from playwright.async_api import Page
+from pydantic import BaseModel
+
+from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
+from skyvern.forge.sdk.settings_manager import SettingsManager
+from skyvern.webeye.browser_factory import BrowserState
+
+LOG = structlog.get_logger()
+
+RESERVED_ATTRIBUTES = {
+    "accept",  # for input file
+    "alt",
+    "aria-checked",  # for option tag
+    "aria-current",
+    "aria-label",
+    "aria-required",
+    "aria-role",
+    "aria-selected",  # for option tag
+    "checked",
+    "data-ui",
+    "for",
+    "href",  # For a tags
+    "maxlength",
+    "name",
+    "pattern",
+    "placeholder",
+    "readonly",
+    "required",
+    "selected",  # for option tag
+    "src",  # do we need this?
+    "text-value",
+    "title",
+    "type",
+    "value",
+}
+
+
+def load_js_script() -> str:
+    # TODO: Handle file location better. This is a hacky way to find the file location.
+    path = f"{SKYVERN_DIR}/webeye/scraper/domUtils.js"
+    try:
+        # TODO: Implement TS of domUtils.js and use the complied JS file instead of the raw JS file.
+        # This will allow our code to be type safe.
+        with open(path, "r") as f:
+            return f.read()
+    except FileNotFoundError as e:
+        LOG.exception("Failed to load the JS script", exc_info=True, path=path)
+        raise e
+
+
+JS_FUNCTION_DEFS = load_js_script()
+
+
+class ScrapedPage(BaseModel):
+    """
+    Scraped response from a webpage, including:
+    1. List of elements
+    2. ID to xpath map
+    3. The element tree of the page (list of dicts). Each element has children and attributes.
+    4. The screenshot (base64 encoded)
+    5. The URL of the page
+    6. The HTML of the page
+    7. The extracted text from the page
+    """
+
+    elements: list[dict]
+    id_to_xpath_dict: dict[int, str]
+    element_tree: list[dict]
+    element_tree_trimmed: list[dict]
+    screenshots: list[bytes]
+    url: str
+    html: str
+    extracted_text: str | None = None
+
+
+async def scrape_website(
+    browser_state: BrowserState,
+    url: str,
+    num_retry: int = 0,
+) -> ScrapedPage:
+    """
+    ************************************************************************************************
+    ************ NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production *************
+    ************************************************************************************************
+    High-level asynchronous function to scrape a web page. It sets up the Playwright environment, handles browser and
+    page initialization, and calls the safe scraping function. This function is ideal for general use where initial
+    setup and safety measures are required.
+
+    Asynchronous function that safely scrapes a web page. It handles exceptions and retries scraping up to a maximum
+    number of attempts. This function should be used when reliability and error handling are crucial, such as in
+    automated scraping tasks.
+
+    :param browser_context: BrowserContext instance used for scraping.
+    :param url: URL of the web page to be scraped.
+    :param page: Optional Page instance for scraping, a new page is created if None.
+    :param num_retry: Tracks number of retries if scraping fails, defaults to 0.
+
+    :return: Tuple containing Page instance, base64 encoded screenshot, and page elements.
+
+    :raises Exception: When scraping fails after maximum retries.
+    """
+    try:
+        num_retry += 1
+        return await scrape_web_unsafe(browser_state, url)
+    except Exception:
+        # NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
+        if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES:
+            LOG.error(
+                "Scraping failed after max retries, aborting.",
+                max_retries=SettingsManager.get_settings().MAX_SCRAPING_RETRIES,
+                url=url,
+                exc_info=True,
+            )
+            raise Exception("Scraping failed.")
+        LOG.info("Scraping failed, will retry", num_retry=num_retry, url=url)
+        return await scrape_website(
+            browser_state,
+            url,
+            num_retry=num_retry,
+        )
+
+
+async def get_all_visible_text(page: Page) -> str:
+    """
+    Get all the visible text on the page.
+    :param page: Page instance to get the text from.
+    :return: All the visible text on the page.
+    """
+    js_script = "() => document.body.innerText"
+    return await page.evaluate(js_script)
+
+
+async def scrape_web_unsafe(
+    browser_state: BrowserState,
+    url: str,
+) -> ScrapedPage:
+    """
+    Asynchronous function that performs web scraping without any built-in error handling. This function is intended
+    for use cases where the caller handles exceptions or in controlled environments. It directly scrapes the provided
+    URL or continues on the given page.
+
+    :param browser_context: BrowserContext instance used for scraping.
+    :param url: URL of the web page to be scraped. Used only when creating a new page.
+    :param page: Optional Page instance for scraping, a new page is created if None.
+    :return: Tuple containing Page instance, base64 encoded screenshot, and page elements.
+    :note: This function does not handle exceptions. Ensure proper error handling in the calling context.
+    """
+    # We only create a new page if one does not exist. This is to allow keeping the same page since we want to
+    # continue working on the same page that we're taking actions on.
+    # *This also means URL is only used when creating a new page, and not when using an existing page.
+    page = await browser_state.get_or_create_page(url)
+    # Take screenshots of the page with the bounding boxes. We will remove the bounding boxes later.
+    # Scroll to the top of the page and take a screenshot.
+    # Scroll to the next page and take a screenshot until we reach the end of the page.
+    # We check if the scroll_y_px_old is the same as scroll_y_px to determine if we have reached the end of the page.
+    # This also solves the issue where we can't scroll due to a popup.(e.g. geico first popup on the homepage after
+    # clicking start my quote)
+
+    LOG.info("Waiting for 5 seconds before scraping the website.")
+    await asyncio.sleep(5)
+
+    screenshots: list[bytes] = []
+    scroll_y_px_old = -1.0
+    scroll_y_px = await scroll_to_top(page, drow_boxes=True)
+    # Checking max number of screenshots to prevent infinite loop
+    while scroll_y_px_old != scroll_y_px and len(screenshots) < SettingsManager.get_settings().MAX_NUM_SCREENSHOTS:
+        screenshot = await page.screenshot(full_page=False)
+        screenshots.append(screenshot)
+        scroll_y_px_old = scroll_y_px
+        LOG.info("Scrolling to next page", url=url, num_screenshots=len(screenshots))
+        scroll_y_px = await scroll_to_next_page(page, drow_boxes=True)
+        LOG.info("Scrolled to next page", scroll_y_px=scroll_y_px, scroll_y_px_old=scroll_y_px_old)
+    await remove_bounding_boxes(page)
+    await scroll_to_top(page, drow_boxes=False)
+
+    elements, element_tree = await get_interactable_element_tree(page)
+    element_tree = cleanup_elements(copy.deepcopy(element_tree))
+
+    id_to_xpath_dict = {}
+    for element in elements:
+        element_id = element["id"]
+        # get_interactable_element_tree marks each interactable element with a unique_id attribute
+        id_to_xpath_dict[element_id] = f"//*[@{SKYVERN_ID_ATTR}='{element_id}']"
+
+    text_content = await get_all_visible_text(page)
+    return ScrapedPage(
+        elements=elements,
+        id_to_xpath_dict=id_to_xpath_dict,
+        element_tree=element_tree,
+        element_tree_trimmed=trim_element_tree(copy.deepcopy(element_tree)),
+        screenshots=screenshots,
+        url=page.url,
+        html=await page.content(),
+        extracted_text=text_content,
+    )
+
+
+async def get_interactable_element_tree(page: Page) -> tuple[list[dict], list[dict]]:
+    """
+    Get the element tree of the page, including all the elements that are interactable.
+    :param page: Page instance to get the element tree from.
+    :return: Tuple containing the element tree and a map of element IDs to elements.
+    """
+    await page.evaluate(JS_FUNCTION_DEFS)
+    js_script = "() => buildTreeFromBody()"
+    elements, element_tree = await page.evaluate(js_script)
+    return elements, element_tree
+
+
+async def scroll_to_top(page: Page, drow_boxes: bool) -> float:
+    """
+    Scroll to the top of the page and take a screenshot.
+    :param drow_boxes: If True, draw bounding boxes around the elements.
+    :param page: Page instance to take the screenshot from.
+    :return: Screenshot of the page.
+    """
+    await page.evaluate(JS_FUNCTION_DEFS)
+    js_script = f"() => scrollToTop({str(drow_boxes).lower()})"
+    scroll_y_px = await page.evaluate(js_script)
+    return scroll_y_px
+
+
+async def scroll_to_next_page(page: Page, drow_boxes: bool) -> bool:
+    """
+    Scroll to the next page and take a screenshot.
+    :param drow_boxes: If True, draw bounding boxes around the elements.
+    :param page: Page instance to take the screenshot from.
+    :return: Screenshot of the page.
+    """
+    await page.evaluate(JS_FUNCTION_DEFS)
+    js_script = f"() => scrollToNextPage({str(drow_boxes).lower()})"
+    scroll_y_px = await page.evaluate(js_script)
+    return scroll_y_px
+
+
+async def remove_bounding_boxes(page: Page) -> None:
+    """
+    Remove the bounding boxes from the page.
+    :param page: Page instance to remove the bounding boxes from.
+    """
+    js_script = "() => removeBoundingBoxes()"
+    await page.evaluate(js_script)
+
+
+def cleanup_elements(elements: list[dict]) -> list[dict]:
+    """
+    Remove rect and attribute.unique_id from the elements.
+    The reason we're doing it is to
+    1. reduce unnecessary data so that llm get less distrction
+    # TODO later: 2. reduce tokens sent to llm to save money
+    :param elements: List of elements to remove xpaths from.
+    :return: List of elements without xpaths.
+    """
+    queue = []
+    for element in elements:
+        queue.append(element)
+    while queue:
+        queue_ele = queue.pop(0)
+        _remove_rect(queue_ele)
+        # TODO: we can come back to test removing the unique_id
+        # from element attributes to make sure this won't increase hallucination
+        # _remove_unique_id(queue_ele)
+        if "children" in queue_ele:
+            queue.extend(queue_ele["children"])
+    return elements
+
+
+def trim_element_tree(elements: list[dict]) -> list[dict]:
+    queue = []
+    for element in elements:
+        queue.append(element)
+    while queue:
+        queue_ele = queue.pop(0)
+        if "attributes" in queue_ele:
+            tag_name = queue_ele["tagName"] if "tagName" in queue_ele else ""
+            new_attributes = _trimmed_attributes(tag_name, queue_ele["attributes"])
+            if new_attributes:
+                queue_ele["attributes"] = new_attributes
+            else:
+                del queue_ele["attributes"]
+        if "children" in queue_ele:
+            queue.extend(queue_ele["children"])
+            if not queue_ele["children"]:
+                del queue_ele["children"]
+        if "text" in queue_ele:
+            element_text = str(queue_ele["text"]).strip()
+            if not element_text:
+                del queue_ele["text"]
+    return elements
+
+
+def _trimmed_attributes(tag_name: str, attributes: dict) -> dict:
+    new_attributes: dict = {}
+    for key in attributes:
+        if key == "id" and tag_name in ["input", "textarea", "select"]:
+            # We don't want to remove the id attribute any of these elements in case there's a label for it
+            new_attributes[key] = attributes[key]
+        if key in RESERVED_ATTRIBUTES:
+            new_attributes[key] = attributes[key]
+    return new_attributes
+
+
+def _remove_rect(element: dict) -> None:
+    if "rect" in element:
+        del element["rect"]
+
+
+def _remove_unique_id(element: dict) -> None:
+    if "attributes" not in element:
+        return
+    if SKYVERN_ID_ATTR in element["attributes"]:
+        del element["attributes"][SKYVERN_ID_ATTR]
--- a/skyvern/webeye/string_util.py
+++ b/skyvern/webeye/string_util.py
@@ -0,0 +1,5 @@
+import re
+
+
+def remove_whitespace(string: str) -> str:
+    return re.sub("[ \n\t]+", " ", string)