Move the code over from private repository (#3)

2024-03-01 10:09:30 -08:00
parent 32dd6d92a5
commit 9eddb3d812
93 changed files with 16798 additions and 0 deletions
--- a/skyvern/webeye/actions/init.py
+++ b/skyvern/webeye/actions/init.py
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -0,0 +1,204 @@
+import abc
+from enum import StrEnum
+from typing import Any, Dict, List
+
+import structlog
+from pydantic import BaseModel
+
+from skyvern.forge.sdk.schemas.tasks import Task
+
+LOG = structlog.get_logger()
+
+
+class ActionType(StrEnum):
+    CLICK = "click"
+    INPUT_TEXT = "input_text"
+    UPLOAD_FILE = "upload_file"
+    SELECT_OPTION = "select_option"
+    CHECKBOX = "checkbox"
+    WAIT = "wait"
+    NULL_ACTION = "null_action"
+    SOLVE_CAPTCHA = "solve_captcha"
+    TERMINATE = "terminate"
+    COMPLETE = "complete"
+    # Note: Remember to update ActionTypeUnion with new actions
+
+
+class Action(BaseModel):
+    action_type: ActionType
+    description: str | None = None
+    reasoning: str | None = None
+
+
+class WebAction(Action, abc.ABC):
+    element_id: int
+
+
+class ClickAction(WebAction):
+    action_type: ActionType = ActionType.CLICK
+    file_url: str | None = None
+
+    def __repr__(self) -> str:
+        return f"ClickAction(element_id={self.element_id}, file_url={self.file_url})"
+
+
+class InputTextAction(WebAction):
+    action_type: ActionType = ActionType.INPUT_TEXT
+    text: str
+
+    def __repr__(self) -> str:
+        return f"InputTextAction(element_id={self.element_id}, text={self.text})"
+
+
+class UploadFileAction(WebAction):
+    action_type: ActionType = ActionType.UPLOAD_FILE
+    file_url: str
+    is_upload_file_tag: bool = True
+
+    def __repr__(self) -> str:
+        return f"UploadFileAction(element_id={self.element_id}, file={self.file_url}, is_upload_file_tag={self.is_upload_file_tag})"
+
+
+class NullAction(Action):
+    action_type: ActionType = ActionType.NULL_ACTION
+
+
+class SolveCaptchaAction(Action):
+    action_type: ActionType = ActionType.SOLVE_CAPTCHA
+
+
+class SelectOption(BaseModel):
+    label: str | None
+    value: str | None
+    index: int | None
+
+    def __repr__(self) -> str:
+        return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"
+
+
+class SelectOptionAction(WebAction):
+    action_type: ActionType = ActionType.SELECT_OPTION
+    option: SelectOption
+
+    def __repr__(self) -> str:
+        return f"SelectOptionAction(element_id={self.element_id}, option={self.option})"
+
+
+###
+# This action causes more harm than it does good.
+# It frequently mis-behaves, or gets stuck in click loops.
+# Treating checkbox actions as click actions seem to perform way more reliably
+# Developers who tried this and failed: 2 (Suchintan and Shu 😂)
+###
+class CheckboxAction(WebAction):
+    action_type: ActionType = ActionType.CHECKBOX
+    is_checked: bool
+
+    def __repr__(self) -> str:
+        return f"CheckboxAction(element_id={self.element_id}, is_checked={self.is_checked})"
+
+
+class WaitAction(Action):
+    action_type: ActionType = ActionType.WAIT
+
+
+class TerminateAction(Action):
+    action_type: ActionType = ActionType.TERMINATE
+
+
+class CompleteAction(Action):
+    action_type: ActionType = ActionType.COMPLETE
+    data_extraction_goal: str | None = None
+
+
+def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Action]:
+    actions = []
+    for action in json_response:
+        element_id = action["id"]
+        reasoning = action["reasoning"] if "reasoning" in action else None
+        if "action_type" not in action or action["action_type"] is None:
+            actions.append(NullAction(reasoning=reasoning))
+            continue
+        # `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
+        action_type = ActionType[action["action_type"].upper()]
+        if action_type == ActionType.TERMINATE:
+            LOG.warning(
+                "Agent decided to terminate",
+                task_id=task.task_id,
+                llm_response=json_response,
+                reasoning=reasoning,
+                actions=actions,
+            )
+            actions.append(TerminateAction(reasoning=reasoning))
+        elif action_type == ActionType.CLICK:
+            file_url = action["file_url"] if "file_url" in action else None
+            actions.append(ClickAction(element_id=element_id, reasoning=reasoning, file_url=file_url))
+        elif action_type == ActionType.INPUT_TEXT:
+            actions.append(InputTextAction(element_id=element_id, text=action["text"], reasoning=reasoning))
+        elif action_type == ActionType.UPLOAD_FILE:
+            # TODO: see if the element is a file input element. if it's not, convert this action into a click action
+
+            actions.append(UploadFileAction(element_id=element_id, file_url=action["file_url"], reasoning=reasoning))
+        elif action_type == ActionType.SELECT_OPTION:
+            actions.append(
+                SelectOptionAction(
+                    element_id=element_id,
+                    option=SelectOption(
+                        label=action["option"]["label"],
+                        value=action["option"]["value"],
+                        index=action["option"]["index"],
+                    ),
+                    reasoning=reasoning,
+                )
+            )
+        elif action_type == ActionType.CHECKBOX:
+            actions.append(CheckboxAction(element_id=element_id, is_checked=action["is_checked"], reasoning=reasoning))
+        elif action_type == ActionType.WAIT:
+            actions.append(WaitAction(reasoning=reasoning))
+        elif action_type == ActionType.COMPLETE:
+            if actions:
+                LOG.info(
+                    "Navigation goal achieved, creating complete action and discarding all other actions except "
+                    "complete action",
+                    task_id=task.task_id,
+                    nav_goal=task.navigation_goal,
+                    actions=actions,
+                    llm_response=json_response,
+                )
+            return [CompleteAction(reasoning=reasoning, data_extraction_goal=task.data_extraction_goal)]
+        elif action_type == "null":
+            actions.append(NullAction(reasoning=reasoning))
+        elif action_type == ActionType.SOLVE_CAPTCHA:
+            actions.append(SolveCaptchaAction(reasoning=reasoning))
+        else:
+            LOG.error(
+                "Unsupported action type when parsing actions",
+                task_id=task.task_id,
+                action_type=action_type,
+                raw_action=action,
+            )
+    return actions
+
+
+class ScrapeResult(BaseModel):
+    """
+    Scraped response from a webpage, including:
+    1. JSON representation of what the user is seeing
+    """
+
+    scraped_data: dict[str, Any] | list[dict[str, Any]]
+
+
+# https://blog.devgenius.io/deserialize-child-classes-with-pydantic-that-gonna-work-784230e1cf83
+ActionTypeUnion = (
+    ClickAction
+    | InputTextAction
+    | UploadFileAction
+    | SelectOptionAction
+    | CheckboxAction
+    | WaitAction
+    | NullAction
+    | SolveCaptchaAction
+    | TerminateAction
+    | CompleteAction
+)
--- a/skyvern/webeye/actions/handler.py
+++ b/skyvern/webeye/actions/handler.py
@@ -0,0 +1,445 @@
+import asyncio
+import re
+from typing import Awaitable, Callable, List
+
+import structlog
+from playwright.async_api import Locator, Page
+
+from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound
+from skyvern.forge import app
+from skyvern.forge.prompts import prompt_engine
+from skyvern.forge.sdk.api.files import download_file
+from skyvern.forge.sdk.models import Step
+from skyvern.forge.sdk.schemas.tasks import Task
+from skyvern.forge.sdk.settings_manager import SettingsManager
+from skyvern.webeye.actions import actions
+from skyvern.webeye.actions.actions import Action, ActionType, ClickAction, ScrapeResult, UploadFileAction, WebAction
+from skyvern.webeye.actions.responses import ActionFailure, ActionResult, ActionSuccess
+from skyvern.webeye.browser_factory import BrowserState
+from skyvern.webeye.scraper.scraper import ScrapedPage
+
+LOG = structlog.get_logger()
+
+
+class ActionHandler:
+    _handled_action_types: dict[
+        ActionType, Callable[[Action, Page, ScrapedPage, Task, Step], Awaitable[list[ActionResult]]]
+    ] = {}
+
+    @classmethod
+    def register_action_type(
+        cls,
+        action_type: ActionType,
+        handler: Callable[[Action, Page, ScrapedPage, Task, Step], Awaitable[list[ActionResult]]],
+    ) -> None:
+        cls._handled_action_types[action_type] = handler
+
+    @staticmethod
+    async def handle_action(
+        scraped_page: ScrapedPage,
+        task: Task,
+        step: Step,
+        browser_state: BrowserState,
+        action: Action,
+    ) -> list[ActionResult]:
+        LOG.info("Handling action", action=action)
+        page = await browser_state.get_or_create_page()
+        try:
+            if action.action_type in ActionHandler._handled_action_types:
+                handler = ActionHandler._handled_action_types[action.action_type]
+                return await handler(action, page, scraped_page, task, step)
+            else:
+                LOG.error("Unsupported action type in handler", action=action, type=type(action))
+                return [ActionFailure(Exception(f"Unsupported action type: {type(action)}"))]
+        except MissingElement as e:
+            LOG.info("Known exceptions", action=action, exception_type=type(e), exception_message=str(e))
+            return [ActionFailure(e)]
+        except MultipleElementsFound as e:
+            LOG.exception(
+                "Cannot handle multiple elements with the same xpath in one action.",
+                action=action,
+                exception=e,
+            )
+            return [ActionFailure(e)]
+        except Exception as e:
+            LOG.exception("Unhandled exception in action handler", action=action, exception=e)
+            return [ActionFailure(e)]
+
+
+async def handle_solve_captcha_action(
+    action: actions.SolveCaptchaAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    LOG.warning(
+        "Please solve the captcha on the page, you have 30 seconds",
+        action=action,
+    )
+    await asyncio.sleep(30)
+    return [ActionSuccess()]
+
+
+async def handle_click_action(
+    action: actions.ClickAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    await asyncio.sleep(0.3)
+    return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+
+
+async def handle_input_text_action(
+    action: actions.InputTextAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    locator = page.locator(f"xpath={xpath}")
+    await locator.clear()
+    await locator.fill(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+
+    # This is a hack that gets dropdowns to select the "best" option based on what's typed
+    # Fixes situations like tsk_228671423990405776 where the location isn't being autocompleted
+    await locator.press("Tab", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+    input_value = await locator.input_value(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+    if not input_value:
+        LOG.info("Failed to input the text, trying to press sequentially with an enter click", action=action)
+        await locator.clear()
+        await locator.press_sequentially(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        await locator.press("Enter", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        input_value = await locator.input_value(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        LOG.info("Input value", input_value=input_value, action=action)
+
+    return [ActionSuccess()]
+
+
+async def handle_upload_file_action(
+    action: actions.UploadFileAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    if not action.file_url:
+        LOG.warning("InputFileAction has no file_url", action=action)
+        return [ActionFailure(MissingFileUrl())]
+    if action.file_url not in str(task.navigation_payload):
+        LOG.warning(
+            "LLM might be imagining the file url, which is not in navigation payload",
+            action=action,
+            file_url=action.file_url,
+        )
+        return [ActionFailure(ImaginaryFileUrl(action.file_url))]
+    xpath = await validate_actions_in_dom(action, page, scraped_page)
+    file_path = download_file(action.file_url)
+    locator = page.locator(f"xpath={xpath}")
+    is_file_input = await is_file_input_element(locator)
+    if is_file_input:
+        LOG.info("Taking UploadFileAction. Found file input tag", action=action)
+        if file_path:
+            await page.locator(f"xpath={xpath}").set_input_files(
+                file_path, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
+            )
+
+            # Sleep for 10 seconds after uploading a file to let the page process it
+            await asyncio.sleep(10)
+            return [ActionSuccess()]
+        else:
+            return [ActionFailure(Exception(f"Failed to download file from {action.file_url}"))]
+    else:
+        LOG.info("Taking UploadFileAction. Found non file input tag", action=action)
+        # treat it as a click action
+        action.is_upload_file_tag = False
+        return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+
+
+async def handle_null_action(
+    action: actions.NullAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    return [ActionSuccess()]
+
+
+async def handle_select_option_action(
+    action: actions.SelectOptionAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    xpath = await validate_actions_in_dom(action, page, scraped_page)
+
+    try:
+        # First click by label (if it matches)
+        await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        await page.select_option(
+            xpath,
+            label=action.option.label,
+            timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
+        )
+        await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        return [ActionSuccess()]
+    except Exception as e:
+        if action.option.index is not None:
+            LOG.warning(
+                "Failed to click on the option by label, trying by index",
+                exc_info=e,
+                action=action,
+                xpath=xpath,
+            )
+        else:
+            return [ActionFailure(e)]
+
+    try:
+        option_xpath = scraped_page.id_to_xpath_dict[action.option.index]
+        match = re.search(r"option\[(\d+)]$", option_xpath)
+        if match:
+            # This means we were trying to select an option xpath, click the option
+            option_index = int(match.group(1))
+            await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+            await page.select_option(
+                xpath,
+                index=option_index,
+                timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
+            )
+            await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+            return [ActionSuccess()]
+        else:
+            # This means the supplied index was for the select element, not a reference to the xpath dict
+            await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+            await page.select_option(
+                xpath,
+                index=action.option.index,
+                timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
+            )
+            await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+        return [ActionSuccess()]
+    except Exception as e:
+        LOG.warning("Failed to click on the option by index", exception=e, action=action)
+        return [ActionFailure(e)]
+
+
+async def handle_checkbox_action(
+    self: actions.CheckboxAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    """
+    ******* NOT REGISTERED *******
+    This action causes more harm than it does good.
+    It frequently mis-behaves, or gets stuck in click loops.
+    Treating checkbox actions as click actions seem to perform way more reliably
+    Developers who tried this and failed: 2 (Suchintan and Shu 😂)
+    """
+    xpath = await validate_actions_in_dom(self, page, scraped_page)
+    if self.is_checked:
+        await page.check(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+    else:
+        await page.uncheck(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
+
+    # TODO (suchintan): Why does checking the label work, but not the actual input element?
+    return [ActionSuccess()]
+
+
+async def handle_wait_action(
+    action: actions.WaitAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    await asyncio.sleep(10)
+    return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]
+
+
+async def handle_terminate_action(
+    action: actions.TerminateAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    return [ActionSuccess()]
+
+
+async def handle_complete_action(
+    action: actions.CompleteAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
+) -> list[ActionResult]:
+    extracted_data = None
+    if action.data_extraction_goal:
+        scrape_action_result = await extract_information_for_navigation_goal(
+            scraped_page=scraped_page,
+            task=task,
+            step=step,
+        )
+        extracted_data = scrape_action_result.scraped_data
+    return [ActionSuccess(data=extracted_data)]
+
+
+ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
+ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
+ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
+ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
+ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
+ActionHandler.register_action_type(ActionType.SELECT_OPTION, handle_select_option_action)
+ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
+ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
+ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
+
+
+async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> str:
+    xpath = scraped_page.id_to_xpath_dict[action.element_id]
+    locator = page.locator(xpath)
+
+    num_elements = await locator.count()
+    if num_elements < 1:
+        LOG.warning("No elements found with action xpath. Validation failed.", action=action, xpath=xpath)
+        raise MissingElement(xpath=xpath, element_id=action.element_id)
+    elif num_elements > 1:
+        LOG.warning(
+            "Multiple elements found with action xpath. Expected 1. Validation failed.",
+            action=action,
+            num_elements=num_elements,
+        )
+        raise MultipleElementsFound(num=num_elements, xpath=xpath, element_id=action.element_id)
+    else:
+        LOG.info("Validated action xpath in DOM", action=action)
+
+    return xpath
+
+
+async def chain_click(
+    page: Page,
+    action: ClickAction | UploadFileAction,
+    xpath: str,
+    timeout: int = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
+) -> List[ActionResult]:
+    # Add a defensive page handler here in case a click action opens a file chooser.
+    # This automatically dismisses the dialog
+    # File choosers are impossible to close if you don't expect one. Instead of dealing with it, close it!
+
+    # TODO (suchintan): This should likely result in an ActionFailure -- we can figure out how to do this later!
+    LOG.info("Chain click starts", action=action, xpath=xpath)
+    file: list[str] | str = []
+    if action.file_url:
+        file = download_file(action.file_url) or []
+
+    fc_func = lambda fc: fc.set_files(files=file)
+    page.on("filechooser", fc_func)
+
+    LOG.info("Registered file chooser listener", action=action, path=file)
+    """
+    Clicks on an element identified by the xpath and its parent if failed.
+    :param xpath: xpath of the element to click
+    """
+    javascript_triggered = await is_javascript_triggered(page, xpath)
+    try:
+        await page.click(f"xpath={xpath}", timeout=timeout)
+        LOG.info("Chain click: main element click succeeded", action=action, xpath=xpath)
+        return [ActionSuccess(javascript_triggered=javascript_triggered)]
+    except Exception as e:
+        action_results: list[ActionResult] = [ActionFailure(e, javascript_triggered=javascript_triggered)]
+        if await is_input_element(page.locator(xpath)):
+            LOG.info("Chain click: it's an input element. going to try sibling click", action=action, xpath=xpath)
+            sibling_action_result = await click_sibling_of_input(page.locator(xpath), timeout=timeout)
+            action_results.append(sibling_action_result)
+            if type(sibling_action_result) == ActionSuccess:
+                return action_results
+
+        parent_xpath = f"{xpath}/.."
+        try:
+            parent_javascript_triggered = await is_javascript_triggered(page, parent_xpath)
+            javascript_triggered = javascript_triggered or parent_javascript_triggered
+            parent_locator = page.locator(xpath).locator("..")
+            await parent_locator.click(timeout=timeout)
+            LOG.info("Chain click: successfully clicked parent element", action=action, parent_xpath=parent_xpath)
+            action_results.append(
+                ActionSuccess(
+                    javascript_triggered=javascript_triggered,
+                    interacted_with_parent=True,
+                )
+            )
+        except Exception as pe:
+            LOG.warning("Failed to click parent element", action=action, parent_xpath=parent_xpath, exc_info=True)
+            action_results.append(
+                ActionFailure(pe, javascript_triggered=javascript_triggered, interacted_with_parent=True)
+            )
+            # We don't raise exception here because we do log the exception, and return ActionFailure as the last action
+
+        return action_results
+    finally:
+        LOG.info("Remove file chooser listener", action=action)
+
+        # Sleep for 10 seconds after uploading a file to let the page process it
+        # Removing this breaks file uploads using the filechooser
+        # KEREM DO NOT REMOVE
+        if file:
+            await asyncio.sleep(10)
+        page.remove_listener("filechooser", fc_func)
+
+
+async def is_javascript_triggered(page: Page, xpath: str) -> bool:
+    locator = page.locator(f"xpath={xpath}")
+    element = locator.first
+    tag_name = await element.evaluate("e => e.tagName")
+    if tag_name.lower() == "a":
+        href = await element.evaluate("e => e.href")
+        if href.lower().startswith("javascript:"):
+            LOG.info("Found javascript call in anchor tag, marking step as completed. Dropping remaining actions")
+            return True
+    return False
+
+
+async def is_file_input_element(locator: Locator) -> bool:
+    element = locator.first
+    if element:
+        tag_name = await element.evaluate("el => el.tagName")
+        type_name = await element.evaluate("el => el.type")
+        return tag_name.lower() == "input" and type_name == "file"
+    return False
+
+
+async def is_input_element(locator: Locator) -> bool:
+    element = locator.first
+    if element:
+        tag_name = await element.evaluate("el => el.tagName")
+        return tag_name.lower() == "input"
+    return False
+
+
+async def click_sibling_of_input(
+    locator: Locator,
+    timeout: int,
+    javascript_triggered: bool = False,
+) -> ActionResult:
+    try:
+        input_element = locator.first
+        parent_locator = locator.locator("..")
+        if input_element:
+            input_id = await input_element.get_attribute("id")
+            sibling_label_xpath = f'//label[@for="{input_id}"]'
+            label_locator = parent_locator.locator(sibling_label_xpath)
+            await label_locator.click(timeout=timeout)
+            LOG.info(
+                "Successfully clicked sibling label of input element",
+                sibling_label_xpath=sibling_label_xpath,
+            )
+            return ActionSuccess(javascript_triggered=javascript_triggered, interacted_with_sibling=True)
+        # Should never get here
+        return ActionFailure(
+            exception=Exception("Failed while trying to click sibling of input element"),
+            javascript_triggered=javascript_triggered,
+            interacted_with_sibling=True,
+        )
+    except Exception as e:
+        LOG.warning("Failed to click sibling label of input element", exc_info=e)
+        return ActionFailure(exception=e, javascript_triggered=javascript_triggered)
+
+
+async def extract_information_for_navigation_goal(
+    task: Task,
+    step: Step,
+    scraped_page: ScrapedPage,
+) -> ScrapeResult:
+    """
+    Scrapes a webpage and returns the scraped response, including:
+    1. JSON representation of what the user is seeing
+    2. The scraped page
+    """
+    prompt_template = "extract-information"
+
+    extract_information_prompt = prompt_engine.load_prompt(
+        prompt_template,
+        navigation_goal=task.navigation_goal,
+        elements=scraped_page.element_tree,
+        data_extraction_goal=task.data_extraction_goal,
+        extracted_information_schema=task.extracted_information_schema,
+        current_url=scraped_page.url,
+        extracted_text=scraped_page.extracted_text,
+    )
+
+    json_response = await app.OPENAI_CLIENT.chat_completion(
+        step=step,
+        prompt=extract_information_prompt,
+        screenshots=scraped_page.screenshots,
+    )
+
+    return ScrapeResult(
+        scraped_data=json_response,
+    )
--- a/skyvern/webeye/actions/models.py
+++ b/skyvern/webeye/actions/models.py
@@ -0,0 +1,58 @@
+from __future__ import annotations
+
+from typing import Any
+
+from pydantic import BaseModel
+
+from skyvern.forge.sdk.settings_manager import SettingsManager
+from skyvern.webeye.actions.actions import Action, ActionTypeUnion
+from skyvern.webeye.actions.responses import ActionResult
+from skyvern.webeye.scraper.scraper import ScrapedPage
+
+
+class AgentStepOutput(BaseModel):
+    """
+    Output of the agent step, this is recorded in the database.
+    """
+
+    # Will be deprecated once we move to the new format below
+    action_results: list[ActionResult] | None = None
+    # Nullable for backwards compatibility, once backfill is done, this won't be nullable anymore
+    actions_and_results: list[tuple[ActionTypeUnion, list[ActionResult]]] | None = None
+
+    def __repr__(self) -> str:
+        return f"AgentStepOutput({self.model_dump()})"
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+
+class DetailedAgentStepOutput(BaseModel):
+    """
+    Output of the agent step, this is not recorded in the database, only used for debugging in the Jupyter notebook.
+    """
+
+    scraped_page: ScrapedPage | None
+    extract_action_prompt: str | None
+    llm_response: dict[str, Any] | None
+    actions: list[Action] | None
+    action_results: list[ActionResult] | None
+    actions_and_results: list[tuple[ActionTypeUnion, list[ActionResult]]] | None
+
+    class Config:
+        exclude = ["scraped_page", "extract_action_prompt"]
+
+    def __repr__(self) -> str:
+        if SettingsManager.get_settings().DEBUG_MODE:
+            return f"DetailedAgentStepOutput({self.model_dump()})"
+        else:
+            return f"AgentStepOutput({self.to_agent_step_output().model_dump()})"
+
+    def __str__(self) -> str:
+        return self.__repr__()
+
+    def to_agent_step_output(self) -> AgentStepOutput:
+        return AgentStepOutput(
+            action_results=self.action_results if self.action_results else [],
+            actions_and_results=self.actions_and_results if self.actions_and_results else [],
+        )
--- a/skyvern/webeye/actions/responses.py
+++ b/skyvern/webeye/actions/responses.py
@@ -0,0 +1,62 @@
+from typing import Any
+
+from pydantic import BaseModel
+
+from skyvern.webeye.string_util import remove_whitespace
+
+
+class ActionResult(BaseModel):
+    success: bool
+    exception_type: str | None = None
+    exception_message: str | None = None
+    data: dict[str, Any] | list | str | None = None
+    step_retry_number: int | None = None
+    step_order: int | None = None
+    javascript_triggered: bool = False
+    # None is used for old data so that we can differentiate between old and new data which only has boolean
+    interacted_with_sibling: bool | None = None
+    interacted_with_parent: bool | None = None
+
+    def __str__(self) -> str:
+        return (
+            f"ActionResult(success={self.success}, exception_type={self.exception_type}, "
+            f"exception_message={self.exception_message}), data={self.data}"
+        )
+
+    def __repr__(self) -> str:
+        return self.__str__()
+
+
+class ActionSuccess(ActionResult):
+    def __init__(
+        self,
+        data: dict[str, Any] | list | str | None = None,
+        javascript_triggered: bool = False,
+        interacted_with_sibling: bool = False,
+        interacted_with_parent: bool = False,
+    ):
+        super().__init__(
+            success=True,
+            data=data,
+            javascript_triggered=javascript_triggered,
+            interacted_with_sibling=interacted_with_sibling,
+            interacted_with_parent=interacted_with_parent,
+        )
+
+
+class ActionFailure(ActionResult):
+    def __init__(
+        self,
+        exception: Exception,
+        javascript_triggered: bool = False,
+        interacted_with_sibling: bool = False,
+        interacted_with_parent: bool = False,
+    ):
+        super().__init__(
+            success=False,
+            exception_type=type(exception).__name__,
+            exception_message=remove_whitespace(str(exception)),
+            javascript_triggered=javascript_triggered,
+            interacted_with_sibling=interacted_with_sibling,
+            interacted_with_parent=interacted_with_parent,
+        )