Move the code over from private repository (#3)

This commit is contained in:
Kerem Yilmaz
2024-03-01 10:09:30 -08:00
committed by GitHub
parent 32dd6d92a5
commit 9eddb3d812
93 changed files with 16798 additions and 0 deletions

View File

View File

View File

@@ -0,0 +1,204 @@
import abc
from enum import StrEnum
from typing import Any, Dict, List
import structlog
from pydantic import BaseModel
from skyvern.forge.sdk.schemas.tasks import Task
LOG = structlog.get_logger()
class ActionType(StrEnum):
CLICK = "click"
INPUT_TEXT = "input_text"
UPLOAD_FILE = "upload_file"
SELECT_OPTION = "select_option"
CHECKBOX = "checkbox"
WAIT = "wait"
NULL_ACTION = "null_action"
SOLVE_CAPTCHA = "solve_captcha"
TERMINATE = "terminate"
COMPLETE = "complete"
# Note: Remember to update ActionTypeUnion with new actions
class Action(BaseModel):
action_type: ActionType
description: str | None = None
reasoning: str | None = None
class WebAction(Action, abc.ABC):
element_id: int
class ClickAction(WebAction):
action_type: ActionType = ActionType.CLICK
file_url: str | None = None
def __repr__(self) -> str:
return f"ClickAction(element_id={self.element_id}, file_url={self.file_url})"
class InputTextAction(WebAction):
action_type: ActionType = ActionType.INPUT_TEXT
text: str
def __repr__(self) -> str:
return f"InputTextAction(element_id={self.element_id}, text={self.text})"
class UploadFileAction(WebAction):
action_type: ActionType = ActionType.UPLOAD_FILE
file_url: str
is_upload_file_tag: bool = True
def __repr__(self) -> str:
return f"UploadFileAction(element_id={self.element_id}, file={self.file_url}, is_upload_file_tag={self.is_upload_file_tag})"
class NullAction(Action):
action_type: ActionType = ActionType.NULL_ACTION
class SolveCaptchaAction(Action):
action_type: ActionType = ActionType.SOLVE_CAPTCHA
class SelectOption(BaseModel):
label: str | None
value: str | None
index: int | None
def __repr__(self) -> str:
return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"
class SelectOptionAction(WebAction):
action_type: ActionType = ActionType.SELECT_OPTION
option: SelectOption
def __repr__(self) -> str:
return f"SelectOptionAction(element_id={self.element_id}, option={self.option})"
###
# This action causes more harm than it does good.
# It frequently mis-behaves, or gets stuck in click loops.
# Treating checkbox actions as click actions seem to perform way more reliably
# Developers who tried this and failed: 2 (Suchintan and Shu 😂)
###
class CheckboxAction(WebAction):
action_type: ActionType = ActionType.CHECKBOX
is_checked: bool
def __repr__(self) -> str:
return f"CheckboxAction(element_id={self.element_id}, is_checked={self.is_checked})"
class WaitAction(Action):
action_type: ActionType = ActionType.WAIT
class TerminateAction(Action):
action_type: ActionType = ActionType.TERMINATE
class CompleteAction(Action):
action_type: ActionType = ActionType.COMPLETE
data_extraction_goal: str | None = None
def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Action]:
actions = []
for action in json_response:
element_id = action["id"]
reasoning = action["reasoning"] if "reasoning" in action else None
if "action_type" not in action or action["action_type"] is None:
actions.append(NullAction(reasoning=reasoning))
continue
# `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
action_type = ActionType[action["action_type"].upper()]
if action_type == ActionType.TERMINATE:
LOG.warning(
"Agent decided to terminate",
task_id=task.task_id,
llm_response=json_response,
reasoning=reasoning,
actions=actions,
)
actions.append(TerminateAction(reasoning=reasoning))
elif action_type == ActionType.CLICK:
file_url = action["file_url"] if "file_url" in action else None
actions.append(ClickAction(element_id=element_id, reasoning=reasoning, file_url=file_url))
elif action_type == ActionType.INPUT_TEXT:
actions.append(InputTextAction(element_id=element_id, text=action["text"], reasoning=reasoning))
elif action_type == ActionType.UPLOAD_FILE:
# TODO: see if the element is a file input element. if it's not, convert this action into a click action
actions.append(UploadFileAction(element_id=element_id, file_url=action["file_url"], reasoning=reasoning))
elif action_type == ActionType.SELECT_OPTION:
actions.append(
SelectOptionAction(
element_id=element_id,
option=SelectOption(
label=action["option"]["label"],
value=action["option"]["value"],
index=action["option"]["index"],
),
reasoning=reasoning,
)
)
elif action_type == ActionType.CHECKBOX:
actions.append(CheckboxAction(element_id=element_id, is_checked=action["is_checked"], reasoning=reasoning))
elif action_type == ActionType.WAIT:
actions.append(WaitAction(reasoning=reasoning))
elif action_type == ActionType.COMPLETE:
if actions:
LOG.info(
"Navigation goal achieved, creating complete action and discarding all other actions except "
"complete action",
task_id=task.task_id,
nav_goal=task.navigation_goal,
actions=actions,
llm_response=json_response,
)
return [CompleteAction(reasoning=reasoning, data_extraction_goal=task.data_extraction_goal)]
elif action_type == "null":
actions.append(NullAction(reasoning=reasoning))
elif action_type == ActionType.SOLVE_CAPTCHA:
actions.append(SolveCaptchaAction(reasoning=reasoning))
else:
LOG.error(
"Unsupported action type when parsing actions",
task_id=task.task_id,
action_type=action_type,
raw_action=action,
)
return actions
class ScrapeResult(BaseModel):
"""
Scraped response from a webpage, including:
1. JSON representation of what the user is seeing
"""
scraped_data: dict[str, Any] | list[dict[str, Any]]
# https://blog.devgenius.io/deserialize-child-classes-with-pydantic-that-gonna-work-784230e1cf83
ActionTypeUnion = (
ClickAction
| InputTextAction
| UploadFileAction
| SelectOptionAction
| CheckboxAction
| WaitAction
| NullAction
| SolveCaptchaAction
| TerminateAction
| CompleteAction
)

View File

@@ -0,0 +1,445 @@
import asyncio
import re
from typing import Awaitable, Callable, List
import structlog
from playwright.async_api import Locator, Page
from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.api.files import download_file
from skyvern.forge.sdk.models import Step
from skyvern.forge.sdk.schemas.tasks import Task
from skyvern.forge.sdk.settings_manager import SettingsManager
from skyvern.webeye.actions import actions
from skyvern.webeye.actions.actions import Action, ActionType, ClickAction, ScrapeResult, UploadFileAction, WebAction
from skyvern.webeye.actions.responses import ActionFailure, ActionResult, ActionSuccess
from skyvern.webeye.browser_factory import BrowserState
from skyvern.webeye.scraper.scraper import ScrapedPage
LOG = structlog.get_logger()
class ActionHandler:
_handled_action_types: dict[
ActionType, Callable[[Action, Page, ScrapedPage, Task, Step], Awaitable[list[ActionResult]]]
] = {}
@classmethod
def register_action_type(
cls,
action_type: ActionType,
handler: Callable[[Action, Page, ScrapedPage, Task, Step], Awaitable[list[ActionResult]]],
) -> None:
cls._handled_action_types[action_type] = handler
@staticmethod
async def handle_action(
scraped_page: ScrapedPage,
task: Task,
step: Step,
browser_state: BrowserState,
action: Action,
) -> list[ActionResult]:
LOG.info("Handling action", action=action)
page = await browser_state.get_or_create_page()
try:
if action.action_type in ActionHandler._handled_action_types:
handler = ActionHandler._handled_action_types[action.action_type]
return await handler(action, page, scraped_page, task, step)
else:
LOG.error("Unsupported action type in handler", action=action, type=type(action))
return [ActionFailure(Exception(f"Unsupported action type: {type(action)}"))]
except MissingElement as e:
LOG.info("Known exceptions", action=action, exception_type=type(e), exception_message=str(e))
return [ActionFailure(e)]
except MultipleElementsFound as e:
LOG.exception(
"Cannot handle multiple elements with the same xpath in one action.",
action=action,
exception=e,
)
return [ActionFailure(e)]
except Exception as e:
LOG.exception("Unhandled exception in action handler", action=action, exception=e)
return [ActionFailure(e)]
async def handle_solve_captcha_action(
action: actions.SolveCaptchaAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> list[ActionResult]:
LOG.warning(
"Please solve the captcha on the page, you have 30 seconds",
action=action,
)
await asyncio.sleep(30)
return [ActionSuccess()]
async def handle_click_action(
action: actions.ClickAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> list[ActionResult]:
xpath = await validate_actions_in_dom(action, page, scraped_page)
await asyncio.sleep(0.3)
return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
async def handle_input_text_action(
action: actions.InputTextAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> list[ActionResult]:
xpath = await validate_actions_in_dom(action, page, scraped_page)
locator = page.locator(f"xpath={xpath}")
await locator.clear()
await locator.fill(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
# This is a hack that gets dropdowns to select the "best" option based on what's typed
# Fixes situations like tsk_228671423990405776 where the location isn't being autocompleted
await locator.press("Tab", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
input_value = await locator.input_value(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
if not input_value:
LOG.info("Failed to input the text, trying to press sequentially with an enter click", action=action)
await locator.clear()
await locator.press_sequentially(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
await locator.press("Enter", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
input_value = await locator.input_value(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
LOG.info("Input value", input_value=input_value, action=action)
return [ActionSuccess()]
async def handle_upload_file_action(
action: actions.UploadFileAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> list[ActionResult]:
if not action.file_url:
LOG.warning("InputFileAction has no file_url", action=action)
return [ActionFailure(MissingFileUrl())]
if action.file_url not in str(task.navigation_payload):
LOG.warning(
"LLM might be imagining the file url, which is not in navigation payload",
action=action,
file_url=action.file_url,
)
return [ActionFailure(ImaginaryFileUrl(action.file_url))]
xpath = await validate_actions_in_dom(action, page, scraped_page)
file_path = download_file(action.file_url)
locator = page.locator(f"xpath={xpath}")
is_file_input = await is_file_input_element(locator)
if is_file_input:
LOG.info("Taking UploadFileAction. Found file input tag", action=action)
if file_path:
await page.locator(f"xpath={xpath}").set_input_files(
file_path, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
)
# Sleep for 10 seconds after uploading a file to let the page process it
await asyncio.sleep(10)
return [ActionSuccess()]
else:
return [ActionFailure(Exception(f"Failed to download file from {action.file_url}"))]
else:
LOG.info("Taking UploadFileAction. Found non file input tag", action=action)
# treat it as a click action
action.is_upload_file_tag = False
return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
async def handle_null_action(
action: actions.NullAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> list[ActionResult]:
return [ActionSuccess()]
async def handle_select_option_action(
action: actions.SelectOptionAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> list[ActionResult]:
xpath = await validate_actions_in_dom(action, page, scraped_page)
try:
# First click by label (if it matches)
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
await page.select_option(
xpath,
label=action.option.label,
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
)
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
return [ActionSuccess()]
except Exception as e:
if action.option.index is not None:
LOG.warning(
"Failed to click on the option by label, trying by index",
exc_info=e,
action=action,
xpath=xpath,
)
else:
return [ActionFailure(e)]
try:
option_xpath = scraped_page.id_to_xpath_dict[action.option.index]
match = re.search(r"option\[(\d+)]$", option_xpath)
if match:
# This means we were trying to select an option xpath, click the option
option_index = int(match.group(1))
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
await page.select_option(
xpath,
index=option_index,
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
)
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
return [ActionSuccess()]
else:
# This means the supplied index was for the select element, not a reference to the xpath dict
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
await page.select_option(
xpath,
index=action.option.index,
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
)
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
return [ActionSuccess()]
except Exception as e:
LOG.warning("Failed to click on the option by index", exception=e, action=action)
return [ActionFailure(e)]
async def handle_checkbox_action(
self: actions.CheckboxAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> list[ActionResult]:
"""
******* NOT REGISTERED *******
This action causes more harm than it does good.
It frequently mis-behaves, or gets stuck in click loops.
Treating checkbox actions as click actions seem to perform way more reliably
Developers who tried this and failed: 2 (Suchintan and Shu 😂)
"""
xpath = await validate_actions_in_dom(self, page, scraped_page)
if self.is_checked:
await page.check(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
else:
await page.uncheck(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
# TODO (suchintan): Why does checking the label work, but not the actual input element?
return [ActionSuccess()]
async def handle_wait_action(
action: actions.WaitAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> list[ActionResult]:
await asyncio.sleep(10)
return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]
async def handle_terminate_action(
action: actions.TerminateAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> list[ActionResult]:
return [ActionSuccess()]
async def handle_complete_action(
action: actions.CompleteAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> list[ActionResult]:
extracted_data = None
if action.data_extraction_goal:
scrape_action_result = await extract_information_for_navigation_goal(
scraped_page=scraped_page,
task=task,
step=step,
)
extracted_data = scrape_action_result.scraped_data
return [ActionSuccess(data=extracted_data)]
ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
ActionHandler.register_action_type(ActionType.SELECT_OPTION, handle_select_option_action)
ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> str:
xpath = scraped_page.id_to_xpath_dict[action.element_id]
locator = page.locator(xpath)
num_elements = await locator.count()
if num_elements < 1:
LOG.warning("No elements found with action xpath. Validation failed.", action=action, xpath=xpath)
raise MissingElement(xpath=xpath, element_id=action.element_id)
elif num_elements > 1:
LOG.warning(
"Multiple elements found with action xpath. Expected 1. Validation failed.",
action=action,
num_elements=num_elements,
)
raise MultipleElementsFound(num=num_elements, xpath=xpath, element_id=action.element_id)
else:
LOG.info("Validated action xpath in DOM", action=action)
return xpath
async def chain_click(
page: Page,
action: ClickAction | UploadFileAction,
xpath: str,
timeout: int = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
) -> List[ActionResult]:
# Add a defensive page handler here in case a click action opens a file chooser.
# This automatically dismisses the dialog
# File choosers are impossible to close if you don't expect one. Instead of dealing with it, close it!
# TODO (suchintan): This should likely result in an ActionFailure -- we can figure out how to do this later!
LOG.info("Chain click starts", action=action, xpath=xpath)
file: list[str] | str = []
if action.file_url:
file = download_file(action.file_url) or []
fc_func = lambda fc: fc.set_files(files=file)
page.on("filechooser", fc_func)
LOG.info("Registered file chooser listener", action=action, path=file)
"""
Clicks on an element identified by the xpath and its parent if failed.
:param xpath: xpath of the element to click
"""
javascript_triggered = await is_javascript_triggered(page, xpath)
try:
await page.click(f"xpath={xpath}", timeout=timeout)
LOG.info("Chain click: main element click succeeded", action=action, xpath=xpath)
return [ActionSuccess(javascript_triggered=javascript_triggered)]
except Exception as e:
action_results: list[ActionResult] = [ActionFailure(e, javascript_triggered=javascript_triggered)]
if await is_input_element(page.locator(xpath)):
LOG.info("Chain click: it's an input element. going to try sibling click", action=action, xpath=xpath)
sibling_action_result = await click_sibling_of_input(page.locator(xpath), timeout=timeout)
action_results.append(sibling_action_result)
if type(sibling_action_result) == ActionSuccess:
return action_results
parent_xpath = f"{xpath}/.."
try:
parent_javascript_triggered = await is_javascript_triggered(page, parent_xpath)
javascript_triggered = javascript_triggered or parent_javascript_triggered
parent_locator = page.locator(xpath).locator("..")
await parent_locator.click(timeout=timeout)
LOG.info("Chain click: successfully clicked parent element", action=action, parent_xpath=parent_xpath)
action_results.append(
ActionSuccess(
javascript_triggered=javascript_triggered,
interacted_with_parent=True,
)
)
except Exception as pe:
LOG.warning("Failed to click parent element", action=action, parent_xpath=parent_xpath, exc_info=True)
action_results.append(
ActionFailure(pe, javascript_triggered=javascript_triggered, interacted_with_parent=True)
)
# We don't raise exception here because we do log the exception, and return ActionFailure as the last action
return action_results
finally:
LOG.info("Remove file chooser listener", action=action)
# Sleep for 10 seconds after uploading a file to let the page process it
# Removing this breaks file uploads using the filechooser
# KEREM DO NOT REMOVE
if file:
await asyncio.sleep(10)
page.remove_listener("filechooser", fc_func)
async def is_javascript_triggered(page: Page, xpath: str) -> bool:
locator = page.locator(f"xpath={xpath}")
element = locator.first
tag_name = await element.evaluate("e => e.tagName")
if tag_name.lower() == "a":
href = await element.evaluate("e => e.href")
if href.lower().startswith("javascript:"):
LOG.info("Found javascript call in anchor tag, marking step as completed. Dropping remaining actions")
return True
return False
async def is_file_input_element(locator: Locator) -> bool:
element = locator.first
if element:
tag_name = await element.evaluate("el => el.tagName")
type_name = await element.evaluate("el => el.type")
return tag_name.lower() == "input" and type_name == "file"
return False
async def is_input_element(locator: Locator) -> bool:
element = locator.first
if element:
tag_name = await element.evaluate("el => el.tagName")
return tag_name.lower() == "input"
return False
async def click_sibling_of_input(
locator: Locator,
timeout: int,
javascript_triggered: bool = False,
) -> ActionResult:
try:
input_element = locator.first
parent_locator = locator.locator("..")
if input_element:
input_id = await input_element.get_attribute("id")
sibling_label_xpath = f'//label[@for="{input_id}"]'
label_locator = parent_locator.locator(sibling_label_xpath)
await label_locator.click(timeout=timeout)
LOG.info(
"Successfully clicked sibling label of input element",
sibling_label_xpath=sibling_label_xpath,
)
return ActionSuccess(javascript_triggered=javascript_triggered, interacted_with_sibling=True)
# Should never get here
return ActionFailure(
exception=Exception("Failed while trying to click sibling of input element"),
javascript_triggered=javascript_triggered,
interacted_with_sibling=True,
)
except Exception as e:
LOG.warning("Failed to click sibling label of input element", exc_info=e)
return ActionFailure(exception=e, javascript_triggered=javascript_triggered)
async def extract_information_for_navigation_goal(
task: Task,
step: Step,
scraped_page: ScrapedPage,
) -> ScrapeResult:
"""
Scrapes a webpage and returns the scraped response, including:
1. JSON representation of what the user is seeing
2. The scraped page
"""
prompt_template = "extract-information"
extract_information_prompt = prompt_engine.load_prompt(
prompt_template,
navigation_goal=task.navigation_goal,
elements=scraped_page.element_tree,
data_extraction_goal=task.data_extraction_goal,
extracted_information_schema=task.extracted_information_schema,
current_url=scraped_page.url,
extracted_text=scraped_page.extracted_text,
)
json_response = await app.OPENAI_CLIENT.chat_completion(
step=step,
prompt=extract_information_prompt,
screenshots=scraped_page.screenshots,
)
return ScrapeResult(
scraped_data=json_response,
)

View File

@@ -0,0 +1,58 @@
from __future__ import annotations
from typing import Any
from pydantic import BaseModel
from skyvern.forge.sdk.settings_manager import SettingsManager
from skyvern.webeye.actions.actions import Action, ActionTypeUnion
from skyvern.webeye.actions.responses import ActionResult
from skyvern.webeye.scraper.scraper import ScrapedPage
class AgentStepOutput(BaseModel):
"""
Output of the agent step, this is recorded in the database.
"""
# Will be deprecated once we move to the new format below
action_results: list[ActionResult] | None = None
# Nullable for backwards compatibility, once backfill is done, this won't be nullable anymore
actions_and_results: list[tuple[ActionTypeUnion, list[ActionResult]]] | None = None
def __repr__(self) -> str:
return f"AgentStepOutput({self.model_dump()})"
def __str__(self) -> str:
return self.__repr__()
class DetailedAgentStepOutput(BaseModel):
"""
Output of the agent step, this is not recorded in the database, only used for debugging in the Jupyter notebook.
"""
scraped_page: ScrapedPage | None
extract_action_prompt: str | None
llm_response: dict[str, Any] | None
actions: list[Action] | None
action_results: list[ActionResult] | None
actions_and_results: list[tuple[ActionTypeUnion, list[ActionResult]]] | None
class Config:
exclude = ["scraped_page", "extract_action_prompt"]
def __repr__(self) -> str:
if SettingsManager.get_settings().DEBUG_MODE:
return f"DetailedAgentStepOutput({self.model_dump()})"
else:
return f"AgentStepOutput({self.to_agent_step_output().model_dump()})"
def __str__(self) -> str:
return self.__repr__()
def to_agent_step_output(self) -> AgentStepOutput:
return AgentStepOutput(
action_results=self.action_results if self.action_results else [],
actions_and_results=self.actions_and_results if self.actions_and_results else [],
)

View File

@@ -0,0 +1,62 @@
from typing import Any
from pydantic import BaseModel
from skyvern.webeye.string_util import remove_whitespace
class ActionResult(BaseModel):
success: bool
exception_type: str | None = None
exception_message: str | None = None
data: dict[str, Any] | list | str | None = None
step_retry_number: int | None = None
step_order: int | None = None
javascript_triggered: bool = False
# None is used for old data so that we can differentiate between old and new data which only has boolean
interacted_with_sibling: bool | None = None
interacted_with_parent: bool | None = None
def __str__(self) -> str:
return (
f"ActionResult(success={self.success}, exception_type={self.exception_type}, "
f"exception_message={self.exception_message}), data={self.data}"
)
def __repr__(self) -> str:
return self.__str__()
class ActionSuccess(ActionResult):
def __init__(
self,
data: dict[str, Any] | list | str | None = None,
javascript_triggered: bool = False,
interacted_with_sibling: bool = False,
interacted_with_parent: bool = False,
):
super().__init__(
success=True,
data=data,
javascript_triggered=javascript_triggered,
interacted_with_sibling=interacted_with_sibling,
interacted_with_parent=interacted_with_parent,
)
class ActionFailure(ActionResult):
def __init__(
self,
exception: Exception,
javascript_triggered: bool = False,
interacted_with_sibling: bool = False,
interacted_with_parent: bool = False,
):
super().__init__(
success=False,
exception_type=type(exception).__name__,
exception_message=remove_whitespace(str(exception)),
javascript_triggered=javascript_triggered,
interacted_with_sibling=interacted_with_sibling,
interacted_with_parent=interacted_with_parent,
)

View File

@@ -0,0 +1,167 @@
from __future__ import annotations
import uuid
from datetime import datetime
from typing import Any, Awaitable, Protocol
import structlog
from playwright.async_api import BrowserContext, Error, Page, Playwright, async_playwright
from pydantic import BaseModel
from skyvern.exceptions import FailedToNavigateToUrl, UnknownBrowserType, UnknownErrorWhileCreatingBrowserContext
from skyvern.forge.sdk.core.skyvern_context import current
from skyvern.forge.sdk.settings_manager import SettingsManager
LOG = structlog.get_logger()
class BrowserContextCreator(Protocol):
def __call__(
self, playwright: Playwright, **kwargs: dict[str, Any]
) -> Awaitable[tuple[BrowserContext, BrowserArtifacts]]:
...
class BrowserContextFactory:
_creators: dict[str, BrowserContextCreator] = {}
@staticmethod
def get_subdir() -> str:
curr_context = current()
if curr_context and curr_context.task_id:
return curr_context.task_id
elif curr_context and curr_context.request_id:
return curr_context.request_id
return str(uuid.uuid4())
@staticmethod
def build_browser_args() -> dict[str, Any]:
video_dir = f"{SettingsManager.get_settings().VIDEO_PATH}/{datetime.utcnow().strftime('%Y-%m-%d')}"
har_dir = f"{SettingsManager.get_settings().HAR_PATH}/{datetime.utcnow().strftime('%Y-%m-%d')}/{BrowserContextFactory.get_subdir()}.har"
return {
"record_har_path": har_dir,
"record_video_dir": video_dir,
"viewport": {"width": 1920, "height": 1080},
}
@staticmethod
def build_browser_artifacts(
video_path: str | None = None, har_path: str | None = None, video_artifact_id: str | None = None
) -> BrowserArtifacts:
return BrowserArtifacts(video_path=video_path, har_path=har_path, video_artifact_id=video_artifact_id)
@classmethod
def register_type(cls, browser_type: str, creator: BrowserContextCreator) -> None:
cls._creators[browser_type] = creator
@classmethod
async def create_browser_context(
cls, playwright: Playwright, **kwargs: Any
) -> tuple[BrowserContext, BrowserArtifacts]:
browser_type = SettingsManager.get_settings().BROWSER_TYPE
try:
creator = cls._creators.get(browser_type)
if not creator:
raise UnknownBrowserType(browser_type)
return await creator(playwright, **kwargs)
except UnknownBrowserType as e:
raise e
except Exception as e:
raise UnknownErrorWhileCreatingBrowserContext(browser_type, e) from e
class BrowserArtifacts(BaseModel):
video_path: str | None = None
video_artifact_id: str | None = None
har_path: str | None = None
async def _create_headless_chromium(playwright: Playwright, **kwargs: dict) -> tuple[BrowserContext, BrowserArtifacts]:
browser = await playwright.chromium.launch(headless=True)
browser_args = BrowserContextFactory.build_browser_args()
browser_artifacts = BrowserContextFactory.build_browser_artifacts(har_path=browser_args["record_har_path"])
browser_context = await browser.new_context(**browser_args)
return browser_context, browser_artifacts
async def _create_headful_chromium(playwright: Playwright, **kwargs: dict) -> tuple[BrowserContext, BrowserArtifacts]:
browser = await playwright.chromium.launch(headless=False)
browser_args = BrowserContextFactory.build_browser_args()
browser_artifacts = BrowserContextFactory.build_browser_artifacts(har_path=browser_args["record_har_path"])
browser_context = await browser.new_context(**browser_args)
return browser_context, browser_artifacts
BrowserContextFactory.register_type("chromium-headless", _create_headless_chromium)
BrowserContextFactory.register_type("chromium-headful", _create_headful_chromium)
class BrowserState:
instance = None
def __init__(
self,
pw: Playwright | None = None,
browser_context: BrowserContext | None = None,
page: Page | None = None,
browser_artifacts: BrowserArtifacts = BrowserArtifacts(),
):
self.pw = pw
self.browser_context = browser_context
self.page = page
self.browser_artifacts = browser_artifacts
async def _close_all_other_pages(self) -> None:
if not self.browser_context or not self.page:
return
pages = self.browser_context.pages
for page in pages:
if page != self.page:
await page.close()
async def check_and_fix_state(self, url: str | None = None) -> None:
if self.pw is None:
LOG.info("Starting playwright")
self.pw = await async_playwright().start()
LOG.info("playwright is started")
if self.browser_context is None:
LOG.info("creating browser context")
browser_context, browser_artifacts = await BrowserContextFactory.create_browser_context(self.pw, url=url)
self.browser_context = browser_context
self.browser_artifacts = browser_artifacts
LOG.info("browser context is created")
assert self.browser_context is not None
if self.page is None:
LOG.info("Creating a new page")
self.page = await self.browser_context.new_page()
await self._close_all_other_pages()
LOG.info("A new page is created")
if url:
LOG.info(f"Navigating page to {url} and waiting for 5 seconds")
try:
await self.page.goto(url)
except Error as playright_error:
LOG.exception(f"Error while navigating to url: {str(playright_error)}", exc_info=True)
raise FailedToNavigateToUrl(url=url, error_message=str(playright_error))
LOG.info(f"Successfully went to {url}")
if self.browser_artifacts.video_path is None:
self.browser_artifacts.video_path = await self.page.video.path()
async def get_or_create_page(self, url: str | None = None) -> Page:
await self.check_and_fix_state(url)
assert self.page is not None
return self.page
async def close(self, close_browser_on_completion: bool = True) -> None:
LOG.info("Closing browser state")
if self.browser_context and close_browser_on_completion:
LOG.info("Closing browser context and its pages")
await self.browser_context.close()
LOG.info("Main browser context and all its pages are closed")
if self.pw and close_browser_on_completion:
LOG.info("Stopping playwright")
await self.pw.stop()
LOG.info("Playwright is stopped")

View File

@@ -0,0 +1,152 @@
from __future__ import annotations
import structlog
from playwright.async_api import Browser, Playwright, async_playwright
from skyvern.exceptions import MissingBrowserState
from skyvern.forge.sdk.schemas.tasks import ProxyLocation, Task
from skyvern.forge.sdk.settings_manager import SettingsManager
from skyvern.forge.sdk.workflow.models.workflow import WorkflowRun
from skyvern.webeye.browser_factory import BrowserContextFactory, BrowserState
LOG = structlog.get_logger()
class BrowserManager:
instance = None
pages: dict[str, BrowserState] = dict()
def __new__(cls) -> BrowserManager:
if cls.instance is None:
cls.instance = super().__new__(cls)
return cls.instance
@staticmethod
async def _create_browser_state(
proxy_location: ProxyLocation | None = None, url: str | None = None
) -> BrowserState:
pw = await async_playwright().start()
browser_context, browser_artifacts = await BrowserContextFactory.create_browser_context(
pw, proxy_location=proxy_location, url=url
)
return BrowserState(pw=pw, browser_context=browser_context, page=None, browser_artifacts=browser_artifacts)
async def get_or_create_for_task(self, task: Task) -> BrowserState:
if task.task_id in self.pages:
return self.pages[task.task_id]
elif task.workflow_run_id in self.pages:
LOG.info(
"Browser state for task not found. Using browser state for workflow run",
task_id=task.task_id,
workflow_run_id=task.workflow_run_id,
)
self.pages[task.task_id] = self.pages[task.workflow_run_id]
return self.pages[task.task_id]
LOG.info("Creating browser state for task", task_id=task.task_id)
browser_state = await self._create_browser_state(task.proxy_location, task.url)
# The URL here is only used when creating a new page, and not when using an existing page.
# This will make sure browser_state.page is not None.
await browser_state.get_or_create_page(task.url)
self.pages[task.task_id] = browser_state
return browser_state
async def get_or_create_for_workflow_run(self, workflow_run: WorkflowRun, url: str | None = None) -> BrowserState:
if workflow_run.workflow_run_id in self.pages:
return self.pages[workflow_run.workflow_run_id]
LOG.info("Creating browser state for workflow run", workflow_run_id=workflow_run.workflow_run_id)
browser_state = await self._create_browser_state(workflow_run.proxy_location, url=url)
# The URL here is only used when creating a new page, and not when using an existing page.
# This will make sure browser_state.page is not None.
await browser_state.get_or_create_page(url)
self.pages[workflow_run.workflow_run_id] = browser_state
return browser_state
def set_video_artifact_for_task(self, task: Task, artifact_id: str) -> None:
if task.workflow_run_id and task.workflow_run_id in self.pages:
if self.pages[task.workflow_run_id].browser_artifacts.video_artifact_id:
LOG.warning(
"Video artifact is already set for workflow run. Overwriting",
workflow_run_id=task.workflow_run_id,
old_artifact_id=self.pages[task.workflow_run_id].browser_artifacts.video_artifact_id,
new_artifact_id=artifact_id,
)
self.pages[task.workflow_run_id].browser_artifacts.video_artifact_id = artifact_id
return
if task.task_id in self.pages:
if self.pages[task.task_id].browser_artifacts.video_artifact_id:
LOG.warning(
"Video artifact is already set for task. Overwriting",
task_id=task.task_id,
old_artifact_id=self.pages[task.task_id].browser_artifacts.video_artifact_id,
new_artifact_id=artifact_id,
)
self.pages[task.task_id].browser_artifacts.video_artifact_id = artifact_id
return
raise MissingBrowserState(task_id=task.task_id)
async def get_video_data(
self, browser_state: BrowserState, task_id: str = "", workflow_id: str = "", workflow_run_id: str = ""
) -> bytes:
if browser_state:
path = browser_state.browser_artifacts.video_path
if path:
with open(path, "rb") as f:
return f.read()
LOG.warning(
"Video data not found for task", task_id=task_id, workflow_id=workflow_id, workflow_run_id=workflow_run_id
)
return b""
async def get_har_data(
self, browser_state: BrowserState, task_id: str = "", workflow_id: str = "", workflow_run_id: str = ""
) -> bytes:
if browser_state:
path = browser_state.browser_artifacts.har_path
if path:
with open(path, "rb") as f:
return f.read()
LOG.warning(
"HAR data not found for task", task_id=task_id, workflow_id=workflow_id, workflow_run_id=workflow_run_id
)
return b""
@classmethod
async def connect_to_scraping_browser(cls, pw: Playwright) -> Browser:
if not SettingsManager.get_settings().REMOTE_BROWSER_KEY:
raise Exception("REMOTE_BROWSER_KEY is empty. Cannot connect to remote browser.")
browser = await pw.chromium.connect_over_cdp(SettingsManager.get_settings().REMOTE_BROWSER_KEY)
LOG.info("Connected to remote browser", browser_type=SettingsManager.get_settings().BROWSER_TYPE)
return browser
@classmethod
async def close(cls) -> None:
LOG.info("Closing BrowserManager")
for browser_state in cls.pages.values():
await browser_state.close()
cls.pages = dict()
LOG.info("BrowserManger is closed")
async def cleanup_for_task(self, task_id: str, close_browser_on_completion: bool = True) -> BrowserState | None:
LOG.info("Cleaning up for task")
browser_state_to_close = self.pages.pop(task_id, None)
if browser_state_to_close:
await browser_state_to_close.close(close_browser_on_completion=close_browser_on_completion)
LOG.info("Task is cleaned up")
return browser_state_to_close
async def cleanup_for_workflow_run(
self, workflow_run_id: str, close_browser_on_completion: bool = True
) -> BrowserState | None:
LOG.info("Cleaning up for workflow run")
browser_state_to_close = self.pages.pop(workflow_run_id, None)
if browser_state_to_close:
await browser_state_to_close.close(close_browser_on_completion=close_browser_on_completion)
LOG.info("Workflow run is cleaned up")
return browser_state_to_close

View File

View File

@@ -0,0 +1,806 @@
// Commands for manipulating rects.
class Rect {
// Create a rect given the top left and bottom right corners.
static create(x1, y1, x2, y2) {
return {
bottom: y2,
top: y1,
left: x1,
right: x2,
width: x2 - x1,
height: y2 - y1,
};
}
static copy(rect) {
return {
bottom: rect.bottom,
top: rect.top,
left: rect.left,
right: rect.right,
width: rect.width,
height: rect.height,
};
}
// Translate a rect by x horizontally and y vertically.
static translate(rect, x, y) {
if (x == null) x = 0;
if (y == null) y = 0;
return {
bottom: rect.bottom + y,
top: rect.top + y,
left: rect.left + x,
right: rect.right + x,
width: rect.width,
height: rect.height,
};
}
// Determine whether two rects overlap.
static intersects(rect1, rect2) {
return (
rect1.right > rect2.left &&
rect1.left < rect2.right &&
rect1.bottom > rect2.top &&
rect1.top < rect2.bottom
);
}
static equals(rect1, rect2) {
for (const property of [
"top",
"bottom",
"left",
"right",
"width",
"height",
]) {
if (rect1[property] !== rect2[property]) return false;
}
return true;
}
}
class DomUtils {
//
// Bounds the rect by the current viewport dimensions. If the rect is offscreen or has a height or
// width < 3 then null is returned instead of a rect.
//
static cropRectToVisible(rect) {
const boundedRect = Rect.create(
Math.max(rect.left, 0),
Math.max(rect.top, 0),
rect.right,
rect.bottom,
);
if (
boundedRect.top >= window.innerHeight - 4 ||
boundedRect.left >= window.innerWidth - 4
) {
return null;
} else {
return boundedRect;
}
}
static getVisibleClientRect(element, testChildren) {
// Note: this call will be expensive if we modify the DOM in between calls.
let clientRect;
if (testChildren == null) testChildren = false;
const clientRects = (() => {
const result = [];
for (clientRect of element.getClientRects()) {
result.push(Rect.copy(clientRect));
}
return result;
})();
// Inline elements with font-size: 0px; will declare a height of zero, even if a child with
// non-zero font-size contains text.
let isInlineZeroHeight = function () {
const elementComputedStyle = window.getComputedStyle(element, null);
const isInlineZeroFontSize =
0 ===
elementComputedStyle.getPropertyValue("display").indexOf("inline") &&
elementComputedStyle.getPropertyValue("font-size") === "0px";
// Override the function to return this value for the rest of this context.
isInlineZeroHeight = () => isInlineZeroFontSize;
return isInlineZeroFontSize;
};
for (clientRect of clientRects) {
// If the link has zero dimensions, it may be wrapping visible but floated elements. Check for
// this.
let computedStyle;
if ((clientRect.width === 0 || clientRect.height === 0) && testChildren) {
for (const child of Array.from(element.children)) {
computedStyle = window.getComputedStyle(child, null);
// Ignore child elements which are not floated and not absolutely positioned for parent
// elements with zero width/height, as long as the case described at isInlineZeroHeight
// does not apply.
// NOTE(mrmr1993): This ignores floated/absolutely positioned descendants nested within
// inline children.
const position = computedStyle.getPropertyValue("position");
if (
computedStyle.getPropertyValue("float") === "none" &&
!["absolute", "fixed"].includes(position) &&
!(
clientRect.height === 0 &&
isInlineZeroHeight() &&
0 === computedStyle.getPropertyValue("display").indexOf("inline")
)
) {
continue;
}
const childClientRect = this.getVisibleClientRect(child, true);
if (
childClientRect === null ||
childClientRect.width < 3 ||
childClientRect.height < 3
)
continue;
return childClientRect;
}
} else {
clientRect = this.cropRectToVisible(clientRect);
if (
clientRect === null ||
clientRect.width < 3 ||
clientRect.height < 3
)
continue;
// eliminate invisible elements (see test_harnesses/visibility_test.html)
computedStyle = window.getComputedStyle(element, null);
if (computedStyle.getPropertyValue("visibility") !== "visible")
continue;
return clientRect;
}
}
return null;
}
static getViewportTopLeft() {
const box = document.documentElement;
const style = getComputedStyle(box);
const rect = box.getBoundingClientRect();
if (
style.position === "static" &&
!/content|paint|strict/.test(style.contain || "")
) {
// The margin is included in the client rect, so we need to subtract it back out.
const marginTop = parseInt(style.marginTop);
const marginLeft = parseInt(style.marginLeft);
return {
top: -rect.top + marginTop,
left: -rect.left + marginLeft,
};
} else {
const { clientTop, clientLeft } = box;
return {
top: -rect.top - clientTop,
left: -rect.left - clientLeft,
};
}
}
}
// from playwright
function getElementComputedStyle(element, pseudo) {
return element.ownerDocument && element.ownerDocument.defaultView
? element.ownerDocument.defaultView.getComputedStyle(element, pseudo)
: undefined;
}
// from playwright
function isElementStyleVisibilityVisible(element, style) {
style = style ?? getElementComputedStyle(element);
if (!style) return true;
if (
!element.checkVisibility({ checkOpacity: false, checkVisibilityCSS: false })
)
return false;
if (style.visibility !== "visible") return false;
return true;
}
// from playwright
function isElementVisible(element) {
// TODO: This is a hack to not check visibility for option elements
// because they are not visible by default. We check their parent instead for visibility.
if (element.tagName.toLowerCase() === "option")
return element.parentElement && isElementVisible(element.parentElement);
const style = getElementComputedStyle(element);
if (!style) return true;
if (style.display === "contents") {
// display:contents is not rendered itself, but its child nodes are.
for (let child = element.firstChild; child; child = child.nextSibling) {
if (
child.nodeType === 1 /* Node.ELEMENT_NODE */ &&
isElementVisible(child)
)
return true;
// skipping other nodes including text
}
return false;
}
if (!isElementStyleVisibilityVisible(element, style)) return false;
const rect = element.getBoundingClientRect();
return rect.width > 0 && rect.height > 0;
}
function isHiddenOrDisabled(element) {
const style = getElementComputedStyle(element);
return style?.display === "none" || element.hidden || element.disabled;
}
function isScriptOrStyle(element) {
const tagName = element.tagName.toLowerCase();
return tagName === "script" || tagName === "style";
}
function hasWidgetRole(element) {
const role = element.getAttribute("role");
if (!role) {
return false;
}
// https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles#2._widget_roles
// Not all roles make sense for the time being so we only check for the ones that do
const widgetRoles = [
"button",
"link",
"checkbox",
"menuitem",
"menuitemcheckbox",
"menuitemradio",
"radio",
"tab",
"combobox",
"textbox",
"searchbox",
"slider",
"spinbutton",
"switch",
"gridcell",
];
return widgetRoles.includes(role.toLowerCase().trim());
}
function isInteractableInput(element) {
const tagName = element.tagName.toLowerCase();
const type = element.getAttribute("type");
if (tagName !== "input" || !type) {
// let other checks decide
return false;
}
const clickableTypes = [
"button",
"checkbox",
"date",
"datetime-local",
"email",
"file",
"image",
"month",
"number",
"password",
"radio",
"range",
"reset",
"search",
"submit",
"tel",
"text",
"time",
"url",
"week",
];
return clickableTypes.includes(type.toLowerCase().trim());
}
function isInteractable(element) {
if (!isElementVisible(element)) {
return false;
}
if (isHiddenOrDisabled(element)) {
return false;
}
if (isScriptOrStyle(element)) {
return false;
}
if (hasWidgetRole(element)) {
return true;
}
if (isInteractableInput(element)) {
return true;
}
const tagName = element.tagName.toLowerCase();
if (tagName === "a" && element.href) {
return true;
}
if (
tagName === "button" ||
tagName === "select" ||
tagName === "option" ||
tagName === "textarea"
) {
return true;
}
if (tagName === "label" && element.control && !element.control.disabled) {
return true;
}
if (
element.hasAttribute("onclick") ||
element.isContentEditable ||
element.hasAttribute("jsaction")
) {
return true;
}
if (tagName === "div" || tagName === "img" || tagName === "span") {
const computedStyle = window.getComputedStyle(element);
const hasPointer = computedStyle.cursor === "pointer";
const hasCursor = computedStyle.cursor === "cursor";
return hasPointer || hasCursor;
}
return false;
}
function removeMultipleSpaces(str) {
if (!str) {
return str;
}
return str.replace(/\s+/g, " ");
}
function cleanupText(text) {
return removeMultipleSpaces(
text.replace("SVGs not supported by this browser.", ""),
).trim();
}
function getElementContext(element) {
// dfs to collect the non unique_id context
let fullContext = "";
if (element.childNodes.length === 0) {
return fullContext;
}
let childContextList = new Array();
for (var child of element.childNodes) {
let childContext = "";
if (child.nodeType === Node.TEXT_NODE) {
if (!element.hasAttribute("unique_id")) {
childContext = child.data.trim();
}
} else if (child.nodeType === Node.ELEMENT_NODE) {
if (!child.hasAttribute("unique_id")) {
childContext = getElementContext(child);
}
}
if (childContext.length > 0) {
childContextList.push(childContext);
}
if (childContextList.length > 0) {
fullContext = childContextList.join(";");
}
const charLimit = 1000;
if (fullContext.length > charLimit) {
fullContext = "";
}
}
return fullContext;
}
function getElementContent(element) {
// DFS to get all the text content from all the nodes under the element
let textContent = element.textContent;
let nodeContent = "";
// if element has children, then build a list of text and join with a semicolon
if (element.childNodes.length > 0) {
let childTextContentList = new Array();
let nodeTextContentList = new Array();
for (var child of element.childNodes) {
let childText = "";
if (child.nodeType === Node.TEXT_NODE) {
childText = child.data.trim();
nodeTextContentList.push(childText);
} else if (child.nodeType === Node.ELEMENT_NODE) {
// childText = child.textContent.trim();
childText = getElementContent(child);
} else {
console.log("Unhandled node type: ", child.nodeType);
}
if (childText.length > 0) {
childTextContentList.push(childText);
}
}
textContent = childTextContentList.join(";");
nodeContent = cleanupText(nodeTextContentList.join(";"));
}
let finalTextContent = cleanupText(textContent);
// Currently we don't support too much context. Character limit is 1000 per element.
// we don't think element context has to be that big
const charLimit = 1000;
if (finalTextContent.length > charLimit) {
if (nodeContent.length <= charLimit) {
finalTextContent = nodeContent;
} else {
finalTextContent = "";
}
}
return finalTextContent;
}
function getSelectOptions(element) {
const options = Array.from(element.options);
const selectOptions = [];
for (const option of options) {
selectOptions.push({
optionIndex: option.index,
text: removeMultipleSpaces(option.textContent),
});
}
return selectOptions;
}
function buildTreeFromBody() {
var elements = [];
var resultArray = [];
function buildElementObject(element) {
var element_id = elements.length;
var elementTagNameLower = element.tagName.toLowerCase();
element.setAttribute("unique_id", element_id);
// if element is an "a" tag and has a target="_blank" attribute, remove the target attribute
// We're doing this so that skyvern can do all the navigation in a single page/tab and not open new tab
if (element.tagName.toLowerCase() === "a") {
if (element.getAttribute("target") === "_blank") {
element.removeAttribute("target");
}
}
const attrs = {};
for (const attr of element.attributes) {
var attrValue = attr.value;
if (
attr.name === "required" ||
attr.name === "aria-required" ||
attr.name === "checked" ||
attr.name === "aria-checked" ||
attr.name === "selected" ||
attr.name === "aria-selected" ||
attr.name === "readonly" ||
attr.name === "aria-readonly"
) {
attrValue = true;
}
attrs[attr.name] = attrValue;
}
if (elementTagNameLower === "input" || elementTagNameLower === "textarea") {
attrs["value"] = element.value;
}
let elementObj = {
id: element_id,
tagName: elementTagNameLower,
attributes: attrs,
text: getElementContent(element),
children: [],
rect: DomUtils.getVisibleClientRect(element, true),
};
// get options for select element or for listbox element
let selectOptions = null;
if (elementTagNameLower === "select") {
selectOptions = getSelectOptions(element);
}
if (selectOptions) {
elementObj.options = selectOptions;
}
return elementObj;
}
function getChildElements(element) {
if (element.childElementCount !== 0) {
return Array.from(element.children);
} else {
return [];
}
}
function processElement(element, interactableParentId) {
// Check if the element is interactable
if (isInteractable(element)) {
var elementObj = buildElementObject(element);
elements.push(elementObj);
// If the element is interactable but has no interactable parent,
// then it starts a new tree, so add it to the result array
// and set its id as the interactable parent id for the next elements
// under it
if (interactableParentId === null) {
resultArray.push(elementObj);
}
// If the element is interactable and has an interactable parent,
// then add it to the children of the parent
else {
elements[interactableParentId].children.push(elementObj);
}
// Recursively process the children of the element
getChildElements(element).forEach((child) => {
processElement(child, elementObj.id);
});
return elementObj;
} else {
// For a non-interactable element, process its children
// and check if any of them are interactable
let interactableChildren = [];
getChildElements(element).forEach((child) => {
let children = processElement(child, interactableParentId);
});
}
}
// TODO: Handle iframes
// Clear all the unique_id attributes so that there are no conflicts
removeAllUniqueIdAttributes();
processElement(document.body, null);
for (var element of elements) {
if (
((element.tagName === "input" && element.attributes["type"] === "text") ||
element.tagName === "textarea") &&
(element.attributes["required"] || element.attributes["aria-required"]) &&
element.attributes.value === ""
) {
// TODO (kerem): we may want to pass these elements to the LLM as empty but required fields in the future
console.log(
"input element with required attribute and no value",
element,
);
}
// for most elements, we're going 10 layers up to see if we can find "label" as a parent
// if found, most likely the context under label is relevant to this element
let targetParentElements = new Set(["label", "fieldset"]);
// look up for 10 levels to find the most contextual parent element
let targetContextualParent = null;
let currentEle = document.querySelector(`[unique_id="${element.id}"]`);
let parentEle = currentEle;
for (var i = 0; i < 10; i++) {
parentEle = parentEle.parentElement;
if (parentEle) {
if (targetParentElements.has(parentEle.tagName.toLowerCase())) {
targetContextualParent = parentEle;
}
} else {
break;
}
}
if (targetContextualParent) {
let context = "";
var lowerCaseTagName = targetContextualParent.tagName.toLowerCase();
if (lowerCaseTagName === "label") {
context = getElementContext(targetContextualParent);
} else if (lowerCaseTagName === "fieldset") {
// fieldset is usually within a form or another element that contains the whole context
targetContextualParent = targetContextualParent.parentElement;
if (targetContextualParent) {
context = getElementContext(targetContextualParent);
}
}
if (context.length > 0) {
element.context = context;
}
}
}
return [elements, resultArray];
}
function drawBoundingBoxes(elements) {
// draw a red border around the elements
var groups = groupElementsVisually(elements);
var hintMarkers = createHintMarkersForGroups(groups);
addHintMarkersToPage(hintMarkers);
}
function removeAllUniqueIdAttributes() {
var elementsWithUniqueId = document.querySelectorAll("[unique_id]");
elementsWithUniqueId.forEach(function (element) {
element.removeAttribute("unique_id");
});
}
function captchaSolvedCallback() {
console.log("captcha solved");
if (!window["captchaSolvedCounter"]) {
window["captchaSolvedCounter"] = 0;
}
// For some reason this isn't being called.. TODO figure out why
window["captchaSolvedCounter"] = window["captchaSolvedCounter"] + 1;
}
function getCaptchaSolves() {
if (!window["captchaSolvedCounter"]) {
window["captchaSolvedCounter"] = 0;
}
return window["captchaSolvedCounter"];
}
function groupElementsVisually(elements) {
const groups = [];
// o n^2
// go through each hint and see if it overlaps with any other hints, if it does, add it to the group of the other hint
// *** if we start from the bigger elements (top -> bottom) we can avoid merging groups
for (const element of elements) {
if (!element.rect) {
continue;
}
const group = groups.find((group) => {
for (const groupElement of group.elements) {
if (Rect.intersects(groupElement.rect, element.rect)) {
return true;
}
}
return false;
});
if (group) {
group.elements.push(element);
} else {
groups.push({
elements: [element],
});
}
}
// go through each group and create a rectangle that encompasses all the hints in the group
for (const group of groups) {
group.rect = createRectangleForGroup(group);
}
return groups;
}
function createRectangleForGroup(group) {
const rects = group.elements.map((element) => element.rect);
const top = Math.min(...rects.map((rect) => rect.top));
const left = Math.min(...rects.map((rect) => rect.left));
const bottom = Math.max(...rects.map((rect) => rect.bottom));
const right = Math.max(...rects.map((rect) => rect.right));
return Rect.create(left, top, right, bottom);
}
function generateHintStrings(count) {
const hintCharacters = "sadfjklewcmpgh";
let hintStrings = [""];
let offset = 0;
while (hintStrings.length - offset < count || hintStrings.length === 1) {
const hintString = hintStrings[offset++];
for (const ch of hintCharacters) {
hintStrings.push(ch + hintString);
}
}
hintStrings = hintStrings.slice(offset, offset + count);
// Shuffle the hints so that they're scattered; hints starting with the same character and short
// hints are spread evenly throughout the array.
return hintStrings.sort(); // .map((str) => str.reverse())
}
function createHintMarkersForGroups(groups) {
if (groups.length === 0) {
console.log("No groups found, not adding hint markers to page.");
return [];
}
const hintMarkers = groups.map((group) => createHintMarkerForGroup(group));
// fill in marker text
const hintStrings = generateHintStrings(hintMarkers.length);
for (let i = 0; i < hintMarkers.length; i++) {
const hintMarker = hintMarkers[i];
hintMarker.hintString = hintStrings[i];
hintMarker.element.innerHTML = hintMarker.hintString.toUpperCase();
}
return hintMarkers;
}
function createHintMarkerForGroup(group) {
const marker = {};
// yellow annotation box with string
const el = document.createElement("div");
el.style.left = group.rect.left + "px";
el.style.top = group.rect.top + "px";
// Each group is assigned a different incremental z-index, we use the same z-index for the
// bounding box and the hint marker
el.style.zIndex = this.currentZIndex;
// The bounding box around the group of hints.
const boundingBox = document.createElement("div");
// Calculate the position of the element relative to the document
var scrollTop = window.pageYOffset || document.documentElement.scrollTop;
var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
// Set styles for the bounding box
boundingBox.style.position = "absolute";
boundingBox.style.display = "display";
boundingBox.style.left = group.rect.left + scrollLeft + "px";
boundingBox.style.top = group.rect.top + scrollTop + "px";
boundingBox.style.width = group.rect.width + "px";
boundingBox.style.height = group.rect.height + "px";
boundingBox.style.bottom = boundingBox.style.top + boundingBox.style.height;
boundingBox.style.right = boundingBox.style.left + boundingBox.style.width;
boundingBox.style.border = "2px solid blue"; // Change the border color as needed
boundingBox.style.pointerEvents = "none"; // Ensures the box doesn't interfere with other interactions
boundingBox.style.zIndex = this.currentZIndex++;
return Object.assign(marker, {
element: el,
boundingBox: boundingBox,
group: group,
});
}
function addHintMarkersToPage(hintMarkers) {
const parent = document.createElement("div");
parent.id = "boundingBoxContainer";
for (const hintMarker of hintMarkers) {
// parent.appendChild(hintMarker.element);
parent.appendChild(hintMarker.boundingBox);
}
document.documentElement.appendChild(parent);
}
function removeBoundingBoxes() {
var hintMarkerContainer = document.querySelector("#boundingBoxContainer");
if (hintMarkerContainer) {
hintMarkerContainer.remove();
}
}
function scrollToTop(draw_boxes) {
removeBoundingBoxes();
window.scrollTo(0, 0);
if (draw_boxes) {
var elementsAndResultArray = buildTreeFromBody();
drawBoundingBoxes(elementsAndResultArray[0]);
}
return window.scrollY;
}
function scrollToNextPage(draw_boxes) {
// remove bounding boxes, scroll to next page with 200px overlap, then draw bounding boxes again
// return true if there is a next page, false otherwise
removeBoundingBoxes();
window.scrollBy(0, window.innerHeight - 200);
if (draw_boxes) {
var elementsAndResultArray = buildTreeFromBody();
drawBoundingBoxes(elementsAndResultArray[0]);
}
return window.scrollY;
}

View File

@@ -0,0 +1,316 @@
import asyncio
import copy
import structlog
from playwright.async_api import Page
from pydantic import BaseModel
from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
from skyvern.forge.sdk.settings_manager import SettingsManager
from skyvern.webeye.browser_factory import BrowserState
LOG = structlog.get_logger()
RESERVED_ATTRIBUTES = {
"accept", # for input file
"alt",
"aria-checked", # for option tag
"aria-current",
"aria-label",
"aria-required",
"aria-role",
"aria-selected", # for option tag
"checked",
"data-ui",
"for",
"href", # For a tags
"maxlength",
"name",
"pattern",
"placeholder",
"readonly",
"required",
"selected", # for option tag
"src", # do we need this?
"text-value",
"title",
"type",
"value",
}
def load_js_script() -> str:
# TODO: Handle file location better. This is a hacky way to find the file location.
path = f"{SKYVERN_DIR}/webeye/scraper/domUtils.js"
try:
# TODO: Implement TS of domUtils.js and use the complied JS file instead of the raw JS file.
# This will allow our code to be type safe.
with open(path, "r") as f:
return f.read()
except FileNotFoundError as e:
LOG.exception("Failed to load the JS script", exc_info=True, path=path)
raise e
JS_FUNCTION_DEFS = load_js_script()
class ScrapedPage(BaseModel):
"""
Scraped response from a webpage, including:
1. List of elements
2. ID to xpath map
3. The element tree of the page (list of dicts). Each element has children and attributes.
4. The screenshot (base64 encoded)
5. The URL of the page
6. The HTML of the page
7. The extracted text from the page
"""
elements: list[dict]
id_to_xpath_dict: dict[int, str]
element_tree: list[dict]
element_tree_trimmed: list[dict]
screenshots: list[bytes]
url: str
html: str
extracted_text: str | None = None
async def scrape_website(
browser_state: BrowserState,
url: str,
num_retry: int = 0,
) -> ScrapedPage:
"""
************************************************************************************************
************ NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production *************
************************************************************************************************
High-level asynchronous function to scrape a web page. It sets up the Playwright environment, handles browser and
page initialization, and calls the safe scraping function. This function is ideal for general use where initial
setup and safety measures are required.
Asynchronous function that safely scrapes a web page. It handles exceptions and retries scraping up to a maximum
number of attempts. This function should be used when reliability and error handling are crucial, such as in
automated scraping tasks.
:param browser_context: BrowserContext instance used for scraping.
:param url: URL of the web page to be scraped.
:param page: Optional Page instance for scraping, a new page is created if None.
:param num_retry: Tracks number of retries if scraping fails, defaults to 0.
:return: Tuple containing Page instance, base64 encoded screenshot, and page elements.
:raises Exception: When scraping fails after maximum retries.
"""
try:
num_retry += 1
return await scrape_web_unsafe(browser_state, url)
except Exception:
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES:
LOG.error(
"Scraping failed after max retries, aborting.",
max_retries=SettingsManager.get_settings().MAX_SCRAPING_RETRIES,
url=url,
exc_info=True,
)
raise Exception("Scraping failed.")
LOG.info("Scraping failed, will retry", num_retry=num_retry, url=url)
return await scrape_website(
browser_state,
url,
num_retry=num_retry,
)
async def get_all_visible_text(page: Page) -> str:
"""
Get all the visible text on the page.
:param page: Page instance to get the text from.
:return: All the visible text on the page.
"""
js_script = "() => document.body.innerText"
return await page.evaluate(js_script)
async def scrape_web_unsafe(
browser_state: BrowserState,
url: str,
) -> ScrapedPage:
"""
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
for use cases where the caller handles exceptions or in controlled environments. It directly scrapes the provided
URL or continues on the given page.
:param browser_context: BrowserContext instance used for scraping.
:param url: URL of the web page to be scraped. Used only when creating a new page.
:param page: Optional Page instance for scraping, a new page is created if None.
:return: Tuple containing Page instance, base64 encoded screenshot, and page elements.
:note: This function does not handle exceptions. Ensure proper error handling in the calling context.
"""
# We only create a new page if one does not exist. This is to allow keeping the same page since we want to
# continue working on the same page that we're taking actions on.
# *This also means URL is only used when creating a new page, and not when using an existing page.
page = await browser_state.get_or_create_page(url)
# Take screenshots of the page with the bounding boxes. We will remove the bounding boxes later.
# Scroll to the top of the page and take a screenshot.
# Scroll to the next page and take a screenshot until we reach the end of the page.
# We check if the scroll_y_px_old is the same as scroll_y_px to determine if we have reached the end of the page.
# This also solves the issue where we can't scroll due to a popup.(e.g. geico first popup on the homepage after
# clicking start my quote)
LOG.info("Waiting for 5 seconds before scraping the website.")
await asyncio.sleep(5)
screenshots: list[bytes] = []
scroll_y_px_old = -1.0
scroll_y_px = await scroll_to_top(page, drow_boxes=True)
# Checking max number of screenshots to prevent infinite loop
while scroll_y_px_old != scroll_y_px and len(screenshots) < SettingsManager.get_settings().MAX_NUM_SCREENSHOTS:
screenshot = await page.screenshot(full_page=False)
screenshots.append(screenshot)
scroll_y_px_old = scroll_y_px
LOG.info("Scrolling to next page", url=url, num_screenshots=len(screenshots))
scroll_y_px = await scroll_to_next_page(page, drow_boxes=True)
LOG.info("Scrolled to next page", scroll_y_px=scroll_y_px, scroll_y_px_old=scroll_y_px_old)
await remove_bounding_boxes(page)
await scroll_to_top(page, drow_boxes=False)
elements, element_tree = await get_interactable_element_tree(page)
element_tree = cleanup_elements(copy.deepcopy(element_tree))
id_to_xpath_dict = {}
for element in elements:
element_id = element["id"]
# get_interactable_element_tree marks each interactable element with a unique_id attribute
id_to_xpath_dict[element_id] = f"//*[@{SKYVERN_ID_ATTR}='{element_id}']"
text_content = await get_all_visible_text(page)
return ScrapedPage(
elements=elements,
id_to_xpath_dict=id_to_xpath_dict,
element_tree=element_tree,
element_tree_trimmed=trim_element_tree(copy.deepcopy(element_tree)),
screenshots=screenshots,
url=page.url,
html=await page.content(),
extracted_text=text_content,
)
async def get_interactable_element_tree(page: Page) -> tuple[list[dict], list[dict]]:
"""
Get the element tree of the page, including all the elements that are interactable.
:param page: Page instance to get the element tree from.
:return: Tuple containing the element tree and a map of element IDs to elements.
"""
await page.evaluate(JS_FUNCTION_DEFS)
js_script = "() => buildTreeFromBody()"
elements, element_tree = await page.evaluate(js_script)
return elements, element_tree
async def scroll_to_top(page: Page, drow_boxes: bool) -> float:
"""
Scroll to the top of the page and take a screenshot.
:param drow_boxes: If True, draw bounding boxes around the elements.
:param page: Page instance to take the screenshot from.
:return: Screenshot of the page.
"""
await page.evaluate(JS_FUNCTION_DEFS)
js_script = f"() => scrollToTop({str(drow_boxes).lower()})"
scroll_y_px = await page.evaluate(js_script)
return scroll_y_px
async def scroll_to_next_page(page: Page, drow_boxes: bool) -> bool:
"""
Scroll to the next page and take a screenshot.
:param drow_boxes: If True, draw bounding boxes around the elements.
:param page: Page instance to take the screenshot from.
:return: Screenshot of the page.
"""
await page.evaluate(JS_FUNCTION_DEFS)
js_script = f"() => scrollToNextPage({str(drow_boxes).lower()})"
scroll_y_px = await page.evaluate(js_script)
return scroll_y_px
async def remove_bounding_boxes(page: Page) -> None:
"""
Remove the bounding boxes from the page.
:param page: Page instance to remove the bounding boxes from.
"""
js_script = "() => removeBoundingBoxes()"
await page.evaluate(js_script)
def cleanup_elements(elements: list[dict]) -> list[dict]:
"""
Remove rect and attribute.unique_id from the elements.
The reason we're doing it is to
1. reduce unnecessary data so that llm get less distrction
# TODO later: 2. reduce tokens sent to llm to save money
:param elements: List of elements to remove xpaths from.
:return: List of elements without xpaths.
"""
queue = []
for element in elements:
queue.append(element)
while queue:
queue_ele = queue.pop(0)
_remove_rect(queue_ele)
# TODO: we can come back to test removing the unique_id
# from element attributes to make sure this won't increase hallucination
# _remove_unique_id(queue_ele)
if "children" in queue_ele:
queue.extend(queue_ele["children"])
return elements
def trim_element_tree(elements: list[dict]) -> list[dict]:
queue = []
for element in elements:
queue.append(element)
while queue:
queue_ele = queue.pop(0)
if "attributes" in queue_ele:
tag_name = queue_ele["tagName"] if "tagName" in queue_ele else ""
new_attributes = _trimmed_attributes(tag_name, queue_ele["attributes"])
if new_attributes:
queue_ele["attributes"] = new_attributes
else:
del queue_ele["attributes"]
if "children" in queue_ele:
queue.extend(queue_ele["children"])
if not queue_ele["children"]:
del queue_ele["children"]
if "text" in queue_ele:
element_text = str(queue_ele["text"]).strip()
if not element_text:
del queue_ele["text"]
return elements
def _trimmed_attributes(tag_name: str, attributes: dict) -> dict:
new_attributes: dict = {}
for key in attributes:
if key == "id" and tag_name in ["input", "textarea", "select"]:
# We don't want to remove the id attribute any of these elements in case there's a label for it
new_attributes[key] = attributes[key]
if key in RESERVED_ATTRIBUTES:
new_attributes[key] = attributes[key]
return new_attributes
def _remove_rect(element: dict) -> None:
if "rect" in element:
del element["rect"]
def _remove_unique_id(element: dict) -> None:
if "attributes" not in element:
return
if SKYVERN_ID_ATTR in element["attributes"]:
del element["attributes"][SKYVERN_ID_ATTR]

View File

@@ -0,0 +1,5 @@
import re
def remove_whitespace(string: str) -> str:
return re.sub("[ \n\t]+", " ", string)