Move the code over from private repository (#3)
This commit is contained in:
0
skyvern/webeye/__init__.py
Normal file
0
skyvern/webeye/__init__.py
Normal file
0
skyvern/webeye/actions/__init__.py
Normal file
0
skyvern/webeye/actions/__init__.py
Normal file
204
skyvern/webeye/actions/actions.py
Normal file
204
skyvern/webeye/actions/actions.py
Normal file
@@ -0,0 +1,204 @@
|
||||
import abc
|
||||
from enum import StrEnum
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import structlog
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.forge.sdk.schemas.tasks import Task
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
|
||||
|
||||
class ActionType(StrEnum):
|
||||
CLICK = "click"
|
||||
INPUT_TEXT = "input_text"
|
||||
UPLOAD_FILE = "upload_file"
|
||||
SELECT_OPTION = "select_option"
|
||||
CHECKBOX = "checkbox"
|
||||
WAIT = "wait"
|
||||
NULL_ACTION = "null_action"
|
||||
SOLVE_CAPTCHA = "solve_captcha"
|
||||
TERMINATE = "terminate"
|
||||
COMPLETE = "complete"
|
||||
# Note: Remember to update ActionTypeUnion with new actions
|
||||
|
||||
|
||||
class Action(BaseModel):
|
||||
action_type: ActionType
|
||||
description: str | None = None
|
||||
reasoning: str | None = None
|
||||
|
||||
|
||||
class WebAction(Action, abc.ABC):
|
||||
element_id: int
|
||||
|
||||
|
||||
class ClickAction(WebAction):
|
||||
action_type: ActionType = ActionType.CLICK
|
||||
file_url: str | None = None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"ClickAction(element_id={self.element_id}, file_url={self.file_url})"
|
||||
|
||||
|
||||
class InputTextAction(WebAction):
|
||||
action_type: ActionType = ActionType.INPUT_TEXT
|
||||
text: str
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"InputTextAction(element_id={self.element_id}, text={self.text})"
|
||||
|
||||
|
||||
class UploadFileAction(WebAction):
|
||||
action_type: ActionType = ActionType.UPLOAD_FILE
|
||||
file_url: str
|
||||
is_upload_file_tag: bool = True
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"UploadFileAction(element_id={self.element_id}, file={self.file_url}, is_upload_file_tag={self.is_upload_file_tag})"
|
||||
|
||||
|
||||
class NullAction(Action):
|
||||
action_type: ActionType = ActionType.NULL_ACTION
|
||||
|
||||
|
||||
class SolveCaptchaAction(Action):
|
||||
action_type: ActionType = ActionType.SOLVE_CAPTCHA
|
||||
|
||||
|
||||
class SelectOption(BaseModel):
|
||||
label: str | None
|
||||
value: str | None
|
||||
index: int | None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"
|
||||
|
||||
|
||||
class SelectOptionAction(WebAction):
|
||||
action_type: ActionType = ActionType.SELECT_OPTION
|
||||
option: SelectOption
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"SelectOptionAction(element_id={self.element_id}, option={self.option})"
|
||||
|
||||
|
||||
###
|
||||
# This action causes more harm than it does good.
|
||||
# It frequently mis-behaves, or gets stuck in click loops.
|
||||
# Treating checkbox actions as click actions seem to perform way more reliably
|
||||
# Developers who tried this and failed: 2 (Suchintan and Shu 😂)
|
||||
###
|
||||
class CheckboxAction(WebAction):
|
||||
action_type: ActionType = ActionType.CHECKBOX
|
||||
is_checked: bool
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"CheckboxAction(element_id={self.element_id}, is_checked={self.is_checked})"
|
||||
|
||||
|
||||
class WaitAction(Action):
|
||||
action_type: ActionType = ActionType.WAIT
|
||||
|
||||
|
||||
class TerminateAction(Action):
|
||||
action_type: ActionType = ActionType.TERMINATE
|
||||
|
||||
|
||||
class CompleteAction(Action):
|
||||
action_type: ActionType = ActionType.COMPLETE
|
||||
data_extraction_goal: str | None = None
|
||||
|
||||
|
||||
def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Action]:
|
||||
actions = []
|
||||
for action in json_response:
|
||||
element_id = action["id"]
|
||||
reasoning = action["reasoning"] if "reasoning" in action else None
|
||||
if "action_type" not in action or action["action_type"] is None:
|
||||
actions.append(NullAction(reasoning=reasoning))
|
||||
continue
|
||||
# `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
|
||||
action_type = ActionType[action["action_type"].upper()]
|
||||
if action_type == ActionType.TERMINATE:
|
||||
LOG.warning(
|
||||
"Agent decided to terminate",
|
||||
task_id=task.task_id,
|
||||
llm_response=json_response,
|
||||
reasoning=reasoning,
|
||||
actions=actions,
|
||||
)
|
||||
actions.append(TerminateAction(reasoning=reasoning))
|
||||
elif action_type == ActionType.CLICK:
|
||||
file_url = action["file_url"] if "file_url" in action else None
|
||||
actions.append(ClickAction(element_id=element_id, reasoning=reasoning, file_url=file_url))
|
||||
elif action_type == ActionType.INPUT_TEXT:
|
||||
actions.append(InputTextAction(element_id=element_id, text=action["text"], reasoning=reasoning))
|
||||
elif action_type == ActionType.UPLOAD_FILE:
|
||||
# TODO: see if the element is a file input element. if it's not, convert this action into a click action
|
||||
|
||||
actions.append(UploadFileAction(element_id=element_id, file_url=action["file_url"], reasoning=reasoning))
|
||||
elif action_type == ActionType.SELECT_OPTION:
|
||||
actions.append(
|
||||
SelectOptionAction(
|
||||
element_id=element_id,
|
||||
option=SelectOption(
|
||||
label=action["option"]["label"],
|
||||
value=action["option"]["value"],
|
||||
index=action["option"]["index"],
|
||||
),
|
||||
reasoning=reasoning,
|
||||
)
|
||||
)
|
||||
elif action_type == ActionType.CHECKBOX:
|
||||
actions.append(CheckboxAction(element_id=element_id, is_checked=action["is_checked"], reasoning=reasoning))
|
||||
elif action_type == ActionType.WAIT:
|
||||
actions.append(WaitAction(reasoning=reasoning))
|
||||
elif action_type == ActionType.COMPLETE:
|
||||
if actions:
|
||||
LOG.info(
|
||||
"Navigation goal achieved, creating complete action and discarding all other actions except "
|
||||
"complete action",
|
||||
task_id=task.task_id,
|
||||
nav_goal=task.navigation_goal,
|
||||
actions=actions,
|
||||
llm_response=json_response,
|
||||
)
|
||||
return [CompleteAction(reasoning=reasoning, data_extraction_goal=task.data_extraction_goal)]
|
||||
elif action_type == "null":
|
||||
actions.append(NullAction(reasoning=reasoning))
|
||||
elif action_type == ActionType.SOLVE_CAPTCHA:
|
||||
actions.append(SolveCaptchaAction(reasoning=reasoning))
|
||||
else:
|
||||
LOG.error(
|
||||
"Unsupported action type when parsing actions",
|
||||
task_id=task.task_id,
|
||||
action_type=action_type,
|
||||
raw_action=action,
|
||||
)
|
||||
return actions
|
||||
|
||||
|
||||
class ScrapeResult(BaseModel):
|
||||
"""
|
||||
Scraped response from a webpage, including:
|
||||
1. JSON representation of what the user is seeing
|
||||
"""
|
||||
|
||||
scraped_data: dict[str, Any] | list[dict[str, Any]]
|
||||
|
||||
|
||||
# https://blog.devgenius.io/deserialize-child-classes-with-pydantic-that-gonna-work-784230e1cf83
|
||||
ActionTypeUnion = (
|
||||
ClickAction
|
||||
| InputTextAction
|
||||
| UploadFileAction
|
||||
| SelectOptionAction
|
||||
| CheckboxAction
|
||||
| WaitAction
|
||||
| NullAction
|
||||
| SolveCaptchaAction
|
||||
| TerminateAction
|
||||
| CompleteAction
|
||||
)
|
||||
445
skyvern/webeye/actions/handler.py
Normal file
445
skyvern/webeye/actions/handler.py
Normal file
@@ -0,0 +1,445 @@
|
||||
import asyncio
|
||||
import re
|
||||
from typing import Awaitable, Callable, List
|
||||
|
||||
import structlog
|
||||
from playwright.async_api import Locator, Page
|
||||
|
||||
from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound
|
||||
from skyvern.forge import app
|
||||
from skyvern.forge.prompts import prompt_engine
|
||||
from skyvern.forge.sdk.api.files import download_file
|
||||
from skyvern.forge.sdk.models import Step
|
||||
from skyvern.forge.sdk.schemas.tasks import Task
|
||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||
from skyvern.webeye.actions import actions
|
||||
from skyvern.webeye.actions.actions import Action, ActionType, ClickAction, ScrapeResult, UploadFileAction, WebAction
|
||||
from skyvern.webeye.actions.responses import ActionFailure, ActionResult, ActionSuccess
|
||||
from skyvern.webeye.browser_factory import BrowserState
|
||||
from skyvern.webeye.scraper.scraper import ScrapedPage
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
|
||||
|
||||
class ActionHandler:
|
||||
_handled_action_types: dict[
|
||||
ActionType, Callable[[Action, Page, ScrapedPage, Task, Step], Awaitable[list[ActionResult]]]
|
||||
] = {}
|
||||
|
||||
@classmethod
|
||||
def register_action_type(
|
||||
cls,
|
||||
action_type: ActionType,
|
||||
handler: Callable[[Action, Page, ScrapedPage, Task, Step], Awaitable[list[ActionResult]]],
|
||||
) -> None:
|
||||
cls._handled_action_types[action_type] = handler
|
||||
|
||||
@staticmethod
|
||||
async def handle_action(
|
||||
scraped_page: ScrapedPage,
|
||||
task: Task,
|
||||
step: Step,
|
||||
browser_state: BrowserState,
|
||||
action: Action,
|
||||
) -> list[ActionResult]:
|
||||
LOG.info("Handling action", action=action)
|
||||
page = await browser_state.get_or_create_page()
|
||||
try:
|
||||
if action.action_type in ActionHandler._handled_action_types:
|
||||
handler = ActionHandler._handled_action_types[action.action_type]
|
||||
return await handler(action, page, scraped_page, task, step)
|
||||
else:
|
||||
LOG.error("Unsupported action type in handler", action=action, type=type(action))
|
||||
return [ActionFailure(Exception(f"Unsupported action type: {type(action)}"))]
|
||||
except MissingElement as e:
|
||||
LOG.info("Known exceptions", action=action, exception_type=type(e), exception_message=str(e))
|
||||
return [ActionFailure(e)]
|
||||
except MultipleElementsFound as e:
|
||||
LOG.exception(
|
||||
"Cannot handle multiple elements with the same xpath in one action.",
|
||||
action=action,
|
||||
exception=e,
|
||||
)
|
||||
return [ActionFailure(e)]
|
||||
except Exception as e:
|
||||
LOG.exception("Unhandled exception in action handler", action=action, exception=e)
|
||||
return [ActionFailure(e)]
|
||||
|
||||
|
||||
async def handle_solve_captcha_action(
|
||||
action: actions.SolveCaptchaAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
LOG.warning(
|
||||
"Please solve the captcha on the page, you have 30 seconds",
|
||||
action=action,
|
||||
)
|
||||
await asyncio.sleep(30)
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_click_action(
|
||||
action: actions.ClickAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
xpath = await validate_actions_in_dom(action, page, scraped_page)
|
||||
await asyncio.sleep(0.3)
|
||||
return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
|
||||
|
||||
async def handle_input_text_action(
|
||||
action: actions.InputTextAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
xpath = await validate_actions_in_dom(action, page, scraped_page)
|
||||
locator = page.locator(f"xpath={xpath}")
|
||||
await locator.clear()
|
||||
await locator.fill(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
|
||||
# This is a hack that gets dropdowns to select the "best" option based on what's typed
|
||||
# Fixes situations like tsk_228671423990405776 where the location isn't being autocompleted
|
||||
await locator.press("Tab", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
input_value = await locator.input_value(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
if not input_value:
|
||||
LOG.info("Failed to input the text, trying to press sequentially with an enter click", action=action)
|
||||
await locator.clear()
|
||||
await locator.press_sequentially(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
await locator.press("Enter", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
input_value = await locator.input_value(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
LOG.info("Input value", input_value=input_value, action=action)
|
||||
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_upload_file_action(
|
||||
action: actions.UploadFileAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
if not action.file_url:
|
||||
LOG.warning("InputFileAction has no file_url", action=action)
|
||||
return [ActionFailure(MissingFileUrl())]
|
||||
if action.file_url not in str(task.navigation_payload):
|
||||
LOG.warning(
|
||||
"LLM might be imagining the file url, which is not in navigation payload",
|
||||
action=action,
|
||||
file_url=action.file_url,
|
||||
)
|
||||
return [ActionFailure(ImaginaryFileUrl(action.file_url))]
|
||||
xpath = await validate_actions_in_dom(action, page, scraped_page)
|
||||
file_path = download_file(action.file_url)
|
||||
locator = page.locator(f"xpath={xpath}")
|
||||
is_file_input = await is_file_input_element(locator)
|
||||
if is_file_input:
|
||||
LOG.info("Taking UploadFileAction. Found file input tag", action=action)
|
||||
if file_path:
|
||||
await page.locator(f"xpath={xpath}").set_input_files(
|
||||
file_path, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
|
||||
)
|
||||
|
||||
# Sleep for 10 seconds after uploading a file to let the page process it
|
||||
await asyncio.sleep(10)
|
||||
return [ActionSuccess()]
|
||||
else:
|
||||
return [ActionFailure(Exception(f"Failed to download file from {action.file_url}"))]
|
||||
else:
|
||||
LOG.info("Taking UploadFileAction. Found non file input tag", action=action)
|
||||
# treat it as a click action
|
||||
action.is_upload_file_tag = False
|
||||
return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
|
||||
|
||||
async def handle_null_action(
|
||||
action: actions.NullAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_select_option_action(
|
||||
action: actions.SelectOptionAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
xpath = await validate_actions_in_dom(action, page, scraped_page)
|
||||
|
||||
try:
|
||||
# First click by label (if it matches)
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
await page.select_option(
|
||||
xpath,
|
||||
label=action.option.label,
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
return [ActionSuccess()]
|
||||
except Exception as e:
|
||||
if action.option.index is not None:
|
||||
LOG.warning(
|
||||
"Failed to click on the option by label, trying by index",
|
||||
exc_info=e,
|
||||
action=action,
|
||||
xpath=xpath,
|
||||
)
|
||||
else:
|
||||
return [ActionFailure(e)]
|
||||
|
||||
try:
|
||||
option_xpath = scraped_page.id_to_xpath_dict[action.option.index]
|
||||
match = re.search(r"option\[(\d+)]$", option_xpath)
|
||||
if match:
|
||||
# This means we were trying to select an option xpath, click the option
|
||||
option_index = int(match.group(1))
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
await page.select_option(
|
||||
xpath,
|
||||
index=option_index,
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
return [ActionSuccess()]
|
||||
else:
|
||||
# This means the supplied index was for the select element, not a reference to the xpath dict
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
await page.select_option(
|
||||
xpath,
|
||||
index=action.option.index,
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
return [ActionSuccess()]
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to click on the option by index", exception=e, action=action)
|
||||
return [ActionFailure(e)]
|
||||
|
||||
|
||||
async def handle_checkbox_action(
|
||||
self: actions.CheckboxAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
"""
|
||||
******* NOT REGISTERED *******
|
||||
This action causes more harm than it does good.
|
||||
It frequently mis-behaves, or gets stuck in click loops.
|
||||
Treating checkbox actions as click actions seem to perform way more reliably
|
||||
Developers who tried this and failed: 2 (Suchintan and Shu 😂)
|
||||
"""
|
||||
xpath = await validate_actions_in_dom(self, page, scraped_page)
|
||||
if self.is_checked:
|
||||
await page.check(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
else:
|
||||
await page.uncheck(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
|
||||
# TODO (suchintan): Why does checking the label work, but not the actual input element?
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_wait_action(
|
||||
action: actions.WaitAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
await asyncio.sleep(10)
|
||||
return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]
|
||||
|
||||
|
||||
async def handle_terminate_action(
|
||||
action: actions.TerminateAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_complete_action(
|
||||
action: actions.CompleteAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
extracted_data = None
|
||||
if action.data_extraction_goal:
|
||||
scrape_action_result = await extract_information_for_navigation_goal(
|
||||
scraped_page=scraped_page,
|
||||
task=task,
|
||||
step=step,
|
||||
)
|
||||
extracted_data = scrape_action_result.scraped_data
|
||||
return [ActionSuccess(data=extracted_data)]
|
||||
|
||||
|
||||
ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
|
||||
ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
|
||||
ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
|
||||
ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
|
||||
ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
|
||||
ActionHandler.register_action_type(ActionType.SELECT_OPTION, handle_select_option_action)
|
||||
ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
|
||||
ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
|
||||
ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
|
||||
|
||||
|
||||
async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> str:
|
||||
xpath = scraped_page.id_to_xpath_dict[action.element_id]
|
||||
locator = page.locator(xpath)
|
||||
|
||||
num_elements = await locator.count()
|
||||
if num_elements < 1:
|
||||
LOG.warning("No elements found with action xpath. Validation failed.", action=action, xpath=xpath)
|
||||
raise MissingElement(xpath=xpath, element_id=action.element_id)
|
||||
elif num_elements > 1:
|
||||
LOG.warning(
|
||||
"Multiple elements found with action xpath. Expected 1. Validation failed.",
|
||||
action=action,
|
||||
num_elements=num_elements,
|
||||
)
|
||||
raise MultipleElementsFound(num=num_elements, xpath=xpath, element_id=action.element_id)
|
||||
else:
|
||||
LOG.info("Validated action xpath in DOM", action=action)
|
||||
|
||||
return xpath
|
||||
|
||||
|
||||
async def chain_click(
|
||||
page: Page,
|
||||
action: ClickAction | UploadFileAction,
|
||||
xpath: str,
|
||||
timeout: int = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
) -> List[ActionResult]:
|
||||
# Add a defensive page handler here in case a click action opens a file chooser.
|
||||
# This automatically dismisses the dialog
|
||||
# File choosers are impossible to close if you don't expect one. Instead of dealing with it, close it!
|
||||
|
||||
# TODO (suchintan): This should likely result in an ActionFailure -- we can figure out how to do this later!
|
||||
LOG.info("Chain click starts", action=action, xpath=xpath)
|
||||
file: list[str] | str = []
|
||||
if action.file_url:
|
||||
file = download_file(action.file_url) or []
|
||||
|
||||
fc_func = lambda fc: fc.set_files(files=file)
|
||||
page.on("filechooser", fc_func)
|
||||
|
||||
LOG.info("Registered file chooser listener", action=action, path=file)
|
||||
"""
|
||||
Clicks on an element identified by the xpath and its parent if failed.
|
||||
:param xpath: xpath of the element to click
|
||||
"""
|
||||
javascript_triggered = await is_javascript_triggered(page, xpath)
|
||||
try:
|
||||
await page.click(f"xpath={xpath}", timeout=timeout)
|
||||
LOG.info("Chain click: main element click succeeded", action=action, xpath=xpath)
|
||||
return [ActionSuccess(javascript_triggered=javascript_triggered)]
|
||||
except Exception as e:
|
||||
action_results: list[ActionResult] = [ActionFailure(e, javascript_triggered=javascript_triggered)]
|
||||
if await is_input_element(page.locator(xpath)):
|
||||
LOG.info("Chain click: it's an input element. going to try sibling click", action=action, xpath=xpath)
|
||||
sibling_action_result = await click_sibling_of_input(page.locator(xpath), timeout=timeout)
|
||||
action_results.append(sibling_action_result)
|
||||
if type(sibling_action_result) == ActionSuccess:
|
||||
return action_results
|
||||
|
||||
parent_xpath = f"{xpath}/.."
|
||||
try:
|
||||
parent_javascript_triggered = await is_javascript_triggered(page, parent_xpath)
|
||||
javascript_triggered = javascript_triggered or parent_javascript_triggered
|
||||
parent_locator = page.locator(xpath).locator("..")
|
||||
await parent_locator.click(timeout=timeout)
|
||||
LOG.info("Chain click: successfully clicked parent element", action=action, parent_xpath=parent_xpath)
|
||||
action_results.append(
|
||||
ActionSuccess(
|
||||
javascript_triggered=javascript_triggered,
|
||||
interacted_with_parent=True,
|
||||
)
|
||||
)
|
||||
except Exception as pe:
|
||||
LOG.warning("Failed to click parent element", action=action, parent_xpath=parent_xpath, exc_info=True)
|
||||
action_results.append(
|
||||
ActionFailure(pe, javascript_triggered=javascript_triggered, interacted_with_parent=True)
|
||||
)
|
||||
# We don't raise exception here because we do log the exception, and return ActionFailure as the last action
|
||||
|
||||
return action_results
|
||||
finally:
|
||||
LOG.info("Remove file chooser listener", action=action)
|
||||
|
||||
# Sleep for 10 seconds after uploading a file to let the page process it
|
||||
# Removing this breaks file uploads using the filechooser
|
||||
# KEREM DO NOT REMOVE
|
||||
if file:
|
||||
await asyncio.sleep(10)
|
||||
page.remove_listener("filechooser", fc_func)
|
||||
|
||||
|
||||
async def is_javascript_triggered(page: Page, xpath: str) -> bool:
|
||||
locator = page.locator(f"xpath={xpath}")
|
||||
element = locator.first
|
||||
tag_name = await element.evaluate("e => e.tagName")
|
||||
if tag_name.lower() == "a":
|
||||
href = await element.evaluate("e => e.href")
|
||||
if href.lower().startswith("javascript:"):
|
||||
LOG.info("Found javascript call in anchor tag, marking step as completed. Dropping remaining actions")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
async def is_file_input_element(locator: Locator) -> bool:
|
||||
element = locator.first
|
||||
if element:
|
||||
tag_name = await element.evaluate("el => el.tagName")
|
||||
type_name = await element.evaluate("el => el.type")
|
||||
return tag_name.lower() == "input" and type_name == "file"
|
||||
return False
|
||||
|
||||
|
||||
async def is_input_element(locator: Locator) -> bool:
|
||||
element = locator.first
|
||||
if element:
|
||||
tag_name = await element.evaluate("el => el.tagName")
|
||||
return tag_name.lower() == "input"
|
||||
return False
|
||||
|
||||
|
||||
async def click_sibling_of_input(
|
||||
locator: Locator,
|
||||
timeout: int,
|
||||
javascript_triggered: bool = False,
|
||||
) -> ActionResult:
|
||||
try:
|
||||
input_element = locator.first
|
||||
parent_locator = locator.locator("..")
|
||||
if input_element:
|
||||
input_id = await input_element.get_attribute("id")
|
||||
sibling_label_xpath = f'//label[@for="{input_id}"]'
|
||||
label_locator = parent_locator.locator(sibling_label_xpath)
|
||||
await label_locator.click(timeout=timeout)
|
||||
LOG.info(
|
||||
"Successfully clicked sibling label of input element",
|
||||
sibling_label_xpath=sibling_label_xpath,
|
||||
)
|
||||
return ActionSuccess(javascript_triggered=javascript_triggered, interacted_with_sibling=True)
|
||||
# Should never get here
|
||||
return ActionFailure(
|
||||
exception=Exception("Failed while trying to click sibling of input element"),
|
||||
javascript_triggered=javascript_triggered,
|
||||
interacted_with_sibling=True,
|
||||
)
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to click sibling label of input element", exc_info=e)
|
||||
return ActionFailure(exception=e, javascript_triggered=javascript_triggered)
|
||||
|
||||
|
||||
async def extract_information_for_navigation_goal(
|
||||
task: Task,
|
||||
step: Step,
|
||||
scraped_page: ScrapedPage,
|
||||
) -> ScrapeResult:
|
||||
"""
|
||||
Scrapes a webpage and returns the scraped response, including:
|
||||
1. JSON representation of what the user is seeing
|
||||
2. The scraped page
|
||||
"""
|
||||
prompt_template = "extract-information"
|
||||
|
||||
extract_information_prompt = prompt_engine.load_prompt(
|
||||
prompt_template,
|
||||
navigation_goal=task.navigation_goal,
|
||||
elements=scraped_page.element_tree,
|
||||
data_extraction_goal=task.data_extraction_goal,
|
||||
extracted_information_schema=task.extracted_information_schema,
|
||||
current_url=scraped_page.url,
|
||||
extracted_text=scraped_page.extracted_text,
|
||||
)
|
||||
|
||||
json_response = await app.OPENAI_CLIENT.chat_completion(
|
||||
step=step,
|
||||
prompt=extract_information_prompt,
|
||||
screenshots=scraped_page.screenshots,
|
||||
)
|
||||
|
||||
return ScrapeResult(
|
||||
scraped_data=json_response,
|
||||
)
|
||||
58
skyvern/webeye/actions/models.py
Normal file
58
skyvern/webeye/actions/models.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||
from skyvern.webeye.actions.actions import Action, ActionTypeUnion
|
||||
from skyvern.webeye.actions.responses import ActionResult
|
||||
from skyvern.webeye.scraper.scraper import ScrapedPage
|
||||
|
||||
|
||||
class AgentStepOutput(BaseModel):
|
||||
"""
|
||||
Output of the agent step, this is recorded in the database.
|
||||
"""
|
||||
|
||||
# Will be deprecated once we move to the new format below
|
||||
action_results: list[ActionResult] | None = None
|
||||
# Nullable for backwards compatibility, once backfill is done, this won't be nullable anymore
|
||||
actions_and_results: list[tuple[ActionTypeUnion, list[ActionResult]]] | None = None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"AgentStepOutput({self.model_dump()})"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
|
||||
class DetailedAgentStepOutput(BaseModel):
|
||||
"""
|
||||
Output of the agent step, this is not recorded in the database, only used for debugging in the Jupyter notebook.
|
||||
"""
|
||||
|
||||
scraped_page: ScrapedPage | None
|
||||
extract_action_prompt: str | None
|
||||
llm_response: dict[str, Any] | None
|
||||
actions: list[Action] | None
|
||||
action_results: list[ActionResult] | None
|
||||
actions_and_results: list[tuple[ActionTypeUnion, list[ActionResult]]] | None
|
||||
|
||||
class Config:
|
||||
exclude = ["scraped_page", "extract_action_prompt"]
|
||||
|
||||
def __repr__(self) -> str:
|
||||
if SettingsManager.get_settings().DEBUG_MODE:
|
||||
return f"DetailedAgentStepOutput({self.model_dump()})"
|
||||
else:
|
||||
return f"AgentStepOutput({self.to_agent_step_output().model_dump()})"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def to_agent_step_output(self) -> AgentStepOutput:
|
||||
return AgentStepOutput(
|
||||
action_results=self.action_results if self.action_results else [],
|
||||
actions_and_results=self.actions_and_results if self.actions_and_results else [],
|
||||
)
|
||||
62
skyvern/webeye/actions/responses.py
Normal file
62
skyvern/webeye/actions/responses.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.webeye.string_util import remove_whitespace
|
||||
|
||||
|
||||
class ActionResult(BaseModel):
|
||||
success: bool
|
||||
exception_type: str | None = None
|
||||
exception_message: str | None = None
|
||||
data: dict[str, Any] | list | str | None = None
|
||||
step_retry_number: int | None = None
|
||||
step_order: int | None = None
|
||||
javascript_triggered: bool = False
|
||||
# None is used for old data so that we can differentiate between old and new data which only has boolean
|
||||
interacted_with_sibling: bool | None = None
|
||||
interacted_with_parent: bool | None = None
|
||||
|
||||
def __str__(self) -> str:
|
||||
return (
|
||||
f"ActionResult(success={self.success}, exception_type={self.exception_type}, "
|
||||
f"exception_message={self.exception_message}), data={self.data}"
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.__str__()
|
||||
|
||||
|
||||
class ActionSuccess(ActionResult):
|
||||
def __init__(
|
||||
self,
|
||||
data: dict[str, Any] | list | str | None = None,
|
||||
javascript_triggered: bool = False,
|
||||
interacted_with_sibling: bool = False,
|
||||
interacted_with_parent: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
success=True,
|
||||
data=data,
|
||||
javascript_triggered=javascript_triggered,
|
||||
interacted_with_sibling=interacted_with_sibling,
|
||||
interacted_with_parent=interacted_with_parent,
|
||||
)
|
||||
|
||||
|
||||
class ActionFailure(ActionResult):
|
||||
def __init__(
|
||||
self,
|
||||
exception: Exception,
|
||||
javascript_triggered: bool = False,
|
||||
interacted_with_sibling: bool = False,
|
||||
interacted_with_parent: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
success=False,
|
||||
exception_type=type(exception).__name__,
|
||||
exception_message=remove_whitespace(str(exception)),
|
||||
javascript_triggered=javascript_triggered,
|
||||
interacted_with_sibling=interacted_with_sibling,
|
||||
interacted_with_parent=interacted_with_parent,
|
||||
)
|
||||
167
skyvern/webeye/browser_factory.py
Normal file
167
skyvern/webeye/browser_factory.py
Normal file
@@ -0,0 +1,167 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
from typing import Any, Awaitable, Protocol
|
||||
|
||||
import structlog
|
||||
from playwright.async_api import BrowserContext, Error, Page, Playwright, async_playwright
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.exceptions import FailedToNavigateToUrl, UnknownBrowserType, UnknownErrorWhileCreatingBrowserContext
|
||||
from skyvern.forge.sdk.core.skyvern_context import current
|
||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
|
||||
|
||||
class BrowserContextCreator(Protocol):
|
||||
def __call__(
|
||||
self, playwright: Playwright, **kwargs: dict[str, Any]
|
||||
) -> Awaitable[tuple[BrowserContext, BrowserArtifacts]]:
|
||||
...
|
||||
|
||||
|
||||
class BrowserContextFactory:
|
||||
_creators: dict[str, BrowserContextCreator] = {}
|
||||
|
||||
@staticmethod
|
||||
def get_subdir() -> str:
|
||||
curr_context = current()
|
||||
if curr_context and curr_context.task_id:
|
||||
return curr_context.task_id
|
||||
elif curr_context and curr_context.request_id:
|
||||
return curr_context.request_id
|
||||
return str(uuid.uuid4())
|
||||
|
||||
@staticmethod
|
||||
def build_browser_args() -> dict[str, Any]:
|
||||
video_dir = f"{SettingsManager.get_settings().VIDEO_PATH}/{datetime.utcnow().strftime('%Y-%m-%d')}"
|
||||
har_dir = f"{SettingsManager.get_settings().HAR_PATH}/{datetime.utcnow().strftime('%Y-%m-%d')}/{BrowserContextFactory.get_subdir()}.har"
|
||||
return {
|
||||
"record_har_path": har_dir,
|
||||
"record_video_dir": video_dir,
|
||||
"viewport": {"width": 1920, "height": 1080},
|
||||
}
|
||||
|
||||
@staticmethod
|
||||
def build_browser_artifacts(
|
||||
video_path: str | None = None, har_path: str | None = None, video_artifact_id: str | None = None
|
||||
) -> BrowserArtifacts:
|
||||
return BrowserArtifacts(video_path=video_path, har_path=har_path, video_artifact_id=video_artifact_id)
|
||||
|
||||
@classmethod
|
||||
def register_type(cls, browser_type: str, creator: BrowserContextCreator) -> None:
|
||||
cls._creators[browser_type] = creator
|
||||
|
||||
@classmethod
|
||||
async def create_browser_context(
|
||||
cls, playwright: Playwright, **kwargs: Any
|
||||
) -> tuple[BrowserContext, BrowserArtifacts]:
|
||||
browser_type = SettingsManager.get_settings().BROWSER_TYPE
|
||||
try:
|
||||
creator = cls._creators.get(browser_type)
|
||||
if not creator:
|
||||
raise UnknownBrowserType(browser_type)
|
||||
return await creator(playwright, **kwargs)
|
||||
except UnknownBrowserType as e:
|
||||
raise e
|
||||
except Exception as e:
|
||||
raise UnknownErrorWhileCreatingBrowserContext(browser_type, e) from e
|
||||
|
||||
|
||||
class BrowserArtifacts(BaseModel):
|
||||
video_path: str | None = None
|
||||
video_artifact_id: str | None = None
|
||||
har_path: str | None = None
|
||||
|
||||
|
||||
async def _create_headless_chromium(playwright: Playwright, **kwargs: dict) -> tuple[BrowserContext, BrowserArtifacts]:
|
||||
browser = await playwright.chromium.launch(headless=True)
|
||||
browser_args = BrowserContextFactory.build_browser_args()
|
||||
browser_artifacts = BrowserContextFactory.build_browser_artifacts(har_path=browser_args["record_har_path"])
|
||||
browser_context = await browser.new_context(**browser_args)
|
||||
return browser_context, browser_artifacts
|
||||
|
||||
|
||||
async def _create_headful_chromium(playwright: Playwright, **kwargs: dict) -> tuple[BrowserContext, BrowserArtifacts]:
|
||||
browser = await playwright.chromium.launch(headless=False)
|
||||
browser_args = BrowserContextFactory.build_browser_args()
|
||||
browser_artifacts = BrowserContextFactory.build_browser_artifacts(har_path=browser_args["record_har_path"])
|
||||
browser_context = await browser.new_context(**browser_args)
|
||||
return browser_context, browser_artifacts
|
||||
|
||||
|
||||
BrowserContextFactory.register_type("chromium-headless", _create_headless_chromium)
|
||||
BrowserContextFactory.register_type("chromium-headful", _create_headful_chromium)
|
||||
|
||||
|
||||
class BrowserState:
|
||||
instance = None
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
pw: Playwright | None = None,
|
||||
browser_context: BrowserContext | None = None,
|
||||
page: Page | None = None,
|
||||
browser_artifacts: BrowserArtifacts = BrowserArtifacts(),
|
||||
):
|
||||
self.pw = pw
|
||||
self.browser_context = browser_context
|
||||
self.page = page
|
||||
self.browser_artifacts = browser_artifacts
|
||||
|
||||
async def _close_all_other_pages(self) -> None:
|
||||
if not self.browser_context or not self.page:
|
||||
return
|
||||
pages = self.browser_context.pages
|
||||
for page in pages:
|
||||
if page != self.page:
|
||||
await page.close()
|
||||
|
||||
async def check_and_fix_state(self, url: str | None = None) -> None:
|
||||
if self.pw is None:
|
||||
LOG.info("Starting playwright")
|
||||
self.pw = await async_playwright().start()
|
||||
LOG.info("playwright is started")
|
||||
if self.browser_context is None:
|
||||
LOG.info("creating browser context")
|
||||
browser_context, browser_artifacts = await BrowserContextFactory.create_browser_context(self.pw, url=url)
|
||||
self.browser_context = browser_context
|
||||
self.browser_artifacts = browser_artifacts
|
||||
LOG.info("browser context is created")
|
||||
|
||||
assert self.browser_context is not None
|
||||
|
||||
if self.page is None:
|
||||
LOG.info("Creating a new page")
|
||||
self.page = await self.browser_context.new_page()
|
||||
await self._close_all_other_pages()
|
||||
LOG.info("A new page is created")
|
||||
if url:
|
||||
LOG.info(f"Navigating page to {url} and waiting for 5 seconds")
|
||||
try:
|
||||
await self.page.goto(url)
|
||||
except Error as playright_error:
|
||||
LOG.exception(f"Error while navigating to url: {str(playright_error)}", exc_info=True)
|
||||
raise FailedToNavigateToUrl(url=url, error_message=str(playright_error))
|
||||
LOG.info(f"Successfully went to {url}")
|
||||
|
||||
if self.browser_artifacts.video_path is None:
|
||||
self.browser_artifacts.video_path = await self.page.video.path()
|
||||
|
||||
async def get_or_create_page(self, url: str | None = None) -> Page:
|
||||
await self.check_and_fix_state(url)
|
||||
assert self.page is not None
|
||||
return self.page
|
||||
|
||||
async def close(self, close_browser_on_completion: bool = True) -> None:
|
||||
LOG.info("Closing browser state")
|
||||
if self.browser_context and close_browser_on_completion:
|
||||
LOG.info("Closing browser context and its pages")
|
||||
await self.browser_context.close()
|
||||
LOG.info("Main browser context and all its pages are closed")
|
||||
if self.pw and close_browser_on_completion:
|
||||
LOG.info("Stopping playwright")
|
||||
await self.pw.stop()
|
||||
LOG.info("Playwright is stopped")
|
||||
152
skyvern/webeye/browser_manager.py
Normal file
152
skyvern/webeye/browser_manager.py
Normal file
@@ -0,0 +1,152 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import structlog
|
||||
from playwright.async_api import Browser, Playwright, async_playwright
|
||||
|
||||
from skyvern.exceptions import MissingBrowserState
|
||||
from skyvern.forge.sdk.schemas.tasks import ProxyLocation, Task
|
||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||
from skyvern.forge.sdk.workflow.models.workflow import WorkflowRun
|
||||
from skyvern.webeye.browser_factory import BrowserContextFactory, BrowserState
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
|
||||
|
||||
class BrowserManager:
|
||||
instance = None
|
||||
pages: dict[str, BrowserState] = dict()
|
||||
|
||||
def __new__(cls) -> BrowserManager:
|
||||
if cls.instance is None:
|
||||
cls.instance = super().__new__(cls)
|
||||
return cls.instance
|
||||
|
||||
@staticmethod
|
||||
async def _create_browser_state(
|
||||
proxy_location: ProxyLocation | None = None, url: str | None = None
|
||||
) -> BrowserState:
|
||||
pw = await async_playwright().start()
|
||||
browser_context, browser_artifacts = await BrowserContextFactory.create_browser_context(
|
||||
pw, proxy_location=proxy_location, url=url
|
||||
)
|
||||
return BrowserState(pw=pw, browser_context=browser_context, page=None, browser_artifacts=browser_artifacts)
|
||||
|
||||
async def get_or_create_for_task(self, task: Task) -> BrowserState:
|
||||
if task.task_id in self.pages:
|
||||
return self.pages[task.task_id]
|
||||
elif task.workflow_run_id in self.pages:
|
||||
LOG.info(
|
||||
"Browser state for task not found. Using browser state for workflow run",
|
||||
task_id=task.task_id,
|
||||
workflow_run_id=task.workflow_run_id,
|
||||
)
|
||||
self.pages[task.task_id] = self.pages[task.workflow_run_id]
|
||||
return self.pages[task.task_id]
|
||||
LOG.info("Creating browser state for task", task_id=task.task_id)
|
||||
browser_state = await self._create_browser_state(task.proxy_location, task.url)
|
||||
|
||||
# The URL here is only used when creating a new page, and not when using an existing page.
|
||||
# This will make sure browser_state.page is not None.
|
||||
await browser_state.get_or_create_page(task.url)
|
||||
|
||||
self.pages[task.task_id] = browser_state
|
||||
return browser_state
|
||||
|
||||
async def get_or_create_for_workflow_run(self, workflow_run: WorkflowRun, url: str | None = None) -> BrowserState:
|
||||
if workflow_run.workflow_run_id in self.pages:
|
||||
return self.pages[workflow_run.workflow_run_id]
|
||||
LOG.info("Creating browser state for workflow run", workflow_run_id=workflow_run.workflow_run_id)
|
||||
browser_state = await self._create_browser_state(workflow_run.proxy_location, url=url)
|
||||
|
||||
# The URL here is only used when creating a new page, and not when using an existing page.
|
||||
# This will make sure browser_state.page is not None.
|
||||
await browser_state.get_or_create_page(url)
|
||||
|
||||
self.pages[workflow_run.workflow_run_id] = browser_state
|
||||
return browser_state
|
||||
|
||||
def set_video_artifact_for_task(self, task: Task, artifact_id: str) -> None:
|
||||
if task.workflow_run_id and task.workflow_run_id in self.pages:
|
||||
if self.pages[task.workflow_run_id].browser_artifacts.video_artifact_id:
|
||||
LOG.warning(
|
||||
"Video artifact is already set for workflow run. Overwriting",
|
||||
workflow_run_id=task.workflow_run_id,
|
||||
old_artifact_id=self.pages[task.workflow_run_id].browser_artifacts.video_artifact_id,
|
||||
new_artifact_id=artifact_id,
|
||||
)
|
||||
self.pages[task.workflow_run_id].browser_artifacts.video_artifact_id = artifact_id
|
||||
return
|
||||
if task.task_id in self.pages:
|
||||
if self.pages[task.task_id].browser_artifacts.video_artifact_id:
|
||||
LOG.warning(
|
||||
"Video artifact is already set for task. Overwriting",
|
||||
task_id=task.task_id,
|
||||
old_artifact_id=self.pages[task.task_id].browser_artifacts.video_artifact_id,
|
||||
new_artifact_id=artifact_id,
|
||||
)
|
||||
self.pages[task.task_id].browser_artifacts.video_artifact_id = artifact_id
|
||||
return
|
||||
|
||||
raise MissingBrowserState(task_id=task.task_id)
|
||||
|
||||
async def get_video_data(
|
||||
self, browser_state: BrowserState, task_id: str = "", workflow_id: str = "", workflow_run_id: str = ""
|
||||
) -> bytes:
|
||||
if browser_state:
|
||||
path = browser_state.browser_artifacts.video_path
|
||||
if path:
|
||||
with open(path, "rb") as f:
|
||||
return f.read()
|
||||
LOG.warning(
|
||||
"Video data not found for task", task_id=task_id, workflow_id=workflow_id, workflow_run_id=workflow_run_id
|
||||
)
|
||||
return b""
|
||||
|
||||
async def get_har_data(
|
||||
self, browser_state: BrowserState, task_id: str = "", workflow_id: str = "", workflow_run_id: str = ""
|
||||
) -> bytes:
|
||||
if browser_state:
|
||||
path = browser_state.browser_artifacts.har_path
|
||||
if path:
|
||||
with open(path, "rb") as f:
|
||||
return f.read()
|
||||
LOG.warning(
|
||||
"HAR data not found for task", task_id=task_id, workflow_id=workflow_id, workflow_run_id=workflow_run_id
|
||||
)
|
||||
return b""
|
||||
|
||||
@classmethod
|
||||
async def connect_to_scraping_browser(cls, pw: Playwright) -> Browser:
|
||||
if not SettingsManager.get_settings().REMOTE_BROWSER_KEY:
|
||||
raise Exception("REMOTE_BROWSER_KEY is empty. Cannot connect to remote browser.")
|
||||
browser = await pw.chromium.connect_over_cdp(SettingsManager.get_settings().REMOTE_BROWSER_KEY)
|
||||
LOG.info("Connected to remote browser", browser_type=SettingsManager.get_settings().BROWSER_TYPE)
|
||||
return browser
|
||||
|
||||
@classmethod
|
||||
async def close(cls) -> None:
|
||||
LOG.info("Closing BrowserManager")
|
||||
for browser_state in cls.pages.values():
|
||||
await browser_state.close()
|
||||
cls.pages = dict()
|
||||
LOG.info("BrowserManger is closed")
|
||||
|
||||
async def cleanup_for_task(self, task_id: str, close_browser_on_completion: bool = True) -> BrowserState | None:
|
||||
LOG.info("Cleaning up for task")
|
||||
browser_state_to_close = self.pages.pop(task_id, None)
|
||||
if browser_state_to_close:
|
||||
await browser_state_to_close.close(close_browser_on_completion=close_browser_on_completion)
|
||||
LOG.info("Task is cleaned up")
|
||||
|
||||
return browser_state_to_close
|
||||
|
||||
async def cleanup_for_workflow_run(
|
||||
self, workflow_run_id: str, close_browser_on_completion: bool = True
|
||||
) -> BrowserState | None:
|
||||
LOG.info("Cleaning up for workflow run")
|
||||
browser_state_to_close = self.pages.pop(workflow_run_id, None)
|
||||
if browser_state_to_close:
|
||||
await browser_state_to_close.close(close_browser_on_completion=close_browser_on_completion)
|
||||
LOG.info("Workflow run is cleaned up")
|
||||
|
||||
return browser_state_to_close
|
||||
0
skyvern/webeye/scraper/__init__.py
Normal file
0
skyvern/webeye/scraper/__init__.py
Normal file
806
skyvern/webeye/scraper/domUtils.js
Normal file
806
skyvern/webeye/scraper/domUtils.js
Normal file
@@ -0,0 +1,806 @@
|
||||
// Commands for manipulating rects.
|
||||
class Rect {
|
||||
// Create a rect given the top left and bottom right corners.
|
||||
static create(x1, y1, x2, y2) {
|
||||
return {
|
||||
bottom: y2,
|
||||
top: y1,
|
||||
left: x1,
|
||||
right: x2,
|
||||
width: x2 - x1,
|
||||
height: y2 - y1,
|
||||
};
|
||||
}
|
||||
|
||||
static copy(rect) {
|
||||
return {
|
||||
bottom: rect.bottom,
|
||||
top: rect.top,
|
||||
left: rect.left,
|
||||
right: rect.right,
|
||||
width: rect.width,
|
||||
height: rect.height,
|
||||
};
|
||||
}
|
||||
|
||||
// Translate a rect by x horizontally and y vertically.
|
||||
static translate(rect, x, y) {
|
||||
if (x == null) x = 0;
|
||||
if (y == null) y = 0;
|
||||
return {
|
||||
bottom: rect.bottom + y,
|
||||
top: rect.top + y,
|
||||
left: rect.left + x,
|
||||
right: rect.right + x,
|
||||
width: rect.width,
|
||||
height: rect.height,
|
||||
};
|
||||
}
|
||||
|
||||
// Determine whether two rects overlap.
|
||||
static intersects(rect1, rect2) {
|
||||
return (
|
||||
rect1.right > rect2.left &&
|
||||
rect1.left < rect2.right &&
|
||||
rect1.bottom > rect2.top &&
|
||||
rect1.top < rect2.bottom
|
||||
);
|
||||
}
|
||||
|
||||
static equals(rect1, rect2) {
|
||||
for (const property of [
|
||||
"top",
|
||||
"bottom",
|
||||
"left",
|
||||
"right",
|
||||
"width",
|
||||
"height",
|
||||
]) {
|
||||
if (rect1[property] !== rect2[property]) return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
}
|
||||
|
||||
class DomUtils {
|
||||
//
|
||||
// Bounds the rect by the current viewport dimensions. If the rect is offscreen or has a height or
|
||||
// width < 3 then null is returned instead of a rect.
|
||||
//
|
||||
static cropRectToVisible(rect) {
|
||||
const boundedRect = Rect.create(
|
||||
Math.max(rect.left, 0),
|
||||
Math.max(rect.top, 0),
|
||||
rect.right,
|
||||
rect.bottom,
|
||||
);
|
||||
if (
|
||||
boundedRect.top >= window.innerHeight - 4 ||
|
||||
boundedRect.left >= window.innerWidth - 4
|
||||
) {
|
||||
return null;
|
||||
} else {
|
||||
return boundedRect;
|
||||
}
|
||||
}
|
||||
|
||||
static getVisibleClientRect(element, testChildren) {
|
||||
// Note: this call will be expensive if we modify the DOM in between calls.
|
||||
let clientRect;
|
||||
if (testChildren == null) testChildren = false;
|
||||
const clientRects = (() => {
|
||||
const result = [];
|
||||
for (clientRect of element.getClientRects()) {
|
||||
result.push(Rect.copy(clientRect));
|
||||
}
|
||||
return result;
|
||||
})();
|
||||
|
||||
// Inline elements with font-size: 0px; will declare a height of zero, even if a child with
|
||||
// non-zero font-size contains text.
|
||||
let isInlineZeroHeight = function () {
|
||||
const elementComputedStyle = window.getComputedStyle(element, null);
|
||||
const isInlineZeroFontSize =
|
||||
0 ===
|
||||
elementComputedStyle.getPropertyValue("display").indexOf("inline") &&
|
||||
elementComputedStyle.getPropertyValue("font-size") === "0px";
|
||||
// Override the function to return this value for the rest of this context.
|
||||
isInlineZeroHeight = () => isInlineZeroFontSize;
|
||||
return isInlineZeroFontSize;
|
||||
};
|
||||
|
||||
for (clientRect of clientRects) {
|
||||
// If the link has zero dimensions, it may be wrapping visible but floated elements. Check for
|
||||
// this.
|
||||
let computedStyle;
|
||||
if ((clientRect.width === 0 || clientRect.height === 0) && testChildren) {
|
||||
for (const child of Array.from(element.children)) {
|
||||
computedStyle = window.getComputedStyle(child, null);
|
||||
// Ignore child elements which are not floated and not absolutely positioned for parent
|
||||
// elements with zero width/height, as long as the case described at isInlineZeroHeight
|
||||
// does not apply.
|
||||
// NOTE(mrmr1993): This ignores floated/absolutely positioned descendants nested within
|
||||
// inline children.
|
||||
const position = computedStyle.getPropertyValue("position");
|
||||
if (
|
||||
computedStyle.getPropertyValue("float") === "none" &&
|
||||
!["absolute", "fixed"].includes(position) &&
|
||||
!(
|
||||
clientRect.height === 0 &&
|
||||
isInlineZeroHeight() &&
|
||||
0 === computedStyle.getPropertyValue("display").indexOf("inline")
|
||||
)
|
||||
) {
|
||||
continue;
|
||||
}
|
||||
const childClientRect = this.getVisibleClientRect(child, true);
|
||||
if (
|
||||
childClientRect === null ||
|
||||
childClientRect.width < 3 ||
|
||||
childClientRect.height < 3
|
||||
)
|
||||
continue;
|
||||
return childClientRect;
|
||||
}
|
||||
} else {
|
||||
clientRect = this.cropRectToVisible(clientRect);
|
||||
|
||||
if (
|
||||
clientRect === null ||
|
||||
clientRect.width < 3 ||
|
||||
clientRect.height < 3
|
||||
)
|
||||
continue;
|
||||
|
||||
// eliminate invisible elements (see test_harnesses/visibility_test.html)
|
||||
computedStyle = window.getComputedStyle(element, null);
|
||||
if (computedStyle.getPropertyValue("visibility") !== "visible")
|
||||
continue;
|
||||
|
||||
return clientRect;
|
||||
}
|
||||
}
|
||||
|
||||
return null;
|
||||
}
|
||||
|
||||
static getViewportTopLeft() {
|
||||
const box = document.documentElement;
|
||||
const style = getComputedStyle(box);
|
||||
const rect = box.getBoundingClientRect();
|
||||
if (
|
||||
style.position === "static" &&
|
||||
!/content|paint|strict/.test(style.contain || "")
|
||||
) {
|
||||
// The margin is included in the client rect, so we need to subtract it back out.
|
||||
const marginTop = parseInt(style.marginTop);
|
||||
const marginLeft = parseInt(style.marginLeft);
|
||||
return {
|
||||
top: -rect.top + marginTop,
|
||||
left: -rect.left + marginLeft,
|
||||
};
|
||||
} else {
|
||||
const { clientTop, clientLeft } = box;
|
||||
return {
|
||||
top: -rect.top - clientTop,
|
||||
left: -rect.left - clientLeft,
|
||||
};
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// from playwright
|
||||
function getElementComputedStyle(element, pseudo) {
|
||||
return element.ownerDocument && element.ownerDocument.defaultView
|
||||
? element.ownerDocument.defaultView.getComputedStyle(element, pseudo)
|
||||
: undefined;
|
||||
}
|
||||
|
||||
// from playwright
|
||||
function isElementStyleVisibilityVisible(element, style) {
|
||||
style = style ?? getElementComputedStyle(element);
|
||||
if (!style) return true;
|
||||
if (
|
||||
!element.checkVisibility({ checkOpacity: false, checkVisibilityCSS: false })
|
||||
)
|
||||
return false;
|
||||
if (style.visibility !== "visible") return false;
|
||||
return true;
|
||||
}
|
||||
|
||||
// from playwright
|
||||
function isElementVisible(element) {
|
||||
// TODO: This is a hack to not check visibility for option elements
|
||||
// because they are not visible by default. We check their parent instead for visibility.
|
||||
if (element.tagName.toLowerCase() === "option")
|
||||
return element.parentElement && isElementVisible(element.parentElement);
|
||||
|
||||
const style = getElementComputedStyle(element);
|
||||
if (!style) return true;
|
||||
if (style.display === "contents") {
|
||||
// display:contents is not rendered itself, but its child nodes are.
|
||||
for (let child = element.firstChild; child; child = child.nextSibling) {
|
||||
if (
|
||||
child.nodeType === 1 /* Node.ELEMENT_NODE */ &&
|
||||
isElementVisible(child)
|
||||
)
|
||||
return true;
|
||||
// skipping other nodes including text
|
||||
}
|
||||
return false;
|
||||
}
|
||||
if (!isElementStyleVisibilityVisible(element, style)) return false;
|
||||
const rect = element.getBoundingClientRect();
|
||||
return rect.width > 0 && rect.height > 0;
|
||||
}
|
||||
|
||||
function isHiddenOrDisabled(element) {
|
||||
const style = getElementComputedStyle(element);
|
||||
return style?.display === "none" || element.hidden || element.disabled;
|
||||
}
|
||||
|
||||
function isScriptOrStyle(element) {
|
||||
const tagName = element.tagName.toLowerCase();
|
||||
return tagName === "script" || tagName === "style";
|
||||
}
|
||||
|
||||
function hasWidgetRole(element) {
|
||||
const role = element.getAttribute("role");
|
||||
if (!role) {
|
||||
return false;
|
||||
}
|
||||
// https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles#2._widget_roles
|
||||
// Not all roles make sense for the time being so we only check for the ones that do
|
||||
const widgetRoles = [
|
||||
"button",
|
||||
"link",
|
||||
"checkbox",
|
||||
"menuitem",
|
||||
"menuitemcheckbox",
|
||||
"menuitemradio",
|
||||
"radio",
|
||||
"tab",
|
||||
"combobox",
|
||||
"textbox",
|
||||
"searchbox",
|
||||
"slider",
|
||||
"spinbutton",
|
||||
"switch",
|
||||
"gridcell",
|
||||
];
|
||||
return widgetRoles.includes(role.toLowerCase().trim());
|
||||
}
|
||||
|
||||
function isInteractableInput(element) {
|
||||
const tagName = element.tagName.toLowerCase();
|
||||
const type = element.getAttribute("type");
|
||||
if (tagName !== "input" || !type) {
|
||||
// let other checks decide
|
||||
return false;
|
||||
}
|
||||
const clickableTypes = [
|
||||
"button",
|
||||
"checkbox",
|
||||
"date",
|
||||
"datetime-local",
|
||||
"email",
|
||||
"file",
|
||||
"image",
|
||||
"month",
|
||||
"number",
|
||||
"password",
|
||||
"radio",
|
||||
"range",
|
||||
"reset",
|
||||
"search",
|
||||
"submit",
|
||||
"tel",
|
||||
"text",
|
||||
"time",
|
||||
"url",
|
||||
"week",
|
||||
];
|
||||
return clickableTypes.includes(type.toLowerCase().trim());
|
||||
}
|
||||
|
||||
function isInteractable(element) {
|
||||
if (!isElementVisible(element)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isHiddenOrDisabled(element)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (isScriptOrStyle(element)) {
|
||||
return false;
|
||||
}
|
||||
|
||||
if (hasWidgetRole(element)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (isInteractableInput(element)) {
|
||||
return true;
|
||||
}
|
||||
|
||||
const tagName = element.tagName.toLowerCase();
|
||||
|
||||
if (tagName === "a" && element.href) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (
|
||||
tagName === "button" ||
|
||||
tagName === "select" ||
|
||||
tagName === "option" ||
|
||||
tagName === "textarea"
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (tagName === "label" && element.control && !element.control.disabled) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (
|
||||
element.hasAttribute("onclick") ||
|
||||
element.isContentEditable ||
|
||||
element.hasAttribute("jsaction")
|
||||
) {
|
||||
return true;
|
||||
}
|
||||
|
||||
if (tagName === "div" || tagName === "img" || tagName === "span") {
|
||||
const computedStyle = window.getComputedStyle(element);
|
||||
const hasPointer = computedStyle.cursor === "pointer";
|
||||
const hasCursor = computedStyle.cursor === "cursor";
|
||||
return hasPointer || hasCursor;
|
||||
}
|
||||
|
||||
return false;
|
||||
}
|
||||
|
||||
function removeMultipleSpaces(str) {
|
||||
if (!str) {
|
||||
return str;
|
||||
}
|
||||
return str.replace(/\s+/g, " ");
|
||||
}
|
||||
|
||||
function cleanupText(text) {
|
||||
return removeMultipleSpaces(
|
||||
text.replace("SVGs not supported by this browser.", ""),
|
||||
).trim();
|
||||
}
|
||||
|
||||
function getElementContext(element) {
|
||||
// dfs to collect the non unique_id context
|
||||
let fullContext = "";
|
||||
if (element.childNodes.length === 0) {
|
||||
return fullContext;
|
||||
}
|
||||
let childContextList = new Array();
|
||||
for (var child of element.childNodes) {
|
||||
let childContext = "";
|
||||
if (child.nodeType === Node.TEXT_NODE) {
|
||||
if (!element.hasAttribute("unique_id")) {
|
||||
childContext = child.data.trim();
|
||||
}
|
||||
} else if (child.nodeType === Node.ELEMENT_NODE) {
|
||||
if (!child.hasAttribute("unique_id")) {
|
||||
childContext = getElementContext(child);
|
||||
}
|
||||
}
|
||||
if (childContext.length > 0) {
|
||||
childContextList.push(childContext);
|
||||
}
|
||||
|
||||
if (childContextList.length > 0) {
|
||||
fullContext = childContextList.join(";");
|
||||
}
|
||||
|
||||
const charLimit = 1000;
|
||||
if (fullContext.length > charLimit) {
|
||||
fullContext = "";
|
||||
}
|
||||
}
|
||||
return fullContext;
|
||||
}
|
||||
|
||||
function getElementContent(element) {
|
||||
// DFS to get all the text content from all the nodes under the element
|
||||
|
||||
let textContent = element.textContent;
|
||||
let nodeContent = "";
|
||||
// if element has children, then build a list of text and join with a semicolon
|
||||
if (element.childNodes.length > 0) {
|
||||
let childTextContentList = new Array();
|
||||
let nodeTextContentList = new Array();
|
||||
for (var child of element.childNodes) {
|
||||
let childText = "";
|
||||
if (child.nodeType === Node.TEXT_NODE) {
|
||||
childText = child.data.trim();
|
||||
nodeTextContentList.push(childText);
|
||||
} else if (child.nodeType === Node.ELEMENT_NODE) {
|
||||
// childText = child.textContent.trim();
|
||||
childText = getElementContent(child);
|
||||
} else {
|
||||
console.log("Unhandled node type: ", child.nodeType);
|
||||
}
|
||||
if (childText.length > 0) {
|
||||
childTextContentList.push(childText);
|
||||
}
|
||||
}
|
||||
textContent = childTextContentList.join(";");
|
||||
nodeContent = cleanupText(nodeTextContentList.join(";"));
|
||||
}
|
||||
let finalTextContent = cleanupText(textContent);
|
||||
|
||||
// Currently we don't support too much context. Character limit is 1000 per element.
|
||||
// we don't think element context has to be that big
|
||||
const charLimit = 1000;
|
||||
if (finalTextContent.length > charLimit) {
|
||||
if (nodeContent.length <= charLimit) {
|
||||
finalTextContent = nodeContent;
|
||||
} else {
|
||||
finalTextContent = "";
|
||||
}
|
||||
}
|
||||
|
||||
return finalTextContent;
|
||||
}
|
||||
|
||||
function getSelectOptions(element) {
|
||||
const options = Array.from(element.options);
|
||||
const selectOptions = [];
|
||||
for (const option of options) {
|
||||
selectOptions.push({
|
||||
optionIndex: option.index,
|
||||
text: removeMultipleSpaces(option.textContent),
|
||||
});
|
||||
}
|
||||
return selectOptions;
|
||||
}
|
||||
|
||||
function buildTreeFromBody() {
|
||||
var elements = [];
|
||||
var resultArray = [];
|
||||
|
||||
function buildElementObject(element) {
|
||||
var element_id = elements.length;
|
||||
var elementTagNameLower = element.tagName.toLowerCase();
|
||||
element.setAttribute("unique_id", element_id);
|
||||
// if element is an "a" tag and has a target="_blank" attribute, remove the target attribute
|
||||
// We're doing this so that skyvern can do all the navigation in a single page/tab and not open new tab
|
||||
if (element.tagName.toLowerCase() === "a") {
|
||||
if (element.getAttribute("target") === "_blank") {
|
||||
element.removeAttribute("target");
|
||||
}
|
||||
}
|
||||
const attrs = {};
|
||||
for (const attr of element.attributes) {
|
||||
var attrValue = attr.value;
|
||||
if (
|
||||
attr.name === "required" ||
|
||||
attr.name === "aria-required" ||
|
||||
attr.name === "checked" ||
|
||||
attr.name === "aria-checked" ||
|
||||
attr.name === "selected" ||
|
||||
attr.name === "aria-selected" ||
|
||||
attr.name === "readonly" ||
|
||||
attr.name === "aria-readonly"
|
||||
) {
|
||||
attrValue = true;
|
||||
}
|
||||
attrs[attr.name] = attrValue;
|
||||
}
|
||||
if (elementTagNameLower === "input" || elementTagNameLower === "textarea") {
|
||||
attrs["value"] = element.value;
|
||||
}
|
||||
|
||||
let elementObj = {
|
||||
id: element_id,
|
||||
tagName: elementTagNameLower,
|
||||
attributes: attrs,
|
||||
text: getElementContent(element),
|
||||
children: [],
|
||||
rect: DomUtils.getVisibleClientRect(element, true),
|
||||
};
|
||||
|
||||
// get options for select element or for listbox element
|
||||
let selectOptions = null;
|
||||
if (elementTagNameLower === "select") {
|
||||
selectOptions = getSelectOptions(element);
|
||||
}
|
||||
if (selectOptions) {
|
||||
elementObj.options = selectOptions;
|
||||
}
|
||||
|
||||
return elementObj;
|
||||
}
|
||||
|
||||
function getChildElements(element) {
|
||||
if (element.childElementCount !== 0) {
|
||||
return Array.from(element.children);
|
||||
} else {
|
||||
return [];
|
||||
}
|
||||
}
|
||||
function processElement(element, interactableParentId) {
|
||||
// Check if the element is interactable
|
||||
if (isInteractable(element)) {
|
||||
var elementObj = buildElementObject(element);
|
||||
elements.push(elementObj);
|
||||
// If the element is interactable but has no interactable parent,
|
||||
// then it starts a new tree, so add it to the result array
|
||||
// and set its id as the interactable parent id for the next elements
|
||||
// under it
|
||||
if (interactableParentId === null) {
|
||||
resultArray.push(elementObj);
|
||||
}
|
||||
// If the element is interactable and has an interactable parent,
|
||||
// then add it to the children of the parent
|
||||
else {
|
||||
elements[interactableParentId].children.push(elementObj);
|
||||
}
|
||||
// Recursively process the children of the element
|
||||
getChildElements(element).forEach((child) => {
|
||||
processElement(child, elementObj.id);
|
||||
});
|
||||
return elementObj;
|
||||
} else {
|
||||
// For a non-interactable element, process its children
|
||||
// and check if any of them are interactable
|
||||
let interactableChildren = [];
|
||||
getChildElements(element).forEach((child) => {
|
||||
let children = processElement(child, interactableParentId);
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// TODO: Handle iframes
|
||||
|
||||
// Clear all the unique_id attributes so that there are no conflicts
|
||||
removeAllUniqueIdAttributes();
|
||||
processElement(document.body, null);
|
||||
|
||||
for (var element of elements) {
|
||||
if (
|
||||
((element.tagName === "input" && element.attributes["type"] === "text") ||
|
||||
element.tagName === "textarea") &&
|
||||
(element.attributes["required"] || element.attributes["aria-required"]) &&
|
||||
element.attributes.value === ""
|
||||
) {
|
||||
// TODO (kerem): we may want to pass these elements to the LLM as empty but required fields in the future
|
||||
console.log(
|
||||
"input element with required attribute and no value",
|
||||
element,
|
||||
);
|
||||
}
|
||||
|
||||
// for most elements, we're going 10 layers up to see if we can find "label" as a parent
|
||||
// if found, most likely the context under label is relevant to this element
|
||||
let targetParentElements = new Set(["label", "fieldset"]);
|
||||
|
||||
// look up for 10 levels to find the most contextual parent element
|
||||
let targetContextualParent = null;
|
||||
let currentEle = document.querySelector(`[unique_id="${element.id}"]`);
|
||||
let parentEle = currentEle;
|
||||
for (var i = 0; i < 10; i++) {
|
||||
parentEle = parentEle.parentElement;
|
||||
if (parentEle) {
|
||||
if (targetParentElements.has(parentEle.tagName.toLowerCase())) {
|
||||
targetContextualParent = parentEle;
|
||||
}
|
||||
} else {
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (targetContextualParent) {
|
||||
let context = "";
|
||||
var lowerCaseTagName = targetContextualParent.tagName.toLowerCase();
|
||||
if (lowerCaseTagName === "label") {
|
||||
context = getElementContext(targetContextualParent);
|
||||
} else if (lowerCaseTagName === "fieldset") {
|
||||
// fieldset is usually within a form or another element that contains the whole context
|
||||
targetContextualParent = targetContextualParent.parentElement;
|
||||
if (targetContextualParent) {
|
||||
context = getElementContext(targetContextualParent);
|
||||
}
|
||||
}
|
||||
if (context.length > 0) {
|
||||
element.context = context;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
return [elements, resultArray];
|
||||
}
|
||||
|
||||
function drawBoundingBoxes(elements) {
|
||||
// draw a red border around the elements
|
||||
var groups = groupElementsVisually(elements);
|
||||
var hintMarkers = createHintMarkersForGroups(groups);
|
||||
addHintMarkersToPage(hintMarkers);
|
||||
}
|
||||
|
||||
function removeAllUniqueIdAttributes() {
|
||||
var elementsWithUniqueId = document.querySelectorAll("[unique_id]");
|
||||
|
||||
elementsWithUniqueId.forEach(function (element) {
|
||||
element.removeAttribute("unique_id");
|
||||
});
|
||||
}
|
||||
|
||||
function captchaSolvedCallback() {
|
||||
console.log("captcha solved");
|
||||
if (!window["captchaSolvedCounter"]) {
|
||||
window["captchaSolvedCounter"] = 0;
|
||||
}
|
||||
// For some reason this isn't being called.. TODO figure out why
|
||||
window["captchaSolvedCounter"] = window["captchaSolvedCounter"] + 1;
|
||||
}
|
||||
|
||||
function getCaptchaSolves() {
|
||||
if (!window["captchaSolvedCounter"]) {
|
||||
window["captchaSolvedCounter"] = 0;
|
||||
}
|
||||
return window["captchaSolvedCounter"];
|
||||
}
|
||||
|
||||
function groupElementsVisually(elements) {
|
||||
const groups = [];
|
||||
// o n^2
|
||||
// go through each hint and see if it overlaps with any other hints, if it does, add it to the group of the other hint
|
||||
// *** if we start from the bigger elements (top -> bottom) we can avoid merging groups
|
||||
for (const element of elements) {
|
||||
if (!element.rect) {
|
||||
continue;
|
||||
}
|
||||
const group = groups.find((group) => {
|
||||
for (const groupElement of group.elements) {
|
||||
if (Rect.intersects(groupElement.rect, element.rect)) {
|
||||
return true;
|
||||
}
|
||||
}
|
||||
return false;
|
||||
});
|
||||
if (group) {
|
||||
group.elements.push(element);
|
||||
} else {
|
||||
groups.push({
|
||||
elements: [element],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
||||
// go through each group and create a rectangle that encompasses all the hints in the group
|
||||
for (const group of groups) {
|
||||
group.rect = createRectangleForGroup(group);
|
||||
}
|
||||
|
||||
return groups;
|
||||
}
|
||||
|
||||
function createRectangleForGroup(group) {
|
||||
const rects = group.elements.map((element) => element.rect);
|
||||
const top = Math.min(...rects.map((rect) => rect.top));
|
||||
const left = Math.min(...rects.map((rect) => rect.left));
|
||||
const bottom = Math.max(...rects.map((rect) => rect.bottom));
|
||||
const right = Math.max(...rects.map((rect) => rect.right));
|
||||
return Rect.create(left, top, right, bottom);
|
||||
}
|
||||
|
||||
function generateHintStrings(count) {
|
||||
const hintCharacters = "sadfjklewcmpgh";
|
||||
let hintStrings = [""];
|
||||
let offset = 0;
|
||||
|
||||
while (hintStrings.length - offset < count || hintStrings.length === 1) {
|
||||
const hintString = hintStrings[offset++];
|
||||
for (const ch of hintCharacters) {
|
||||
hintStrings.push(ch + hintString);
|
||||
}
|
||||
}
|
||||
hintStrings = hintStrings.slice(offset, offset + count);
|
||||
|
||||
// Shuffle the hints so that they're scattered; hints starting with the same character and short
|
||||
// hints are spread evenly throughout the array.
|
||||
return hintStrings.sort(); // .map((str) => str.reverse())
|
||||
}
|
||||
|
||||
function createHintMarkersForGroups(groups) {
|
||||
if (groups.length === 0) {
|
||||
console.log("No groups found, not adding hint markers to page.");
|
||||
return [];
|
||||
}
|
||||
|
||||
const hintMarkers = groups.map((group) => createHintMarkerForGroup(group));
|
||||
|
||||
// fill in marker text
|
||||
const hintStrings = generateHintStrings(hintMarkers.length);
|
||||
for (let i = 0; i < hintMarkers.length; i++) {
|
||||
const hintMarker = hintMarkers[i];
|
||||
hintMarker.hintString = hintStrings[i];
|
||||
hintMarker.element.innerHTML = hintMarker.hintString.toUpperCase();
|
||||
}
|
||||
|
||||
return hintMarkers;
|
||||
}
|
||||
|
||||
function createHintMarkerForGroup(group) {
|
||||
const marker = {};
|
||||
// yellow annotation box with string
|
||||
const el = document.createElement("div");
|
||||
el.style.left = group.rect.left + "px";
|
||||
el.style.top = group.rect.top + "px";
|
||||
// Each group is assigned a different incremental z-index, we use the same z-index for the
|
||||
// bounding box and the hint marker
|
||||
el.style.zIndex = this.currentZIndex;
|
||||
|
||||
// The bounding box around the group of hints.
|
||||
const boundingBox = document.createElement("div");
|
||||
|
||||
// Calculate the position of the element relative to the document
|
||||
var scrollTop = window.pageYOffset || document.documentElement.scrollTop;
|
||||
var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
|
||||
|
||||
// Set styles for the bounding box
|
||||
boundingBox.style.position = "absolute";
|
||||
boundingBox.style.display = "display";
|
||||
boundingBox.style.left = group.rect.left + scrollLeft + "px";
|
||||
boundingBox.style.top = group.rect.top + scrollTop + "px";
|
||||
boundingBox.style.width = group.rect.width + "px";
|
||||
boundingBox.style.height = group.rect.height + "px";
|
||||
boundingBox.style.bottom = boundingBox.style.top + boundingBox.style.height;
|
||||
boundingBox.style.right = boundingBox.style.left + boundingBox.style.width;
|
||||
boundingBox.style.border = "2px solid blue"; // Change the border color as needed
|
||||
boundingBox.style.pointerEvents = "none"; // Ensures the box doesn't interfere with other interactions
|
||||
boundingBox.style.zIndex = this.currentZIndex++;
|
||||
|
||||
return Object.assign(marker, {
|
||||
element: el,
|
||||
boundingBox: boundingBox,
|
||||
group: group,
|
||||
});
|
||||
}
|
||||
|
||||
function addHintMarkersToPage(hintMarkers) {
|
||||
const parent = document.createElement("div");
|
||||
parent.id = "boundingBoxContainer";
|
||||
for (const hintMarker of hintMarkers) {
|
||||
// parent.appendChild(hintMarker.element);
|
||||
parent.appendChild(hintMarker.boundingBox);
|
||||
}
|
||||
document.documentElement.appendChild(parent);
|
||||
}
|
||||
|
||||
function removeBoundingBoxes() {
|
||||
var hintMarkerContainer = document.querySelector("#boundingBoxContainer");
|
||||
if (hintMarkerContainer) {
|
||||
hintMarkerContainer.remove();
|
||||
}
|
||||
}
|
||||
|
||||
function scrollToTop(draw_boxes) {
|
||||
removeBoundingBoxes();
|
||||
window.scrollTo(0, 0);
|
||||
if (draw_boxes) {
|
||||
var elementsAndResultArray = buildTreeFromBody();
|
||||
drawBoundingBoxes(elementsAndResultArray[0]);
|
||||
}
|
||||
return window.scrollY;
|
||||
}
|
||||
|
||||
function scrollToNextPage(draw_boxes) {
|
||||
// remove bounding boxes, scroll to next page with 200px overlap, then draw bounding boxes again
|
||||
// return true if there is a next page, false otherwise
|
||||
removeBoundingBoxes();
|
||||
window.scrollBy(0, window.innerHeight - 200);
|
||||
if (draw_boxes) {
|
||||
var elementsAndResultArray = buildTreeFromBody();
|
||||
drawBoundingBoxes(elementsAndResultArray[0]);
|
||||
}
|
||||
return window.scrollY;
|
||||
}
|
||||
316
skyvern/webeye/scraper/scraper.py
Normal file
316
skyvern/webeye/scraper/scraper.py
Normal file
@@ -0,0 +1,316 @@
|
||||
import asyncio
|
||||
import copy
|
||||
|
||||
import structlog
|
||||
from playwright.async_api import Page
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
|
||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||
from skyvern.webeye.browser_factory import BrowserState
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
|
||||
RESERVED_ATTRIBUTES = {
|
||||
"accept", # for input file
|
||||
"alt",
|
||||
"aria-checked", # for option tag
|
||||
"aria-current",
|
||||
"aria-label",
|
||||
"aria-required",
|
||||
"aria-role",
|
||||
"aria-selected", # for option tag
|
||||
"checked",
|
||||
"data-ui",
|
||||
"for",
|
||||
"href", # For a tags
|
||||
"maxlength",
|
||||
"name",
|
||||
"pattern",
|
||||
"placeholder",
|
||||
"readonly",
|
||||
"required",
|
||||
"selected", # for option tag
|
||||
"src", # do we need this?
|
||||
"text-value",
|
||||
"title",
|
||||
"type",
|
||||
"value",
|
||||
}
|
||||
|
||||
|
||||
def load_js_script() -> str:
|
||||
# TODO: Handle file location better. This is a hacky way to find the file location.
|
||||
path = f"{SKYVERN_DIR}/webeye/scraper/domUtils.js"
|
||||
try:
|
||||
# TODO: Implement TS of domUtils.js and use the complied JS file instead of the raw JS file.
|
||||
# This will allow our code to be type safe.
|
||||
with open(path, "r") as f:
|
||||
return f.read()
|
||||
except FileNotFoundError as e:
|
||||
LOG.exception("Failed to load the JS script", exc_info=True, path=path)
|
||||
raise e
|
||||
|
||||
|
||||
JS_FUNCTION_DEFS = load_js_script()
|
||||
|
||||
|
||||
class ScrapedPage(BaseModel):
|
||||
"""
|
||||
Scraped response from a webpage, including:
|
||||
1. List of elements
|
||||
2. ID to xpath map
|
||||
3. The element tree of the page (list of dicts). Each element has children and attributes.
|
||||
4. The screenshot (base64 encoded)
|
||||
5. The URL of the page
|
||||
6. The HTML of the page
|
||||
7. The extracted text from the page
|
||||
"""
|
||||
|
||||
elements: list[dict]
|
||||
id_to_xpath_dict: dict[int, str]
|
||||
element_tree: list[dict]
|
||||
element_tree_trimmed: list[dict]
|
||||
screenshots: list[bytes]
|
||||
url: str
|
||||
html: str
|
||||
extracted_text: str | None = None
|
||||
|
||||
|
||||
async def scrape_website(
|
||||
browser_state: BrowserState,
|
||||
url: str,
|
||||
num_retry: int = 0,
|
||||
) -> ScrapedPage:
|
||||
"""
|
||||
************************************************************************************************
|
||||
************ NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production *************
|
||||
************************************************************************************************
|
||||
High-level asynchronous function to scrape a web page. It sets up the Playwright environment, handles browser and
|
||||
page initialization, and calls the safe scraping function. This function is ideal for general use where initial
|
||||
setup and safety measures are required.
|
||||
|
||||
Asynchronous function that safely scrapes a web page. It handles exceptions and retries scraping up to a maximum
|
||||
number of attempts. This function should be used when reliability and error handling are crucial, such as in
|
||||
automated scraping tasks.
|
||||
|
||||
:param browser_context: BrowserContext instance used for scraping.
|
||||
:param url: URL of the web page to be scraped.
|
||||
:param page: Optional Page instance for scraping, a new page is created if None.
|
||||
:param num_retry: Tracks number of retries if scraping fails, defaults to 0.
|
||||
|
||||
:return: Tuple containing Page instance, base64 encoded screenshot, and page elements.
|
||||
|
||||
:raises Exception: When scraping fails after maximum retries.
|
||||
"""
|
||||
try:
|
||||
num_retry += 1
|
||||
return await scrape_web_unsafe(browser_state, url)
|
||||
except Exception:
|
||||
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
|
||||
if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES:
|
||||
LOG.error(
|
||||
"Scraping failed after max retries, aborting.",
|
||||
max_retries=SettingsManager.get_settings().MAX_SCRAPING_RETRIES,
|
||||
url=url,
|
||||
exc_info=True,
|
||||
)
|
||||
raise Exception("Scraping failed.")
|
||||
LOG.info("Scraping failed, will retry", num_retry=num_retry, url=url)
|
||||
return await scrape_website(
|
||||
browser_state,
|
||||
url,
|
||||
num_retry=num_retry,
|
||||
)
|
||||
|
||||
|
||||
async def get_all_visible_text(page: Page) -> str:
|
||||
"""
|
||||
Get all the visible text on the page.
|
||||
:param page: Page instance to get the text from.
|
||||
:return: All the visible text on the page.
|
||||
"""
|
||||
js_script = "() => document.body.innerText"
|
||||
return await page.evaluate(js_script)
|
||||
|
||||
|
||||
async def scrape_web_unsafe(
|
||||
browser_state: BrowserState,
|
||||
url: str,
|
||||
) -> ScrapedPage:
|
||||
"""
|
||||
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
|
||||
for use cases where the caller handles exceptions or in controlled environments. It directly scrapes the provided
|
||||
URL or continues on the given page.
|
||||
|
||||
:param browser_context: BrowserContext instance used for scraping.
|
||||
:param url: URL of the web page to be scraped. Used only when creating a new page.
|
||||
:param page: Optional Page instance for scraping, a new page is created if None.
|
||||
:return: Tuple containing Page instance, base64 encoded screenshot, and page elements.
|
||||
:note: This function does not handle exceptions. Ensure proper error handling in the calling context.
|
||||
"""
|
||||
# We only create a new page if one does not exist. This is to allow keeping the same page since we want to
|
||||
# continue working on the same page that we're taking actions on.
|
||||
# *This also means URL is only used when creating a new page, and not when using an existing page.
|
||||
page = await browser_state.get_or_create_page(url)
|
||||
# Take screenshots of the page with the bounding boxes. We will remove the bounding boxes later.
|
||||
# Scroll to the top of the page and take a screenshot.
|
||||
# Scroll to the next page and take a screenshot until we reach the end of the page.
|
||||
# We check if the scroll_y_px_old is the same as scroll_y_px to determine if we have reached the end of the page.
|
||||
# This also solves the issue where we can't scroll due to a popup.(e.g. geico first popup on the homepage after
|
||||
# clicking start my quote)
|
||||
|
||||
LOG.info("Waiting for 5 seconds before scraping the website.")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
screenshots: list[bytes] = []
|
||||
scroll_y_px_old = -1.0
|
||||
scroll_y_px = await scroll_to_top(page, drow_boxes=True)
|
||||
# Checking max number of screenshots to prevent infinite loop
|
||||
while scroll_y_px_old != scroll_y_px and len(screenshots) < SettingsManager.get_settings().MAX_NUM_SCREENSHOTS:
|
||||
screenshot = await page.screenshot(full_page=False)
|
||||
screenshots.append(screenshot)
|
||||
scroll_y_px_old = scroll_y_px
|
||||
LOG.info("Scrolling to next page", url=url, num_screenshots=len(screenshots))
|
||||
scroll_y_px = await scroll_to_next_page(page, drow_boxes=True)
|
||||
LOG.info("Scrolled to next page", scroll_y_px=scroll_y_px, scroll_y_px_old=scroll_y_px_old)
|
||||
await remove_bounding_boxes(page)
|
||||
await scroll_to_top(page, drow_boxes=False)
|
||||
|
||||
elements, element_tree = await get_interactable_element_tree(page)
|
||||
element_tree = cleanup_elements(copy.deepcopy(element_tree))
|
||||
|
||||
id_to_xpath_dict = {}
|
||||
for element in elements:
|
||||
element_id = element["id"]
|
||||
# get_interactable_element_tree marks each interactable element with a unique_id attribute
|
||||
id_to_xpath_dict[element_id] = f"//*[@{SKYVERN_ID_ATTR}='{element_id}']"
|
||||
|
||||
text_content = await get_all_visible_text(page)
|
||||
return ScrapedPage(
|
||||
elements=elements,
|
||||
id_to_xpath_dict=id_to_xpath_dict,
|
||||
element_tree=element_tree,
|
||||
element_tree_trimmed=trim_element_tree(copy.deepcopy(element_tree)),
|
||||
screenshots=screenshots,
|
||||
url=page.url,
|
||||
html=await page.content(),
|
||||
extracted_text=text_content,
|
||||
)
|
||||
|
||||
|
||||
async def get_interactable_element_tree(page: Page) -> tuple[list[dict], list[dict]]:
|
||||
"""
|
||||
Get the element tree of the page, including all the elements that are interactable.
|
||||
:param page: Page instance to get the element tree from.
|
||||
:return: Tuple containing the element tree and a map of element IDs to elements.
|
||||
"""
|
||||
await page.evaluate(JS_FUNCTION_DEFS)
|
||||
js_script = "() => buildTreeFromBody()"
|
||||
elements, element_tree = await page.evaluate(js_script)
|
||||
return elements, element_tree
|
||||
|
||||
|
||||
async def scroll_to_top(page: Page, drow_boxes: bool) -> float:
|
||||
"""
|
||||
Scroll to the top of the page and take a screenshot.
|
||||
:param drow_boxes: If True, draw bounding boxes around the elements.
|
||||
:param page: Page instance to take the screenshot from.
|
||||
:return: Screenshot of the page.
|
||||
"""
|
||||
await page.evaluate(JS_FUNCTION_DEFS)
|
||||
js_script = f"() => scrollToTop({str(drow_boxes).lower()})"
|
||||
scroll_y_px = await page.evaluate(js_script)
|
||||
return scroll_y_px
|
||||
|
||||
|
||||
async def scroll_to_next_page(page: Page, drow_boxes: bool) -> bool:
|
||||
"""
|
||||
Scroll to the next page and take a screenshot.
|
||||
:param drow_boxes: If True, draw bounding boxes around the elements.
|
||||
:param page: Page instance to take the screenshot from.
|
||||
:return: Screenshot of the page.
|
||||
"""
|
||||
await page.evaluate(JS_FUNCTION_DEFS)
|
||||
js_script = f"() => scrollToNextPage({str(drow_boxes).lower()})"
|
||||
scroll_y_px = await page.evaluate(js_script)
|
||||
return scroll_y_px
|
||||
|
||||
|
||||
async def remove_bounding_boxes(page: Page) -> None:
|
||||
"""
|
||||
Remove the bounding boxes from the page.
|
||||
:param page: Page instance to remove the bounding boxes from.
|
||||
"""
|
||||
js_script = "() => removeBoundingBoxes()"
|
||||
await page.evaluate(js_script)
|
||||
|
||||
|
||||
def cleanup_elements(elements: list[dict]) -> list[dict]:
|
||||
"""
|
||||
Remove rect and attribute.unique_id from the elements.
|
||||
The reason we're doing it is to
|
||||
1. reduce unnecessary data so that llm get less distrction
|
||||
# TODO later: 2. reduce tokens sent to llm to save money
|
||||
:param elements: List of elements to remove xpaths from.
|
||||
:return: List of elements without xpaths.
|
||||
"""
|
||||
queue = []
|
||||
for element in elements:
|
||||
queue.append(element)
|
||||
while queue:
|
||||
queue_ele = queue.pop(0)
|
||||
_remove_rect(queue_ele)
|
||||
# TODO: we can come back to test removing the unique_id
|
||||
# from element attributes to make sure this won't increase hallucination
|
||||
# _remove_unique_id(queue_ele)
|
||||
if "children" in queue_ele:
|
||||
queue.extend(queue_ele["children"])
|
||||
return elements
|
||||
|
||||
|
||||
def trim_element_tree(elements: list[dict]) -> list[dict]:
|
||||
queue = []
|
||||
for element in elements:
|
||||
queue.append(element)
|
||||
while queue:
|
||||
queue_ele = queue.pop(0)
|
||||
if "attributes" in queue_ele:
|
||||
tag_name = queue_ele["tagName"] if "tagName" in queue_ele else ""
|
||||
new_attributes = _trimmed_attributes(tag_name, queue_ele["attributes"])
|
||||
if new_attributes:
|
||||
queue_ele["attributes"] = new_attributes
|
||||
else:
|
||||
del queue_ele["attributes"]
|
||||
if "children" in queue_ele:
|
||||
queue.extend(queue_ele["children"])
|
||||
if not queue_ele["children"]:
|
||||
del queue_ele["children"]
|
||||
if "text" in queue_ele:
|
||||
element_text = str(queue_ele["text"]).strip()
|
||||
if not element_text:
|
||||
del queue_ele["text"]
|
||||
return elements
|
||||
|
||||
|
||||
def _trimmed_attributes(tag_name: str, attributes: dict) -> dict:
|
||||
new_attributes: dict = {}
|
||||
for key in attributes:
|
||||
if key == "id" and tag_name in ["input", "textarea", "select"]:
|
||||
# We don't want to remove the id attribute any of these elements in case there's a label for it
|
||||
new_attributes[key] = attributes[key]
|
||||
if key in RESERVED_ATTRIBUTES:
|
||||
new_attributes[key] = attributes[key]
|
||||
return new_attributes
|
||||
|
||||
|
||||
def _remove_rect(element: dict) -> None:
|
||||
if "rect" in element:
|
||||
del element["rect"]
|
||||
|
||||
|
||||
def _remove_unique_id(element: dict) -> None:
|
||||
if "attributes" not in element:
|
||||
return
|
||||
if SKYVERN_ID_ATTR in element["attributes"]:
|
||||
del element["attributes"][SKYVERN_ID_ATTR]
|
||||
5
skyvern/webeye/string_util.py
Normal file
5
skyvern/webeye/string_util.py
Normal file
@@ -0,0 +1,5 @@
|
||||
import re
|
||||
|
||||
|
||||
def remove_whitespace(string: str) -> str:
|
||||
return re.sub("[ \n\t]+", " ", string)
|
||||
Reference in New Issue
Block a user