Move the code over from private repository (#3)
This commit is contained in:
0
skyvern/webeye/actions/__init__.py
Normal file
0
skyvern/webeye/actions/__init__.py
Normal file
204
skyvern/webeye/actions/actions.py
Normal file
204
skyvern/webeye/actions/actions.py
Normal file
@@ -0,0 +1,204 @@
|
||||
import abc
|
||||
from enum import StrEnum
|
||||
from typing import Any, Dict, List
|
||||
|
||||
import structlog
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.forge.sdk.schemas.tasks import Task
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
|
||||
|
||||
class ActionType(StrEnum):
|
||||
CLICK = "click"
|
||||
INPUT_TEXT = "input_text"
|
||||
UPLOAD_FILE = "upload_file"
|
||||
SELECT_OPTION = "select_option"
|
||||
CHECKBOX = "checkbox"
|
||||
WAIT = "wait"
|
||||
NULL_ACTION = "null_action"
|
||||
SOLVE_CAPTCHA = "solve_captcha"
|
||||
TERMINATE = "terminate"
|
||||
COMPLETE = "complete"
|
||||
# Note: Remember to update ActionTypeUnion with new actions
|
||||
|
||||
|
||||
class Action(BaseModel):
|
||||
action_type: ActionType
|
||||
description: str | None = None
|
||||
reasoning: str | None = None
|
||||
|
||||
|
||||
class WebAction(Action, abc.ABC):
|
||||
element_id: int
|
||||
|
||||
|
||||
class ClickAction(WebAction):
|
||||
action_type: ActionType = ActionType.CLICK
|
||||
file_url: str | None = None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"ClickAction(element_id={self.element_id}, file_url={self.file_url})"
|
||||
|
||||
|
||||
class InputTextAction(WebAction):
|
||||
action_type: ActionType = ActionType.INPUT_TEXT
|
||||
text: str
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"InputTextAction(element_id={self.element_id}, text={self.text})"
|
||||
|
||||
|
||||
class UploadFileAction(WebAction):
|
||||
action_type: ActionType = ActionType.UPLOAD_FILE
|
||||
file_url: str
|
||||
is_upload_file_tag: bool = True
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"UploadFileAction(element_id={self.element_id}, file={self.file_url}, is_upload_file_tag={self.is_upload_file_tag})"
|
||||
|
||||
|
||||
class NullAction(Action):
|
||||
action_type: ActionType = ActionType.NULL_ACTION
|
||||
|
||||
|
||||
class SolveCaptchaAction(Action):
|
||||
action_type: ActionType = ActionType.SOLVE_CAPTCHA
|
||||
|
||||
|
||||
class SelectOption(BaseModel):
|
||||
label: str | None
|
||||
value: str | None
|
||||
index: int | None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"SelectOption(label={self.label}, value={self.value}, index={self.index})"
|
||||
|
||||
|
||||
class SelectOptionAction(WebAction):
|
||||
action_type: ActionType = ActionType.SELECT_OPTION
|
||||
option: SelectOption
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"SelectOptionAction(element_id={self.element_id}, option={self.option})"
|
||||
|
||||
|
||||
###
|
||||
# This action causes more harm than it does good.
|
||||
# It frequently mis-behaves, or gets stuck in click loops.
|
||||
# Treating checkbox actions as click actions seem to perform way more reliably
|
||||
# Developers who tried this and failed: 2 (Suchintan and Shu 😂)
|
||||
###
|
||||
class CheckboxAction(WebAction):
|
||||
action_type: ActionType = ActionType.CHECKBOX
|
||||
is_checked: bool
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"CheckboxAction(element_id={self.element_id}, is_checked={self.is_checked})"
|
||||
|
||||
|
||||
class WaitAction(Action):
|
||||
action_type: ActionType = ActionType.WAIT
|
||||
|
||||
|
||||
class TerminateAction(Action):
|
||||
action_type: ActionType = ActionType.TERMINATE
|
||||
|
||||
|
||||
class CompleteAction(Action):
|
||||
action_type: ActionType = ActionType.COMPLETE
|
||||
data_extraction_goal: str | None = None
|
||||
|
||||
|
||||
def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Action]:
|
||||
actions = []
|
||||
for action in json_response:
|
||||
element_id = action["id"]
|
||||
reasoning = action["reasoning"] if "reasoning" in action else None
|
||||
if "action_type" not in action or action["action_type"] is None:
|
||||
actions.append(NullAction(reasoning=reasoning))
|
||||
continue
|
||||
# `.upper()` handles the case where the LLM returns a lowercase action type (e.g. "click" instead of "CLICK")
|
||||
action_type = ActionType[action["action_type"].upper()]
|
||||
if action_type == ActionType.TERMINATE:
|
||||
LOG.warning(
|
||||
"Agent decided to terminate",
|
||||
task_id=task.task_id,
|
||||
llm_response=json_response,
|
||||
reasoning=reasoning,
|
||||
actions=actions,
|
||||
)
|
||||
actions.append(TerminateAction(reasoning=reasoning))
|
||||
elif action_type == ActionType.CLICK:
|
||||
file_url = action["file_url"] if "file_url" in action else None
|
||||
actions.append(ClickAction(element_id=element_id, reasoning=reasoning, file_url=file_url))
|
||||
elif action_type == ActionType.INPUT_TEXT:
|
||||
actions.append(InputTextAction(element_id=element_id, text=action["text"], reasoning=reasoning))
|
||||
elif action_type == ActionType.UPLOAD_FILE:
|
||||
# TODO: see if the element is a file input element. if it's not, convert this action into a click action
|
||||
|
||||
actions.append(UploadFileAction(element_id=element_id, file_url=action["file_url"], reasoning=reasoning))
|
||||
elif action_type == ActionType.SELECT_OPTION:
|
||||
actions.append(
|
||||
SelectOptionAction(
|
||||
element_id=element_id,
|
||||
option=SelectOption(
|
||||
label=action["option"]["label"],
|
||||
value=action["option"]["value"],
|
||||
index=action["option"]["index"],
|
||||
),
|
||||
reasoning=reasoning,
|
||||
)
|
||||
)
|
||||
elif action_type == ActionType.CHECKBOX:
|
||||
actions.append(CheckboxAction(element_id=element_id, is_checked=action["is_checked"], reasoning=reasoning))
|
||||
elif action_type == ActionType.WAIT:
|
||||
actions.append(WaitAction(reasoning=reasoning))
|
||||
elif action_type == ActionType.COMPLETE:
|
||||
if actions:
|
||||
LOG.info(
|
||||
"Navigation goal achieved, creating complete action and discarding all other actions except "
|
||||
"complete action",
|
||||
task_id=task.task_id,
|
||||
nav_goal=task.navigation_goal,
|
||||
actions=actions,
|
||||
llm_response=json_response,
|
||||
)
|
||||
return [CompleteAction(reasoning=reasoning, data_extraction_goal=task.data_extraction_goal)]
|
||||
elif action_type == "null":
|
||||
actions.append(NullAction(reasoning=reasoning))
|
||||
elif action_type == ActionType.SOLVE_CAPTCHA:
|
||||
actions.append(SolveCaptchaAction(reasoning=reasoning))
|
||||
else:
|
||||
LOG.error(
|
||||
"Unsupported action type when parsing actions",
|
||||
task_id=task.task_id,
|
||||
action_type=action_type,
|
||||
raw_action=action,
|
||||
)
|
||||
return actions
|
||||
|
||||
|
||||
class ScrapeResult(BaseModel):
|
||||
"""
|
||||
Scraped response from a webpage, including:
|
||||
1. JSON representation of what the user is seeing
|
||||
"""
|
||||
|
||||
scraped_data: dict[str, Any] | list[dict[str, Any]]
|
||||
|
||||
|
||||
# https://blog.devgenius.io/deserialize-child-classes-with-pydantic-that-gonna-work-784230e1cf83
|
||||
ActionTypeUnion = (
|
||||
ClickAction
|
||||
| InputTextAction
|
||||
| UploadFileAction
|
||||
| SelectOptionAction
|
||||
| CheckboxAction
|
||||
| WaitAction
|
||||
| NullAction
|
||||
| SolveCaptchaAction
|
||||
| TerminateAction
|
||||
| CompleteAction
|
||||
)
|
||||
445
skyvern/webeye/actions/handler.py
Normal file
445
skyvern/webeye/actions/handler.py
Normal file
@@ -0,0 +1,445 @@
|
||||
import asyncio
|
||||
import re
|
||||
from typing import Awaitable, Callable, List
|
||||
|
||||
import structlog
|
||||
from playwright.async_api import Locator, Page
|
||||
|
||||
from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound
|
||||
from skyvern.forge import app
|
||||
from skyvern.forge.prompts import prompt_engine
|
||||
from skyvern.forge.sdk.api.files import download_file
|
||||
from skyvern.forge.sdk.models import Step
|
||||
from skyvern.forge.sdk.schemas.tasks import Task
|
||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||
from skyvern.webeye.actions import actions
|
||||
from skyvern.webeye.actions.actions import Action, ActionType, ClickAction, ScrapeResult, UploadFileAction, WebAction
|
||||
from skyvern.webeye.actions.responses import ActionFailure, ActionResult, ActionSuccess
|
||||
from skyvern.webeye.browser_factory import BrowserState
|
||||
from skyvern.webeye.scraper.scraper import ScrapedPage
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
|
||||
|
||||
class ActionHandler:
|
||||
_handled_action_types: dict[
|
||||
ActionType, Callable[[Action, Page, ScrapedPage, Task, Step], Awaitable[list[ActionResult]]]
|
||||
] = {}
|
||||
|
||||
@classmethod
|
||||
def register_action_type(
|
||||
cls,
|
||||
action_type: ActionType,
|
||||
handler: Callable[[Action, Page, ScrapedPage, Task, Step], Awaitable[list[ActionResult]]],
|
||||
) -> None:
|
||||
cls._handled_action_types[action_type] = handler
|
||||
|
||||
@staticmethod
|
||||
async def handle_action(
|
||||
scraped_page: ScrapedPage,
|
||||
task: Task,
|
||||
step: Step,
|
||||
browser_state: BrowserState,
|
||||
action: Action,
|
||||
) -> list[ActionResult]:
|
||||
LOG.info("Handling action", action=action)
|
||||
page = await browser_state.get_or_create_page()
|
||||
try:
|
||||
if action.action_type in ActionHandler._handled_action_types:
|
||||
handler = ActionHandler._handled_action_types[action.action_type]
|
||||
return await handler(action, page, scraped_page, task, step)
|
||||
else:
|
||||
LOG.error("Unsupported action type in handler", action=action, type=type(action))
|
||||
return [ActionFailure(Exception(f"Unsupported action type: {type(action)}"))]
|
||||
except MissingElement as e:
|
||||
LOG.info("Known exceptions", action=action, exception_type=type(e), exception_message=str(e))
|
||||
return [ActionFailure(e)]
|
||||
except MultipleElementsFound as e:
|
||||
LOG.exception(
|
||||
"Cannot handle multiple elements with the same xpath in one action.",
|
||||
action=action,
|
||||
exception=e,
|
||||
)
|
||||
return [ActionFailure(e)]
|
||||
except Exception as e:
|
||||
LOG.exception("Unhandled exception in action handler", action=action, exception=e)
|
||||
return [ActionFailure(e)]
|
||||
|
||||
|
||||
async def handle_solve_captcha_action(
|
||||
action: actions.SolveCaptchaAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
LOG.warning(
|
||||
"Please solve the captcha on the page, you have 30 seconds",
|
||||
action=action,
|
||||
)
|
||||
await asyncio.sleep(30)
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_click_action(
|
||||
action: actions.ClickAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
xpath = await validate_actions_in_dom(action, page, scraped_page)
|
||||
await asyncio.sleep(0.3)
|
||||
return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
|
||||
|
||||
async def handle_input_text_action(
|
||||
action: actions.InputTextAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
xpath = await validate_actions_in_dom(action, page, scraped_page)
|
||||
locator = page.locator(f"xpath={xpath}")
|
||||
await locator.clear()
|
||||
await locator.fill(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
|
||||
# This is a hack that gets dropdowns to select the "best" option based on what's typed
|
||||
# Fixes situations like tsk_228671423990405776 where the location isn't being autocompleted
|
||||
await locator.press("Tab", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
input_value = await locator.input_value(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
if not input_value:
|
||||
LOG.info("Failed to input the text, trying to press sequentially with an enter click", action=action)
|
||||
await locator.clear()
|
||||
await locator.press_sequentially(action.text, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
await locator.press("Enter", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
input_value = await locator.input_value(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
LOG.info("Input value", input_value=input_value, action=action)
|
||||
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_upload_file_action(
|
||||
action: actions.UploadFileAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
if not action.file_url:
|
||||
LOG.warning("InputFileAction has no file_url", action=action)
|
||||
return [ActionFailure(MissingFileUrl())]
|
||||
if action.file_url not in str(task.navigation_payload):
|
||||
LOG.warning(
|
||||
"LLM might be imagining the file url, which is not in navigation payload",
|
||||
action=action,
|
||||
file_url=action.file_url,
|
||||
)
|
||||
return [ActionFailure(ImaginaryFileUrl(action.file_url))]
|
||||
xpath = await validate_actions_in_dom(action, page, scraped_page)
|
||||
file_path = download_file(action.file_url)
|
||||
locator = page.locator(f"xpath={xpath}")
|
||||
is_file_input = await is_file_input_element(locator)
|
||||
if is_file_input:
|
||||
LOG.info("Taking UploadFileAction. Found file input tag", action=action)
|
||||
if file_path:
|
||||
await page.locator(f"xpath={xpath}").set_input_files(
|
||||
file_path, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS
|
||||
)
|
||||
|
||||
# Sleep for 10 seconds after uploading a file to let the page process it
|
||||
await asyncio.sleep(10)
|
||||
return [ActionSuccess()]
|
||||
else:
|
||||
return [ActionFailure(Exception(f"Failed to download file from {action.file_url}"))]
|
||||
else:
|
||||
LOG.info("Taking UploadFileAction. Found non file input tag", action=action)
|
||||
# treat it as a click action
|
||||
action.is_upload_file_tag = False
|
||||
return await chain_click(page, action, xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
|
||||
|
||||
async def handle_null_action(
|
||||
action: actions.NullAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_select_option_action(
|
||||
action: actions.SelectOptionAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
xpath = await validate_actions_in_dom(action, page, scraped_page)
|
||||
|
||||
try:
|
||||
# First click by label (if it matches)
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
await page.select_option(
|
||||
xpath,
|
||||
label=action.option.label,
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
return [ActionSuccess()]
|
||||
except Exception as e:
|
||||
if action.option.index is not None:
|
||||
LOG.warning(
|
||||
"Failed to click on the option by label, trying by index",
|
||||
exc_info=e,
|
||||
action=action,
|
||||
xpath=xpath,
|
||||
)
|
||||
else:
|
||||
return [ActionFailure(e)]
|
||||
|
||||
try:
|
||||
option_xpath = scraped_page.id_to_xpath_dict[action.option.index]
|
||||
match = re.search(r"option\[(\d+)]$", option_xpath)
|
||||
if match:
|
||||
# This means we were trying to select an option xpath, click the option
|
||||
option_index = int(match.group(1))
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
await page.select_option(
|
||||
xpath,
|
||||
index=option_index,
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
return [ActionSuccess()]
|
||||
else:
|
||||
# This means the supplied index was for the select element, not a reference to the xpath dict
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
await page.select_option(
|
||||
xpath,
|
||||
index=action.option.index,
|
||||
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
)
|
||||
await page.click(f"xpath={xpath}", timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
return [ActionSuccess()]
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to click on the option by index", exception=e, action=action)
|
||||
return [ActionFailure(e)]
|
||||
|
||||
|
||||
async def handle_checkbox_action(
|
||||
self: actions.CheckboxAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
"""
|
||||
******* NOT REGISTERED *******
|
||||
This action causes more harm than it does good.
|
||||
It frequently mis-behaves, or gets stuck in click loops.
|
||||
Treating checkbox actions as click actions seem to perform way more reliably
|
||||
Developers who tried this and failed: 2 (Suchintan and Shu 😂)
|
||||
"""
|
||||
xpath = await validate_actions_in_dom(self, page, scraped_page)
|
||||
if self.is_checked:
|
||||
await page.check(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
else:
|
||||
await page.uncheck(xpath, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
|
||||
|
||||
# TODO (suchintan): Why does checking the label work, but not the actual input element?
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_wait_action(
|
||||
action: actions.WaitAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
await asyncio.sleep(10)
|
||||
return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]
|
||||
|
||||
|
||||
async def handle_terminate_action(
|
||||
action: actions.TerminateAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_complete_action(
|
||||
action: actions.CompleteAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
|
||||
) -> list[ActionResult]:
|
||||
extracted_data = None
|
||||
if action.data_extraction_goal:
|
||||
scrape_action_result = await extract_information_for_navigation_goal(
|
||||
scraped_page=scraped_page,
|
||||
task=task,
|
||||
step=step,
|
||||
)
|
||||
extracted_data = scrape_action_result.scraped_data
|
||||
return [ActionSuccess(data=extracted_data)]
|
||||
|
||||
|
||||
ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
|
||||
ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
|
||||
ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
|
||||
ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
|
||||
ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
|
||||
ActionHandler.register_action_type(ActionType.SELECT_OPTION, handle_select_option_action)
|
||||
ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
|
||||
ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
|
||||
ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
|
||||
|
||||
|
||||
async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> str:
|
||||
xpath = scraped_page.id_to_xpath_dict[action.element_id]
|
||||
locator = page.locator(xpath)
|
||||
|
||||
num_elements = await locator.count()
|
||||
if num_elements < 1:
|
||||
LOG.warning("No elements found with action xpath. Validation failed.", action=action, xpath=xpath)
|
||||
raise MissingElement(xpath=xpath, element_id=action.element_id)
|
||||
elif num_elements > 1:
|
||||
LOG.warning(
|
||||
"Multiple elements found with action xpath. Expected 1. Validation failed.",
|
||||
action=action,
|
||||
num_elements=num_elements,
|
||||
)
|
||||
raise MultipleElementsFound(num=num_elements, xpath=xpath, element_id=action.element_id)
|
||||
else:
|
||||
LOG.info("Validated action xpath in DOM", action=action)
|
||||
|
||||
return xpath
|
||||
|
||||
|
||||
async def chain_click(
|
||||
page: Page,
|
||||
action: ClickAction | UploadFileAction,
|
||||
xpath: str,
|
||||
timeout: int = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
|
||||
) -> List[ActionResult]:
|
||||
# Add a defensive page handler here in case a click action opens a file chooser.
|
||||
# This automatically dismisses the dialog
|
||||
# File choosers are impossible to close if you don't expect one. Instead of dealing with it, close it!
|
||||
|
||||
# TODO (suchintan): This should likely result in an ActionFailure -- we can figure out how to do this later!
|
||||
LOG.info("Chain click starts", action=action, xpath=xpath)
|
||||
file: list[str] | str = []
|
||||
if action.file_url:
|
||||
file = download_file(action.file_url) or []
|
||||
|
||||
fc_func = lambda fc: fc.set_files(files=file)
|
||||
page.on("filechooser", fc_func)
|
||||
|
||||
LOG.info("Registered file chooser listener", action=action, path=file)
|
||||
"""
|
||||
Clicks on an element identified by the xpath and its parent if failed.
|
||||
:param xpath: xpath of the element to click
|
||||
"""
|
||||
javascript_triggered = await is_javascript_triggered(page, xpath)
|
||||
try:
|
||||
await page.click(f"xpath={xpath}", timeout=timeout)
|
||||
LOG.info("Chain click: main element click succeeded", action=action, xpath=xpath)
|
||||
return [ActionSuccess(javascript_triggered=javascript_triggered)]
|
||||
except Exception as e:
|
||||
action_results: list[ActionResult] = [ActionFailure(e, javascript_triggered=javascript_triggered)]
|
||||
if await is_input_element(page.locator(xpath)):
|
||||
LOG.info("Chain click: it's an input element. going to try sibling click", action=action, xpath=xpath)
|
||||
sibling_action_result = await click_sibling_of_input(page.locator(xpath), timeout=timeout)
|
||||
action_results.append(sibling_action_result)
|
||||
if type(sibling_action_result) == ActionSuccess:
|
||||
return action_results
|
||||
|
||||
parent_xpath = f"{xpath}/.."
|
||||
try:
|
||||
parent_javascript_triggered = await is_javascript_triggered(page, parent_xpath)
|
||||
javascript_triggered = javascript_triggered or parent_javascript_triggered
|
||||
parent_locator = page.locator(xpath).locator("..")
|
||||
await parent_locator.click(timeout=timeout)
|
||||
LOG.info("Chain click: successfully clicked parent element", action=action, parent_xpath=parent_xpath)
|
||||
action_results.append(
|
||||
ActionSuccess(
|
||||
javascript_triggered=javascript_triggered,
|
||||
interacted_with_parent=True,
|
||||
)
|
||||
)
|
||||
except Exception as pe:
|
||||
LOG.warning("Failed to click parent element", action=action, parent_xpath=parent_xpath, exc_info=True)
|
||||
action_results.append(
|
||||
ActionFailure(pe, javascript_triggered=javascript_triggered, interacted_with_parent=True)
|
||||
)
|
||||
# We don't raise exception here because we do log the exception, and return ActionFailure as the last action
|
||||
|
||||
return action_results
|
||||
finally:
|
||||
LOG.info("Remove file chooser listener", action=action)
|
||||
|
||||
# Sleep for 10 seconds after uploading a file to let the page process it
|
||||
# Removing this breaks file uploads using the filechooser
|
||||
# KEREM DO NOT REMOVE
|
||||
if file:
|
||||
await asyncio.sleep(10)
|
||||
page.remove_listener("filechooser", fc_func)
|
||||
|
||||
|
||||
async def is_javascript_triggered(page: Page, xpath: str) -> bool:
|
||||
locator = page.locator(f"xpath={xpath}")
|
||||
element = locator.first
|
||||
tag_name = await element.evaluate("e => e.tagName")
|
||||
if tag_name.lower() == "a":
|
||||
href = await element.evaluate("e => e.href")
|
||||
if href.lower().startswith("javascript:"):
|
||||
LOG.info("Found javascript call in anchor tag, marking step as completed. Dropping remaining actions")
|
||||
return True
|
||||
return False
|
||||
|
||||
|
||||
async def is_file_input_element(locator: Locator) -> bool:
|
||||
element = locator.first
|
||||
if element:
|
||||
tag_name = await element.evaluate("el => el.tagName")
|
||||
type_name = await element.evaluate("el => el.type")
|
||||
return tag_name.lower() == "input" and type_name == "file"
|
||||
return False
|
||||
|
||||
|
||||
async def is_input_element(locator: Locator) -> bool:
|
||||
element = locator.first
|
||||
if element:
|
||||
tag_name = await element.evaluate("el => el.tagName")
|
||||
return tag_name.lower() == "input"
|
||||
return False
|
||||
|
||||
|
||||
async def click_sibling_of_input(
|
||||
locator: Locator,
|
||||
timeout: int,
|
||||
javascript_triggered: bool = False,
|
||||
) -> ActionResult:
|
||||
try:
|
||||
input_element = locator.first
|
||||
parent_locator = locator.locator("..")
|
||||
if input_element:
|
||||
input_id = await input_element.get_attribute("id")
|
||||
sibling_label_xpath = f'//label[@for="{input_id}"]'
|
||||
label_locator = parent_locator.locator(sibling_label_xpath)
|
||||
await label_locator.click(timeout=timeout)
|
||||
LOG.info(
|
||||
"Successfully clicked sibling label of input element",
|
||||
sibling_label_xpath=sibling_label_xpath,
|
||||
)
|
||||
return ActionSuccess(javascript_triggered=javascript_triggered, interacted_with_sibling=True)
|
||||
# Should never get here
|
||||
return ActionFailure(
|
||||
exception=Exception("Failed while trying to click sibling of input element"),
|
||||
javascript_triggered=javascript_triggered,
|
||||
interacted_with_sibling=True,
|
||||
)
|
||||
except Exception as e:
|
||||
LOG.warning("Failed to click sibling label of input element", exc_info=e)
|
||||
return ActionFailure(exception=e, javascript_triggered=javascript_triggered)
|
||||
|
||||
|
||||
async def extract_information_for_navigation_goal(
|
||||
task: Task,
|
||||
step: Step,
|
||||
scraped_page: ScrapedPage,
|
||||
) -> ScrapeResult:
|
||||
"""
|
||||
Scrapes a webpage and returns the scraped response, including:
|
||||
1. JSON representation of what the user is seeing
|
||||
2. The scraped page
|
||||
"""
|
||||
prompt_template = "extract-information"
|
||||
|
||||
extract_information_prompt = prompt_engine.load_prompt(
|
||||
prompt_template,
|
||||
navigation_goal=task.navigation_goal,
|
||||
elements=scraped_page.element_tree,
|
||||
data_extraction_goal=task.data_extraction_goal,
|
||||
extracted_information_schema=task.extracted_information_schema,
|
||||
current_url=scraped_page.url,
|
||||
extracted_text=scraped_page.extracted_text,
|
||||
)
|
||||
|
||||
json_response = await app.OPENAI_CLIENT.chat_completion(
|
||||
step=step,
|
||||
prompt=extract_information_prompt,
|
||||
screenshots=scraped_page.screenshots,
|
||||
)
|
||||
|
||||
return ScrapeResult(
|
||||
scraped_data=json_response,
|
||||
)
|
||||
58
skyvern/webeye/actions/models.py
Normal file
58
skyvern/webeye/actions/models.py
Normal file
@@ -0,0 +1,58 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.forge.sdk.settings_manager import SettingsManager
|
||||
from skyvern.webeye.actions.actions import Action, ActionTypeUnion
|
||||
from skyvern.webeye.actions.responses import ActionResult
|
||||
from skyvern.webeye.scraper.scraper import ScrapedPage
|
||||
|
||||
|
||||
class AgentStepOutput(BaseModel):
|
||||
"""
|
||||
Output of the agent step, this is recorded in the database.
|
||||
"""
|
||||
|
||||
# Will be deprecated once we move to the new format below
|
||||
action_results: list[ActionResult] | None = None
|
||||
# Nullable for backwards compatibility, once backfill is done, this won't be nullable anymore
|
||||
actions_and_results: list[tuple[ActionTypeUnion, list[ActionResult]]] | None = None
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"AgentStepOutput({self.model_dump()})"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
|
||||
class DetailedAgentStepOutput(BaseModel):
|
||||
"""
|
||||
Output of the agent step, this is not recorded in the database, only used for debugging in the Jupyter notebook.
|
||||
"""
|
||||
|
||||
scraped_page: ScrapedPage | None
|
||||
extract_action_prompt: str | None
|
||||
llm_response: dict[str, Any] | None
|
||||
actions: list[Action] | None
|
||||
action_results: list[ActionResult] | None
|
||||
actions_and_results: list[tuple[ActionTypeUnion, list[ActionResult]]] | None
|
||||
|
||||
class Config:
|
||||
exclude = ["scraped_page", "extract_action_prompt"]
|
||||
|
||||
def __repr__(self) -> str:
|
||||
if SettingsManager.get_settings().DEBUG_MODE:
|
||||
return f"DetailedAgentStepOutput({self.model_dump()})"
|
||||
else:
|
||||
return f"AgentStepOutput({self.to_agent_step_output().model_dump()})"
|
||||
|
||||
def __str__(self) -> str:
|
||||
return self.__repr__()
|
||||
|
||||
def to_agent_step_output(self) -> AgentStepOutput:
|
||||
return AgentStepOutput(
|
||||
action_results=self.action_results if self.action_results else [],
|
||||
actions_and_results=self.actions_and_results if self.actions_and_results else [],
|
||||
)
|
||||
62
skyvern/webeye/actions/responses.py
Normal file
62
skyvern/webeye/actions/responses.py
Normal file
@@ -0,0 +1,62 @@
|
||||
from typing import Any
|
||||
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.webeye.string_util import remove_whitespace
|
||||
|
||||
|
||||
class ActionResult(BaseModel):
|
||||
success: bool
|
||||
exception_type: str | None = None
|
||||
exception_message: str | None = None
|
||||
data: dict[str, Any] | list | str | None = None
|
||||
step_retry_number: int | None = None
|
||||
step_order: int | None = None
|
||||
javascript_triggered: bool = False
|
||||
# None is used for old data so that we can differentiate between old and new data which only has boolean
|
||||
interacted_with_sibling: bool | None = None
|
||||
interacted_with_parent: bool | None = None
|
||||
|
||||
def __str__(self) -> str:
|
||||
return (
|
||||
f"ActionResult(success={self.success}, exception_type={self.exception_type}, "
|
||||
f"exception_message={self.exception_message}), data={self.data}"
|
||||
)
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return self.__str__()
|
||||
|
||||
|
||||
class ActionSuccess(ActionResult):
|
||||
def __init__(
|
||||
self,
|
||||
data: dict[str, Any] | list | str | None = None,
|
||||
javascript_triggered: bool = False,
|
||||
interacted_with_sibling: bool = False,
|
||||
interacted_with_parent: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
success=True,
|
||||
data=data,
|
||||
javascript_triggered=javascript_triggered,
|
||||
interacted_with_sibling=interacted_with_sibling,
|
||||
interacted_with_parent=interacted_with_parent,
|
||||
)
|
||||
|
||||
|
||||
class ActionFailure(ActionResult):
|
||||
def __init__(
|
||||
self,
|
||||
exception: Exception,
|
||||
javascript_triggered: bool = False,
|
||||
interacted_with_sibling: bool = False,
|
||||
interacted_with_parent: bool = False,
|
||||
):
|
||||
super().__init__(
|
||||
success=False,
|
||||
exception_type=type(exception).__name__,
|
||||
exception_message=remove_whitespace(str(exception)),
|
||||
javascript_triggered=javascript_triggered,
|
||||
interacted_with_sibling=interacted_with_sibling,
|
||||
interacted_with_parent=interacted_with_parent,
|
||||
)
|
||||
Reference in New Issue
Block a user