use css selector instead of xpath (#551)

This commit is contained in:
LawyZheng
2024-07-04 10:45:47 +08:00
committed by GitHub
parent 80c7c43bd6
commit cd0d563070
7 changed files with 92 additions and 122 deletions

View File

@@ -47,17 +47,17 @@ class ScriptNotFound(SkyvernException):
class MissingElement(SkyvernException):
def __init__(self, xpath: str | None = None, element_id: str | None = None):
def __init__(self, selector: str | None = None, element_id: str | None = None):
super().__init__(
f"Found no elements. Might be due to previous actions which removed this element."
f" xpath={xpath} element_id={element_id}",
f" selector={selector} element_id={element_id}",
)
class MultipleElementsFound(SkyvernException):
def __init__(self, num: int, xpath: str | None = None, element_id: str | None = None):
def __init__(self, num: int, selector: str | None = None, element_id: str | None = None):
super().__init__(
f"Found {num} elements. Expected 1. num_elements={num} xpath={xpath} element_id={element_id}",
f"Found {num} elements. Expected 1. num_elements={num} selector={selector} element_id={element_id}",
)
@@ -318,6 +318,11 @@ class MissingElementInIframe(SkyvernException):
super().__init__(f"Found no iframe includes the element. element_id={element_id}")
class MissingElementInCSSMap(SkyvernException):
def __init__(self, element_id: str) -> None:
super().__init__(f"Found no css selector in the CSS map for the element. element_id={element_id}")
class InputActionOnSelect2Dropdown(SkyvernException):
def __init__(self, element_id: str):
super().__init__(

View File

@@ -966,8 +966,8 @@ class ForgeAgent:
await app.ARTIFACT_MANAGER.create_artifact(
step=step,
artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP,
data=json.dumps(scraped_page.id_to_xpath_dict, indent=2).encode(),
artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_CSS_MAP,
data=json.dumps(scraped_page.id_to_css_dict, indent=2).encode(),
)
await app.ARTIFACT_MANAGER.create_artifact(
step=step,

View File

@@ -21,12 +21,15 @@ class ArtifactType(StrEnum):
LLM_REQUEST = "llm_request"
LLM_RESPONSE = "llm_response"
LLM_RESPONSE_PARSED = "llm_response_parsed"
VISIBLE_ELEMENTS_ID_XPATH_MAP = "visible_elements_id_xpath_map"
VISIBLE_ELEMENTS_ID_CSS_MAP = "visible_elements_id_css_map"
VISIBLE_ELEMENTS_ID_FRAME_MAP = "visible_elements_id_frame_map"
VISIBLE_ELEMENTS_TREE = "visible_elements_tree"
VISIBLE_ELEMENTS_TREE_TRIMMED = "visible_elements_tree_trimmed"
VISIBLE_ELEMENTS_TREE_IN_PROMPT = "visible_elements_tree_in_prompt"
# DEPRECATED. pls use VISIBLE_ELEMENTS_ID_CSS_MAP
VISIBLE_ELEMENTS_ID_XPATH_MAP = "visible_elements_id_xpath_map"
# DEPRECATED. pls use HTML_SCRAPE or HTML_ACTION
HTML = "html"

View File

@@ -13,7 +13,7 @@ FILE_EXTENTSION_MAP: dict[ArtifactType, str] = {
ArtifactType.LLM_REQUEST: "json",
ArtifactType.LLM_RESPONSE: "json",
ArtifactType.LLM_RESPONSE_PARSED: "json",
ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP: "json",
ArtifactType.VISIBLE_ELEMENTS_ID_CSS_MAP: "json",
ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP: "json",
ArtifactType.VISIBLE_ELEMENTS_TREE: "json",
ArtifactType.VISIBLE_ELEMENTS_TREE_TRIMMED: "json",
@@ -22,6 +22,8 @@ FILE_EXTENTSION_MAP: dict[ArtifactType, str] = {
ArtifactType.HTML_ACTION: "html",
ArtifactType.TRACE: "zip",
ArtifactType.HAR: "har",
# DEPRECATED: we're using CSS selector map now
ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP: "json",
}

View File

@@ -48,7 +48,7 @@ from skyvern.webeye.actions.actions import (
from skyvern.webeye.actions.responses import ActionFailure, ActionResult, ActionSuccess
from skyvern.webeye.browser_factory import BrowserState
from skyvern.webeye.scraper.scraper import ScrapedPage
from skyvern.webeye.utils.dom import DomUtil, InteractiveElement, Select2Dropdown, SkyvernElement, resolve_locator
from skyvern.webeye.utils.dom import DomUtil, InteractiveElement, Select2Dropdown, SkyvernElement
LOG = structlog.get_logger()
TEXT_INPUT_DELAY = 10 # 10ms between each character input
@@ -152,7 +152,7 @@ class ActionHandler:
return [ActionFailure(e)]
except MultipleElementsFound as e:
LOG.exception(
"Cannot handle multiple elements with the same xpath in one action.",
"Cannot handle multiple elements with the same selector in one action.",
action=action,
)
return [ActionFailure(e)]
@@ -206,7 +206,8 @@ async def handle_click_action(
num_downloaded_files_before=num_downloaded_files_before,
download_dir=download_dir,
)
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
dom = DomUtil(scraped_page=scraped_page, page=page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
await asyncio.sleep(0.3)
if action.download:
results = await handle_click_to_download_file_action(action, page, scraped_page)
@@ -216,8 +217,7 @@ async def handle_click_action(
scraped_page,
page,
action,
xpath,
frame,
skyvern_element,
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
)
@@ -241,9 +241,9 @@ async def handle_click_to_download_file_action(
page: Page,
scraped_page: ScrapedPage,
) -> list[ActionResult]:
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
locator = resolve_locator(scraped_page, page, frame, xpath)
dom = DomUtil(scraped_page=scraped_page, page=page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
locator = skyvern_element.locator
try:
await locator.click(
@@ -269,10 +269,7 @@ async def handle_input_text_action(
if await skyvern_element.is_select2_dropdown():
return [ActionFailure(InputActionOnSelect2Dropdown(element_id=action.element_id))]
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
locator = resolve_locator(scraped_page, page, frame, xpath)
locator = skyvern_element.locator
current_text = await get_input_value(locator)
if current_text == action.text:
return [ActionSuccess()]
@@ -323,19 +320,16 @@ async def handle_upload_file_action(
)
return [ActionFailure(ImaginaryFileUrl(action.file_url))]
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
dom = DomUtil(scraped_page=scraped_page, page=page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
locator = skyvern_element.locator
file_path = await download_file(file_url)
locator = resolve_locator(scraped_page, page, frame, xpath)
is_file_input = await is_file_input_element(locator)
if is_file_input:
LOG.info("Taking UploadFileAction. Found file input tag", action=action)
if file_path:
locator = resolve_locator(scraped_page, page, frame, xpath)
await locator.set_input_files(
file_path,
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
@@ -356,8 +350,7 @@ async def handle_upload_file_action(
scraped_page,
page,
action,
xpath,
frame,
skyvern_element,
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
)
@@ -370,7 +363,9 @@ async def handle_download_file_action(
task: Task,
step: Step,
) -> list[ActionResult]:
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
dom = DomUtil(scraped_page=scraped_page, page=page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
file_name = f"{action.file_name or uuid.uuid4()}"
full_file_path = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}/{file_name}"
try:
@@ -378,8 +373,7 @@ async def handle_download_file_action(
async with page.expect_download() as download_info:
await asyncio.sleep(0.3)
locator = resolve_locator(scraped_page, page, frame, xpath)
locator = skyvern_element.locator
await locator.click(
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
modifiers=["Alt"],
@@ -422,12 +416,9 @@ async def handle_select_option_action(
) -> list[ActionResult]:
dom = DomUtil(scraped_page, page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
locator = skyvern_element.locator
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
locator = resolve_locator(scraped_page, page, frame, xpath)
tag_name = await get_tag_name_lowercase(locator)
tag_name = skyvern_element.get_tag_name()
element_dict = scraped_page.id_to_element_dict[action.element_id]
LOG.info(
"SelectOptionAction",
@@ -574,6 +565,7 @@ async def handle_select_option_action(
return result
elif tag_name == "ul" or tag_name == "div" or tag_name == "li":
# DEPRECATED: This was used for handle select2 dropdown, and isn't used anymore.
# if the role is listbox, find the option with the "label" or "value" and click that option element
# references:
# https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles/listbox_role
@@ -603,7 +595,7 @@ async def handle_select_option_action(
)
# click the option element
click_action = ClickAction(element_id=action.element_id)
return await chain_click(task, scraped_page, page, click_action, xpath, frame)
return await chain_click(task, scraped_page, page, click_action, skyvern_element)
else:
LOG.error(
"SelectOptionAction on a non-listbox element. Cannot handle this action",
@@ -622,7 +614,7 @@ async def handle_select_option_action(
action=action,
)
click_action = ClickAction(element_id=action.element_id)
return await chain_click(task, scraped_page, page, click_action, xpath, frame)
return await chain_click(task, scraped_page, page, click_action, skyvern_element)
try:
current_text = await locator.input_value()
@@ -631,7 +623,7 @@ async def handle_select_option_action(
except Exception:
LOG.info("failed to confirm if the select option has been done, force to take the action again.")
return await normal_select(action=action, skyvern_element=skyvern_element, xpath=xpath, frame=frame)
return await normal_select(action=action, skyvern_element=skyvern_element)
async def handle_checkbox_action(
@@ -648,9 +640,10 @@ async def handle_checkbox_action(
Treating checkbox actions as click actions seem to perform way more reliably
Developers who tried this and failed: 2 (Suchintan and Shu 😂)
"""
xpath, frame = await validate_actions_in_dom(action, page, scraped_page)
locator = resolve_locator(scraped_page, page, frame, xpath)
dom = DomUtil(scraped_page=scraped_page, page=page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
locator = skyvern_element.locator
if action.is_checked:
await locator.check(timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS)
@@ -732,48 +725,21 @@ def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
return secret_value if secret_value is not None else parameter
async def validate_actions_in_dom(action: WebAction, page: Page, scraped_page: ScrapedPage) -> tuple[str, str]:
xpath = scraped_page.id_to_xpath_dict[action.element_id]
frame = scraped_page.id_to_frame_dict[action.element_id]
locator = resolve_locator(scraped_page, page, frame, xpath)
num_elements = await locator.count()
if num_elements < 1:
LOG.warning(
"No elements found with action xpath. Validation failed.",
action=action,
xpath=xpath,
)
raise MissingElement(xpath=xpath, element_id=action.element_id)
elif num_elements > 1:
LOG.warning(
"Multiple elements found with action xpath. Expected 1. Validation failed.",
action=action,
num_elements=num_elements,
)
raise MultipleElementsFound(num=num_elements, xpath=xpath, element_id=action.element_id)
else:
LOG.info("Validated action xpath in DOM", action=action)
return xpath, frame
async def chain_click(
task: Task,
scraped_page: ScrapedPage,
page: Page,
action: ClickAction | UploadFileAction,
xpath: str,
frame: str,
skyvern_element: SkyvernElement,
timeout: int = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
) -> List[ActionResult]:
# Add a defensive page handler here in case a click action opens a file chooser.
# This automatically dismisses the dialog
# File choosers are impossible to close if you don't expect one. Instead of dealing with it, close it!
locator = skyvern_element.locator
# TODO (suchintan): This should likely result in an ActionFailure -- we can figure out how to do this later!
LOG.info("Chain click starts", action=action, xpath=xpath)
LOG.info("Chain click starts", action=action, locator=locator)
file: list[str] | str = []
if action.file_url:
file_url = get_actual_value_of_parameter_if_secret(task, action.file_url)
@@ -792,15 +758,14 @@ async def chain_click(
LOG.info("Registered file chooser listener", action=action, path=file)
"""
Clicks on an element identified by the xpath and its parent if failed.
:param xpath: xpath of the element to click
Clicks on an element identified by the css and its parent if failed.
:param css: css of the element to click
"""
javascript_triggered = await is_javascript_triggered(scraped_page, page, frame, xpath)
locator = resolve_locator(scraped_page, page, frame, xpath)
javascript_triggered = await is_javascript_triggered(scraped_page, page, locator)
try:
await locator.click(timeout=timeout)
LOG.info("Chain click: main element click succeeded", action=action, xpath=xpath)
LOG.info("Chain click: main element click succeeded", action=action, locator=locator)
return [
ActionSuccess(
javascript_triggered=javascript_triggered,
@@ -818,25 +783,25 @@ async def chain_click(
LOG.info(
"Chain click: it's an input element. going to try sibling click",
action=action,
xpath=xpath,
locator=locator,
)
sibling_action_result = await click_sibling_of_input(locator, timeout=timeout)
action_results.append(sibling_action_result)
if type(sibling_action_result) == ActionSuccess:
return action_results
parent_xpath = f"{xpath}/.."
try:
parent_javascript_triggered = await is_javascript_triggered(scraped_page, page, frame, parent_xpath)
parent_locator = locator.locator("..")
parent_javascript_triggered = await is_javascript_triggered(scraped_page, page, parent_locator)
javascript_triggered = javascript_triggered or parent_javascript_triggered
parent_locator = resolve_locator(scraped_page, page, frame, xpath).locator("..")
await parent_locator.click(timeout=timeout)
LOG.info(
"Chain click: successfully clicked parent element",
action=action,
parent_xpath=parent_xpath,
parent_locator=parent_locator,
)
action_results.append(
ActionSuccess(
@@ -848,7 +813,7 @@ async def chain_click(
LOG.warning(
"Failed to click parent element",
action=action,
parent_xpath=parent_xpath,
parent_locator=parent_locator,
exc_info=True,
)
action_results.append(
@@ -875,8 +840,6 @@ async def chain_click(
async def normal_select(
action: actions.SelectOptionAction,
skyvern_element: SkyvernElement,
xpath: str,
frame: str,
) -> List[ActionResult]:
action_result: List[ActionResult] = []
is_success = False
@@ -891,8 +854,7 @@ async def normal_select(
"Failed to click before select action",
exc_info=True,
action=action,
xpath=xpath,
frame=frame,
locator=locator,
)
action_result.append(ActionFailure(e))
return action_result
@@ -912,8 +874,7 @@ async def normal_select(
"Failed to take select action by label",
exc_info=True,
action=action,
xpath=xpath,
frame=frame,
locator=locator,
)
if not is_success and action.option.value is not None:
@@ -931,8 +892,7 @@ async def normal_select(
"Failed to take select action by value",
exc_info=True,
action=action,
xpath=xpath,
frame=frame,
locator=locator,
)
if not is_success and action.option.index is not None:
@@ -941,12 +901,11 @@ async def normal_select(
LOG.error(
"option index is out of bound",
action=action,
xpath=xpath,
frame=frame,
locator=locator,
)
else:
try:
# This means the supplied index was for the select element, not a reference to the xpath dict
# This means the supplied index was for the select element, not a reference to the css dict
await locator.select_option(
index=action.option.index,
timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
@@ -959,8 +918,7 @@ async def normal_select(
"Failed to click on the option by index",
exc_info=True,
action=action,
xpath=xpath,
frame=frame,
locator=locator,
)
try:
@@ -972,8 +930,7 @@ async def normal_select(
"Failed to click after select action",
exc_info=True,
action=action,
xpath=xpath,
frame=frame,
locator=locator,
)
action_result.append(ActionFailure(e))
return action_result
@@ -993,7 +950,7 @@ def get_anchor_to_click(scraped_page: ScrapedPage, element_id: str) -> str | Non
if "id" in ele and ele["id"] == element_id:
for child in ele["children"]:
if "tagName" in child and child["tagName"] == "a":
return scraped_page.id_to_xpath_dict[child["id"]]
return scraped_page.id_to_css_dict[child["id"]]
return None
@@ -1029,8 +986,8 @@ def get_checkbox_id_in_label_children(scraped_page: ScrapedPage, element_id: str
return None
async def is_javascript_triggered(scraped_page: ScrapedPage, page: Page, frame: str, xpath: str) -> bool:
locator = resolve_locator(scraped_page, page, frame, xpath)
@deprecated("This function is deprecated. It was used for select2 dropdown, but we don't use it anymore.")
async def is_javascript_triggered(scraped_page: ScrapedPage, page: Page, locator: Locator) -> bool:
element = locator.first
tag_name = await element.evaluate("e => e.tagName")
@@ -1077,12 +1034,12 @@ async def click_sibling_of_input(
parent_locator = locator.locator("..")
if input_element:
input_id = await input_element.get_attribute("id")
sibling_label_xpath = f'//label[@for="{input_id}"]'
label_locator = parent_locator.locator(sibling_label_xpath)
sibling_label_css = f'label[for="{input_id}"]'
label_locator = parent_locator.locator(sibling_label_css)
await label_locator.click(timeout=timeout)
LOG.info(
"Successfully clicked sibling label of input element",
sibling_label_xpath=sibling_label_xpath,
sibling_label_css=sibling_label_css,
)
return ActionSuccess(javascript_triggered=javascript_triggered, interacted_with_sibling=True)
# Should never get here
@@ -1154,12 +1111,10 @@ async def click_listbox_option(
LOG.info("found option", element_id=child["id"])
text = child["text"] if "text" in child else ""
if text and (text == action.option.label or text == action.option.value):
option_xpath = scraped_page.id_to_xpath_dict[child["id"]]
option_frame = scraped_page.id_to_frame_dict[child["id"]]
dom = DomUtil(scraped_page=scraped_page, page=page)
try:
locator = resolve_locator(scraped_page, page, option_frame, option_xpath)
skyvern_element = await dom.get_skyvern_element_by_id(child["id"])
locator = skyvern_element.locator
await locator.click(timeout=1000)
return True
@@ -1167,7 +1122,7 @@ async def click_listbox_option(
LOG.error(
"Failed to click on the option",
action=action,
option_xpath=option_xpath,
locator=locator,
exc_info=True,
)
if "children" in child:

View File

@@ -111,7 +111,7 @@ class ScrapedPage(BaseModel):
"""
Scraped response from a webpage, including:
1. List of elements
2. ID to xpath map
2. ID to css map
3. The element tree of the page (list of dicts). Each element has children and attributes.
4. The screenshot (base64 encoded)
5. The URL of the page
@@ -122,7 +122,7 @@ class ScrapedPage(BaseModel):
elements: list[dict]
id_to_element_dict: dict[str, dict] = {}
id_to_frame_dict: dict[str, str] = {}
id_to_xpath_dict: dict[str, str]
id_to_css_dict: dict[str, str]
element_tree: list[dict]
element_tree_trimmed: list[dict]
screenshots: list[bytes]
@@ -276,14 +276,14 @@ async def scrape_web_unsafe(
_build_element_links(elements)
id_to_xpath_dict = {}
id_to_css_dict = {}
id_to_element_dict = {}
id_to_frame_dict = {}
for element in elements:
element_id = element["id"]
# get_interactable_element_tree marks each interactable element with a unique_id attribute
id_to_xpath_dict[element_id] = f"//*[@{SKYVERN_ID_ATTR}='{element_id}']"
id_to_css_dict[element_id] = f"[{SKYVERN_ID_ATTR}='{element_id}']"
id_to_element_dict[element_id] = element
id_to_frame_dict[element_id] = element["frame"]
@@ -301,7 +301,7 @@ async def scrape_web_unsafe(
return ScrapedPage(
elements=elements,
id_to_xpath_dict=id_to_xpath_dict,
id_to_css_dict=id_to_css_dict,
id_to_element_dict=id_to_element_dict,
id_to_frame_dict=id_to_frame_dict,
element_tree=element_tree,

View File

@@ -10,6 +10,7 @@ from skyvern.exceptions import (
ElementIsNotLabel,
MissingElement,
MissingElementDict,
MissingElementInCSSMap,
MissingElementInIframe,
MultipleElementsFound,
SkyvernException,
@@ -20,7 +21,7 @@ from skyvern.webeye.scraper.scraper import ScrapedPage, get_select2_options
LOG = structlog.get_logger()
def resolve_locator(scrape_page: ScrapedPage, page: Page, frame: str, xpath: str) -> Locator:
def resolve_locator(scrape_page: ScrapedPage, page: Page, frame: str, css: str) -> Locator:
iframe_path: list[str] = []
while frame != "main.frame":
@@ -42,7 +43,7 @@ def resolve_locator(scrape_page: ScrapedPage, page: Page, frame: str, xpath: str
child_frame = iframe_path.pop()
current_page = current_page.frame_locator(f"[{SKYVERN_ID_ATTR}='{child_frame}']")
return current_page.locator(f"xpath={xpath}")
return current_page.locator(css)
class InteractiveElement(StrEnum):
@@ -161,21 +162,25 @@ class DomUtil:
if not frame:
raise MissingElementInIframe(element_id)
xpath = self.scraped_page.id_to_xpath_dict[element_id]
css = self.scraped_page.id_to_css_dict.get(element_id)
if not css:
raise MissingElementInCSSMap(element_id)
locator = resolve_locator(self.scraped_page, self.page, frame, xpath)
locator = resolve_locator(self.scraped_page, self.page, frame, css)
num_elements = await locator.count()
if num_elements < 1:
LOG.warning("No elements found with xpath. Validation failed.", xpath=xpath)
raise MissingElement(xpath=xpath, element_id=element_id)
LOG.warning("No elements found with css. Validation failed.", css=css, element_id=element_id)
raise MissingElement(selector=css, element_id=element_id)
elif num_elements > 1:
LOG.warning(
"Multiple elements found with xpath. Expected 1. Validation failed.",
"Multiple elements found with css. Expected 1. Validation failed.",
num_elements=num_elements,
selector=css,
element_id=element_id,
)
raise MultipleElementsFound(num=num_elements, xpath=xpath, element_id=element_id)
raise MultipleElementsFound(num=num_elements, selector=css, element_id=element_id)
return SkyvernElement(locator, element)