diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 794e8911..c019163e 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -818,6 +818,11 @@ class ForgeAgent: artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP, data=json.dumps(scraped_page.id_to_xpath_dict, indent=2).encode(), ) + await app.ARTIFACT_MANAGER.create_artifact( + step=step, + artifact_type=ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP, + data=json.dumps(scraped_page.id_to_frame_dict, indent=2).encode(), + ) await app.ARTIFACT_MANAGER.create_artifact( step=step, artifact_type=ArtifactType.VISIBLE_ELEMENTS_TREE, diff --git a/skyvern/forge/sdk/artifact/models.py b/skyvern/forge/sdk/artifact/models.py index 52f50770..e22558b4 100644 --- a/skyvern/forge/sdk/artifact/models.py +++ b/skyvern/forge/sdk/artifact/models.py @@ -22,6 +22,7 @@ class ArtifactType(StrEnum): LLM_RESPONSE = "llm_response" LLM_RESPONSE_PARSED = "llm_response_parsed" VISIBLE_ELEMENTS_ID_XPATH_MAP = "visible_elements_id_xpath_map" + VISIBLE_ELEMENTS_ID_FRAME_MAP = "visible_elements_id_frame_map" VISIBLE_ELEMENTS_TREE = "visible_elements_tree" VISIBLE_ELEMENTS_TREE_TRIMMED = "visible_elements_tree_trimmed" VISIBLE_ELEMENTS_TREE_IN_PROMPT = "visible_elements_tree_in_prompt" diff --git a/skyvern/forge/sdk/artifact/storage/base.py b/skyvern/forge/sdk/artifact/storage/base.py index e8ca77b1..5858eb48 100644 --- a/skyvern/forge/sdk/artifact/storage/base.py +++ b/skyvern/forge/sdk/artifact/storage/base.py @@ -14,6 +14,7 @@ FILE_EXTENTSION_MAP: dict[ArtifactType, str] = { ArtifactType.LLM_RESPONSE: "json", ArtifactType.LLM_RESPONSE_PARSED: "json", ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP: "json", + ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP: "json", ArtifactType.VISIBLE_ELEMENTS_TREE: "json", ArtifactType.VISIBLE_ELEMENTS_TREE_TRIMMED: "json", ArtifactType.VISIBLE_ELEMENTS_TREE_IN_PROMPT: "txt", diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index a4689ea7..bd55d74e 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -6,10 +6,10 @@ from typing import Any, Awaitable, Callable, List import structlog from deprecation import deprecated -from playwright.async_api import Locator, Page +from playwright.async_api import FrameLocator, Locator, Page -from skyvern.constants import REPO_ROOT_DIR -from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound +from skyvern.constants import REPO_ROOT_DIR, SKYVERN_ID_ATTR +from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound, SkyvernException from skyvern.forge import app from skyvern.forge.prompts import prompt_engine from skyvern.forge.sdk.api.files import ( @@ -175,16 +175,18 @@ async def handle_click_action( num_downloaded_files_before=num_downloaded_files_before, download_dir=download_dir, ) - xpath = await validate_actions_in_dom(action, page, scraped_page) + xpath, frame = await validate_actions_in_dom(action, page, scraped_page) await asyncio.sleep(0.3) if action.download: results = await handle_click_to_download_file_action(action, page, scraped_page) else: results = await chain_click( task, + scraped_page, page, action, xpath, + frame, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS, ) @@ -208,10 +210,12 @@ async def handle_click_to_download_file_action( page: Page, scraped_page: ScrapedPage, ) -> list[ActionResult]: - xpath = await validate_actions_in_dom(action, page, scraped_page) + xpath, frame = await validate_actions_in_dom(action, page, scraped_page) + + locator = resolve_locator(scraped_page, page, frame, xpath) + try: - await page.click( - f"xpath={xpath}", + await locator.click( timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS, modifiers=["Alt"], ) @@ -229,8 +233,9 @@ async def handle_input_text_action( task: Task, step: Step, ) -> list[ActionResult]: - xpath = await validate_actions_in_dom(action, page, scraped_page) - locator = page.locator(f"xpath={xpath}") + xpath, frame = await validate_actions_in_dom(action, page, scraped_page) + + locator = resolve_locator(scraped_page, page, frame, xpath) current_text = await locator.input_value() if current_text == action.text: @@ -269,20 +274,28 @@ async def handle_upload_file_action( file_url=action.file_url, ) return [ActionFailure(ImaginaryFileUrl(action.file_url))] - xpath = await validate_actions_in_dom(action, page, scraped_page) + + xpath, frame = await validate_actions_in_dom(action, page, scraped_page) + file_path = await download_file(file_url) - locator = page.locator(f"xpath={xpath}") + + locator = resolve_locator(scraped_page, page, frame, xpath) + is_file_input = await is_file_input_element(locator) + if is_file_input: LOG.info("Taking UploadFileAction. Found file input tag", action=action) if file_path: - await page.locator(f"xpath={xpath}").set_input_files( + locator = resolve_locator(scraped_page, page, frame, xpath) + + await locator.set_input_files( file_path, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS, ) # Sleep for 10 seconds after uploading a file to let the page process it await asyncio.sleep(10) + return [ActionSuccess()] else: return [ActionFailure(Exception(f"Failed to download file from {action.file_url}"))] @@ -292,9 +305,11 @@ async def handle_upload_file_action( action.is_upload_file_tag = False return await chain_click( task, + scraped_page, page, action, xpath, + frame, timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS, ) @@ -307,15 +322,17 @@ async def handle_download_file_action( task: Task, step: Step, ) -> list[ActionResult]: - xpath = await validate_actions_in_dom(action, page, scraped_page) + xpath, frame = await validate_actions_in_dom(action, page, scraped_page) file_name = f"{action.file_name or uuid.uuid4()}" full_file_path = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}/{file_name}" try: # Start waiting for the download async with page.expect_download() as download_info: await asyncio.sleep(0.3) - await page.click( - f"xpath={xpath}", + + locator = resolve_locator(scraped_page, page, frame, xpath) + + await locator.click( timeout=SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS, modifiers=["Alt"], ) @@ -355,9 +372,10 @@ async def handle_select_option_action( task: Task, step: Step, ) -> list[ActionResult]: - xpath = await validate_actions_in_dom(action, page, scraped_page) + xpath, frame = await validate_actions_in_dom(action, page, scraped_page) + + locator = resolve_locator(scraped_page, page, frame, xpath) - locator = page.locator(f"xpath={xpath}") tag_name = await get_tag_name_lowercase(locator) element_dict = scraped_page.id_to_element_dict[action.element_id] LOG.info( @@ -400,7 +418,7 @@ async def handle_select_option_action( child_anchor_xpath=child_anchor_xpath, ) click_action = ClickAction(element_id=action.element_id) - return await chain_click(task, page, click_action, child_anchor_xpath) + return await chain_click(task, scraped_page, page, click_action, child_anchor_xpath, frame) # handler the select action on