Ykeremy/click instead of download (#275)

This commit is contained in:
Kerem Yilmaz
2024-05-07 23:54:07 -07:00
committed by GitHub
parent 2788b53a0c
commit cc91c1b2b6
5 changed files with 42 additions and 12 deletions

View File

@@ -12,11 +12,10 @@ Reply in JSON format with the following keys:
[{
"reasoning": str, // The reasoning behind the action. Be specific, referencing any user information and their fields and element ids in your reasoning. Mention why you chose the action type, and why you chose the element id. Keep the reasoning short and to the point.
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
"action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "DOWNLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "DOWNLOAD_FILE" is an element you'd like to download a file from, and the file will be saved with the name provided in the "file_name" field. You can download multiple files in one action by returning multiple "DOWNLOAD_FILE" actions in one step. The "file_name" field should be unique for each file download action. Do not download the same file multiple times, check action history to see if the file has already been downloaded. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.
"action_type": str, // It's a string enum: "CLICK", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE". "CLICK" is an element you'd like to click. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the user goal has been achieved AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the user goal is achieved. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.
"id": int, // The id of the element to take action on. The id has to be one from the elements list
"text": str, // Text for INPUT_TEXT action only
"file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise.
"file_name": str, // The name of the file to save the downloaded file as. This field must be present for DOWNLOAD_FILE only. It should be null otherwise.
"option": { // The option to select for SELECT_OPTION action only. null if not SELECT_OPTION action
"label": str, // the label of the option if any. MAKE SURE YOU USE THIS LABEL TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION LABEL HERE
"index": int, // the id corresponding to the optionIndex under the the select element.

View File

@@ -7,7 +7,7 @@ from urllib.parse import urlparse
import aiohttp
import structlog
from skyvern.constants import SKYVERN_DIR
from skyvern.constants import REPO_ROOT_DIR
from skyvern.exceptions import DownloadFileMaxSizeExceeded
LOG = structlog.get_logger()
@@ -67,4 +67,4 @@ def zip_files(files_path: str, zip_file_path: str) -> str:
def get_path_for_workflow_download_directory(workflow_run_id: str) -> Path:
return Path(f"{SKYVERN_DIR}/downloads/{workflow_run_id}/")
return Path(f"{REPO_ROOT_DIR}/downloads/{workflow_run_id}/")

View File

@@ -3,6 +3,7 @@ from enum import StrEnum
from typing import Any, Dict, List
import structlog
from deprecation import deprecated
from pydantic import BaseModel, Field
from skyvern.forge.sdk.schemas.tasks import Task
@@ -14,7 +15,10 @@ class ActionType(StrEnum):
CLICK = "click"
INPUT_TEXT = "input_text"
UPLOAD_FILE = "upload_file"
# This action is not used in the current implementation. Click actions are used instead."
DOWNLOAD_FILE = "download_file"
SELECT_OPTION = "select_option"
CHECKBOX = "checkbox"
WAIT = "wait"
@@ -70,6 +74,7 @@ class UploadFileAction(WebAction):
return f"UploadFileAction(element_id={self.element_id}, file={self.file_url}, is_upload_file_tag={self.is_upload_file_tag})"
@deprecated("This action is not used in the current implementation. Click actions are used instead.")
class DownloadFileAction(WebAction):
action_type: ActionType = ActionType.DOWNLOAD_FILE
file_name: str
@@ -158,6 +163,7 @@ def parse_actions(task: Task, json_response: List[Dict[str, Any]]) -> List[Actio
# TODO: see if the element is a file input element. if it's not, convert this action into a click action
actions.append(UploadFileAction(element_id=element_id, file_url=action["file_url"], reasoning=reasoning))
# This action is not used in the current implementation. Click actions are used instead.
elif action_type == ActionType.DOWNLOAD_FILE:
actions.append(
DownloadFileAction(element_id=element_id, file_name=action["file_name"], reasoning=reasoning)
@@ -214,7 +220,8 @@ ActionTypeUnion = (
ClickAction
| InputTextAction
| UploadFileAction
| DownloadFileAction
# Deprecated
# | DownloadFileAction
| SelectOptionAction
| CheckboxAction
| WaitAction

View File

@@ -6,9 +6,10 @@ import uuid
from typing import Any, Awaitable, Callable, List
import structlog
from deprecation import deprecated
from playwright.async_api import Locator, Page
from skyvern.constants import SKYVERN_DIR
from skyvern.constants import REPO_ROOT_DIR
from skyvern.exceptions import ImaginaryFileUrl, MissingElement, MissingFileUrl, MultipleElementsFound
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
@@ -205,12 +206,13 @@ async def handle_upload_file_action(
)
@deprecated("This function is deprecated. Downloads are handled by the click action handler now.")
async def handle_download_file_action(
action: actions.DownloadFileAction, page: Page, scraped_page: ScrapedPage, task: Task, step: Step
) -> list[ActionResult]:
xpath = await validate_actions_in_dom(action, page, scraped_page)
file_name = f"{action.file_name or uuid.uuid4()}"
full_file_path = f"{SKYVERN_DIR}/downloads/{task.workflow_run_id or task.task_id}/{file_name}"
full_file_path = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}/{file_name}"
try:
# Start waiting for the download
async with page.expect_download() as download_info:
@@ -222,7 +224,7 @@ async def handle_download_file_action(
download = await download_info.value
# Create download folders if they don't exist
download_folder = f"{SKYVERN_DIR}/downloads/{task.workflow_run_id or task.task_id}"
download_folder = f"{REPO_ROOT_DIR}/downloads/{task.workflow_run_id or task.task_id}"
os.makedirs(download_folder, exist_ok=True)
# Wait for the download process to complete and save the downloaded file
await download.save_as(full_file_path)
@@ -452,7 +454,7 @@ ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captch
ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
ActionHandler.register_action_type(ActionType.UPLOAD_FILE, handle_upload_file_action)
ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action)
# ActionHandler.register_action_type(ActionType.DOWNLOAD_FILE, handle_download_file_action)
ActionHandler.register_action_type(ActionType.NULL_ACTION, handle_null_action)
ActionHandler.register_action_type(ActionType.SELECT_OPTION, handle_select_option_action)
ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
@@ -525,8 +527,18 @@ async def chain_click(
fc_func = lambda fc: fc.set_files(files=file)
page.on("filechooser", fc_func)
LOG.info("Registered file chooser listener", action=action, path=file)
# If a download is triggered due to the click, we need to let LLM know in action_results
download_triggered = False
def download_func(download: Any) -> None:
nonlocal download_triggered
download_triggered = True
page.on("download", download_func)
LOG.info("Registered download listener", action=action)
"""
Clicks on an element identified by the xpath and its parent if failed.
:param xpath: xpath of the element to click
@@ -535,12 +547,15 @@ async def chain_click(
try:
await page.click(f"xpath={xpath}", timeout=timeout)
LOG.info("Chain click: main element click succeeded", action=action, xpath=xpath)
return [ActionSuccess(javascript_triggered=javascript_triggered)]
return [ActionSuccess(javascript_triggered=javascript_triggered, download_triggered=download_triggered)]
except Exception as e:
action_results: list[ActionResult] = [ActionFailure(e, javascript_triggered=javascript_triggered)]
action_results: list[ActionResult] = [
ActionFailure(e, javascript_triggered=javascript_triggered, download_triggered=download_triggered)
]
if await is_input_element(page.locator(xpath)):
LOG.info("Chain click: it's an input element. going to try sibling click", action=action, xpath=xpath)
sibling_action_result = await click_sibling_of_input(page.locator(xpath), timeout=timeout)
sibling_action_result.download_triggered = download_triggered
action_results.append(sibling_action_result)
if type(sibling_action_result) == ActionSuccess:
return action_results
@@ -556,6 +571,7 @@ async def chain_click(
ActionSuccess(
javascript_triggered=javascript_triggered,
interacted_with_parent=True,
download_triggered=download_triggered,
)
)
except Exception as pe:
@@ -575,6 +591,7 @@ async def chain_click(
if file:
await asyncio.sleep(10)
page.remove_listener("filechooser", fc_func)
page.remove_listener("download", download_func)
def get_anchor_to_click(scraped_page: ScrapedPage, element_id: int) -> str | None:

View File

@@ -13,6 +13,7 @@ class ActionResult(BaseModel):
step_retry_number: int | None = None
step_order: int | None = None
javascript_triggered: bool = False
download_triggered: bool | None = None
# None is used for old data so that we can differentiate between old and new data which only has boolean
interacted_with_sibling: bool | None = None
interacted_with_parent: bool | None = None
@@ -32,6 +33,7 @@ class ActionSuccess(ActionResult):
self,
data: dict[str, Any] | list | str | None = None,
javascript_triggered: bool = False,
download_triggered: bool | None = None,
interacted_with_sibling: bool = False,
interacted_with_parent: bool = False,
):
@@ -39,6 +41,7 @@ class ActionSuccess(ActionResult):
success=True,
data=data,
javascript_triggered=javascript_triggered,
download_triggered=download_triggered,
interacted_with_sibling=interacted_with_sibling,
interacted_with_parent=interacted_with_parent,
)
@@ -49,6 +52,7 @@ class ActionFailure(ActionResult):
self,
exception: Exception,
javascript_triggered: bool = False,
download_triggered: bool | None = None,
interacted_with_sibling: bool = False,
interacted_with_parent: bool = False,
):
@@ -57,6 +61,7 @@ class ActionFailure(ActionResult):
exception_type=type(exception).__name__,
exception_message=remove_whitespace(str(exception)),
javascript_triggered=javascript_triggered,
download_triggered=download_triggered,
interacted_with_sibling=interacted_with_sibling,
interacted_with_parent=interacted_with_parent,
)
@@ -68,12 +73,14 @@ class ActionAbort(ActionResult):
def __init__(
self,
javascript_triggered: bool = False,
download_triggered: bool | None = None,
interacted_with_sibling: bool = False,
interacted_with_parent: bool = False,
):
super().__init__(
success=True,
javascript_triggered=javascript_triggered,
download_triggered=download_triggered,
interacted_with_sibling=interacted_with_sibling,
interacted_with_parent=interacted_with_parent,
)