integration with CUA (#2126)
This commit is contained in:
@@ -28,6 +28,9 @@ class ActionType(StrEnum):
|
||||
RELOAD_PAGE = "reload_page"
|
||||
|
||||
EXTRACT = "extract"
|
||||
SCROLL = "scroll"
|
||||
KEYPRESS = "keypress"
|
||||
TYPE = "type"
|
||||
|
||||
def is_web_action(self) -> bool:
|
||||
return self in [
|
||||
@@ -177,6 +180,9 @@ class ClickAction(WebAction):
|
||||
action_type: ActionType = ActionType.CLICK
|
||||
file_url: str | None = None
|
||||
download: bool = False
|
||||
x: int | None = None
|
||||
y: int | None = None
|
||||
button: str = "left"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"ClickAction(element_id={self.element_id}, file_url={self.file_url}, download={self.download})"
|
||||
@@ -240,6 +246,7 @@ class CheckboxAction(WebAction):
|
||||
|
||||
class WaitAction(Action):
|
||||
action_type: ActionType = ActionType.WAIT
|
||||
seconds: int = 20
|
||||
|
||||
|
||||
class TerminateAction(DecisiveAction):
|
||||
@@ -258,6 +265,19 @@ class ExtractAction(Action):
|
||||
data_extraction_schema: dict[str, Any] | None = None
|
||||
|
||||
|
||||
class ScrollAction(Action):
|
||||
action_type: ActionType = ActionType.SCROLL
|
||||
x: int
|
||||
y: int
|
||||
scroll_x: int
|
||||
scroll_y: int
|
||||
|
||||
|
||||
class KeypressAction(Action):
|
||||
action_type: ActionType = ActionType.KEYPRESS
|
||||
keys: list[str] = []
|
||||
|
||||
|
||||
class ScrapeResult(BaseModel):
|
||||
"""
|
||||
Scraped response from a webpage, including:
|
||||
|
||||
@@ -77,6 +77,7 @@ from skyvern.webeye.actions.actions import (
|
||||
CheckboxAction,
|
||||
ClickAction,
|
||||
InputOrSelectContext,
|
||||
InputTextAction,
|
||||
ScrapeResult,
|
||||
SelectOption,
|
||||
SelectOptionAction,
|
||||
@@ -392,6 +393,12 @@ def check_for_invalid_web_action(
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
if isinstance(action, ClickAction) and action.x is not None and action.y is not None:
|
||||
return []
|
||||
|
||||
if isinstance(action, InputTextAction) and not action.element_id:
|
||||
return []
|
||||
|
||||
if isinstance(action, WebAction) and action.element_id not in scraped_page.id_to_element_dict:
|
||||
return [ActionFailure(MissingElement(element_id=action.element_id), stop_execution_on_failure=False)]
|
||||
|
||||
@@ -420,6 +427,36 @@ async def handle_click_action(
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
if action.x is not None and action.y is not None:
|
||||
# Find the element at the clicked location using JavaScript evaluation
|
||||
element_id = await page.evaluate(
|
||||
"""data => {
|
||||
const element = document.elementFromPoint(data.x, data.y);
|
||||
if (!element) return null;
|
||||
|
||||
// Function to get the unique_id attribute of an element
|
||||
function getElementUniqueId(element) {
|
||||
if (element && element.nodeType === 1) {
|
||||
// Check if the element has the unique_id attribute
|
||||
if (element.hasAttribute('unique_id')) {
|
||||
return element.getAttribute('unique_id');
|
||||
}
|
||||
|
||||
// If no unique_id attribute is found, return null
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
return getElementUniqueId(element);
|
||||
}""",
|
||||
{"x": action.x, "y": action.y},
|
||||
)
|
||||
LOG.info("Clicked element at location", x=action.x, y=action.y, element_id=element_id, button=action.button)
|
||||
|
||||
await page.mouse.click(x=action.x, y=action.y, button=action.button)
|
||||
return [ActionSuccess()]
|
||||
|
||||
dom = DomUtil(scraped_page=scraped_page, page=page)
|
||||
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
|
||||
await asyncio.sleep(0.3)
|
||||
@@ -591,6 +628,11 @@ async def handle_input_text_action(
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
if not action.element_id:
|
||||
# This is a CUA type action
|
||||
await page.keyboard.type(action.text)
|
||||
return [ActionSuccess()]
|
||||
|
||||
dom = DomUtil(scraped_page, page)
|
||||
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
|
||||
skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame())
|
||||
@@ -1348,7 +1390,7 @@ async def handle_wait_action(
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
await asyncio.sleep(20)
|
||||
await asyncio.sleep(action.seconds)
|
||||
return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]
|
||||
|
||||
|
||||
@@ -1422,6 +1464,35 @@ async def handle_extract_action(
|
||||
return [ActionFailure(exception=Exception("No data extraction goal"))]
|
||||
|
||||
|
||||
async def handle_scroll_action(
|
||||
action: actions.ScrollAction,
|
||||
page: Page,
|
||||
scraped_page: ScrapedPage,
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
await page.mouse.move(action.x, action.y)
|
||||
await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_keypress_action(
|
||||
action: actions.KeypressAction,
|
||||
page: Page,
|
||||
scraped_page: ScrapedPage,
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
for key in action.keys:
|
||||
if key.lower() == "enter":
|
||||
await page.keyboard.press("Enter")
|
||||
elif key.lower() == "space":
|
||||
await page.keyboard.press(" ")
|
||||
else:
|
||||
await page.keyboard.press(key)
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
|
||||
ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
|
||||
ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
|
||||
@@ -1433,6 +1504,8 @@ ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
|
||||
ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
|
||||
ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
|
||||
ActionHandler.register_action_type(ActionType.EXTRACT, handle_extract_action)
|
||||
ActionHandler.register_action_type(ActionType.SCROLL, handle_scroll_action)
|
||||
ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action)
|
||||
|
||||
|
||||
async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from openai.types.responses.response import Response as OpenAIResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.config import settings
|
||||
@@ -40,6 +41,7 @@ class DetailedAgentStepOutput(BaseModel):
|
||||
action_results: list[ActionResult] | None
|
||||
actions_and_results: list[tuple[Action, list[ActionResult]]] | None
|
||||
step_exception: str | None = None
|
||||
cua_response: OpenAIResponse | None = None
|
||||
|
||||
class Config:
|
||||
exclude = ["scraped_page", "extract_action_prompt"]
|
||||
@@ -72,6 +74,7 @@ class DetailedAgentStepOutput(BaseModel):
|
||||
if self.actions_and_results is None
|
||||
else [(action, result) for action, result in self.actions_and_results if result],
|
||||
step_exception=self.step_exception,
|
||||
cua_response=self.cua_response,
|
||||
)
|
||||
|
||||
def to_agent_step_output(self) -> AgentStepOutput:
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
from typing import Any, Dict
|
||||
|
||||
import structlog
|
||||
from openai.types.responses.response import Response as OpenAIResponse
|
||||
from pydantic import ValidationError
|
||||
|
||||
from skyvern.exceptions import UnsupportedActionType
|
||||
from skyvern.forge.sdk.models import Step
|
||||
from skyvern.forge.sdk.schemas.tasks import Task
|
||||
from skyvern.webeye.actions.actions import (
|
||||
Action,
|
||||
@@ -13,7 +15,9 @@ from skyvern.webeye.actions.actions import (
|
||||
CompleteAction,
|
||||
DownloadFileAction,
|
||||
InputTextAction,
|
||||
KeypressAction,
|
||||
NullAction,
|
||||
ScrollAction,
|
||||
SelectOption,
|
||||
SelectOptionAction,
|
||||
SolveCaptchaAction,
|
||||
@@ -194,3 +198,104 @@ def parse_actions(
|
||||
)
|
||||
############################ This part of code might not be needed ############################
|
||||
return actions
|
||||
|
||||
|
||||
def parse_cua_actions(
|
||||
task: Task,
|
||||
step: Step,
|
||||
response: OpenAIResponse,
|
||||
) -> list[Action]:
|
||||
computer_calls = [item for item in response.output if item.type == "computer_call"]
|
||||
reasonings = [item for item in response.output if item.type == "reasoning"]
|
||||
actions: list[Action] = []
|
||||
for idx, computer_call in enumerate(computer_calls):
|
||||
cua_action = computer_call.action
|
||||
action_type = cua_action.type
|
||||
try:
|
||||
reasoning = None
|
||||
if idx < len(reasonings):
|
||||
try:
|
||||
reasoning = reasonings[idx].summary[0].text
|
||||
except Exception:
|
||||
LOG.exception(
|
||||
"Failed to parse reasoning",
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
step_order=step.order,
|
||||
action_order=idx,
|
||||
)
|
||||
|
||||
match action_type:
|
||||
case "click":
|
||||
button = cua_action.button
|
||||
if button != "left" and button != "right":
|
||||
button = "left"
|
||||
reasoning = reasoning or f"Click at: ({cua_action.x}, {cua_action.y})"
|
||||
action = ClickAction(
|
||||
element_id="",
|
||||
x=cua_action.x,
|
||||
y=cua_action.y,
|
||||
button=button,
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
response=f"Click at: ({cua_action.x}, {cua_action.y})",
|
||||
)
|
||||
case "scroll":
|
||||
reasoning = reasoning or f"Scroll by: ({cua_action.x}, {cua_action.y})"
|
||||
action = ScrollAction(
|
||||
element_id="",
|
||||
x=cua_action.x,
|
||||
y=cua_action.y,
|
||||
scroll_x=cua_action.scroll_x,
|
||||
scroll_y=cua_action.scroll_y,
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
response=f"Scroll by: ({cua_action.x}, {cua_action.y})",
|
||||
)
|
||||
case "keypress":
|
||||
reasoning_str = f"Press keys: {cua_action.keys}"
|
||||
if len(cua_action.keys) == 1:
|
||||
reasoning_str = f"Press the '{cua_action.keys[0]}' key"
|
||||
reasoning = reasoning or reasoning_str
|
||||
action = KeypressAction(
|
||||
element_id="",
|
||||
keys=cua_action.keys,
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
response=str(cua_action.keys),
|
||||
)
|
||||
case "type":
|
||||
action = InputTextAction(
|
||||
element_id="",
|
||||
text=cua_action.text,
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
response=cua_action.text,
|
||||
)
|
||||
case "wait":
|
||||
action = WaitAction(
|
||||
seconds=5,
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
)
|
||||
case _:
|
||||
raise ValueError(f"Unsupported action type: {action_type}")
|
||||
action.organization_id = task.organization_id
|
||||
action.workflow_run_id = task.workflow_run_id
|
||||
action.task_id = task.task_id
|
||||
action.step_id = step.step_id
|
||||
action.step_order = step.order
|
||||
action.action_order = idx
|
||||
actions.append(action)
|
||||
except Exception:
|
||||
LOG.exception(
|
||||
"Failed to parse action",
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
step_order=step.order,
|
||||
action_order=idx,
|
||||
)
|
||||
break
|
||||
if not actions:
|
||||
return [CompleteAction(reasoning="No actions generated", verified=True)]
|
||||
return actions
|
||||
|
||||
@@ -326,13 +326,14 @@ class ScrapedPage(BaseModel):
|
||||
element["children"] = new_children
|
||||
return element
|
||||
|
||||
async def refresh(self, draw_boxes: bool = True) -> Self:
|
||||
async def refresh(self, draw_boxes: bool = True, scroll: bool = True) -> Self:
|
||||
refreshed_page = await scrape_website(
|
||||
browser_state=self._browser_state,
|
||||
url=self.url,
|
||||
cleanup_element_tree=self._clean_up_func,
|
||||
scrape_exclude=self._scrape_exclude,
|
||||
draw_boxes=draw_boxes,
|
||||
scroll=scroll,
|
||||
)
|
||||
self.elements = refreshed_page.elements
|
||||
self.id_to_css_dict = refreshed_page.id_to_css_dict
|
||||
@@ -366,6 +367,8 @@ async def scrape_website(
|
||||
scrape_exclude: ScrapeExcludeFunc | None = None,
|
||||
take_screenshots: bool = True,
|
||||
draw_boxes: bool = True,
|
||||
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
|
||||
scroll: bool = True,
|
||||
) -> ScrapedPage:
|
||||
"""
|
||||
************************************************************************************************
|
||||
@@ -397,6 +400,8 @@ async def scrape_website(
|
||||
scrape_exclude=scrape_exclude,
|
||||
take_screenshots=take_screenshots,
|
||||
draw_boxes=draw_boxes,
|
||||
max_screenshot_number=max_screenshot_number,
|
||||
scroll=scroll,
|
||||
)
|
||||
except Exception as e:
|
||||
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
|
||||
@@ -420,6 +425,8 @@ async def scrape_website(
|
||||
scrape_exclude=scrape_exclude,
|
||||
take_screenshots=take_screenshots,
|
||||
draw_boxes=draw_boxes,
|
||||
max_screenshot_number=max_screenshot_number,
|
||||
scroll=scroll,
|
||||
)
|
||||
|
||||
|
||||
@@ -469,6 +476,8 @@ async def scrape_web_unsafe(
|
||||
scrape_exclude: ScrapeExcludeFunc | None = None,
|
||||
take_screenshots: bool = True,
|
||||
draw_boxes: bool = True,
|
||||
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
|
||||
scroll: bool = True,
|
||||
) -> ScrapedPage:
|
||||
"""
|
||||
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
|
||||
@@ -503,7 +512,6 @@ async def scrape_web_unsafe(
|
||||
json_to_html(element, need_skyvern_attrs=False) for element in element_tree_trimmed
|
||||
)
|
||||
token_count = count_tokens(element_tree_trimmed_html_str)
|
||||
max_screenshot_number = settings.MAX_NUM_SCREENSHOTS
|
||||
if token_count > DEFAULT_MAX_TOKENS:
|
||||
max_screenshot_number = min(max_screenshot_number, 1)
|
||||
|
||||
@@ -512,6 +520,7 @@ async def scrape_web_unsafe(
|
||||
url=url,
|
||||
draw_boxes=draw_boxes,
|
||||
max_number=max_screenshot_number,
|
||||
scroll=scroll,
|
||||
)
|
||||
id_to_css_dict, id_to_element_dict, id_to_frame_dict, id_to_element_hash, hash_to_element_ids = build_element_dict(
|
||||
elements
|
||||
|
||||
@@ -98,8 +98,11 @@ class SkyvernFrame:
|
||||
url: str,
|
||||
draw_boxes: bool = False,
|
||||
max_number: int = settings.MAX_NUM_SCREENSHOTS,
|
||||
scroll: bool = True,
|
||||
) -> List[bytes]:
|
||||
skyvern_page = await SkyvernFrame.create_instance(frame=page)
|
||||
if not scroll:
|
||||
return [await SkyvernFrame.take_screenshot(page=skyvern_page.frame, full_page=False)]
|
||||
|
||||
# page is the main frame and the index must be 0
|
||||
assert isinstance(skyvern_page.frame, Page)
|
||||
|
||||
Reference in New Issue
Block a user