integration with CUA (#2126)

This commit is contained in:
Shuchang Zheng
2025-04-11 11:18:53 -07:00
committed by GitHub
parent 2ac65c4a9b
commit f883b91180
13 changed files with 420 additions and 53 deletions

View File

@@ -28,6 +28,9 @@ class ActionType(StrEnum):
RELOAD_PAGE = "reload_page"
EXTRACT = "extract"
SCROLL = "scroll"
KEYPRESS = "keypress"
TYPE = "type"
def is_web_action(self) -> bool:
return self in [
@@ -177,6 +180,9 @@ class ClickAction(WebAction):
action_type: ActionType = ActionType.CLICK
file_url: str | None = None
download: bool = False
x: int | None = None
y: int | None = None
button: str = "left"
def __repr__(self) -> str:
return f"ClickAction(element_id={self.element_id}, file_url={self.file_url}, download={self.download})"
@@ -240,6 +246,7 @@ class CheckboxAction(WebAction):
class WaitAction(Action):
action_type: ActionType = ActionType.WAIT
seconds: int = 20
class TerminateAction(DecisiveAction):
@@ -258,6 +265,19 @@ class ExtractAction(Action):
data_extraction_schema: dict[str, Any] | None = None
class ScrollAction(Action):
action_type: ActionType = ActionType.SCROLL
x: int
y: int
scroll_x: int
scroll_y: int
class KeypressAction(Action):
action_type: ActionType = ActionType.KEYPRESS
keys: list[str] = []
class ScrapeResult(BaseModel):
"""
Scraped response from a webpage, including:

View File

@@ -77,6 +77,7 @@ from skyvern.webeye.actions.actions import (
CheckboxAction,
ClickAction,
InputOrSelectContext,
InputTextAction,
ScrapeResult,
SelectOption,
SelectOptionAction,
@@ -392,6 +393,12 @@ def check_for_invalid_web_action(
task: Task,
step: Step,
) -> list[ActionResult]:
if isinstance(action, ClickAction) and action.x is not None and action.y is not None:
return []
if isinstance(action, InputTextAction) and not action.element_id:
return []
if isinstance(action, WebAction) and action.element_id not in scraped_page.id_to_element_dict:
return [ActionFailure(MissingElement(element_id=action.element_id), stop_execution_on_failure=False)]
@@ -420,6 +427,36 @@ async def handle_click_action(
task: Task,
step: Step,
) -> list[ActionResult]:
if action.x is not None and action.y is not None:
# Find the element at the clicked location using JavaScript evaluation
element_id = await page.evaluate(
"""data => {
const element = document.elementFromPoint(data.x, data.y);
if (!element) return null;
// Function to get the unique_id attribute of an element
function getElementUniqueId(element) {
if (element && element.nodeType === 1) {
// Check if the element has the unique_id attribute
if (element.hasAttribute('unique_id')) {
return element.getAttribute('unique_id');
}
// If no unique_id attribute is found, return null
return null;
}
return null;
}
return getElementUniqueId(element);
}""",
{"x": action.x, "y": action.y},
)
LOG.info("Clicked element at location", x=action.x, y=action.y, element_id=element_id, button=action.button)
await page.mouse.click(x=action.x, y=action.y, button=action.button)
return [ActionSuccess()]
dom = DomUtil(scraped_page=scraped_page, page=page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
await asyncio.sleep(0.3)
@@ -591,6 +628,11 @@ async def handle_input_text_action(
task: Task,
step: Step,
) -> list[ActionResult]:
if not action.element_id:
# This is a CUA type action
await page.keyboard.type(action.text)
return [ActionSuccess()]
dom = DomUtil(scraped_page, page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame())
@@ -1348,7 +1390,7 @@ async def handle_wait_action(
task: Task,
step: Step,
) -> list[ActionResult]:
await asyncio.sleep(20)
await asyncio.sleep(action.seconds)
return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]
@@ -1422,6 +1464,35 @@ async def handle_extract_action(
return [ActionFailure(exception=Exception("No data extraction goal"))]
async def handle_scroll_action(
action: actions.ScrollAction,
page: Page,
scraped_page: ScrapedPage,
task: Task,
step: Step,
) -> list[ActionResult]:
await page.mouse.move(action.x, action.y)
await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
return [ActionSuccess()]
async def handle_keypress_action(
action: actions.KeypressAction,
page: Page,
scraped_page: ScrapedPage,
task: Task,
step: Step,
) -> list[ActionResult]:
for key in action.keys:
if key.lower() == "enter":
await page.keyboard.press("Enter")
elif key.lower() == "space":
await page.keyboard.press(" ")
else:
await page.keyboard.press(key)
return [ActionSuccess()]
ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
@@ -1433,6 +1504,8 @@ ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
ActionHandler.register_action_type(ActionType.EXTRACT, handle_extract_action)
ActionHandler.register_action_type(ActionType.SCROLL, handle_scroll_action)
ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action)
async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
from typing import Any
from openai.types.responses.response import Response as OpenAIResponse
from pydantic import BaseModel
from skyvern.config import settings
@@ -40,6 +41,7 @@ class DetailedAgentStepOutput(BaseModel):
action_results: list[ActionResult] | None
actions_and_results: list[tuple[Action, list[ActionResult]]] | None
step_exception: str | None = None
cua_response: OpenAIResponse | None = None
class Config:
exclude = ["scraped_page", "extract_action_prompt"]
@@ -72,6 +74,7 @@ class DetailedAgentStepOutput(BaseModel):
if self.actions_and_results is None
else [(action, result) for action, result in self.actions_and_results if result],
step_exception=self.step_exception,
cua_response=self.cua_response,
)
def to_agent_step_output(self) -> AgentStepOutput:

View File

@@ -1,9 +1,11 @@
from typing import Any, Dict
import structlog
from openai.types.responses.response import Response as OpenAIResponse
from pydantic import ValidationError
from skyvern.exceptions import UnsupportedActionType
from skyvern.forge.sdk.models import Step
from skyvern.forge.sdk.schemas.tasks import Task
from skyvern.webeye.actions.actions import (
Action,
@@ -13,7 +15,9 @@ from skyvern.webeye.actions.actions import (
CompleteAction,
DownloadFileAction,
InputTextAction,
KeypressAction,
NullAction,
ScrollAction,
SelectOption,
SelectOptionAction,
SolveCaptchaAction,
@@ -194,3 +198,104 @@ def parse_actions(
)
############################ This part of code might not be needed ############################
return actions
def parse_cua_actions(
task: Task,
step: Step,
response: OpenAIResponse,
) -> list[Action]:
computer_calls = [item for item in response.output if item.type == "computer_call"]
reasonings = [item for item in response.output if item.type == "reasoning"]
actions: list[Action] = []
for idx, computer_call in enumerate(computer_calls):
cua_action = computer_call.action
action_type = cua_action.type
try:
reasoning = None
if idx < len(reasonings):
try:
reasoning = reasonings[idx].summary[0].text
except Exception:
LOG.exception(
"Failed to parse reasoning",
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
)
match action_type:
case "click":
button = cua_action.button
if button != "left" and button != "right":
button = "left"
reasoning = reasoning or f"Click at: ({cua_action.x}, {cua_action.y})"
action = ClickAction(
element_id="",
x=cua_action.x,
y=cua_action.y,
button=button,
reasoning=reasoning,
intention=reasoning,
response=f"Click at: ({cua_action.x}, {cua_action.y})",
)
case "scroll":
reasoning = reasoning or f"Scroll by: ({cua_action.x}, {cua_action.y})"
action = ScrollAction(
element_id="",
x=cua_action.x,
y=cua_action.y,
scroll_x=cua_action.scroll_x,
scroll_y=cua_action.scroll_y,
reasoning=reasoning,
intention=reasoning,
response=f"Scroll by: ({cua_action.x}, {cua_action.y})",
)
case "keypress":
reasoning_str = f"Press keys: {cua_action.keys}"
if len(cua_action.keys) == 1:
reasoning_str = f"Press the '{cua_action.keys[0]}' key"
reasoning = reasoning or reasoning_str
action = KeypressAction(
element_id="",
keys=cua_action.keys,
reasoning=reasoning,
intention=reasoning,
response=str(cua_action.keys),
)
case "type":
action = InputTextAction(
element_id="",
text=cua_action.text,
reasoning=reasoning,
intention=reasoning,
response=cua_action.text,
)
case "wait":
action = WaitAction(
seconds=5,
reasoning=reasoning,
intention=reasoning,
)
case _:
raise ValueError(f"Unsupported action type: {action_type}")
action.organization_id = task.organization_id
action.workflow_run_id = task.workflow_run_id
action.task_id = task.task_id
action.step_id = step.step_id
action.step_order = step.order
action.action_order = idx
actions.append(action)
except Exception:
LOG.exception(
"Failed to parse action",
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
)
break
if not actions:
return [CompleteAction(reasoning="No actions generated", verified=True)]
return actions

View File

@@ -326,13 +326,14 @@ class ScrapedPage(BaseModel):
element["children"] = new_children
return element
async def refresh(self, draw_boxes: bool = True) -> Self:
async def refresh(self, draw_boxes: bool = True, scroll: bool = True) -> Self:
refreshed_page = await scrape_website(
browser_state=self._browser_state,
url=self.url,
cleanup_element_tree=self._clean_up_func,
scrape_exclude=self._scrape_exclude,
draw_boxes=draw_boxes,
scroll=scroll,
)
self.elements = refreshed_page.elements
self.id_to_css_dict = refreshed_page.id_to_css_dict
@@ -366,6 +367,8 @@ async def scrape_website(
scrape_exclude: ScrapeExcludeFunc | None = None,
take_screenshots: bool = True,
draw_boxes: bool = True,
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
scroll: bool = True,
) -> ScrapedPage:
"""
************************************************************************************************
@@ -397,6 +400,8 @@ async def scrape_website(
scrape_exclude=scrape_exclude,
take_screenshots=take_screenshots,
draw_boxes=draw_boxes,
max_screenshot_number=max_screenshot_number,
scroll=scroll,
)
except Exception as e:
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
@@ -420,6 +425,8 @@ async def scrape_website(
scrape_exclude=scrape_exclude,
take_screenshots=take_screenshots,
draw_boxes=draw_boxes,
max_screenshot_number=max_screenshot_number,
scroll=scroll,
)
@@ -469,6 +476,8 @@ async def scrape_web_unsafe(
scrape_exclude: ScrapeExcludeFunc | None = None,
take_screenshots: bool = True,
draw_boxes: bool = True,
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
scroll: bool = True,
) -> ScrapedPage:
"""
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -503,7 +512,6 @@ async def scrape_web_unsafe(
json_to_html(element, need_skyvern_attrs=False) for element in element_tree_trimmed
)
token_count = count_tokens(element_tree_trimmed_html_str)
max_screenshot_number = settings.MAX_NUM_SCREENSHOTS
if token_count > DEFAULT_MAX_TOKENS:
max_screenshot_number = min(max_screenshot_number, 1)
@@ -512,6 +520,7 @@ async def scrape_web_unsafe(
url=url,
draw_boxes=draw_boxes,
max_number=max_screenshot_number,
scroll=scroll,
)
id_to_css_dict, id_to_element_dict, id_to_frame_dict, id_to_element_hash, hash_to_element_ids = build_element_dict(
elements

View File

@@ -98,8 +98,11 @@ class SkyvernFrame:
url: str,
draw_boxes: bool = False,
max_number: int = settings.MAX_NUM_SCREENSHOTS,
scroll: bool = True,
) -> List[bytes]:
skyvern_page = await SkyvernFrame.create_instance(frame=page)
if not scroll:
return [await SkyvernFrame.take_screenshot(page=skyvern_page.frame, full_page=False)]
# page is the main frame and the index must be 0
assert isinstance(skyvern_page.frame, Page)