integration with CUA (#2126)

This commit is contained in:
Shuchang Zheng
2025-04-11 11:18:53 -07:00
committed by GitHub
parent 2ac65c4a9b
commit f883b91180
13 changed files with 420 additions and 53 deletions

View File

@@ -116,6 +116,7 @@ class Settings(BaseSettings):
ENABLE_AZURE_O3_MINI: bool = False
ENABLE_BEDROCK: bool = False
ENABLE_GEMINI: bool = False
ENABLE_AZURE_CUA: bool = False
# OPENAI
OPENAI_API_KEY: str | None = None
# ANTHROPIC
@@ -125,6 +126,10 @@ class Settings(BaseSettings):
AZURE_API_KEY: str | None = None
AZURE_API_BASE: str | None = None
AZURE_API_VERSION: str | None = None
AZURE_CUA_API_KEY: str | None = None
AZURE_CUA_ENDPOINT: str | None = None
AZURE_CUA_DEPLOYMENT: str | None = "computer-use-preview"
AZURE_CUA_API_VERSION: str | None = "2025-03-01-preview"
# AZURE GPT-4o mini
AZURE_GPT4O_MINI_DEPLOYMENT: str | None = None

View File

@@ -1,4 +1,5 @@
import asyncio
import base64
import json
import os
import random
@@ -10,6 +11,7 @@ from typing import Any, Tuple
import httpx
import structlog
from openai.types.responses.response import Response as OpenAIResponse
from playwright._impl._errors import TargetClosedError
from playwright.async_api import Page
@@ -68,6 +70,7 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskRequest, TaskResponse, Tas
from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext
from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock
from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus
from skyvern.schemas.runs import RunEngine, RunType
from skyvern.utils.prompt_engine import load_prompt_with_elements
from skyvern.webeye.actions.actions import (
Action,
@@ -85,7 +88,7 @@ from skyvern.webeye.actions.actions import (
from skyvern.webeye.actions.caching import retrieve_action_plan
from skyvern.webeye.actions.handler import ActionHandler, poll_verification_code
from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput
from skyvern.webeye.actions.parse_actions import parse_actions
from skyvern.webeye.actions.parse_actions import parse_actions, parse_cua_actions
from skyvern.webeye.actions.responses import ActionResult, ActionSuccess
from skyvern.webeye.browser_factory import BrowserState
from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, scrape_website
@@ -248,6 +251,8 @@ class ForgeAgent:
task_block: BaseTaskBlock | None = None,
browser_session_id: str | None = None,
complete_verification: bool = True,
engine: RunEngine = RunEngine.skyvern_v1,
cua_response: OpenAIResponse | None = None,
) -> Tuple[Step, DetailedAgentStepOutput | None, Step | None]:
workflow_run: WorkflowRun | None = None
if task.workflow_run_id:
@@ -380,6 +385,8 @@ class ForgeAgent:
organization=organization,
task_block=task_block,
complete_verification=complete_verification,
engine=engine,
cua_response=cua_response,
)
await app.AGENT_FUNCTION.post_step_execution(task, step)
task = await self.update_task_errors_from_detailed_output(task, detailed_output)
@@ -506,6 +513,10 @@ class ForgeAgent:
step_status=step.status,
)
cua_response_param = detailed_output.cua_response if detailed_output else None
if not cua_response_param and cua_response:
cua_response_param = cua_response
if retry and next_step:
return await self.execute_step(
organization,
@@ -516,6 +527,8 @@ class ForgeAgent:
browser_session_id=browser_session_id,
task_block=task_block,
complete_verification=complete_verification,
engine=engine,
cua_response=cua_response_param,
)
elif settings.execute_all_steps() and next_step:
return await self.execute_step(
@@ -527,6 +540,8 @@ class ForgeAgent:
browser_session_id=browser_session_id,
task_block=task_block,
complete_verification=complete_verification,
engine=engine,
cua_response=cua_response_param,
)
else:
LOG.info(
@@ -757,9 +772,11 @@ class ForgeAgent:
task: Task,
step: Step,
browser_state: BrowserState,
engine: RunEngine = RunEngine.skyvern_v1,
organization: Organization | None = None,
task_block: BaseTaskBlock | None = None,
complete_verification: bool = True,
cua_response: OpenAIResponse | None = None,
) -> tuple[Step, DetailedAgentStepOutput]:
detailed_agent_step_output = DetailedAgentStepOutput(
scraped_page=None,
@@ -768,6 +785,7 @@ class ForgeAgent:
actions=None,
action_results=None,
actions_and_results=None,
cua_response=None,
)
try:
LOG.info(
@@ -789,52 +807,62 @@ class ForgeAgent:
task,
step,
browser_state,
engine,
)
detailed_agent_step_output.scraped_page = scraped_page
detailed_agent_step_output.extract_action_prompt = extract_action_prompt
json_response = None
actions: list[Action]
using_cached_action_plan = False
if not task.navigation_goal and not isinstance(task_block, ValidationBlock):
actions = [await self.create_extract_action(task, step, scraped_page)]
elif (
task_block
and task_block.cache_actions
and (actions := await retrieve_action_plan(task, step, scraped_page))
):
using_cached_action_plan = True
else:
self.async_operation_pool.run_operation(task.task_id, AgentPhase.llm)
json_response = await app.LLM_API_HANDLER(
prompt=extract_action_prompt,
prompt_name="extract-actions",
if engine == RunEngine.openai_cua:
actions, new_cua_response = await self._generate_cua_actions(
task=task,
step=step,
screenshots=scraped_page.screenshots,
scraped_page=scraped_page,
previous_response=cua_response,
)
try:
json_response = await self.handle_potential_verification_code(
task,
step,
scraped_page,
browser_state,
json_response,
detailed_agent_step_output.cua_response = new_cua_response
else:
using_cached_action_plan = False
if not task.navigation_goal and not isinstance(task_block, ValidationBlock):
actions = [await self.create_extract_action(task, step, scraped_page)]
elif (
task_block
and task_block.cache_actions
and (actions := await retrieve_action_plan(task, step, scraped_page))
):
using_cached_action_plan = True
else:
self.async_operation_pool.run_operation(task.task_id, AgentPhase.llm)
json_response = await app.LLM_API_HANDLER(
prompt=extract_action_prompt,
prompt_name="extract-actions",
step=step,
screenshots=scraped_page.screenshots,
)
detailed_agent_step_output.llm_response = json_response
actions = parse_actions(task, step.step_id, step.order, scraped_page, json_response["actions"])
except NoTOTPVerificationCodeFound:
actions = [
TerminateAction(
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=0,
reasoning="No TOTP verification code found. Going to terminate.",
intention="No TOTP verification code found. Going to terminate.",
try:
json_response = await self.handle_potential_verification_code(
task,
step,
scraped_page,
browser_state,
json_response,
)
]
detailed_agent_step_output.llm_response = json_response
actions = parse_actions(task, step.step_id, step.order, scraped_page, json_response["actions"])
except NoTOTPVerificationCodeFound:
actions = [
TerminateAction(
organization_id=task.organization_id,
workflow_run_id=task.workflow_run_id,
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=0,
reasoning="No TOTP verification code found. Going to terminate.",
intention="No TOTP verification code found. Going to terminate.",
)
]
detailed_agent_step_output.actions = actions
if len(actions) == 0:
@@ -1187,6 +1215,77 @@ class ForgeAgent:
)
return failed_step, detailed_agent_step_output.get_clean_detailed_output()
async def _generate_cua_actions(
self,
task: Task,
step: Step,
scraped_page: ScrapedPage,
previous_response: OpenAIResponse | None = None,
) -> tuple[list[Action], OpenAIResponse]:
if not previous_response:
# this is the first step
first_response: OpenAIResponse = await app.OPENAI_CLIENT.responses.create(
model="computer-use-preview",
tools=[
{
"type": "computer_use_preview",
"display_width": settings.BROWSER_WIDTH,
"display_height": settings.BROWSER_HEIGHT,
"environment": "browser",
}
],
input=[
{
"role": "user",
"content": task.navigation_goal,
}
],
reasoning={
"generate_summary": "concise",
},
truncation="auto",
)
previous_response = first_response
computer_calls = [item for item in previous_response.output if item.type == "computer_call"]
if not computer_calls:
return [], previous_response
if not scraped_page.screenshots:
return [], previous_response
last_call_id = computer_calls[-1].call_id
screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8")
current_response = await app.OPENAI_CLIENT.responses.create(
model="computer-use-preview",
previous_response_id=previous_response.id,
tools=[
{
"type": "computer_use_preview",
"display_width": settings.BROWSER_WIDTH,
"display_height": settings.BROWSER_HEIGHT,
"environment": "browser",
}
],
input=[
{
"call_id": last_call_id,
"type": "computer_call_output",
"output": {
"type": "input_image",
"image_url": f"data:image/png;base64,{screenshot_base64}",
},
}
],
reasoning={
"generate_summary": "concise",
},
truncation="auto",
)
return parse_cua_actions(task, step, current_response), current_response
@staticmethod
async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult:
LOG.info(
@@ -1195,7 +1294,12 @@ class ForgeAgent:
step_id=step.step_id,
workflow_run_id=task.workflow_run_id,
)
scraped_page_refreshed = await scraped_page.refresh(draw_boxes=False)
run_obj = await app.DATABASE.get_run(run_id=task.task_id, organization_id=task.organization_id)
scroll = True
if run_obj and run_obj.task_run_type == RunType.openai_cua:
scroll = False
scraped_page_refreshed = await scraped_page.refresh(draw_boxes=False, scroll=scroll)
verification_prompt = load_prompt_with_elements(
scraped_page=scraped_page_refreshed,
@@ -1351,6 +1455,7 @@ class ForgeAgent:
step: Step,
browser_state: BrowserState,
scrape_type: ScrapeType,
engine: RunEngine,
) -> ScrapedPage:
if scrape_type == ScrapeType.NORMAL:
pass
@@ -1370,11 +1475,21 @@ class ForgeAgent:
)
await browser_state.reload_page()
max_screenshot_number = settings.MAX_NUM_SCREENSHOTS
draw_boxes = True
scroll = True
if engine == RunEngine.openai_cua:
max_screenshot_number = 1
draw_boxes = False
scroll = False
return await scrape_website(
browser_state,
task.url,
app.AGENT_FUNCTION.cleanup_element_tree_factory(task=task, step=step),
scrape_exclude=app.scrape_exclude,
max_screenshot_number=max_screenshot_number,
draw_boxes=draw_boxes,
scroll=scroll,
)
async def build_and_record_step_prompt(
@@ -1382,6 +1497,7 @@ class ForgeAgent:
task: Task,
step: Step,
browser_state: BrowserState,
engine: RunEngine,
) -> tuple[ScrapedPage, str]:
# start the async tasks while running scrape_website
self.async_operation_pool.run_operation(task.task_id, AgentPhase.scrape)
@@ -1399,6 +1515,7 @@ class ForgeAgent:
step=step,
browser_state=browser_state,
scrape_type=scrape_type,
engine=engine,
)
break
except (FailedToTakeScreenshot, ScrapingFailed) as e:
@@ -1431,14 +1548,16 @@ class ForgeAgent:
# TODO: we only use HTML element for now, introduce a way to switch in the future
element_tree_format = ElementTreeFormat.HTML
element_tree_in_prompt: str = scraped_page.build_element_tree(element_tree_format)
extract_action_prompt = await self._build_extract_action_prompt(
task,
step,
browser_state,
scraped_page,
verification_code_check=bool(task.totp_verification_url or task.totp_identifier),
expire_verification_code=True,
)
extract_action_prompt = ""
if engine != RunEngine.openai_cua:
extract_action_prompt = await self._build_extract_action_prompt(
task,
step,
browser_state,
scraped_page,
verification_code_check=bool(task.totp_verification_url or task.totp_identifier),
expire_verification_code=True,
)
await app.ARTIFACT_MANAGER.create_artifact(
step=step,
@@ -2146,9 +2265,14 @@ class ForgeAgent:
step_result["actions_result"] = action_result_summary
steps_results.append(step_result)
run_obj = await app.DATABASE.get_run(run_id=task.task_id, organization_id=task.organization_id)
scroll = True
if run_obj and run_obj.task_run_type == RunType.openai_cua:
scroll = False
screenshots: list[bytes] = []
if page is not None:
screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=page.url)
screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=page.url, scroll=scroll)
prompt = prompt_engine.load_prompt(
"summarize-max-steps-reason",

View File

@@ -1,6 +1,7 @@
from typing import Awaitable, Callable
from fastapi import FastAPI
from openai import AsyncAzureOpenAI, AsyncOpenAI
from skyvern.forge.agent import ForgeAgent
from skyvern.forge.agent_functions import AgentFunction
@@ -32,6 +33,15 @@ ARTIFACT_MANAGER = ArtifactManager()
BROWSER_MANAGER = BrowserManager()
EXPERIMENTATION_PROVIDER: BaseExperimentationProvider = NoOpExperimentationProvider()
LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(SettingsManager.get_settings().LLM_KEY)
OPENAI_CLIENT = AsyncOpenAI(api_key=SettingsManager.get_settings().OPENAI_API_KEY)
if SettingsManager.get_settings().ENABLE_AZURE_CUA:
OPENAI_CLIENT = AsyncAzureOpenAI(
api_key=SettingsManager.get_settings().AZURE_CUA_API_KEY,
api_version=SettingsManager.get_settings().AZURE_CUA_API_VERSION,
azure_endpoint=SettingsManager.get_settings().AZURE_CUA_ENDPOINT,
azure_deployment=SettingsManager.get_settings().AZURE_CUA_DEPLOYMENT,
)
SECONDARY_LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(
SETTINGS_MANAGER.SECONDARY_LLM_KEY if SETTINGS_MANAGER.SECONDARY_LLM_KEY else SETTINGS_MANAGER.LLM_KEY
)

View File

@@ -10,6 +10,7 @@ from skyvern.forge.sdk.core.skyvern_context import SkyvernContext
from skyvern.forge.sdk.schemas.task_v2 import TaskV2Status
from skyvern.forge.sdk.schemas.tasks import TaskStatus
from skyvern.forge.sdk.workflow.models.workflow import WorkflowRunStatus
from skyvern.schemas.runs import RunEngine, RunType
from skyvern.services import task_v2_service
LOG = structlog.get_logger()
@@ -91,6 +92,10 @@ class BackgroundTaskExecutor(AsyncExecutor):
status=TaskStatus.running,
organization_id=organization_id,
)
run_obj = await app.DATABASE.get_run(run_id=task_id, organization_id=organization_id)
engine = RunEngine.skyvern_v1
if run_obj and run_obj.task_run_type == RunType.openai_cua:
engine = RunEngine.openai_cua
context: SkyvernContext = skyvern_context.ensure_context()
context.task_id = task.task_id
@@ -106,6 +111,7 @@ class BackgroundTaskExecutor(AsyncExecutor):
api_key,
close_browser_on_completion=close_browser_on_completion,
browser_session_id=browser_session_id,
engine=engine,
)
async def execute_workflow(

View File

@@ -1465,7 +1465,7 @@ async def run_task(
analytics.capture("skyvern-oss-run-task", data={"url": run_request.url})
await PermissionCheckerFactory.get_instance().check(current_org, browser_session_id=run_request.browser_session_id)
if run_request.engine == RunEngine.skyvern_v1:
if run_request.engine in [RunEngine.skyvern_v1, RunEngine.openai_cua]:
# create task v1
# if there's no url, call task generation first to generate the url, data schema if any
url = run_request.url
@@ -1497,6 +1497,7 @@ async def run_task(
task_v1_response = await task_v1_service.run_task(
task=task_v1_request,
organization=current_org,
engine=run_request.engine,
x_max_steps_override=run_request.max_steps,
x_api_key=x_api_key,
request=request,
@@ -1577,6 +1578,8 @@ async def run_task(
publish_workflow=run_request.publish_workflow,
),
)
if run_request.engine == RunEngine.openai_cua:
pass
raise HTTPException(status_code=400, detail=f"Invalid agent engine: {run_request.engine}")

View File

@@ -92,11 +92,13 @@ class RunType(StrEnum):
task_v1 = "task_v1"
task_v2 = "task_v2"
workflow_run = "workflow_run"
openai_cua = "openai_cua"
class RunEngine(StrEnum):
skyvern_v1 = "skyvern-1.0"
skyvern_v2 = "skyvern-2.0"
openai_cua = "openai-cua"
class RunStatus(StrEnum):

View File

@@ -13,7 +13,7 @@ from skyvern.forge.sdk.executor.factory import AsyncExecutorFactory
from skyvern.forge.sdk.schemas.organizations import Organization
from skyvern.forge.sdk.schemas.task_generations import TaskGeneration, TaskGenerationBase
from skyvern.forge.sdk.schemas.tasks import Task, TaskRequest
from skyvern.schemas.runs import RunType
from skyvern.schemas.runs import RunEngine, RunType
LOG = structlog.get_logger()
@@ -76,6 +76,7 @@ async def generate_task(user_prompt: str, organization: Organization) -> TaskGen
async def run_task(
task: TaskRequest,
organization: Organization,
engine: RunEngine = RunEngine.skyvern_v1,
x_max_steps_override: int | None = None,
x_api_key: str | None = None,
request: Request | None = None,
@@ -83,8 +84,11 @@ async def run_task(
) -> Task:
created_task = await app.agent.create_task(task, organization.organization_id)
url_hash = generate_url_hash(task.url)
run_type = RunType.task_v1
if engine == RunEngine.openai_cua:
run_type = RunType.openai_cua
await app.DATABASE.create_task_run(
task_run_type=RunType.task_v1,
task_run_type=run_type,
organization_id=organization.organization_id,
run_id=created_task.task_id,
title=task.title,

View File

@@ -28,6 +28,9 @@ class ActionType(StrEnum):
RELOAD_PAGE = "reload_page"
EXTRACT = "extract"
SCROLL = "scroll"
KEYPRESS = "keypress"
TYPE = "type"
def is_web_action(self) -> bool:
return self in [
@@ -177,6 +180,9 @@ class ClickAction(WebAction):
action_type: ActionType = ActionType.CLICK
file_url: str | None = None
download: bool = False
x: int | None = None
y: int | None = None
button: str = "left"
def __repr__(self) -> str:
return f"ClickAction(element_id={self.element_id}, file_url={self.file_url}, download={self.download})"
@@ -240,6 +246,7 @@ class CheckboxAction(WebAction):
class WaitAction(Action):
action_type: ActionType = ActionType.WAIT
seconds: int = 20
class TerminateAction(DecisiveAction):
@@ -258,6 +265,19 @@ class ExtractAction(Action):
data_extraction_schema: dict[str, Any] | None = None
class ScrollAction(Action):
action_type: ActionType = ActionType.SCROLL
x: int
y: int
scroll_x: int
scroll_y: int
class KeypressAction(Action):
action_type: ActionType = ActionType.KEYPRESS
keys: list[str] = []
class ScrapeResult(BaseModel):
"""
Scraped response from a webpage, including:

View File

@@ -77,6 +77,7 @@ from skyvern.webeye.actions.actions import (
CheckboxAction,
ClickAction,
InputOrSelectContext,
InputTextAction,
ScrapeResult,
SelectOption,
SelectOptionAction,
@@ -392,6 +393,12 @@ def check_for_invalid_web_action(
task: Task,
step: Step,
) -> list[ActionResult]:
if isinstance(action, ClickAction) and action.x is not None and action.y is not None:
return []
if isinstance(action, InputTextAction) and not action.element_id:
return []
if isinstance(action, WebAction) and action.element_id not in scraped_page.id_to_element_dict:
return [ActionFailure(MissingElement(element_id=action.element_id), stop_execution_on_failure=False)]
@@ -420,6 +427,36 @@ async def handle_click_action(
task: Task,
step: Step,
) -> list[ActionResult]:
if action.x is not None and action.y is not None:
# Find the element at the clicked location using JavaScript evaluation
element_id = await page.evaluate(
"""data => {
const element = document.elementFromPoint(data.x, data.y);
if (!element) return null;
// Function to get the unique_id attribute of an element
function getElementUniqueId(element) {
if (element && element.nodeType === 1) {
// Check if the element has the unique_id attribute
if (element.hasAttribute('unique_id')) {
return element.getAttribute('unique_id');
}
// If no unique_id attribute is found, return null
return null;
}
return null;
}
return getElementUniqueId(element);
}""",
{"x": action.x, "y": action.y},
)
LOG.info("Clicked element at location", x=action.x, y=action.y, element_id=element_id, button=action.button)
await page.mouse.click(x=action.x, y=action.y, button=action.button)
return [ActionSuccess()]
dom = DomUtil(scraped_page=scraped_page, page=page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
await asyncio.sleep(0.3)
@@ -591,6 +628,11 @@ async def handle_input_text_action(
task: Task,
step: Step,
) -> list[ActionResult]:
if not action.element_id:
# This is a CUA type action
await page.keyboard.type(action.text)
return [ActionSuccess()]
dom = DomUtil(scraped_page, page)
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame())
@@ -1348,7 +1390,7 @@ async def handle_wait_action(
task: Task,
step: Step,
) -> list[ActionResult]:
await asyncio.sleep(20)
await asyncio.sleep(action.seconds)
return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]
@@ -1422,6 +1464,35 @@ async def handle_extract_action(
return [ActionFailure(exception=Exception("No data extraction goal"))]
async def handle_scroll_action(
action: actions.ScrollAction,
page: Page,
scraped_page: ScrapedPage,
task: Task,
step: Step,
) -> list[ActionResult]:
await page.mouse.move(action.x, action.y)
await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
return [ActionSuccess()]
async def handle_keypress_action(
action: actions.KeypressAction,
page: Page,
scraped_page: ScrapedPage,
task: Task,
step: Step,
) -> list[ActionResult]:
for key in action.keys:
if key.lower() == "enter":
await page.keyboard.press("Enter")
elif key.lower() == "space":
await page.keyboard.press(" ")
else:
await page.keyboard.press(key)
return [ActionSuccess()]
ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
@@ -1433,6 +1504,8 @@ ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
ActionHandler.register_action_type(ActionType.EXTRACT, handle_extract_action)
ActionHandler.register_action_type(ActionType.SCROLL, handle_scroll_action)
ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action)
async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:

View File

@@ -2,6 +2,7 @@ from __future__ import annotations
from typing import Any
from openai.types.responses.response import Response as OpenAIResponse
from pydantic import BaseModel
from skyvern.config import settings
@@ -40,6 +41,7 @@ class DetailedAgentStepOutput(BaseModel):
action_results: list[ActionResult] | None
actions_and_results: list[tuple[Action, list[ActionResult]]] | None
step_exception: str | None = None
cua_response: OpenAIResponse | None = None
class Config:
exclude = ["scraped_page", "extract_action_prompt"]
@@ -72,6 +74,7 @@ class DetailedAgentStepOutput(BaseModel):
if self.actions_and_results is None
else [(action, result) for action, result in self.actions_and_results if result],
step_exception=self.step_exception,
cua_response=self.cua_response,
)
def to_agent_step_output(self) -> AgentStepOutput:

View File

@@ -1,9 +1,11 @@
from typing import Any, Dict
import structlog
from openai.types.responses.response import Response as OpenAIResponse
from pydantic import ValidationError
from skyvern.exceptions import UnsupportedActionType
from skyvern.forge.sdk.models import Step
from skyvern.forge.sdk.schemas.tasks import Task
from skyvern.webeye.actions.actions import (
Action,
@@ -13,7 +15,9 @@ from skyvern.webeye.actions.actions import (
CompleteAction,
DownloadFileAction,
InputTextAction,
KeypressAction,
NullAction,
ScrollAction,
SelectOption,
SelectOptionAction,
SolveCaptchaAction,
@@ -194,3 +198,104 @@ def parse_actions(
)
############################ This part of code might not be needed ############################
return actions
def parse_cua_actions(
task: Task,
step: Step,
response: OpenAIResponse,
) -> list[Action]:
computer_calls = [item for item in response.output if item.type == "computer_call"]
reasonings = [item for item in response.output if item.type == "reasoning"]
actions: list[Action] = []
for idx, computer_call in enumerate(computer_calls):
cua_action = computer_call.action
action_type = cua_action.type
try:
reasoning = None
if idx < len(reasonings):
try:
reasoning = reasonings[idx].summary[0].text
except Exception:
LOG.exception(
"Failed to parse reasoning",
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
)
match action_type:
case "click":
button = cua_action.button
if button != "left" and button != "right":
button = "left"
reasoning = reasoning or f"Click at: ({cua_action.x}, {cua_action.y})"
action = ClickAction(
element_id="",
x=cua_action.x,
y=cua_action.y,
button=button,
reasoning=reasoning,
intention=reasoning,
response=f"Click at: ({cua_action.x}, {cua_action.y})",
)
case "scroll":
reasoning = reasoning or f"Scroll by: ({cua_action.x}, {cua_action.y})"
action = ScrollAction(
element_id="",
x=cua_action.x,
y=cua_action.y,
scroll_x=cua_action.scroll_x,
scroll_y=cua_action.scroll_y,
reasoning=reasoning,
intention=reasoning,
response=f"Scroll by: ({cua_action.x}, {cua_action.y})",
)
case "keypress":
reasoning_str = f"Press keys: {cua_action.keys}"
if len(cua_action.keys) == 1:
reasoning_str = f"Press the '{cua_action.keys[0]}' key"
reasoning = reasoning or reasoning_str
action = KeypressAction(
element_id="",
keys=cua_action.keys,
reasoning=reasoning,
intention=reasoning,
response=str(cua_action.keys),
)
case "type":
action = InputTextAction(
element_id="",
text=cua_action.text,
reasoning=reasoning,
intention=reasoning,
response=cua_action.text,
)
case "wait":
action = WaitAction(
seconds=5,
reasoning=reasoning,
intention=reasoning,
)
case _:
raise ValueError(f"Unsupported action type: {action_type}")
action.organization_id = task.organization_id
action.workflow_run_id = task.workflow_run_id
action.task_id = task.task_id
action.step_id = step.step_id
action.step_order = step.order
action.action_order = idx
actions.append(action)
except Exception:
LOG.exception(
"Failed to parse action",
task_id=task.task_id,
step_id=step.step_id,
step_order=step.order,
action_order=idx,
)
break
if not actions:
return [CompleteAction(reasoning="No actions generated", verified=True)]
return actions

View File

@@ -326,13 +326,14 @@ class ScrapedPage(BaseModel):
element["children"] = new_children
return element
async def refresh(self, draw_boxes: bool = True) -> Self:
async def refresh(self, draw_boxes: bool = True, scroll: bool = True) -> Self:
refreshed_page = await scrape_website(
browser_state=self._browser_state,
url=self.url,
cleanup_element_tree=self._clean_up_func,
scrape_exclude=self._scrape_exclude,
draw_boxes=draw_boxes,
scroll=scroll,
)
self.elements = refreshed_page.elements
self.id_to_css_dict = refreshed_page.id_to_css_dict
@@ -366,6 +367,8 @@ async def scrape_website(
scrape_exclude: ScrapeExcludeFunc | None = None,
take_screenshots: bool = True,
draw_boxes: bool = True,
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
scroll: bool = True,
) -> ScrapedPage:
"""
************************************************************************************************
@@ -397,6 +400,8 @@ async def scrape_website(
scrape_exclude=scrape_exclude,
take_screenshots=take_screenshots,
draw_boxes=draw_boxes,
max_screenshot_number=max_screenshot_number,
scroll=scroll,
)
except Exception as e:
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
@@ -420,6 +425,8 @@ async def scrape_website(
scrape_exclude=scrape_exclude,
take_screenshots=take_screenshots,
draw_boxes=draw_boxes,
max_screenshot_number=max_screenshot_number,
scroll=scroll,
)
@@ -469,6 +476,8 @@ async def scrape_web_unsafe(
scrape_exclude: ScrapeExcludeFunc | None = None,
take_screenshots: bool = True,
draw_boxes: bool = True,
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
scroll: bool = True,
) -> ScrapedPage:
"""
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -503,7 +512,6 @@ async def scrape_web_unsafe(
json_to_html(element, need_skyvern_attrs=False) for element in element_tree_trimmed
)
token_count = count_tokens(element_tree_trimmed_html_str)
max_screenshot_number = settings.MAX_NUM_SCREENSHOTS
if token_count > DEFAULT_MAX_TOKENS:
max_screenshot_number = min(max_screenshot_number, 1)
@@ -512,6 +520,7 @@ async def scrape_web_unsafe(
url=url,
draw_boxes=draw_boxes,
max_number=max_screenshot_number,
scroll=scroll,
)
id_to_css_dict, id_to_element_dict, id_to_frame_dict, id_to_element_hash, hash_to_element_ids = build_element_dict(
elements

View File

@@ -98,8 +98,11 @@ class SkyvernFrame:
url: str,
draw_boxes: bool = False,
max_number: int = settings.MAX_NUM_SCREENSHOTS,
scroll: bool = True,
) -> List[bytes]:
skyvern_page = await SkyvernFrame.create_instance(frame=page)
if not scroll:
return [await SkyvernFrame.take_screenshot(page=skyvern_page.frame, full_page=False)]
# page is the main frame and the index must be 0
assert isinstance(skyvern_page.frame, Page)