integration with CUA (#2126)
This commit is contained in:
@@ -116,6 +116,7 @@ class Settings(BaseSettings):
|
||||
ENABLE_AZURE_O3_MINI: bool = False
|
||||
ENABLE_BEDROCK: bool = False
|
||||
ENABLE_GEMINI: bool = False
|
||||
ENABLE_AZURE_CUA: bool = False
|
||||
# OPENAI
|
||||
OPENAI_API_KEY: str | None = None
|
||||
# ANTHROPIC
|
||||
@@ -125,6 +126,10 @@ class Settings(BaseSettings):
|
||||
AZURE_API_KEY: str | None = None
|
||||
AZURE_API_BASE: str | None = None
|
||||
AZURE_API_VERSION: str | None = None
|
||||
AZURE_CUA_API_KEY: str | None = None
|
||||
AZURE_CUA_ENDPOINT: str | None = None
|
||||
AZURE_CUA_DEPLOYMENT: str | None = "computer-use-preview"
|
||||
AZURE_CUA_API_VERSION: str | None = "2025-03-01-preview"
|
||||
|
||||
# AZURE GPT-4o mini
|
||||
AZURE_GPT4O_MINI_DEPLOYMENT: str | None = None
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
import asyncio
|
||||
import base64
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
@@ -10,6 +11,7 @@ from typing import Any, Tuple
|
||||
|
||||
import httpx
|
||||
import structlog
|
||||
from openai.types.responses.response import Response as OpenAIResponse
|
||||
from playwright._impl._errors import TargetClosedError
|
||||
from playwright.async_api import Page
|
||||
|
||||
@@ -68,6 +70,7 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskRequest, TaskResponse, Tas
|
||||
from skyvern.forge.sdk.workflow.context_manager import WorkflowRunContext
|
||||
from skyvern.forge.sdk.workflow.models.block import ActionBlock, BaseTaskBlock, ValidationBlock
|
||||
from skyvern.forge.sdk.workflow.models.workflow import Workflow, WorkflowRun, WorkflowRunStatus
|
||||
from skyvern.schemas.runs import RunEngine, RunType
|
||||
from skyvern.utils.prompt_engine import load_prompt_with_elements
|
||||
from skyvern.webeye.actions.actions import (
|
||||
Action,
|
||||
@@ -85,7 +88,7 @@ from skyvern.webeye.actions.actions import (
|
||||
from skyvern.webeye.actions.caching import retrieve_action_plan
|
||||
from skyvern.webeye.actions.handler import ActionHandler, poll_verification_code
|
||||
from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput
|
||||
from skyvern.webeye.actions.parse_actions import parse_actions
|
||||
from skyvern.webeye.actions.parse_actions import parse_actions, parse_cua_actions
|
||||
from skyvern.webeye.actions.responses import ActionResult, ActionSuccess
|
||||
from skyvern.webeye.browser_factory import BrowserState
|
||||
from skyvern.webeye.scraper.scraper import ElementTreeFormat, ScrapedPage, scrape_website
|
||||
@@ -248,6 +251,8 @@ class ForgeAgent:
|
||||
task_block: BaseTaskBlock | None = None,
|
||||
browser_session_id: str | None = None,
|
||||
complete_verification: bool = True,
|
||||
engine: RunEngine = RunEngine.skyvern_v1,
|
||||
cua_response: OpenAIResponse | None = None,
|
||||
) -> Tuple[Step, DetailedAgentStepOutput | None, Step | None]:
|
||||
workflow_run: WorkflowRun | None = None
|
||||
if task.workflow_run_id:
|
||||
@@ -380,6 +385,8 @@ class ForgeAgent:
|
||||
organization=organization,
|
||||
task_block=task_block,
|
||||
complete_verification=complete_verification,
|
||||
engine=engine,
|
||||
cua_response=cua_response,
|
||||
)
|
||||
await app.AGENT_FUNCTION.post_step_execution(task, step)
|
||||
task = await self.update_task_errors_from_detailed_output(task, detailed_output)
|
||||
@@ -506,6 +513,10 @@ class ForgeAgent:
|
||||
step_status=step.status,
|
||||
)
|
||||
|
||||
cua_response_param = detailed_output.cua_response if detailed_output else None
|
||||
if not cua_response_param and cua_response:
|
||||
cua_response_param = cua_response
|
||||
|
||||
if retry and next_step:
|
||||
return await self.execute_step(
|
||||
organization,
|
||||
@@ -516,6 +527,8 @@ class ForgeAgent:
|
||||
browser_session_id=browser_session_id,
|
||||
task_block=task_block,
|
||||
complete_verification=complete_verification,
|
||||
engine=engine,
|
||||
cua_response=cua_response_param,
|
||||
)
|
||||
elif settings.execute_all_steps() and next_step:
|
||||
return await self.execute_step(
|
||||
@@ -527,6 +540,8 @@ class ForgeAgent:
|
||||
browser_session_id=browser_session_id,
|
||||
task_block=task_block,
|
||||
complete_verification=complete_verification,
|
||||
engine=engine,
|
||||
cua_response=cua_response_param,
|
||||
)
|
||||
else:
|
||||
LOG.info(
|
||||
@@ -757,9 +772,11 @@ class ForgeAgent:
|
||||
task: Task,
|
||||
step: Step,
|
||||
browser_state: BrowserState,
|
||||
engine: RunEngine = RunEngine.skyvern_v1,
|
||||
organization: Organization | None = None,
|
||||
task_block: BaseTaskBlock | None = None,
|
||||
complete_verification: bool = True,
|
||||
cua_response: OpenAIResponse | None = None,
|
||||
) -> tuple[Step, DetailedAgentStepOutput]:
|
||||
detailed_agent_step_output = DetailedAgentStepOutput(
|
||||
scraped_page=None,
|
||||
@@ -768,6 +785,7 @@ class ForgeAgent:
|
||||
actions=None,
|
||||
action_results=None,
|
||||
actions_and_results=None,
|
||||
cua_response=None,
|
||||
)
|
||||
try:
|
||||
LOG.info(
|
||||
@@ -789,52 +807,62 @@ class ForgeAgent:
|
||||
task,
|
||||
step,
|
||||
browser_state,
|
||||
engine,
|
||||
)
|
||||
detailed_agent_step_output.scraped_page = scraped_page
|
||||
detailed_agent_step_output.extract_action_prompt = extract_action_prompt
|
||||
json_response = None
|
||||
actions: list[Action]
|
||||
|
||||
using_cached_action_plan = False
|
||||
if not task.navigation_goal and not isinstance(task_block, ValidationBlock):
|
||||
actions = [await self.create_extract_action(task, step, scraped_page)]
|
||||
elif (
|
||||
task_block
|
||||
and task_block.cache_actions
|
||||
and (actions := await retrieve_action_plan(task, step, scraped_page))
|
||||
):
|
||||
using_cached_action_plan = True
|
||||
else:
|
||||
self.async_operation_pool.run_operation(task.task_id, AgentPhase.llm)
|
||||
json_response = await app.LLM_API_HANDLER(
|
||||
prompt=extract_action_prompt,
|
||||
prompt_name="extract-actions",
|
||||
if engine == RunEngine.openai_cua:
|
||||
actions, new_cua_response = await self._generate_cua_actions(
|
||||
task=task,
|
||||
step=step,
|
||||
screenshots=scraped_page.screenshots,
|
||||
scraped_page=scraped_page,
|
||||
previous_response=cua_response,
|
||||
)
|
||||
try:
|
||||
json_response = await self.handle_potential_verification_code(
|
||||
task,
|
||||
step,
|
||||
scraped_page,
|
||||
browser_state,
|
||||
json_response,
|
||||
detailed_agent_step_output.cua_response = new_cua_response
|
||||
else:
|
||||
using_cached_action_plan = False
|
||||
if not task.navigation_goal and not isinstance(task_block, ValidationBlock):
|
||||
actions = [await self.create_extract_action(task, step, scraped_page)]
|
||||
elif (
|
||||
task_block
|
||||
and task_block.cache_actions
|
||||
and (actions := await retrieve_action_plan(task, step, scraped_page))
|
||||
):
|
||||
using_cached_action_plan = True
|
||||
else:
|
||||
self.async_operation_pool.run_operation(task.task_id, AgentPhase.llm)
|
||||
json_response = await app.LLM_API_HANDLER(
|
||||
prompt=extract_action_prompt,
|
||||
prompt_name="extract-actions",
|
||||
step=step,
|
||||
screenshots=scraped_page.screenshots,
|
||||
)
|
||||
detailed_agent_step_output.llm_response = json_response
|
||||
actions = parse_actions(task, step.step_id, step.order, scraped_page, json_response["actions"])
|
||||
except NoTOTPVerificationCodeFound:
|
||||
actions = [
|
||||
TerminateAction(
|
||||
organization_id=task.organization_id,
|
||||
workflow_run_id=task.workflow_run_id,
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
step_order=step.order,
|
||||
action_order=0,
|
||||
reasoning="No TOTP verification code found. Going to terminate.",
|
||||
intention="No TOTP verification code found. Going to terminate.",
|
||||
try:
|
||||
json_response = await self.handle_potential_verification_code(
|
||||
task,
|
||||
step,
|
||||
scraped_page,
|
||||
browser_state,
|
||||
json_response,
|
||||
)
|
||||
]
|
||||
detailed_agent_step_output.llm_response = json_response
|
||||
actions = parse_actions(task, step.step_id, step.order, scraped_page, json_response["actions"])
|
||||
except NoTOTPVerificationCodeFound:
|
||||
actions = [
|
||||
TerminateAction(
|
||||
organization_id=task.organization_id,
|
||||
workflow_run_id=task.workflow_run_id,
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
step_order=step.order,
|
||||
action_order=0,
|
||||
reasoning="No TOTP verification code found. Going to terminate.",
|
||||
intention="No TOTP verification code found. Going to terminate.",
|
||||
)
|
||||
]
|
||||
|
||||
detailed_agent_step_output.actions = actions
|
||||
if len(actions) == 0:
|
||||
@@ -1187,6 +1215,77 @@ class ForgeAgent:
|
||||
)
|
||||
return failed_step, detailed_agent_step_output.get_clean_detailed_output()
|
||||
|
||||
async def _generate_cua_actions(
|
||||
self,
|
||||
task: Task,
|
||||
step: Step,
|
||||
scraped_page: ScrapedPage,
|
||||
previous_response: OpenAIResponse | None = None,
|
||||
) -> tuple[list[Action], OpenAIResponse]:
|
||||
if not previous_response:
|
||||
# this is the first step
|
||||
first_response: OpenAIResponse = await app.OPENAI_CLIENT.responses.create(
|
||||
model="computer-use-preview",
|
||||
tools=[
|
||||
{
|
||||
"type": "computer_use_preview",
|
||||
"display_width": settings.BROWSER_WIDTH,
|
||||
"display_height": settings.BROWSER_HEIGHT,
|
||||
"environment": "browser",
|
||||
}
|
||||
],
|
||||
input=[
|
||||
{
|
||||
"role": "user",
|
||||
"content": task.navigation_goal,
|
||||
}
|
||||
],
|
||||
reasoning={
|
||||
"generate_summary": "concise",
|
||||
},
|
||||
truncation="auto",
|
||||
)
|
||||
previous_response = first_response
|
||||
|
||||
computer_calls = [item for item in previous_response.output if item.type == "computer_call"]
|
||||
if not computer_calls:
|
||||
return [], previous_response
|
||||
|
||||
if not scraped_page.screenshots:
|
||||
return [], previous_response
|
||||
|
||||
last_call_id = computer_calls[-1].call_id
|
||||
screenshot_base64 = base64.b64encode(scraped_page.screenshots[0]).decode("utf-8")
|
||||
|
||||
current_response = await app.OPENAI_CLIENT.responses.create(
|
||||
model="computer-use-preview",
|
||||
previous_response_id=previous_response.id,
|
||||
tools=[
|
||||
{
|
||||
"type": "computer_use_preview",
|
||||
"display_width": settings.BROWSER_WIDTH,
|
||||
"display_height": settings.BROWSER_HEIGHT,
|
||||
"environment": "browser",
|
||||
}
|
||||
],
|
||||
input=[
|
||||
{
|
||||
"call_id": last_call_id,
|
||||
"type": "computer_call_output",
|
||||
"output": {
|
||||
"type": "input_image",
|
||||
"image_url": f"data:image/png;base64,{screenshot_base64}",
|
||||
},
|
||||
}
|
||||
],
|
||||
reasoning={
|
||||
"generate_summary": "concise",
|
||||
},
|
||||
truncation="auto",
|
||||
)
|
||||
|
||||
return parse_cua_actions(task, step, current_response), current_response
|
||||
|
||||
@staticmethod
|
||||
async def complete_verify(page: Page, scraped_page: ScrapedPage, task: Task, step: Step) -> CompleteVerifyResult:
|
||||
LOG.info(
|
||||
@@ -1195,7 +1294,12 @@ class ForgeAgent:
|
||||
step_id=step.step_id,
|
||||
workflow_run_id=task.workflow_run_id,
|
||||
)
|
||||
scraped_page_refreshed = await scraped_page.refresh(draw_boxes=False)
|
||||
run_obj = await app.DATABASE.get_run(run_id=task.task_id, organization_id=task.organization_id)
|
||||
scroll = True
|
||||
if run_obj and run_obj.task_run_type == RunType.openai_cua:
|
||||
scroll = False
|
||||
|
||||
scraped_page_refreshed = await scraped_page.refresh(draw_boxes=False, scroll=scroll)
|
||||
|
||||
verification_prompt = load_prompt_with_elements(
|
||||
scraped_page=scraped_page_refreshed,
|
||||
@@ -1351,6 +1455,7 @@ class ForgeAgent:
|
||||
step: Step,
|
||||
browser_state: BrowserState,
|
||||
scrape_type: ScrapeType,
|
||||
engine: RunEngine,
|
||||
) -> ScrapedPage:
|
||||
if scrape_type == ScrapeType.NORMAL:
|
||||
pass
|
||||
@@ -1370,11 +1475,21 @@ class ForgeAgent:
|
||||
)
|
||||
await browser_state.reload_page()
|
||||
|
||||
max_screenshot_number = settings.MAX_NUM_SCREENSHOTS
|
||||
draw_boxes = True
|
||||
scroll = True
|
||||
if engine == RunEngine.openai_cua:
|
||||
max_screenshot_number = 1
|
||||
draw_boxes = False
|
||||
scroll = False
|
||||
return await scrape_website(
|
||||
browser_state,
|
||||
task.url,
|
||||
app.AGENT_FUNCTION.cleanup_element_tree_factory(task=task, step=step),
|
||||
scrape_exclude=app.scrape_exclude,
|
||||
max_screenshot_number=max_screenshot_number,
|
||||
draw_boxes=draw_boxes,
|
||||
scroll=scroll,
|
||||
)
|
||||
|
||||
async def build_and_record_step_prompt(
|
||||
@@ -1382,6 +1497,7 @@ class ForgeAgent:
|
||||
task: Task,
|
||||
step: Step,
|
||||
browser_state: BrowserState,
|
||||
engine: RunEngine,
|
||||
) -> tuple[ScrapedPage, str]:
|
||||
# start the async tasks while running scrape_website
|
||||
self.async_operation_pool.run_operation(task.task_id, AgentPhase.scrape)
|
||||
@@ -1399,6 +1515,7 @@ class ForgeAgent:
|
||||
step=step,
|
||||
browser_state=browser_state,
|
||||
scrape_type=scrape_type,
|
||||
engine=engine,
|
||||
)
|
||||
break
|
||||
except (FailedToTakeScreenshot, ScrapingFailed) as e:
|
||||
@@ -1431,14 +1548,16 @@ class ForgeAgent:
|
||||
# TODO: we only use HTML element for now, introduce a way to switch in the future
|
||||
element_tree_format = ElementTreeFormat.HTML
|
||||
element_tree_in_prompt: str = scraped_page.build_element_tree(element_tree_format)
|
||||
extract_action_prompt = await self._build_extract_action_prompt(
|
||||
task,
|
||||
step,
|
||||
browser_state,
|
||||
scraped_page,
|
||||
verification_code_check=bool(task.totp_verification_url or task.totp_identifier),
|
||||
expire_verification_code=True,
|
||||
)
|
||||
extract_action_prompt = ""
|
||||
if engine != RunEngine.openai_cua:
|
||||
extract_action_prompt = await self._build_extract_action_prompt(
|
||||
task,
|
||||
step,
|
||||
browser_state,
|
||||
scraped_page,
|
||||
verification_code_check=bool(task.totp_verification_url or task.totp_identifier),
|
||||
expire_verification_code=True,
|
||||
)
|
||||
|
||||
await app.ARTIFACT_MANAGER.create_artifact(
|
||||
step=step,
|
||||
@@ -2146,9 +2265,14 @@ class ForgeAgent:
|
||||
step_result["actions_result"] = action_result_summary
|
||||
steps_results.append(step_result)
|
||||
|
||||
run_obj = await app.DATABASE.get_run(run_id=task.task_id, organization_id=task.organization_id)
|
||||
scroll = True
|
||||
if run_obj and run_obj.task_run_type == RunType.openai_cua:
|
||||
scroll = False
|
||||
|
||||
screenshots: list[bytes] = []
|
||||
if page is not None:
|
||||
screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=page.url)
|
||||
screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=page.url, scroll=scroll)
|
||||
|
||||
prompt = prompt_engine.load_prompt(
|
||||
"summarize-max-steps-reason",
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
from typing import Awaitable, Callable
|
||||
|
||||
from fastapi import FastAPI
|
||||
from openai import AsyncAzureOpenAI, AsyncOpenAI
|
||||
|
||||
from skyvern.forge.agent import ForgeAgent
|
||||
from skyvern.forge.agent_functions import AgentFunction
|
||||
@@ -32,6 +33,15 @@ ARTIFACT_MANAGER = ArtifactManager()
|
||||
BROWSER_MANAGER = BrowserManager()
|
||||
EXPERIMENTATION_PROVIDER: BaseExperimentationProvider = NoOpExperimentationProvider()
|
||||
LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(SettingsManager.get_settings().LLM_KEY)
|
||||
OPENAI_CLIENT = AsyncOpenAI(api_key=SettingsManager.get_settings().OPENAI_API_KEY)
|
||||
if SettingsManager.get_settings().ENABLE_AZURE_CUA:
|
||||
OPENAI_CLIENT = AsyncAzureOpenAI(
|
||||
api_key=SettingsManager.get_settings().AZURE_CUA_API_KEY,
|
||||
api_version=SettingsManager.get_settings().AZURE_CUA_API_VERSION,
|
||||
azure_endpoint=SettingsManager.get_settings().AZURE_CUA_ENDPOINT,
|
||||
azure_deployment=SettingsManager.get_settings().AZURE_CUA_DEPLOYMENT,
|
||||
)
|
||||
|
||||
SECONDARY_LLM_API_HANDLER = LLMAPIHandlerFactory.get_llm_api_handler(
|
||||
SETTINGS_MANAGER.SECONDARY_LLM_KEY if SETTINGS_MANAGER.SECONDARY_LLM_KEY else SETTINGS_MANAGER.LLM_KEY
|
||||
)
|
||||
|
||||
@@ -10,6 +10,7 @@ from skyvern.forge.sdk.core.skyvern_context import SkyvernContext
|
||||
from skyvern.forge.sdk.schemas.task_v2 import TaskV2Status
|
||||
from skyvern.forge.sdk.schemas.tasks import TaskStatus
|
||||
from skyvern.forge.sdk.workflow.models.workflow import WorkflowRunStatus
|
||||
from skyvern.schemas.runs import RunEngine, RunType
|
||||
from skyvern.services import task_v2_service
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
@@ -91,6 +92,10 @@ class BackgroundTaskExecutor(AsyncExecutor):
|
||||
status=TaskStatus.running,
|
||||
organization_id=organization_id,
|
||||
)
|
||||
run_obj = await app.DATABASE.get_run(run_id=task_id, organization_id=organization_id)
|
||||
engine = RunEngine.skyvern_v1
|
||||
if run_obj and run_obj.task_run_type == RunType.openai_cua:
|
||||
engine = RunEngine.openai_cua
|
||||
|
||||
context: SkyvernContext = skyvern_context.ensure_context()
|
||||
context.task_id = task.task_id
|
||||
@@ -106,6 +111,7 @@ class BackgroundTaskExecutor(AsyncExecutor):
|
||||
api_key,
|
||||
close_browser_on_completion=close_browser_on_completion,
|
||||
browser_session_id=browser_session_id,
|
||||
engine=engine,
|
||||
)
|
||||
|
||||
async def execute_workflow(
|
||||
|
||||
@@ -1465,7 +1465,7 @@ async def run_task(
|
||||
analytics.capture("skyvern-oss-run-task", data={"url": run_request.url})
|
||||
await PermissionCheckerFactory.get_instance().check(current_org, browser_session_id=run_request.browser_session_id)
|
||||
|
||||
if run_request.engine == RunEngine.skyvern_v1:
|
||||
if run_request.engine in [RunEngine.skyvern_v1, RunEngine.openai_cua]:
|
||||
# create task v1
|
||||
# if there's no url, call task generation first to generate the url, data schema if any
|
||||
url = run_request.url
|
||||
@@ -1497,6 +1497,7 @@ async def run_task(
|
||||
task_v1_response = await task_v1_service.run_task(
|
||||
task=task_v1_request,
|
||||
organization=current_org,
|
||||
engine=run_request.engine,
|
||||
x_max_steps_override=run_request.max_steps,
|
||||
x_api_key=x_api_key,
|
||||
request=request,
|
||||
@@ -1577,6 +1578,8 @@ async def run_task(
|
||||
publish_workflow=run_request.publish_workflow,
|
||||
),
|
||||
)
|
||||
if run_request.engine == RunEngine.openai_cua:
|
||||
pass
|
||||
raise HTTPException(status_code=400, detail=f"Invalid agent engine: {run_request.engine}")
|
||||
|
||||
|
||||
|
||||
@@ -92,11 +92,13 @@ class RunType(StrEnum):
|
||||
task_v1 = "task_v1"
|
||||
task_v2 = "task_v2"
|
||||
workflow_run = "workflow_run"
|
||||
openai_cua = "openai_cua"
|
||||
|
||||
|
||||
class RunEngine(StrEnum):
|
||||
skyvern_v1 = "skyvern-1.0"
|
||||
skyvern_v2 = "skyvern-2.0"
|
||||
openai_cua = "openai-cua"
|
||||
|
||||
|
||||
class RunStatus(StrEnum):
|
||||
|
||||
@@ -13,7 +13,7 @@ from skyvern.forge.sdk.executor.factory import AsyncExecutorFactory
|
||||
from skyvern.forge.sdk.schemas.organizations import Organization
|
||||
from skyvern.forge.sdk.schemas.task_generations import TaskGeneration, TaskGenerationBase
|
||||
from skyvern.forge.sdk.schemas.tasks import Task, TaskRequest
|
||||
from skyvern.schemas.runs import RunType
|
||||
from skyvern.schemas.runs import RunEngine, RunType
|
||||
|
||||
LOG = structlog.get_logger()
|
||||
|
||||
@@ -76,6 +76,7 @@ async def generate_task(user_prompt: str, organization: Organization) -> TaskGen
|
||||
async def run_task(
|
||||
task: TaskRequest,
|
||||
organization: Organization,
|
||||
engine: RunEngine = RunEngine.skyvern_v1,
|
||||
x_max_steps_override: int | None = None,
|
||||
x_api_key: str | None = None,
|
||||
request: Request | None = None,
|
||||
@@ -83,8 +84,11 @@ async def run_task(
|
||||
) -> Task:
|
||||
created_task = await app.agent.create_task(task, organization.organization_id)
|
||||
url_hash = generate_url_hash(task.url)
|
||||
run_type = RunType.task_v1
|
||||
if engine == RunEngine.openai_cua:
|
||||
run_type = RunType.openai_cua
|
||||
await app.DATABASE.create_task_run(
|
||||
task_run_type=RunType.task_v1,
|
||||
task_run_type=run_type,
|
||||
organization_id=organization.organization_id,
|
||||
run_id=created_task.task_id,
|
||||
title=task.title,
|
||||
|
||||
@@ -28,6 +28,9 @@ class ActionType(StrEnum):
|
||||
RELOAD_PAGE = "reload_page"
|
||||
|
||||
EXTRACT = "extract"
|
||||
SCROLL = "scroll"
|
||||
KEYPRESS = "keypress"
|
||||
TYPE = "type"
|
||||
|
||||
def is_web_action(self) -> bool:
|
||||
return self in [
|
||||
@@ -177,6 +180,9 @@ class ClickAction(WebAction):
|
||||
action_type: ActionType = ActionType.CLICK
|
||||
file_url: str | None = None
|
||||
download: bool = False
|
||||
x: int | None = None
|
||||
y: int | None = None
|
||||
button: str = "left"
|
||||
|
||||
def __repr__(self) -> str:
|
||||
return f"ClickAction(element_id={self.element_id}, file_url={self.file_url}, download={self.download})"
|
||||
@@ -240,6 +246,7 @@ class CheckboxAction(WebAction):
|
||||
|
||||
class WaitAction(Action):
|
||||
action_type: ActionType = ActionType.WAIT
|
||||
seconds: int = 20
|
||||
|
||||
|
||||
class TerminateAction(DecisiveAction):
|
||||
@@ -258,6 +265,19 @@ class ExtractAction(Action):
|
||||
data_extraction_schema: dict[str, Any] | None = None
|
||||
|
||||
|
||||
class ScrollAction(Action):
|
||||
action_type: ActionType = ActionType.SCROLL
|
||||
x: int
|
||||
y: int
|
||||
scroll_x: int
|
||||
scroll_y: int
|
||||
|
||||
|
||||
class KeypressAction(Action):
|
||||
action_type: ActionType = ActionType.KEYPRESS
|
||||
keys: list[str] = []
|
||||
|
||||
|
||||
class ScrapeResult(BaseModel):
|
||||
"""
|
||||
Scraped response from a webpage, including:
|
||||
|
||||
@@ -77,6 +77,7 @@ from skyvern.webeye.actions.actions import (
|
||||
CheckboxAction,
|
||||
ClickAction,
|
||||
InputOrSelectContext,
|
||||
InputTextAction,
|
||||
ScrapeResult,
|
||||
SelectOption,
|
||||
SelectOptionAction,
|
||||
@@ -392,6 +393,12 @@ def check_for_invalid_web_action(
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
if isinstance(action, ClickAction) and action.x is not None and action.y is not None:
|
||||
return []
|
||||
|
||||
if isinstance(action, InputTextAction) and not action.element_id:
|
||||
return []
|
||||
|
||||
if isinstance(action, WebAction) and action.element_id not in scraped_page.id_to_element_dict:
|
||||
return [ActionFailure(MissingElement(element_id=action.element_id), stop_execution_on_failure=False)]
|
||||
|
||||
@@ -420,6 +427,36 @@ async def handle_click_action(
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
if action.x is not None and action.y is not None:
|
||||
# Find the element at the clicked location using JavaScript evaluation
|
||||
element_id = await page.evaluate(
|
||||
"""data => {
|
||||
const element = document.elementFromPoint(data.x, data.y);
|
||||
if (!element) return null;
|
||||
|
||||
// Function to get the unique_id attribute of an element
|
||||
function getElementUniqueId(element) {
|
||||
if (element && element.nodeType === 1) {
|
||||
// Check if the element has the unique_id attribute
|
||||
if (element.hasAttribute('unique_id')) {
|
||||
return element.getAttribute('unique_id');
|
||||
}
|
||||
|
||||
// If no unique_id attribute is found, return null
|
||||
return null;
|
||||
}
|
||||
return null;
|
||||
}
|
||||
|
||||
return getElementUniqueId(element);
|
||||
}""",
|
||||
{"x": action.x, "y": action.y},
|
||||
)
|
||||
LOG.info("Clicked element at location", x=action.x, y=action.y, element_id=element_id, button=action.button)
|
||||
|
||||
await page.mouse.click(x=action.x, y=action.y, button=action.button)
|
||||
return [ActionSuccess()]
|
||||
|
||||
dom = DomUtil(scraped_page=scraped_page, page=page)
|
||||
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
|
||||
await asyncio.sleep(0.3)
|
||||
@@ -591,6 +628,11 @@ async def handle_input_text_action(
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
if not action.element_id:
|
||||
# This is a CUA type action
|
||||
await page.keyboard.type(action.text)
|
||||
return [ActionSuccess()]
|
||||
|
||||
dom = DomUtil(scraped_page, page)
|
||||
skyvern_element = await dom.get_skyvern_element_by_id(action.element_id)
|
||||
skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame())
|
||||
@@ -1348,7 +1390,7 @@ async def handle_wait_action(
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
await asyncio.sleep(20)
|
||||
await asyncio.sleep(action.seconds)
|
||||
return [ActionFailure(exception=Exception("Wait action is treated as a failure"))]
|
||||
|
||||
|
||||
@@ -1422,6 +1464,35 @@ async def handle_extract_action(
|
||||
return [ActionFailure(exception=Exception("No data extraction goal"))]
|
||||
|
||||
|
||||
async def handle_scroll_action(
|
||||
action: actions.ScrollAction,
|
||||
page: Page,
|
||||
scraped_page: ScrapedPage,
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
await page.mouse.move(action.x, action.y)
|
||||
await page.evaluate(f"window.scrollBy({action.scroll_x}, {action.scroll_y})")
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
async def handle_keypress_action(
|
||||
action: actions.KeypressAction,
|
||||
page: Page,
|
||||
scraped_page: ScrapedPage,
|
||||
task: Task,
|
||||
step: Step,
|
||||
) -> list[ActionResult]:
|
||||
for key in action.keys:
|
||||
if key.lower() == "enter":
|
||||
await page.keyboard.press("Enter")
|
||||
elif key.lower() == "space":
|
||||
await page.keyboard.press(" ")
|
||||
else:
|
||||
await page.keyboard.press(key)
|
||||
return [ActionSuccess()]
|
||||
|
||||
|
||||
ActionHandler.register_action_type(ActionType.SOLVE_CAPTCHA, handle_solve_captcha_action)
|
||||
ActionHandler.register_action_type(ActionType.CLICK, handle_click_action)
|
||||
ActionHandler.register_action_type(ActionType.INPUT_TEXT, handle_input_text_action)
|
||||
@@ -1433,6 +1504,8 @@ ActionHandler.register_action_type(ActionType.WAIT, handle_wait_action)
|
||||
ActionHandler.register_action_type(ActionType.TERMINATE, handle_terminate_action)
|
||||
ActionHandler.register_action_type(ActionType.COMPLETE, handle_complete_action)
|
||||
ActionHandler.register_action_type(ActionType.EXTRACT, handle_extract_action)
|
||||
ActionHandler.register_action_type(ActionType.SCROLL, handle_scroll_action)
|
||||
ActionHandler.register_action_type(ActionType.KEYPRESS, handle_keypress_action)
|
||||
|
||||
|
||||
async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> Any:
|
||||
|
||||
@@ -2,6 +2,7 @@ from __future__ import annotations
|
||||
|
||||
from typing import Any
|
||||
|
||||
from openai.types.responses.response import Response as OpenAIResponse
|
||||
from pydantic import BaseModel
|
||||
|
||||
from skyvern.config import settings
|
||||
@@ -40,6 +41,7 @@ class DetailedAgentStepOutput(BaseModel):
|
||||
action_results: list[ActionResult] | None
|
||||
actions_and_results: list[tuple[Action, list[ActionResult]]] | None
|
||||
step_exception: str | None = None
|
||||
cua_response: OpenAIResponse | None = None
|
||||
|
||||
class Config:
|
||||
exclude = ["scraped_page", "extract_action_prompt"]
|
||||
@@ -72,6 +74,7 @@ class DetailedAgentStepOutput(BaseModel):
|
||||
if self.actions_and_results is None
|
||||
else [(action, result) for action, result in self.actions_and_results if result],
|
||||
step_exception=self.step_exception,
|
||||
cua_response=self.cua_response,
|
||||
)
|
||||
|
||||
def to_agent_step_output(self) -> AgentStepOutput:
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
from typing import Any, Dict
|
||||
|
||||
import structlog
|
||||
from openai.types.responses.response import Response as OpenAIResponse
|
||||
from pydantic import ValidationError
|
||||
|
||||
from skyvern.exceptions import UnsupportedActionType
|
||||
from skyvern.forge.sdk.models import Step
|
||||
from skyvern.forge.sdk.schemas.tasks import Task
|
||||
from skyvern.webeye.actions.actions import (
|
||||
Action,
|
||||
@@ -13,7 +15,9 @@ from skyvern.webeye.actions.actions import (
|
||||
CompleteAction,
|
||||
DownloadFileAction,
|
||||
InputTextAction,
|
||||
KeypressAction,
|
||||
NullAction,
|
||||
ScrollAction,
|
||||
SelectOption,
|
||||
SelectOptionAction,
|
||||
SolveCaptchaAction,
|
||||
@@ -194,3 +198,104 @@ def parse_actions(
|
||||
)
|
||||
############################ This part of code might not be needed ############################
|
||||
return actions
|
||||
|
||||
|
||||
def parse_cua_actions(
|
||||
task: Task,
|
||||
step: Step,
|
||||
response: OpenAIResponse,
|
||||
) -> list[Action]:
|
||||
computer_calls = [item for item in response.output if item.type == "computer_call"]
|
||||
reasonings = [item for item in response.output if item.type == "reasoning"]
|
||||
actions: list[Action] = []
|
||||
for idx, computer_call in enumerate(computer_calls):
|
||||
cua_action = computer_call.action
|
||||
action_type = cua_action.type
|
||||
try:
|
||||
reasoning = None
|
||||
if idx < len(reasonings):
|
||||
try:
|
||||
reasoning = reasonings[idx].summary[0].text
|
||||
except Exception:
|
||||
LOG.exception(
|
||||
"Failed to parse reasoning",
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
step_order=step.order,
|
||||
action_order=idx,
|
||||
)
|
||||
|
||||
match action_type:
|
||||
case "click":
|
||||
button = cua_action.button
|
||||
if button != "left" and button != "right":
|
||||
button = "left"
|
||||
reasoning = reasoning or f"Click at: ({cua_action.x}, {cua_action.y})"
|
||||
action = ClickAction(
|
||||
element_id="",
|
||||
x=cua_action.x,
|
||||
y=cua_action.y,
|
||||
button=button,
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
response=f"Click at: ({cua_action.x}, {cua_action.y})",
|
||||
)
|
||||
case "scroll":
|
||||
reasoning = reasoning or f"Scroll by: ({cua_action.x}, {cua_action.y})"
|
||||
action = ScrollAction(
|
||||
element_id="",
|
||||
x=cua_action.x,
|
||||
y=cua_action.y,
|
||||
scroll_x=cua_action.scroll_x,
|
||||
scroll_y=cua_action.scroll_y,
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
response=f"Scroll by: ({cua_action.x}, {cua_action.y})",
|
||||
)
|
||||
case "keypress":
|
||||
reasoning_str = f"Press keys: {cua_action.keys}"
|
||||
if len(cua_action.keys) == 1:
|
||||
reasoning_str = f"Press the '{cua_action.keys[0]}' key"
|
||||
reasoning = reasoning or reasoning_str
|
||||
action = KeypressAction(
|
||||
element_id="",
|
||||
keys=cua_action.keys,
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
response=str(cua_action.keys),
|
||||
)
|
||||
case "type":
|
||||
action = InputTextAction(
|
||||
element_id="",
|
||||
text=cua_action.text,
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
response=cua_action.text,
|
||||
)
|
||||
case "wait":
|
||||
action = WaitAction(
|
||||
seconds=5,
|
||||
reasoning=reasoning,
|
||||
intention=reasoning,
|
||||
)
|
||||
case _:
|
||||
raise ValueError(f"Unsupported action type: {action_type}")
|
||||
action.organization_id = task.organization_id
|
||||
action.workflow_run_id = task.workflow_run_id
|
||||
action.task_id = task.task_id
|
||||
action.step_id = step.step_id
|
||||
action.step_order = step.order
|
||||
action.action_order = idx
|
||||
actions.append(action)
|
||||
except Exception:
|
||||
LOG.exception(
|
||||
"Failed to parse action",
|
||||
task_id=task.task_id,
|
||||
step_id=step.step_id,
|
||||
step_order=step.order,
|
||||
action_order=idx,
|
||||
)
|
||||
break
|
||||
if not actions:
|
||||
return [CompleteAction(reasoning="No actions generated", verified=True)]
|
||||
return actions
|
||||
|
||||
@@ -326,13 +326,14 @@ class ScrapedPage(BaseModel):
|
||||
element["children"] = new_children
|
||||
return element
|
||||
|
||||
async def refresh(self, draw_boxes: bool = True) -> Self:
|
||||
async def refresh(self, draw_boxes: bool = True, scroll: bool = True) -> Self:
|
||||
refreshed_page = await scrape_website(
|
||||
browser_state=self._browser_state,
|
||||
url=self.url,
|
||||
cleanup_element_tree=self._clean_up_func,
|
||||
scrape_exclude=self._scrape_exclude,
|
||||
draw_boxes=draw_boxes,
|
||||
scroll=scroll,
|
||||
)
|
||||
self.elements = refreshed_page.elements
|
||||
self.id_to_css_dict = refreshed_page.id_to_css_dict
|
||||
@@ -366,6 +367,8 @@ async def scrape_website(
|
||||
scrape_exclude: ScrapeExcludeFunc | None = None,
|
||||
take_screenshots: bool = True,
|
||||
draw_boxes: bool = True,
|
||||
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
|
||||
scroll: bool = True,
|
||||
) -> ScrapedPage:
|
||||
"""
|
||||
************************************************************************************************
|
||||
@@ -397,6 +400,8 @@ async def scrape_website(
|
||||
scrape_exclude=scrape_exclude,
|
||||
take_screenshots=take_screenshots,
|
||||
draw_boxes=draw_boxes,
|
||||
max_screenshot_number=max_screenshot_number,
|
||||
scroll=scroll,
|
||||
)
|
||||
except Exception as e:
|
||||
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
|
||||
@@ -420,6 +425,8 @@ async def scrape_website(
|
||||
scrape_exclude=scrape_exclude,
|
||||
take_screenshots=take_screenshots,
|
||||
draw_boxes=draw_boxes,
|
||||
max_screenshot_number=max_screenshot_number,
|
||||
scroll=scroll,
|
||||
)
|
||||
|
||||
|
||||
@@ -469,6 +476,8 @@ async def scrape_web_unsafe(
|
||||
scrape_exclude: ScrapeExcludeFunc | None = None,
|
||||
take_screenshots: bool = True,
|
||||
draw_boxes: bool = True,
|
||||
max_screenshot_number: int = settings.MAX_NUM_SCREENSHOTS,
|
||||
scroll: bool = True,
|
||||
) -> ScrapedPage:
|
||||
"""
|
||||
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
|
||||
@@ -503,7 +512,6 @@ async def scrape_web_unsafe(
|
||||
json_to_html(element, need_skyvern_attrs=False) for element in element_tree_trimmed
|
||||
)
|
||||
token_count = count_tokens(element_tree_trimmed_html_str)
|
||||
max_screenshot_number = settings.MAX_NUM_SCREENSHOTS
|
||||
if token_count > DEFAULT_MAX_TOKENS:
|
||||
max_screenshot_number = min(max_screenshot_number, 1)
|
||||
|
||||
@@ -512,6 +520,7 @@ async def scrape_web_unsafe(
|
||||
url=url,
|
||||
draw_boxes=draw_boxes,
|
||||
max_number=max_screenshot_number,
|
||||
scroll=scroll,
|
||||
)
|
||||
id_to_css_dict, id_to_element_dict, id_to_frame_dict, id_to_element_hash, hash_to_element_ids = build_element_dict(
|
||||
elements
|
||||
|
||||
@@ -98,8 +98,11 @@ class SkyvernFrame:
|
||||
url: str,
|
||||
draw_boxes: bool = False,
|
||||
max_number: int = settings.MAX_NUM_SCREENSHOTS,
|
||||
scroll: bool = True,
|
||||
) -> List[bytes]:
|
||||
skyvern_page = await SkyvernFrame.create_instance(frame=page)
|
||||
if not scroll:
|
||||
return [await SkyvernFrame.take_screenshot(page=skyvern_page.frame, full_page=False)]
|
||||
|
||||
# page is the main frame and the index must be 0
|
||||
assert isinstance(skyvern_page.frame, Page)
|
||||
|
||||
Reference in New Issue
Block a user