From 775da1887824822bb98efac8d948940cef0aee74 Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Fri, 13 Jun 2025 23:59:50 -0700 Subject: [PATCH] current viewpoint screenshot and scrolling n screenshot (#2716) Co-authored-by: lawyzheng --- ...6b27e8e961_add_screenshot_scroll_number.py | 37 ++++++ skyvern-frontend/src/api/types.ts | 2 + .../routes/tasks/create/CreateNewTaskForm.tsx | 43 +++++++ .../tasks/create/CreateNewTaskFormPage.tsx | 2 + .../src/routes/tasks/create/PromptBox.tsx | 23 +++- .../routes/tasks/create/retry/RetryTask.tsx | 2 + .../src/routes/tasks/create/taskFormTypes.ts | 1 + .../src/routes/workflows/RunWorkflowForm.tsx | 62 ++++++++- .../src/routes/workflows/WorkflowRun.tsx | 3 + .../workflows/WorkflowRunParameters.tsx | 6 + .../routes/workflows/editor/FlowRenderer.tsx | 2 + .../workflows/editor/WorkflowEditor.tsx | 1 + .../editor/nodes/StartNode/StartNode.tsx | 24 ++++ .../workflows/editor/nodes/StartNode/types.ts | 1 + .../editor/nodes/Taskv2Node/types.ts | 3 + .../workflows/editor/workflowEditorUtils.ts | 5 + .../routes/workflows/types/workflowTypes.ts | 2 + .../workflows/types/workflowYamlTypes.ts | 1 + skyvern/constants.py | 1 + skyvern/forge/agent.py | 26 +++- skyvern/forge/sdk/core/skyvern_context.py | 1 + skyvern/forge/sdk/db/client.py | 8 ++ skyvern/forge/sdk/db/models.py | 4 + skyvern/forge/sdk/db/utils.py | 3 + skyvern/forge/sdk/executor/async_executor.py | 1 + skyvern/forge/sdk/routes/agent_protocol.py | 6 + skyvern/forge/sdk/schemas/task_v2.py | 3 + skyvern/forge/sdk/schemas/tasks.py | 7 ++ skyvern/forge/sdk/workflow/models/block.py | 18 ++- skyvern/forge/sdk/workflow/models/workflow.py | 4 + skyvern/forge/sdk/workflow/models/yaml.py | 1 + skyvern/forge/sdk/workflow/service.py | 8 ++ skyvern/schemas/runs.py | 14 ++- skyvern/services/run_service.py | 1 + skyvern/services/task_v2_service.py | 9 +- skyvern/services/workflow_service.py | 1 + skyvern/webeye/browser_factory.py | 30 ++++- skyvern/webeye/scraper/domUtils.js | 3 +- skyvern/webeye/utils/page.py | 118 +++++++++++++++--- 39 files changed, 452 insertions(+), 35 deletions(-) create mode 100644 alembic/versions/2025_06_14_0649-2c6b27e8e961_add_screenshot_scroll_number.py diff --git a/alembic/versions/2025_06_14_0649-2c6b27e8e961_add_screenshot_scroll_number.py b/alembic/versions/2025_06_14_0649-2c6b27e8e961_add_screenshot_scroll_number.py new file mode 100644 index 00000000..10380859 --- /dev/null +++ b/alembic/versions/2025_06_14_0649-2c6b27e8e961_add_screenshot_scroll_number.py @@ -0,0 +1,37 @@ +"""add_screenshot_scroll_number + +Revision ID: 2c6b27e8e961 +Revises: 1517a4ba63fa +Create Date: 2025-06-14 06:49:43.628471+00:00 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "2c6b27e8e961" +down_revision: Union[str, None] = "1517a4ba63fa" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("observer_cruises", sa.Column("max_screenshot_scrolling_times", sa.Integer(), nullable=True)) + op.add_column("tasks", sa.Column("max_screenshot_scrolling_times", sa.Integer(), nullable=True)) + op.add_column("workflow_runs", sa.Column("max_screenshot_scrolling_times", sa.Integer(), nullable=True)) + op.add_column("workflows", sa.Column("max_screenshot_scrolling_times", sa.Integer(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("workflows", "max_screenshot_scrolling_times") + op.drop_column("workflow_runs", "max_screenshot_scrolling_times") + op.drop_column("tasks", "max_screenshot_scrolling_times") + op.drop_column("observer_cruises", "max_screenshot_scrolling_times") + # ### end Alembic commands ### diff --git a/skyvern-frontend/src/api/types.ts b/skyvern-frontend/src/api/types.ts index cef3eb67..f5def2e6 100644 --- a/skyvern-frontend/src/api/types.ts +++ b/skyvern-frontend/src/api/types.ts @@ -141,6 +141,7 @@ export type CreateTaskRequest = { totp_identifier?: string | null; application?: string | null; include_action_history_in_verification?: boolean | null; + max_screenshot_scrolling_times?: number | null; }; export type User = { @@ -293,6 +294,7 @@ export type WorkflowRunStatusApiResponse = { total_cost: number | null; task_v2: TaskV2 | null; workflow_title: string | null; + max_screenshot_scrolling_times: number | null; }; export type TaskGenerationApiResponse = { diff --git a/skyvern-frontend/src/routes/tasks/create/CreateNewTaskForm.tsx b/skyvern-frontend/src/routes/tasks/create/CreateNewTaskForm.tsx index aa079dba..565bbbdc 100644 --- a/skyvern-frontend/src/routes/tasks/create/CreateNewTaskForm.tsx +++ b/skyvern-frontend/src/routes/tasks/create/CreateNewTaskForm.tsx @@ -39,6 +39,7 @@ import { } from "./taskFormTypes"; import { ProxySelector } from "@/components/ProxySelector"; import { Switch } from "@/components/ui/switch"; +import { MAX_SCREENSHOT_SCROLLING_TIMES_DEFAULT } from "@/routes/workflows/editor/nodes/Taskv2Node/types"; type Props = { initialValues: CreateNewTaskFormValues; }; @@ -80,6 +81,7 @@ function createTaskRequestObject( extracted_information_schema: extractedInformationSchema, totp_identifier: transform(formValues.totpIdentifier), error_code_mapping: errorCodeMapping, + max_screenshot_scrolling_times: formValues.maxScreenshotScrollingTimes, include_action_history_in_verification: formValues.includeActionHistoryInVerification, }; @@ -114,6 +116,8 @@ function CreateNewTaskForm({ initialValues }: Props) { ...initialValues, maxStepsOverride: initialValues.maxStepsOverride ?? null, proxyLocation: initialValues.proxyLocation ?? ProxyLocation.Residential, + maxScreenshotScrollingTimes: + initialValues.maxScreenshotScrollingTimes ?? null, }, }); const { errors } = useFormState({ control: form.control }); @@ -557,6 +561,45 @@ function CreateNewTaskForm({ initialValues }: Props) { ); }} /> + ( + +
+ +
+

+ Max Scrolling Screenshots +

+

+ {`The maximum number of times to scroll down the page to take merged screenshots after action. Default is ${MAX_SCREENSHOT_SCROLLING_TIMES_DEFAULT}. If it's set to 0, it will take the current viewport screenshot.`} +

+
+
+
+ + { + const value = + event.target.value === "" + ? null + : Number(event.target.value); + field.onChange(value); + }} + /> + + +
+
+
+ )} + /> @@ -131,6 +132,7 @@ function CreateNewTaskFormPage() { includeActionHistoryInVerification: data.workflow_definition.blocks[0] .include_action_history_in_verification, + maxScreenshotScrollingTimes: data.max_screenshot_scrolling_times, }} /> diff --git a/skyvern-frontend/src/routes/tasks/create/PromptBox.tsx b/skyvern-frontend/src/routes/tasks/create/PromptBox.tsx index 21d8c466..3254973a 100644 --- a/skyvern-frontend/src/routes/tasks/create/PromptBox.tsx +++ b/skyvern-frontend/src/routes/tasks/create/PromptBox.tsx @@ -43,7 +43,10 @@ import { generateUniqueEmail, } from "../data/sampleTaskData"; import { ExampleCasePill } from "./ExampleCasePill"; -import { MAX_STEPS_DEFAULT } from "@/routes/workflows/editor/nodes/Taskv2Node/types"; +import { + MAX_SCREENSHOT_SCROLLING_TIMES_DEFAULT, + MAX_STEPS_DEFAULT, +} from "@/routes/workflows/editor/nodes/Taskv2Node/types"; function createTemplateTaskFromTaskGenerationParameters( values: TaskGenerationApiResponse, @@ -153,6 +156,8 @@ function PromptBox() { const [publishWorkflow, setPublishWorkflow] = useState(false); const [totpIdentifier, setTotpIdentifier] = useState(""); const [maxStepsOverride, setMaxStepsOverride] = useState(null); + const [maxScreenshotScrollingTimes, setMaxScreenshotScrollingTimes] = + useState(null); const [showAdvancedSettings, setShowAdvancedSettings] = useState(false); const [dataSchema, setDataSchema] = useState(null); @@ -167,6 +172,7 @@ function PromptBox() { proxy_location: proxyLocation, totp_identifier: totpIdentifier, publish_workflow: publishWorkflow, + max_screenshot_scrolling_times: maxScreenshotScrollingTimes, extracted_information_schema: dataSchema ? (() => { try { @@ -438,6 +444,21 @@ function PromptBox() { /> +
+
+
Max Scrolling Screenshots
+
+ {`The maximum number of times to scroll down the page to take merged screenshots after action. Default is ${MAX_SCREENSHOT_SCROLLING_TIMES_DEFAULT}. If it's set to 0, it will take the current viewport screenshot.`} +
+
+ { + setMaxScreenshotScrollingTimes(event.target.value); + }} + /> +
) : null} diff --git a/skyvern-frontend/src/routes/tasks/create/retry/RetryTask.tsx b/skyvern-frontend/src/routes/tasks/create/retry/RetryTask.tsx index 6755b5d1..74bcdd30 100644 --- a/skyvern-frontend/src/routes/tasks/create/retry/RetryTask.tsx +++ b/skyvern-frontend/src/routes/tasks/create/retry/RetryTask.tsx @@ -44,6 +44,8 @@ function RetryTask() { proxyLocation: task.request.proxy_location ?? null, includeActionHistoryInVerification: task.request.include_action_history_in_verification ?? false, + maxScreenshotScrollingTimes: + task.request.max_screenshot_scrolling_times ?? null, }} /> diff --git a/skyvern-frontend/src/routes/tasks/create/taskFormTypes.ts b/skyvern-frontend/src/routes/tasks/create/taskFormTypes.ts index efafcbcf..e58afc69 100644 --- a/skyvern-frontend/src/routes/tasks/create/taskFormTypes.ts +++ b/skyvern-frontend/src/routes/tasks/create/taskFormTypes.ts @@ -15,6 +15,7 @@ const createNewTaskFormSchemaBase = z.object({ errorCodeMapping: z.string().or(z.null()), proxyLocation: z.nativeEnum(ProxyLocation).or(z.null()), includeActionHistoryInVerification: z.boolean().or(z.null()).default(false), + maxScreenshotScrollingTimes: z.number().or(z.null()).default(null), }); const savedTaskFormSchemaBase = createNewTaskFormSchemaBase.extend({ diff --git a/skyvern-frontend/src/routes/workflows/RunWorkflowForm.tsx b/skyvern-frontend/src/routes/workflows/RunWorkflowForm.tsx index da36beb4..99648871 100644 --- a/skyvern-frontend/src/routes/workflows/RunWorkflowForm.tsx +++ b/skyvern-frontend/src/routes/workflows/RunWorkflowForm.tsx @@ -28,12 +28,14 @@ import { WorkflowParameter } from "./types/workflowTypes"; import { WorkflowParameterInput } from "./WorkflowParameterInput"; import { AxiosError } from "axios"; import { getLabelForWorkflowParameterType } from "./editor/workflowEditorUtils"; +import { MAX_SCREENSHOT_SCROLLING_TIMES_DEFAULT } from "./editor/nodes/Taskv2Node/types"; type Props = { workflowParameters: Array; initialValues: Record; initialSettings: { proxyLocation: ProxyLocation; webhookCallbackUrl: string; + maxScreenshotScrollingTimes: number | null; }; }; @@ -73,14 +75,20 @@ type RunWorkflowRequestBody = { proxy_location: ProxyLocation | null; webhook_callback_url?: string | null; browser_session_id: string | null; + max_screenshot_scrolling_times?: number | null; }; function getRunWorkflowRequestBody( values: RunWorkflowFormType, workflowParameters: Array, ): RunWorkflowRequestBody { - const { webhookCallbackUrl, proxyLocation, browserSessionId, ...parameters } = - values; + const { + webhookCallbackUrl, + proxyLocation, + browserSessionId, + maxScreenshotScrollingTimes, + ...parameters + } = values; const parsedParameters = parseValuesForWorkflowRun( parameters, @@ -95,6 +103,10 @@ function getRunWorkflowRequestBody( browser_session_id: bsi, }; + if (maxScreenshotScrollingTimes) { + body.max_screenshot_scrolling_times = maxScreenshotScrollingTimes; + } + if (webhookCallbackUrl) { body.webhook_callback_url = webhookCallbackUrl; } @@ -106,6 +118,7 @@ type RunWorkflowFormType = Record & { webhookCallbackUrl: string; proxyLocation: ProxyLocation; browserSessionId: string | null; + maxScreenshotScrollingTimes: number | null; }; function RunWorkflowForm({ @@ -127,6 +140,7 @@ function RunWorkflowForm({ webhookCallbackUrl: initialSettings.webhookCallbackUrl, proxyLocation: initialSettings.proxyLocation, browserSessionId: browserSessionIdDefault, + maxScreenshotScrollingTimes: initialSettings.maxScreenshotScrollingTimes, }, }); const apiCredential = useApiCredential(); @@ -177,6 +191,7 @@ function RunWorkflowForm({ webhookCallbackUrl, proxyLocation, browserSessionId, + maxScreenshotScrollingTimes, ...parameters } = values; @@ -189,6 +204,7 @@ function RunWorkflowForm({ webhookCallbackUrl, proxyLocation, browserSessionId, + maxScreenshotScrollingTimes, }); } @@ -392,6 +408,48 @@ function RunWorkflowForm({ ); }} /> + { + return ( + +
+ +
+
+ Max Scrolling Screenshots +
+

+ {`The maximum number of times to scroll down the page to take merged screenshots after action. Default is ${MAX_SCREENSHOT_SCROLLING_TIMES_DEFAULT}. If it's set to 0, it will take the current viewport screenshot.`} +

+
+
+
+ + { + const value = + event.target.value === "" + ? null + : Number(event.target.value); + field.onChange(value); + }} + /> + + +
+
+
+ ); + }} + />
@@ -244,6 +246,7 @@ function WorkflowRun() { data: parameters, proxyLocation, webhookCallbackUrl: workflowRun?.webhook_callback_url ?? "", + maxScreenshotScrollingTimes, }} > diff --git a/skyvern-frontend/src/routes/workflows/WorkflowRunParameters.tsx b/skyvern-frontend/src/routes/workflows/WorkflowRunParameters.tsx index 8de6821f..2fe037d3 100644 --- a/skyvern-frontend/src/routes/workflows/WorkflowRunParameters.tsx +++ b/skyvern-frontend/src/routes/workflows/WorkflowRunParameters.tsx @@ -30,6 +30,8 @@ function WorkflowRunParameters() { const proxyLocation = location.state ? (location.state.proxyLocation as ProxyLocation) : null; + const maxScreenshotScrollingTimes = + location.state?.maxScreenshotScrollingTimes ?? null; const webhookCallbackUrl = location.state ? (location.state.webhookCallbackUrl as string) @@ -109,6 +111,10 @@ function WorkflowRunParameters() { ProxyLocation.Residential, webhookCallbackUrl: webhookCallbackUrl ?? workflow.webhook_callback_url ?? "", + maxScreenshotScrollingTimes: + maxScreenshotScrollingTimes ?? + workflow.max_screenshot_scrolling_times ?? + null, }} />
diff --git a/skyvern-frontend/src/routes/workflows/editor/FlowRenderer.tsx b/skyvern-frontend/src/routes/workflows/editor/FlowRenderer.tsx index 357add8c..a67653ef 100644 --- a/skyvern-frontend/src/routes/workflows/editor/FlowRenderer.tsx +++ b/skyvern-frontend/src/routes/workflows/editor/FlowRenderer.tsx @@ -289,6 +289,8 @@ function FlowRenderer({ webhook_callback_url: data.settings.webhookCallbackUrl, persist_browser_session: data.settings.persistBrowserSession, model: data.settings.model, + max_screenshot_scrolling_times: + data.settings.maxScreenshotScrollingTimes, totp_verification_url: workflow.totp_verification_url, workflow_definition: { parameters: data.parameters, diff --git a/skyvern-frontend/src/routes/workflows/editor/WorkflowEditor.tsx b/skyvern-frontend/src/routes/workflows/editor/WorkflowEditor.tsx index 144ac78d..322a3ef2 100644 --- a/skyvern-frontend/src/routes/workflows/editor/WorkflowEditor.tsx +++ b/skyvern-frontend/src/routes/workflows/editor/WorkflowEditor.tsx @@ -60,6 +60,7 @@ function WorkflowEditor() { proxyLocation: workflow.proxy_location, webhookCallbackUrl: workflow.webhook_callback_url, model: workflow.model, + maxScreenshotScrollingTimes: workflow.max_screenshot_scrolling_times, }; const elements = getElements( diff --git a/skyvern-frontend/src/routes/workflows/editor/nodes/StartNode/StartNode.tsx b/skyvern-frontend/src/routes/workflows/editor/nodes/StartNode/StartNode.tsx index b2a2774d..1e0a97f5 100644 --- a/skyvern-frontend/src/routes/workflows/editor/nodes/StartNode/StartNode.tsx +++ b/skyvern-frontend/src/routes/workflows/editor/nodes/StartNode/StartNode.tsx @@ -20,6 +20,7 @@ import { Separator } from "@/components/ui/separator"; import { ModelsResponse } from "@/api/types"; import { ModelSelector } from "@/components/ModelSelector"; import { WorkflowModel } from "@/routes/workflows/types/workflowTypes"; +import { MAX_SCREENSHOT_SCROLLING_TIMES_DEFAULT } from "../Taskv2Node/types"; function StartNode({ id, data }: NodeProps) { const credentialGetter = useCredentialGetter(); @@ -51,6 +52,9 @@ function StartNode({ id, data }: NodeProps) { ? data.persistBrowserSession : false, model: data.withWorkflowSettings ? data.model : workflowModel, + maxScreenshotScrollingTimes: data.withWorkflowSettings + ? data.maxScreenshotScrollingTimes + : null, }); function handleChange(key: string, value: unknown) { @@ -130,6 +134,26 @@ function StartNode({ id, data }: NodeProps) { /> +
+
+ + +
+ { + const value = + event.target.value === "" + ? null + : Number(event.target.value); + + handleChange("maxScreenshotScrollingTimes", value); + }} + /> +
diff --git a/skyvern-frontend/src/routes/workflows/editor/nodes/StartNode/types.ts b/skyvern-frontend/src/routes/workflows/editor/nodes/StartNode/types.ts index 11f54be4..6fa5ae31 100644 --- a/skyvern-frontend/src/routes/workflows/editor/nodes/StartNode/types.ts +++ b/skyvern-frontend/src/routes/workflows/editor/nodes/StartNode/types.ts @@ -9,6 +9,7 @@ export type WorkflowStartNodeData = { proxyLocation: ProxyLocation; persistBrowserSession: boolean; model: WorkflowModel | null; + maxScreenshotScrollingTimes: number | null; editable: boolean; }; diff --git a/skyvern-frontend/src/routes/workflows/editor/nodes/Taskv2Node/types.ts b/skyvern-frontend/src/routes/workflows/editor/nodes/Taskv2Node/types.ts index cafdea7d..f3cc549b 100644 --- a/skyvern-frontend/src/routes/workflows/editor/nodes/Taskv2Node/types.ts +++ b/skyvern-frontend/src/routes/workflows/editor/nodes/Taskv2Node/types.ts @@ -2,6 +2,7 @@ import { Node } from "@xyflow/react"; import { NodeBaseData } from "../types"; export const MAX_STEPS_DEFAULT = 25; +export const MAX_SCREENSHOT_SCROLLING_TIMES_DEFAULT = 3; export type Taskv2NodeData = NodeBaseData & { prompt: string; @@ -9,6 +10,7 @@ export type Taskv2NodeData = NodeBaseData & { totpVerificationUrl: string | null; totpIdentifier: string | null; maxSteps: number | null; + maxScreenshotScrollingTimes: number | null; }; export type Taskv2Node = Node; @@ -23,6 +25,7 @@ export const taskv2NodeDefaultData: Taskv2NodeData = { totpVerificationUrl: null, maxSteps: MAX_STEPS_DEFAULT, model: null, + maxScreenshotScrollingTimes: null, }; export function isTaskV2Node(node: Node): node is Taskv2Node { diff --git a/skyvern-frontend/src/routes/workflows/editor/workflowEditorUtils.ts b/skyvern-frontend/src/routes/workflows/editor/workflowEditorUtils.ts index cc239295..ee2c0e2a 100644 --- a/skyvern-frontend/src/routes/workflows/editor/workflowEditorUtils.ts +++ b/skyvern-frontend/src/routes/workflows/editor/workflowEditorUtils.ts @@ -251,6 +251,7 @@ function convertToNode( maxSteps: block.max_steps, totpIdentifier: block.totp_identifier, totpVerificationUrl: block.totp_verification_url, + maxScreenshotScrollingTimes: null, }, }; } @@ -662,6 +663,7 @@ function getElements( proxyLocation: settings.proxyLocation ?? ProxyLocation.Residential, webhookCallbackUrl: settings.webhookCallbackUrl ?? "", model: settings.model, + maxScreenshotScrollingTimes: settings.maxScreenshotScrollingTimes, editable, }), ); @@ -1322,6 +1324,7 @@ function getWorkflowSettings(nodes: Array): WorkflowSettings { proxyLocation: ProxyLocation.Residential, webhookCallbackUrl: null, model: null, + maxScreenshotScrollingTimes: null, }; const startNodes = nodes.filter(isStartNode); const startNodeWithWorkflowSettings = startNodes.find( @@ -1337,6 +1340,7 @@ function getWorkflowSettings(nodes: Array): WorkflowSettings { proxyLocation: data.proxyLocation, webhookCallbackUrl: data.webhookCallbackUrl, model: data.model, + maxScreenshotScrollingTimes: data.maxScreenshotScrollingTimes, }; } return defaultSettings; @@ -1992,6 +1996,7 @@ function convert(workflow: WorkflowApiResponse): WorkflowCreateYAMLRequest { persist_browser_session: workflow.persist_browser_session, model: workflow.model, totp_verification_url: workflow.totp_verification_url, + max_screenshot_scrolling_times: workflow.max_screenshot_scrolling_times, workflow_definition: { parameters: convertParametersToParameterYAML(userParameters), blocks: convertBlocksToBlockYAML(workflow.workflow_definition.blocks), diff --git a/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts b/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts index 430b72ad..feef5c59 100644 --- a/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts +++ b/skyvern-frontend/src/routes/workflows/types/workflowTypes.ts @@ -470,6 +470,7 @@ export type WorkflowApiResponse = { model: WorkflowModel | null; totp_verification_url: string | null; totp_identifier: string | null; + max_screenshot_scrolling_times: number | null; created_at: string; modified_at: string; deleted_at: string | null; @@ -480,6 +481,7 @@ export type WorkflowSettings = { webhookCallbackUrl: string | null; persistBrowserSession: boolean; model: WorkflowModel | null; + maxScreenshotScrollingTimes: number | null; }; export type WorkflowModel = JsonObjectExtendable<{ model_name: string }>; diff --git a/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts b/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts index 9df7811d..e7a40553 100644 --- a/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts +++ b/skyvern-frontend/src/routes/workflows/types/workflowYamlTypes.ts @@ -12,6 +12,7 @@ export type WorkflowCreateYAMLRequest = { totp_verification_url?: string | null; workflow_definition: WorkflowDefinitionYAML; is_saved_task?: boolean; + max_screenshot_scrolling_times?: number | null; }; export type WorkflowDefinitionYAML = { diff --git a/skyvern/constants.py b/skyvern/constants.py index 52de64b1..d1a69a9b 100644 --- a/skyvern/constants.py +++ b/skyvern/constants.py @@ -20,6 +20,7 @@ AUTO_COMPLETION_POTENTIAL_VALUES_COUNT = 3 DROPDOWN_MENU_MAX_DISTANCE = 100 BROWSER_DOWNLOADING_SUFFIX = ".crdownload" MAX_UPLOAD_FILE_COUNT = 50 +DEFAULT_MAX_SCREENSHOT_SCROLLING_TIMES = 3 # reserved fields for navigation payload SPECIAL_FIELD_VERIFICATION_CODE = "verification_code" diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 8dec9606..73ffd6fe 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -19,6 +19,7 @@ from skyvern import analytics from skyvern.config import settings from skyvern.constants import ( BROWSER_DOWNLOADING_SUFFIX, + DEFAULT_MAX_SCREENSHOT_SCROLLING_TIMES, GET_DOWNLOADED_FILES_TIMEOUT, SAVE_DOWNLOADED_FILES_TIMEOUT, SCRAPE_TYPE_ORDER, @@ -181,6 +182,7 @@ class ForgeAgent: error_code_mapping=task_block.error_code_mapping, include_action_history_in_verification=task_block.include_action_history_in_verification, model=task_block.model, + max_screenshot_scrolling_times=workflow_run.max_screenshot_scrolling_times, ) LOG.info( "Created a new task for workflow run", @@ -237,6 +239,7 @@ class ForgeAgent: application=task_request.application, include_action_history_in_verification=task_request.include_action_history_in_verification, model=task_request.model, + max_screenshot_scrolling_times=task_request.max_screenshot_scrolling_times, ) LOG.info( "Created new task", @@ -1650,12 +1653,22 @@ class ForgeAgent: if not working_page: raise BrowserStateMissingPage() - fullpage_screenshot = True + context = skyvern_context.ensure_context() + scrolling_number = context.max_screenshot_scrolling_times + if scrolling_number is None: + scrolling_number = DEFAULT_MAX_SCREENSHOT_SCROLLING_TIMES + if engine in CUA_ENGINES: - fullpage_screenshot = False + scrolling_number = 0 try: - screenshot = await browser_state.take_screenshot(full_page=fullpage_screenshot) + screenshot = await browser_state.take_post_action_screenshot( + scrolling_number=scrolling_number, + use_playwright_fullpage=app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached( + "ENABLE_PLAYWRIGHT_FULLPAGE", + str(task.organization_id), + ), + ) await app.ARTIFACT_MANAGER.create_artifact( step=step, artifact_type=ArtifactType.SCREENSHOT_ACTION, @@ -2135,7 +2148,12 @@ class ForgeAgent: browser_state = app.BROWSER_MANAGER.get_for_task(task.task_id) if browser_state is not None and await browser_state.get_working_page() is not None: try: - screenshot = await browser_state.take_screenshot(full_page=True) + screenshot = await browser_state.take_fullpage_screenshot( + use_playwright_fullpage=app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached( + "ENABLE_PLAYWRIGHT_FULLPAGE", + str(task.organization_id), + ) + ) await app.ARTIFACT_MANAGER.create_artifact( step=last_step, artifact_type=ArtifactType.SCREENSHOT_FINAL, diff --git a/skyvern/forge/sdk/core/skyvern_context.py b/skyvern/forge/sdk/core/skyvern_context.py index 819cb3ab..d51b9fa8 100644 --- a/skyvern/forge/sdk/core/skyvern_context.py +++ b/skyvern/forge/sdk/core/skyvern_context.py @@ -23,6 +23,7 @@ class SkyvernContext: hashed_href_map: dict[str, str] = field(default_factory=dict) refresh_working_page: bool = False frame_index_map: dict[Frame, int] = field(default_factory=dict) + max_screenshot_scrolling_times: int | None = None def __repr__(self) -> str: return f"SkyvernContext(request_id={self.request_id}, organization_id={self.organization_id}, task_id={self.task_id}, workflow_id={self.workflow_id}, workflow_run_id={self.workflow_run_id}, task_v2_id={self.task_v2_id}, max_steps_override={self.max_steps_override})" diff --git a/skyvern/forge/sdk/db/client.py b/skyvern/forge/sdk/db/client.py index 0f035b8b..a3697c6a 100644 --- a/skyvern/forge/sdk/db/client.py +++ b/skyvern/forge/sdk/db/client.py @@ -149,6 +149,7 @@ class AgentDB: application: str | None = None, include_action_history_in_verification: bool | None = None, model: dict[str, Any] | None = None, + max_screenshot_scrolling_times: int | None = None, ) -> Task: try: async with self.Session() as session: @@ -176,6 +177,7 @@ class AgentDB: application=application, include_action_history_in_verification=include_action_history_in_verification, model=model, + max_screenshot_scrolling_times=max_screenshot_scrolling_times, ) session.add(new_task) await session.commit() @@ -1217,6 +1219,7 @@ class AgentDB: description: str | None = None, proxy_location: ProxyLocation | None = None, webhook_callback_url: str | None = None, + max_screenshot_scrolling_times: int | None = None, totp_verification_url: str | None = None, totp_identifier: str | None = None, persist_browser_session: bool = False, @@ -1236,6 +1239,7 @@ class AgentDB: webhook_callback_url=webhook_callback_url, totp_verification_url=totp_verification_url, totp_identifier=totp_identifier, + max_screenshot_scrolling_times=max_screenshot_scrolling_times, persist_browser_session=persist_browser_session, model=model, is_saved_task=is_saved_task, @@ -1479,6 +1483,7 @@ class AgentDB: totp_verification_url: str | None = None, totp_identifier: str | None = None, parent_workflow_run_id: str | None = None, + max_screenshot_scrolling_times: int | None = None, ) -> WorkflowRun: try: async with self.Session() as session: @@ -1492,6 +1497,7 @@ class AgentDB: totp_verification_url=totp_verification_url, totp_identifier=totp_identifier, parent_workflow_run_id=parent_workflow_run_id, + max_screenshot_scrolling_times=max_screenshot_scrolling_times, ) session.add(workflow_run) await session.commit() @@ -2436,6 +2442,7 @@ class AgentDB: extracted_information_schema: dict | list | str | None = None, error_code_mapping: dict | None = None, model: dict[str, Any] | None = None, + max_screenshot_scrolling_times: int | None = None, ) -> TaskV2: async with self.Session() as session: new_task_v2 = TaskV2Model( @@ -2452,6 +2459,7 @@ class AgentDB: error_code_mapping=error_code_mapping, organization_id=organization_id, model=model, + max_screenshot_scrolling_times=max_screenshot_scrolling_times, ) session.add(new_task_v2) await session.commit() diff --git a/skyvern/forge/sdk/db/models.py b/skyvern/forge/sdk/db/models.py index a5cbf913..b41c1b42 100644 --- a/skyvern/forge/sdk/db/models.py +++ b/skyvern/forge/sdk/db/models.py @@ -88,6 +88,7 @@ class TaskModel(Base): queued_at = Column(DateTime, nullable=True) started_at = Column(DateTime, nullable=True) finished_at = Column(DateTime, nullable=True) + max_screenshot_scrolling_times = Column(Integer, nullable=True) created_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False, index=True) modified_at = Column( DateTime, @@ -218,6 +219,7 @@ class WorkflowModel(Base): workflow_definition = Column(JSON, nullable=False) proxy_location = Column(String) webhook_callback_url = Column(String) + max_screenshot_scrolling_times = Column(Integer, nullable=True) totp_verification_url = Column(String) totp_identifier = Column(String) persist_browser_session = Column(Boolean, default=False, nullable=False) @@ -254,6 +256,7 @@ class WorkflowRunModel(Base): webhook_callback_url = Column(String) totp_verification_url = Column(String) totp_identifier = Column(String) + max_screenshot_scrolling_times = Column(Integer, nullable=True) queued_at = Column(DateTime, nullable=True) started_at = Column(DateTime, nullable=True) @@ -621,6 +624,7 @@ class TaskV2Model(Base): extracted_information_schema = Column(JSON, nullable=True) error_code_mapping = Column(JSON, nullable=True) max_steps = Column(Integer, nullable=True) + max_screenshot_scrolling_times = Column(Integer, nullable=True) queued_at = Column(DateTime, nullable=True) started_at = Column(DateTime, nullable=True) diff --git a/skyvern/forge/sdk/db/utils.py b/skyvern/forge/sdk/db/utils.py index 5bfe3454..441a07e3 100644 --- a/skyvern/forge/sdk/db/utils.py +++ b/skyvern/forge/sdk/db/utils.py @@ -142,6 +142,7 @@ def convert_to_task(task_obj: TaskModel, debug_enabled: bool = False, workflow_p queued_at=task_obj.queued_at, started_at=task_obj.started_at, finished_at=task_obj.finished_at, + max_screenshot_scrolling_times=task_obj.max_screenshot_scrolling_times, ) return task @@ -238,6 +239,7 @@ def convert_to_workflow(workflow_model: WorkflowModel, debug_enabled: bool = Fal persist_browser_session=workflow_model.persist_browser_session, model=workflow_model.model, proxy_location=(ProxyLocation(workflow_model.proxy_location) if workflow_model.proxy_location else None), + max_screenshot_scrolling_times=workflow_model.max_screenshot_scrolling_times, version=workflow_model.version, is_saved_task=workflow_model.is_saved_task, description=workflow_model.description, @@ -278,6 +280,7 @@ def convert_to_workflow_run( created_at=workflow_run_model.created_at, modified_at=workflow_run_model.modified_at, workflow_title=workflow_title, + max_screenshot_scrolling_times=workflow_run_model.max_screenshot_scrolling_times, ) diff --git a/skyvern/forge/sdk/executor/async_executor.py b/skyvern/forge/sdk/executor/async_executor.py index a13aa161..b579fafd 100644 --- a/skyvern/forge/sdk/executor/async_executor.py +++ b/skyvern/forge/sdk/executor/async_executor.py @@ -107,6 +107,7 @@ class BackgroundTaskExecutor(AsyncExecutor): context.task_id = task.task_id context.organization_id = organization_id context.max_steps_override = max_steps_override + context.max_screenshot_scrolling_times = task.max_screenshot_scrolling_times if background_tasks: await initialize_skyvern_state_file(task_id=task_id, organization_id=organization_id) diff --git a/skyvern/forge/sdk/routes/agent_protocol.py b/skyvern/forge/sdk/routes/agent_protocol.py index e1441796..99a92282 100644 --- a/skyvern/forge/sdk/routes/agent_protocol.py +++ b/skyvern/forge/sdk/routes/agent_protocol.py @@ -166,6 +166,7 @@ async def run_task( totp_identifier=run_request.totp_identifier, include_action_history_in_verification=run_request.include_action_history_in_verification, model=run_request.model, + max_screenshot_scrolling_times=run_request.max_screenshot_scrolling_times, ) task_v1_response = await task_v1_service.run_task( task=task_v1_request, @@ -203,6 +204,7 @@ async def run_task( data_extraction_schema=task_v1_response.extracted_information_schema, error_code_mapping=task_v1_response.error_code_mapping, browser_session_id=run_request.browser_session_id, + max_screenshot_scrolling_times=run_request.max_screenshot_scrolling_times, ), ) if run_request.engine == RunEngine.skyvern_v2: @@ -221,6 +223,7 @@ async def run_task( error_code_mapping=run_request.error_code_mapping, create_task_run=True, model=run_request.model, + max_screenshot_scrolling_times=run_request.max_screenshot_scrolling_times, ) except MissingBrowserAddressError as e: raise HTTPException(status_code=400, detail=str(e)) from e @@ -263,6 +266,7 @@ async def run_task( error_code_mapping=task_v2.error_code_mapping, data_extraction_schema=task_v2.extracted_information_schema, publish_workflow=run_request.publish_workflow, + max_screenshot_scrolling_times=run_request.max_screenshot_scrolling_times, ), ) LOG.error("Invalid agent engine", engine=run_request.engine, organization_id=current_org.organization_id) @@ -318,6 +322,7 @@ async def run_workflow( totp_identifier=workflow_run_request.totp_identifier, totp_url=workflow_run_request.totp_url, browser_session_id=workflow_run_request.browser_session_id, + max_screenshot_scrolling_times=workflow_run_request.max_screenshot_scrolling_times, ) try: @@ -1765,6 +1770,7 @@ async def run_task_v2( create_task_run=True, extracted_information_schema=data.extracted_information_schema, error_code_mapping=data.error_code_mapping, + max_screenshot_scrolling_times=data.max_screenshot_scrolling_times, ) except MissingBrowserAddressError as e: raise HTTPException(status_code=400, detail=str(e)) from e diff --git a/skyvern/forge/sdk/schemas/task_v2.py b/skyvern/forge/sdk/schemas/task_v2.py index 1978242a..8cbf418a 100644 --- a/skyvern/forge/sdk/schemas/task_v2.py +++ b/skyvern/forge/sdk/schemas/task_v2.py @@ -48,6 +48,8 @@ class TaskV2(BaseModel): queued_at: datetime | None = None started_at: datetime | None = None finished_at: datetime | None = None + max_screenshot_scrolling_times: int | None = None + created_at: datetime modified_at: datetime @@ -147,6 +149,7 @@ class TaskV2Request(BaseModel): publish_workflow: bool = False extracted_information_schema: dict | list | str | None = None error_code_mapping: dict[str, str] | None = None + max_screenshot_scrolling_times: int | None = None @field_validator("url", "webhook_callback_url", "totp_verification_url") @classmethod diff --git a/skyvern/forge/sdk/schemas/tasks.py b/skyvern/forge/sdk/schemas/tasks.py index a8268f9f..31b0a7e5 100644 --- a/skyvern/forge/sdk/schemas/tasks.py +++ b/skyvern/forge/sdk/schemas/tasks.py @@ -96,6 +96,11 @@ class TaskBase(BaseModel): description="Whether to include the action history when verifying the task is complete", examples=[True, False], ) + max_screenshot_scrolling_times: int | None = Field( + default=None, + description="Scroll down n times to get the merged screenshot of the page after taking an action. When it's None or 0, it takes the current viewpoint screenshot.", + examples=[10], + ) class TaskRequest(TaskBase): @@ -314,6 +319,7 @@ class Task(TaskBase): errors=self.errors, max_steps_per_run=self.max_steps_per_run, workflow_run_id=self.workflow_run_id, + max_screenshot_scrolling_times=self.max_screenshot_scrolling_times, ) @@ -337,6 +343,7 @@ class TaskResponse(BaseModel): queued_at: datetime | None = None started_at: datetime | None = None finished_at: datetime | None = None + max_screenshot_scrolling_times: int | None = None class TaskOutput(BaseModel): diff --git a/skyvern/forge/sdk/workflow/models/block.py b/skyvern/forge/sdk/workflow/models/block.py index 6b173a3a..ed16d452 100644 --- a/skyvern/forge/sdk/workflow/models/block.py +++ b/skyvern/forge/sdk/workflow/models/block.py @@ -307,7 +307,12 @@ class Block(BaseModel, abc.ABC): if not browser_state: LOG.warning("No browser state found when creating workflow_run_block", workflow_run_id=workflow_run_id) else: - screenshot = await browser_state.take_screenshot(full_page=True) + screenshot = await browser_state.take_fullpage_screenshot( + use_playwright_fullpage=app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached( + "ENABLE_PLAYWRIGHT_FULLPAGE", + str(organization_id), + ) + ) if screenshot: await app.ARTIFACT_MANAGER.create_workflow_run_block_artifact( workflow_run_block=workflow_run_block, @@ -569,8 +574,15 @@ class BaseTaskBlock(Block): browser_state = await app.BROWSER_MANAGER.get_or_create_for_workflow_run( workflow_run=workflow_run, url=self.url, browser_session_id=browser_session_id ) + # assert that the browser state is not None, otherwise we can't go through typing + assert browser_state is not None # add screenshot artifact for the first task - screenshot = await browser_state.take_screenshot(full_page=True) + screenshot = await browser_state.take_fullpage_screenshot( + use_playwright_fullpage=app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached( + "ENABLE_PLAYWRIGHT_FULLPAGE", + str(organization_id), + ) + ) if screenshot: await app.ARTIFACT_MANAGER.create_workflow_run_block_artifact( workflow_run_block=workflow_run_block, @@ -2486,6 +2498,7 @@ class TaskV2Block(Block): proxy_location=workflow_run.proxy_location, totp_identifier=self.totp_identifier, totp_verification_url=self.totp_verification_url, + max_screenshot_scrolling_times=workflow_run.max_screenshot_scrolling_times, ) await app.DATABASE.update_task_v2( task_v2.observer_cruise_id, status=TaskV2Status.queued, organization_id=organization_id @@ -2517,6 +2530,7 @@ class TaskV2Block(Block): workflow_permanent_id=workflow_run.workflow_permanent_id, workflow_run_id=workflow_run_id, browser_session_id=browser_session_id, + max_screenshot_scrolling_times=workflow_run.max_screenshot_scrolling_times, ) ) result_dict = None diff --git a/skyvern/forge/sdk/workflow/models/workflow.py b/skyvern/forge/sdk/workflow/models/workflow.py index 1880446c..0c9002dc 100644 --- a/skyvern/forge/sdk/workflow/models/workflow.py +++ b/skyvern/forge/sdk/workflow/models/workflow.py @@ -22,6 +22,7 @@ class WorkflowRequestBody(BaseModel): totp_verification_url: str | None = None totp_identifier: str | None = None browser_session_id: str | None = None + max_screenshot_scrolling_times: int | None = None @field_validator("webhook_callback_url", "totp_verification_url") @classmethod @@ -76,6 +77,7 @@ class Workflow(BaseModel): persist_browser_session: bool = False model: dict[str, Any] | None = None status: WorkflowStatus = WorkflowStatus.published + max_screenshot_scrolling_times: int | None = None created_at: datetime modified_at: datetime @@ -115,6 +117,7 @@ class WorkflowRun(BaseModel): failure_reason: str | None = None parent_workflow_run_id: str | None = None workflow_title: str | None = None + max_screenshot_scrolling_times: int | None = None queued_at: datetime | None = None started_at: datetime | None = None @@ -162,3 +165,4 @@ class WorkflowRunResponseBase(BaseModel): task_v2: TaskV2 | None = None workflow_title: str | None = None browser_session_id: str | None = None + max_screenshot_scrolling_times: int | None = None diff --git a/skyvern/forge/sdk/workflow/models/yaml.py b/skyvern/forge/sdk/workflow/models/yaml.py index a96cb38a..d2ce6ae4 100644 --- a/skyvern/forge/sdk/workflow/models/yaml.py +++ b/skyvern/forge/sdk/workflow/models/yaml.py @@ -424,4 +424,5 @@ class WorkflowCreateYAMLRequest(BaseModel): model: dict[str, Any] | None = None workflow_definition: WorkflowDefinitionYAML is_saved_task: bool = False + max_screenshot_scrolling_times: int | None = None status: WorkflowStatus = WorkflowStatus.published diff --git a/skyvern/forge/sdk/workflow/service.py b/skyvern/forge/sdk/workflow/service.py index 844d2a55..45819c27 100644 --- a/skyvern/forge/sdk/workflow/service.py +++ b/skyvern/forge/sdk/workflow/service.py @@ -169,6 +169,7 @@ class WorkflowService: organization_id=workflow.organization_id, proxy_location=workflow_request.proxy_location, webhook_callback_url=workflow_request.webhook_callback_url, + max_screenshot_scrolling_times=workflow_request.max_screenshot_scrolling_times, ) skyvern_context.set( SkyvernContext( @@ -178,6 +179,7 @@ class WorkflowService: workflow_id=workflow_id, workflow_run_id=workflow_run.workflow_run_id, max_steps_override=max_steps_override, + max_screenshot_scrolling_times=workflow_request.max_screenshot_scrolling_times, ) ) @@ -577,6 +579,7 @@ class WorkflowService: workflow_definition: WorkflowDefinition, description: str | None = None, proxy_location: ProxyLocation | None = None, + max_screenshot_scrolling_times: int | None = None, webhook_callback_url: str | None = None, totp_verification_url: str | None = None, totp_identifier: str | None = None, @@ -594,6 +597,7 @@ class WorkflowService: description=description, proxy_location=proxy_location, webhook_callback_url=webhook_callback_url, + max_screenshot_scrolling_times=max_screenshot_scrolling_times, totp_verification_url=totp_verification_url, totp_identifier=totp_identifier, persist_browser_session=persist_browser_session, @@ -767,6 +771,7 @@ class WorkflowService: totp_verification_url=workflow_request.totp_verification_url, totp_identifier=workflow_request.totp_identifier, parent_workflow_run_id=parent_workflow_run_id, + max_screenshot_scrolling_times=workflow_request.max_screenshot_scrolling_times, ) async def mark_workflow_run_as_completed(self, workflow_run_id: str) -> WorkflowRun: @@ -1180,6 +1185,7 @@ class WorkflowService: total_steps=total_steps, total_cost=total_cost, workflow_title=workflow.title, + max_screenshot_scrolling_times=workflow_run.max_screenshot_scrolling_times, ) async def clean_up_workflow( @@ -1453,6 +1459,7 @@ class WorkflowService: totp_identifier=request.totp_identifier, persist_browser_session=request.persist_browser_session, model=request.model, + max_screenshot_scrolling_times=request.max_screenshot_scrolling_times, workflow_permanent_id=workflow_permanent_id, version=existing_version + 1, is_saved_task=request.is_saved_task, @@ -1470,6 +1477,7 @@ class WorkflowService: totp_identifier=request.totp_identifier, persist_browser_session=request.persist_browser_session, model=request.model, + max_screenshot_scrolling_times=request.max_screenshot_scrolling_times, is_saved_task=request.is_saved_task, status=request.status, ) diff --git a/skyvern/schemas/runs.py b/skyvern/schemas/runs.py index 116f1c67..dd809eb7 100644 --- a/skyvern/schemas/runs.py +++ b/skyvern/schemas/runs.py @@ -279,6 +279,10 @@ class TaskRunRequest(BaseModel): include_action_history_in_verification: bool | None = Field( default=False, description="Whether to include action history when verifying that the task is complete" ) + max_screenshot_scrolling_times: int | None = Field( + default=None, + description="Scroll down n times to get the merged screenshot of the page after taking an action. When it's None or 0, it takes the current viewpoint screenshot.", + ) @field_validator("url", "webhook_url", "totp_url") @classmethod @@ -326,6 +330,10 @@ class WorkflowRunRequest(BaseModel): default=None, description="ID of a Skyvern browser session to reuse, having it continue from the current screen state", ) + max_screenshot_scrolling_times: int | None = Field( + default=None, + description="Scroll down n times to get the merged screenshot of the page after taking an action. When it's None or 0, it takes the current viewpoint screenshot.", + ) @field_validator("webhook_url", "totp_url") @classmethod @@ -368,9 +376,11 @@ class BaseRunResponse(BaseModel): examples=["https://app.skyvern.com/tasks/tsk_123", "https://app.skyvern.com/workflows/wpid_123/wr_123"], ) browser_session_id: str | None = Field( + default=None, description="ID of the Skyvern persistent browser session used for this run", examples=["pbs_123"] + ) + max_screenshot_scrolling_times: int | None = Field( default=None, - description="ID of the Skyvern persistent browser session used for this run", - examples=["pbs_123"], + description="Scroll down n times to get the merged screenshot of the page after taking an action. When it's NONE or 0, it takes the current view point screenshot.", ) diff --git a/skyvern/services/run_service.py b/skyvern/services/run_service.py index a301dd24..66f8d257 100644 --- a/skyvern/services/run_service.py +++ b/skyvern/services/run_service.py @@ -67,6 +67,7 @@ async def get_run_response(run_id: str, organization_id: str | None = None) -> R max_steps=task_v1_response.max_steps_per_run, data_extraction_schema=task_v1_response.request.extracted_information_schema, error_code_mapping=task_v1_response.request.error_code_mapping, + max_screenshot_scrolling_times=task_v1_response.request.max_screenshot_scrolling_times, ), ) elif run.task_run_type == RunType.task_v2: diff --git a/skyvern/services/task_v2_service.py b/skyvern/services/task_v2_service.py index 1b6bc705..e11cdfa4 100644 --- a/skyvern/services/task_v2_service.py +++ b/skyvern/services/task_v2_service.py @@ -165,6 +165,7 @@ async def initialize_task_v2( error_code_mapping: dict | None = None, create_task_run: bool = False, model: dict[str, Any] | None = None, + max_screenshot_scrolling_times: int | None = None, ) -> TaskV2: task_v2 = await app.DATABASE.create_task_v2( prompt=user_prompt, @@ -176,11 +177,13 @@ async def initialize_task_v2( extracted_information_schema=extracted_information_schema, error_code_mapping=error_code_mapping, model=model, + max_screenshot_scrolling_times=max_screenshot_scrolling_times, ) # set task_v2_id in context context = skyvern_context.current() if context: context.task_v2_id = task_v2.observer_cruise_id + context.max_screenshot_scrolling_times = max_screenshot_scrolling_times thought = await app.DATABASE.create_thought( task_v2_id=task_v2.observer_cruise_id, @@ -221,7 +224,9 @@ async def initialize_task_v2( ) workflow_run = await app.WORKFLOW_SERVICE.setup_workflow_run( request_id=None, - workflow_request=WorkflowRequestBody(), + workflow_request=WorkflowRequestBody( + max_screenshot_scrolling_times=max_screenshot_scrolling_times, + ), workflow_permanent_id=new_workflow.workflow_permanent_id, organization=organization, version=None, @@ -454,6 +459,7 @@ async def run_task_v2_helper( request_id=request_id, task_v2_id=task_v2_id, browser_session_id=browser_session_id, + max_screenshot_scrolling_times=task_v2.max_screenshot_scrolling_times, ) ) @@ -771,6 +777,7 @@ async def run_task_v2_helper( proxy_location=task_v2.proxy_location or ProxyLocation.RESIDENTIAL, workflow_definition=workflow_definition_yaml, status=workflow.status, + max_screenshot_scrolling_times=task_v2.max_screenshot_scrolling_times, ) LOG.info("Creating workflow from request", workflow_create_request=workflow_create_request) workflow = await app.WORKFLOW_SERVICE.create_workflow_from_request( diff --git a/skyvern/services/workflow_service.py b/skyvern/services/workflow_service.py index fff09c9b..47fd5ec0 100644 --- a/skyvern/services/workflow_service.py +++ b/skyvern/services/workflow_service.py @@ -97,6 +97,7 @@ async def get_workflow_run_response( webhook_url=workflow_run.webhook_callback_url or None, totp_url=workflow_run.totp_verification_url or None, totp_identifier=workflow_run.totp_identifier, + max_screenshot_scrolling_times=workflow_run.max_screenshot_scrolling_times, # TODO: add browser session id ), ) diff --git a/skyvern/webeye/browser_factory.py b/skyvern/webeye/browser_factory.py index d5ea7d10..76c0cc4b 100644 --- a/skyvern/webeye/browser_factory.py +++ b/skyvern/webeye/browser_factory.py @@ -35,7 +35,7 @@ from skyvern.exceptions import ( from skyvern.forge.sdk.api.files import get_download_dir, make_temp_directory from skyvern.forge.sdk.core.skyvern_context import current, ensure_context from skyvern.schemas.runs import ProxyLocation, get_tzinfo_from_proxy -from skyvern.webeye.utils.page import SkyvernFrame +from skyvern.webeye.utils.page import ScreenshotMode, SkyvernFrame LOG = structlog.get_logger() @@ -865,6 +865,30 @@ class BrowserState: except asyncio.TimeoutError: LOG.error("Timeout to close playwright, might leave the broswer opening forever") - async def take_screenshot(self, full_page: bool = False, file_path: str | None = None) -> bytes: + async def take_fullpage_screenshot( + self, + file_path: str | None = None, + use_playwright_fullpage: bool = False, # TODO: THIS IS ONLY FOR EXPERIMENT. will be removed after experiment. + ) -> bytes: page = await self.__assert_page() - return await SkyvernFrame.take_screenshot(page=page, full_page=full_page, file_path=file_path) + return await SkyvernFrame.take_scrolling_screenshot( + page=page, + file_path=file_path, + mode=ScreenshotMode.LITE, + use_playwright_fullpage=use_playwright_fullpage, + ) + + async def take_post_action_screenshot( + self, + scrolling_number: int, + file_path: str | None = None, + use_playwright_fullpage: bool = False, # TODO: THIS IS ONLY FOR EXPERIMENT. will be removed after experiment. + ) -> bytes: + page = await self.__assert_page() + return await SkyvernFrame.take_scrolling_screenshot( + page=page, + file_path=file_path, + mode=ScreenshotMode.LITE, + scrolling_number=scrolling_number, + use_playwright_fullpage=use_playwright_fullpage, + ) diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index 98b8f716..5871b29a 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -2115,13 +2115,14 @@ async function scrollToNextPage( draw_boxes, frame = "main.frame", frame_index = undefined, + need_overlap = true, ) { // remove bounding boxes, scroll to next page with 200px overlap, then draw bounding boxes again // return true if there is a next page, false otherwise removeBoundingBoxes(); window.scrollBy({ left: 0, - top: window.innerHeight - 200, + top: need_overlap ? window.innerHeight - 200 : window.innerHeight, behavior: "instant", }); if (draw_boxes) { diff --git a/skyvern/webeye/utils/page.py b/skyvern/webeye/utils/page.py index d1f9546d..7bf42530 100644 --- a/skyvern/webeye/utils/page.py +++ b/skyvern/webeye/utils/page.py @@ -2,9 +2,12 @@ from __future__ import annotations import asyncio import time +from enum import StrEnum +from io import BytesIO from typing import Any import structlog +from PIL import Image from playwright._impl._errors import TimeoutError from playwright.async_api import ElementHandle, Frame, Page @@ -31,17 +34,24 @@ def load_js_script() -> str: JS_FUNCTION_DEFS = load_js_script() +class ScreenshotMode(StrEnum): + LITE = "lite" + DETAILED = "detailed" + + async def _current_viewpoint_screenshot_helper( page: Page, file_path: str | None = None, full_page: bool = False, timeout: float = settings.BROWSER_SCREENSHOT_TIMEOUT_MS, + mode: ScreenshotMode = ScreenshotMode.DETAILED, ) -> bytes: if page.is_closed(): raise FailedToTakeScreenshot(error_message="Page is closed") try: - await page.wait_for_load_state(timeout=settings.BROWSER_LOADING_TIMEOUT_MS) - LOG.debug("Page is fully loaded, agent is about to take screenshots") + if mode == ScreenshotMode.DETAILED: + await page.wait_for_load_state(timeout=settings.BROWSER_LOADING_TIMEOUT_MS) + LOG.debug("Page is fully loaded, agent is about to take screenshots") start_time = time.time() screenshot: bytes = b"" if file_path: @@ -77,6 +87,7 @@ async def _scrolling_screenshots_helper( url: str | None = None, draw_boxes: bool = False, max_number: int = settings.MAX_NUM_SCREENSHOTS, + mode: ScreenshotMode = ScreenshotMode.DETAILED, ) -> list[bytes]: skyvern_page = await SkyvernFrame.create_instance(frame=page) # page is the main frame and the index must be 0 @@ -84,6 +95,11 @@ async def _scrolling_screenshots_helper( frame = "main.frame" frame_index = 0 + # when mode is lite, we don't draw bounding boxes + # since draw_boxes impacts the performance of processing + if mode == ScreenshotMode.LITE: + draw_boxes = False + screenshots: list[bytes] = [] if await skyvern_page.is_window_scrollable(): scroll_y_px_old = -30.0 @@ -92,12 +108,15 @@ async def _scrolling_screenshots_helper( # We are checking the difference between the old and new scroll_y_px to determine if we have reached the end of the # page. If the difference is less than 25, we assume we have reached the end of the page. while abs(scroll_y_px_old - scroll_y_px) > 25 and len(screenshots) < max_number: - screenshot = await _current_viewpoint_screenshot_helper(page=skyvern_page.frame) + screenshot = await _current_viewpoint_screenshot_helper(page=skyvern_page.frame, mode=mode) screenshots.append(screenshot) scroll_y_px_old = scroll_y_px LOG.debug("Scrolling to next page", url=url, num_screenshots=len(screenshots)) scroll_y_px = await skyvern_page.scroll_to_next_page( - draw_boxes=draw_boxes, frame=frame, frame_index=frame_index + draw_boxes=draw_boxes, + frame=frame, + frame_index=frame_index, + need_overlap=(mode == ScreenshotMode.DETAILED), ) LOG.debug( "Scrolled to next page", @@ -107,15 +126,17 @@ async def _scrolling_screenshots_helper( if draw_boxes: await skyvern_page.remove_bounding_boxes() await skyvern_page.scroll_to_top(draw_boxes=False, frame=frame, frame_index=frame_index) - # wait until animation ends, which is triggered by scrolling - LOG.debug("Waiting for 2 seconds until animation ends.") - await asyncio.sleep(2) + + if mode == ScreenshotMode.DETAILED: + # wait until animation ends, which is triggered by scrolling + LOG.debug("Waiting for 2 seconds until animation ends.") + await asyncio.sleep(2) else: if draw_boxes: await skyvern_page.build_elements_and_draw_bounding_boxes(frame=frame, frame_index=frame_index) LOG.debug("Page is not scrollable", url=url, num_screenshots=len(screenshots)) - screenshot = await _current_viewpoint_screenshot_helper(page=skyvern_page.frame) + screenshot = await _current_viewpoint_screenshot_helper(page=skyvern_page.frame, mode=mode) screenshots.append(screenshot) if draw_boxes: @@ -144,28 +165,85 @@ class SkyvernFrame: return await SkyvernFrame.evaluate(frame=frame, expression="() => document.location.href") @staticmethod - async def take_screenshot( + async def take_scrolling_screenshot( page: Page, - full_page: bool = False, file_path: str | None = None, timeout: float = settings.BROWSER_SCREENSHOT_TIMEOUT_MS, + mode: ScreenshotMode = ScreenshotMode.DETAILED, + scrolling_number: int = settings.MAX_NUM_SCREENSHOTS, + use_playwright_fullpage: bool = False, # TODO: THIS IS ONLY FOR EXPERIMENT. will be removed after experiment. ) -> bytes: - return await _current_viewpoint_screenshot_helper( - page=page, file_path=file_path, full_page=full_page, timeout=timeout - ) + if scrolling_number <= 0: + return await _current_viewpoint_screenshot_helper( + page=page, file_path=file_path, timeout=timeout, mode=mode + ) + + if use_playwright_fullpage: + return await _current_viewpoint_screenshot_helper( + page=page, file_path=file_path, timeout=timeout, full_page=True + ) + + if scrolling_number > settings.MAX_NUM_SCREENSHOTS: + LOG.warning( + "scrolling_number is greater than the max number of screenshots, setting it to the max number of screenshots", + scrolling_number=scrolling_number, + max_number=settings.MAX_NUM_SCREENSHOTS, + ) + scrolling_number = settings.MAX_NUM_SCREENSHOTS + + # use spilt screenshot with lite mode, isntead of fullpage screenshot from playwright + LOG.debug("Page is fully loaded, agent is about to generate the full page screenshot") + start_time = time.time() + async with asyncio.timeout(timeout): + screenshots = await _scrolling_screenshots_helper(page=page, mode=mode, max_number=scrolling_number) + images = [] + + for screenshot in screenshots: + with Image.open(BytesIO(screenshot)) as img: + img.load() + images.append(img) + + total_height = sum(img.height for img in images) + max_width = max(img.width for img in images) + + merged_img = Image.new("RGB", (max_width, total_height), color=(255, 255, 255)) + + current_y = 0 + for img in images: + merged_img.paste(img, (0, current_y)) + current_y += img.height + + buffer = BytesIO() + merged_img.save(buffer, format="PNG") + buffer.seek(0) + + img_data = buffer.read() + if file_path is not None: + with open(file_path, "wb") as f: + f.write(img_data) + + end_time = time.time() + LOG.debug( + "Full page screenshot taking time", + screenshot_time=end_time - start_time, + file_path=file_path, + ) + return img_data @staticmethod async def take_split_screenshots( page: Page, - url: str, + url: str | None = None, draw_boxes: bool = False, max_number: int = settings.MAX_NUM_SCREENSHOTS, scroll: bool = True, ) -> list[bytes]: if not scroll: - return [await _current_viewpoint_screenshot_helper(page=page)] + return [await _current_viewpoint_screenshot_helper(page=page, mode=ScreenshotMode.DETAILED)] - return await _scrolling_screenshots_helper(page=page, url=url, max_number=max_number, draw_boxes=draw_boxes) + return await _scrolling_screenshots_helper( + page=page, url=url, max_number=max_number, draw_boxes=draw_boxes, mode=ScreenshotMode.DETAILED + ) @classmethod async def create_instance(cls, frame: Page | Frame) -> SkyvernFrame: @@ -235,19 +313,21 @@ class SkyvernFrame: ) return scroll_y_px - async def scroll_to_next_page(self, draw_boxes: bool, frame: str, frame_index: int) -> float: + async def scroll_to_next_page( + self, draw_boxes: bool, frame: str, frame_index: int, need_overlap: bool = True + ) -> float: """ Scroll to the next page and take a screenshot. :param drow_boxes: If True, draw bounding boxes around the elements. :param page: Page instance to take the screenshot from. :return: Screenshot of the page. """ - js_script = "async ([draw_boxes, frame, frame_index]) => await scrollToNextPage(draw_boxes, frame, frame_index)" + js_script = "async ([draw_boxes, frame, frame_index, need_overlap]) => await scrollToNextPage(draw_boxes, frame, frame_index, need_overlap)" scroll_y_px = await self.evaluate( frame=self.frame, expression=js_script, timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS, - arg=[draw_boxes, frame, frame_index], + arg=[draw_boxes, frame, frame_index, need_overlap], ) return scroll_y_px