current viewpoint screenshot and scrolling n screenshot (#2716)

Co-authored-by: lawyzheng <lawyzheng1106@gmail.com>
This commit is contained in:
Shuchang Zheng
2025-06-13 23:59:50 -07:00
committed by GitHub
parent 11288817af
commit 775da18878
39 changed files with 452 additions and 35 deletions

View File

@@ -23,6 +23,7 @@ class SkyvernContext:
hashed_href_map: dict[str, str] = field(default_factory=dict)
refresh_working_page: bool = False
frame_index_map: dict[Frame, int] = field(default_factory=dict)
max_screenshot_scrolling_times: int | None = None
def __repr__(self) -> str:
return f"SkyvernContext(request_id={self.request_id}, organization_id={self.organization_id}, task_id={self.task_id}, workflow_id={self.workflow_id}, workflow_run_id={self.workflow_run_id}, task_v2_id={self.task_v2_id}, max_steps_override={self.max_steps_override})"

View File

@@ -149,6 +149,7 @@ class AgentDB:
application: str | None = None,
include_action_history_in_verification: bool | None = None,
model: dict[str, Any] | None = None,
max_screenshot_scrolling_times: int | None = None,
) -> Task:
try:
async with self.Session() as session:
@@ -176,6 +177,7 @@ class AgentDB:
application=application,
include_action_history_in_verification=include_action_history_in_verification,
model=model,
max_screenshot_scrolling_times=max_screenshot_scrolling_times,
)
session.add(new_task)
await session.commit()
@@ -1217,6 +1219,7 @@ class AgentDB:
description: str | None = None,
proxy_location: ProxyLocation | None = None,
webhook_callback_url: str | None = None,
max_screenshot_scrolling_times: int | None = None,
totp_verification_url: str | None = None,
totp_identifier: str | None = None,
persist_browser_session: bool = False,
@@ -1236,6 +1239,7 @@ class AgentDB:
webhook_callback_url=webhook_callback_url,
totp_verification_url=totp_verification_url,
totp_identifier=totp_identifier,
max_screenshot_scrolling_times=max_screenshot_scrolling_times,
persist_browser_session=persist_browser_session,
model=model,
is_saved_task=is_saved_task,
@@ -1479,6 +1483,7 @@ class AgentDB:
totp_verification_url: str | None = None,
totp_identifier: str | None = None,
parent_workflow_run_id: str | None = None,
max_screenshot_scrolling_times: int | None = None,
) -> WorkflowRun:
try:
async with self.Session() as session:
@@ -1492,6 +1497,7 @@ class AgentDB:
totp_verification_url=totp_verification_url,
totp_identifier=totp_identifier,
parent_workflow_run_id=parent_workflow_run_id,
max_screenshot_scrolling_times=max_screenshot_scrolling_times,
)
session.add(workflow_run)
await session.commit()
@@ -2436,6 +2442,7 @@ class AgentDB:
extracted_information_schema: dict | list | str | None = None,
error_code_mapping: dict | None = None,
model: dict[str, Any] | None = None,
max_screenshot_scrolling_times: int | None = None,
) -> TaskV2:
async with self.Session() as session:
new_task_v2 = TaskV2Model(
@@ -2452,6 +2459,7 @@ class AgentDB:
error_code_mapping=error_code_mapping,
organization_id=organization_id,
model=model,
max_screenshot_scrolling_times=max_screenshot_scrolling_times,
)
session.add(new_task_v2)
await session.commit()

View File

@@ -88,6 +88,7 @@ class TaskModel(Base):
queued_at = Column(DateTime, nullable=True)
started_at = Column(DateTime, nullable=True)
finished_at = Column(DateTime, nullable=True)
max_screenshot_scrolling_times = Column(Integer, nullable=True)
created_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False, index=True)
modified_at = Column(
DateTime,
@@ -218,6 +219,7 @@ class WorkflowModel(Base):
workflow_definition = Column(JSON, nullable=False)
proxy_location = Column(String)
webhook_callback_url = Column(String)
max_screenshot_scrolling_times = Column(Integer, nullable=True)
totp_verification_url = Column(String)
totp_identifier = Column(String)
persist_browser_session = Column(Boolean, default=False, nullable=False)
@@ -254,6 +256,7 @@ class WorkflowRunModel(Base):
webhook_callback_url = Column(String)
totp_verification_url = Column(String)
totp_identifier = Column(String)
max_screenshot_scrolling_times = Column(Integer, nullable=True)
queued_at = Column(DateTime, nullable=True)
started_at = Column(DateTime, nullable=True)
@@ -621,6 +624,7 @@ class TaskV2Model(Base):
extracted_information_schema = Column(JSON, nullable=True)
error_code_mapping = Column(JSON, nullable=True)
max_steps = Column(Integer, nullable=True)
max_screenshot_scrolling_times = Column(Integer, nullable=True)
queued_at = Column(DateTime, nullable=True)
started_at = Column(DateTime, nullable=True)

View File

@@ -142,6 +142,7 @@ def convert_to_task(task_obj: TaskModel, debug_enabled: bool = False, workflow_p
queued_at=task_obj.queued_at,
started_at=task_obj.started_at,
finished_at=task_obj.finished_at,
max_screenshot_scrolling_times=task_obj.max_screenshot_scrolling_times,
)
return task
@@ -238,6 +239,7 @@ def convert_to_workflow(workflow_model: WorkflowModel, debug_enabled: bool = Fal
persist_browser_session=workflow_model.persist_browser_session,
model=workflow_model.model,
proxy_location=(ProxyLocation(workflow_model.proxy_location) if workflow_model.proxy_location else None),
max_screenshot_scrolling_times=workflow_model.max_screenshot_scrolling_times,
version=workflow_model.version,
is_saved_task=workflow_model.is_saved_task,
description=workflow_model.description,
@@ -278,6 +280,7 @@ def convert_to_workflow_run(
created_at=workflow_run_model.created_at,
modified_at=workflow_run_model.modified_at,
workflow_title=workflow_title,
max_screenshot_scrolling_times=workflow_run_model.max_screenshot_scrolling_times,
)

View File

@@ -107,6 +107,7 @@ class BackgroundTaskExecutor(AsyncExecutor):
context.task_id = task.task_id
context.organization_id = organization_id
context.max_steps_override = max_steps_override
context.max_screenshot_scrolling_times = task.max_screenshot_scrolling_times
if background_tasks:
await initialize_skyvern_state_file(task_id=task_id, organization_id=organization_id)

View File

@@ -166,6 +166,7 @@ async def run_task(
totp_identifier=run_request.totp_identifier,
include_action_history_in_verification=run_request.include_action_history_in_verification,
model=run_request.model,
max_screenshot_scrolling_times=run_request.max_screenshot_scrolling_times,
)
task_v1_response = await task_v1_service.run_task(
task=task_v1_request,
@@ -203,6 +204,7 @@ async def run_task(
data_extraction_schema=task_v1_response.extracted_information_schema,
error_code_mapping=task_v1_response.error_code_mapping,
browser_session_id=run_request.browser_session_id,
max_screenshot_scrolling_times=run_request.max_screenshot_scrolling_times,
),
)
if run_request.engine == RunEngine.skyvern_v2:
@@ -221,6 +223,7 @@ async def run_task(
error_code_mapping=run_request.error_code_mapping,
create_task_run=True,
model=run_request.model,
max_screenshot_scrolling_times=run_request.max_screenshot_scrolling_times,
)
except MissingBrowserAddressError as e:
raise HTTPException(status_code=400, detail=str(e)) from e
@@ -263,6 +266,7 @@ async def run_task(
error_code_mapping=task_v2.error_code_mapping,
data_extraction_schema=task_v2.extracted_information_schema,
publish_workflow=run_request.publish_workflow,
max_screenshot_scrolling_times=run_request.max_screenshot_scrolling_times,
),
)
LOG.error("Invalid agent engine", engine=run_request.engine, organization_id=current_org.organization_id)
@@ -318,6 +322,7 @@ async def run_workflow(
totp_identifier=workflow_run_request.totp_identifier,
totp_url=workflow_run_request.totp_url,
browser_session_id=workflow_run_request.browser_session_id,
max_screenshot_scrolling_times=workflow_run_request.max_screenshot_scrolling_times,
)
try:
@@ -1765,6 +1770,7 @@ async def run_task_v2(
create_task_run=True,
extracted_information_schema=data.extracted_information_schema,
error_code_mapping=data.error_code_mapping,
max_screenshot_scrolling_times=data.max_screenshot_scrolling_times,
)
except MissingBrowserAddressError as e:
raise HTTPException(status_code=400, detail=str(e)) from e

View File

@@ -48,6 +48,8 @@ class TaskV2(BaseModel):
queued_at: datetime | None = None
started_at: datetime | None = None
finished_at: datetime | None = None
max_screenshot_scrolling_times: int | None = None
created_at: datetime
modified_at: datetime
@@ -147,6 +149,7 @@ class TaskV2Request(BaseModel):
publish_workflow: bool = False
extracted_information_schema: dict | list | str | None = None
error_code_mapping: dict[str, str] | None = None
max_screenshot_scrolling_times: int | None = None
@field_validator("url", "webhook_callback_url", "totp_verification_url")
@classmethod

View File

@@ -96,6 +96,11 @@ class TaskBase(BaseModel):
description="Whether to include the action history when verifying the task is complete",
examples=[True, False],
)
max_screenshot_scrolling_times: int | None = Field(
default=None,
description="Scroll down n times to get the merged screenshot of the page after taking an action. When it's None or 0, it takes the current viewpoint screenshot.",
examples=[10],
)
class TaskRequest(TaskBase):
@@ -314,6 +319,7 @@ class Task(TaskBase):
errors=self.errors,
max_steps_per_run=self.max_steps_per_run,
workflow_run_id=self.workflow_run_id,
max_screenshot_scrolling_times=self.max_screenshot_scrolling_times,
)
@@ -337,6 +343,7 @@ class TaskResponse(BaseModel):
queued_at: datetime | None = None
started_at: datetime | None = None
finished_at: datetime | None = None
max_screenshot_scrolling_times: int | None = None
class TaskOutput(BaseModel):

View File

@@ -307,7 +307,12 @@ class Block(BaseModel, abc.ABC):
if not browser_state:
LOG.warning("No browser state found when creating workflow_run_block", workflow_run_id=workflow_run_id)
else:
screenshot = await browser_state.take_screenshot(full_page=True)
screenshot = await browser_state.take_fullpage_screenshot(
use_playwright_fullpage=app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
"ENABLE_PLAYWRIGHT_FULLPAGE",
str(organization_id),
)
)
if screenshot:
await app.ARTIFACT_MANAGER.create_workflow_run_block_artifact(
workflow_run_block=workflow_run_block,
@@ -569,8 +574,15 @@ class BaseTaskBlock(Block):
browser_state = await app.BROWSER_MANAGER.get_or_create_for_workflow_run(
workflow_run=workflow_run, url=self.url, browser_session_id=browser_session_id
)
# assert that the browser state is not None, otherwise we can't go through typing
assert browser_state is not None
# add screenshot artifact for the first task
screenshot = await browser_state.take_screenshot(full_page=True)
screenshot = await browser_state.take_fullpage_screenshot(
use_playwright_fullpage=app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached(
"ENABLE_PLAYWRIGHT_FULLPAGE",
str(organization_id),
)
)
if screenshot:
await app.ARTIFACT_MANAGER.create_workflow_run_block_artifact(
workflow_run_block=workflow_run_block,
@@ -2486,6 +2498,7 @@ class TaskV2Block(Block):
proxy_location=workflow_run.proxy_location,
totp_identifier=self.totp_identifier,
totp_verification_url=self.totp_verification_url,
max_screenshot_scrolling_times=workflow_run.max_screenshot_scrolling_times,
)
await app.DATABASE.update_task_v2(
task_v2.observer_cruise_id, status=TaskV2Status.queued, organization_id=organization_id
@@ -2517,6 +2530,7 @@ class TaskV2Block(Block):
workflow_permanent_id=workflow_run.workflow_permanent_id,
workflow_run_id=workflow_run_id,
browser_session_id=browser_session_id,
max_screenshot_scrolling_times=workflow_run.max_screenshot_scrolling_times,
)
)
result_dict = None

View File

@@ -22,6 +22,7 @@ class WorkflowRequestBody(BaseModel):
totp_verification_url: str | None = None
totp_identifier: str | None = None
browser_session_id: str | None = None
max_screenshot_scrolling_times: int | None = None
@field_validator("webhook_callback_url", "totp_verification_url")
@classmethod
@@ -76,6 +77,7 @@ class Workflow(BaseModel):
persist_browser_session: bool = False
model: dict[str, Any] | None = None
status: WorkflowStatus = WorkflowStatus.published
max_screenshot_scrolling_times: int | None = None
created_at: datetime
modified_at: datetime
@@ -115,6 +117,7 @@ class WorkflowRun(BaseModel):
failure_reason: str | None = None
parent_workflow_run_id: str | None = None
workflow_title: str | None = None
max_screenshot_scrolling_times: int | None = None
queued_at: datetime | None = None
started_at: datetime | None = None
@@ -162,3 +165,4 @@ class WorkflowRunResponseBase(BaseModel):
task_v2: TaskV2 | None = None
workflow_title: str | None = None
browser_session_id: str | None = None
max_screenshot_scrolling_times: int | None = None

View File

@@ -424,4 +424,5 @@ class WorkflowCreateYAMLRequest(BaseModel):
model: dict[str, Any] | None = None
workflow_definition: WorkflowDefinitionYAML
is_saved_task: bool = False
max_screenshot_scrolling_times: int | None = None
status: WorkflowStatus = WorkflowStatus.published

View File

@@ -169,6 +169,7 @@ class WorkflowService:
organization_id=workflow.organization_id,
proxy_location=workflow_request.proxy_location,
webhook_callback_url=workflow_request.webhook_callback_url,
max_screenshot_scrolling_times=workflow_request.max_screenshot_scrolling_times,
)
skyvern_context.set(
SkyvernContext(
@@ -178,6 +179,7 @@ class WorkflowService:
workflow_id=workflow_id,
workflow_run_id=workflow_run.workflow_run_id,
max_steps_override=max_steps_override,
max_screenshot_scrolling_times=workflow_request.max_screenshot_scrolling_times,
)
)
@@ -577,6 +579,7 @@ class WorkflowService:
workflow_definition: WorkflowDefinition,
description: str | None = None,
proxy_location: ProxyLocation | None = None,
max_screenshot_scrolling_times: int | None = None,
webhook_callback_url: str | None = None,
totp_verification_url: str | None = None,
totp_identifier: str | None = None,
@@ -594,6 +597,7 @@ class WorkflowService:
description=description,
proxy_location=proxy_location,
webhook_callback_url=webhook_callback_url,
max_screenshot_scrolling_times=max_screenshot_scrolling_times,
totp_verification_url=totp_verification_url,
totp_identifier=totp_identifier,
persist_browser_session=persist_browser_session,
@@ -767,6 +771,7 @@ class WorkflowService:
totp_verification_url=workflow_request.totp_verification_url,
totp_identifier=workflow_request.totp_identifier,
parent_workflow_run_id=parent_workflow_run_id,
max_screenshot_scrolling_times=workflow_request.max_screenshot_scrolling_times,
)
async def mark_workflow_run_as_completed(self, workflow_run_id: str) -> WorkflowRun:
@@ -1180,6 +1185,7 @@ class WorkflowService:
total_steps=total_steps,
total_cost=total_cost,
workflow_title=workflow.title,
max_screenshot_scrolling_times=workflow_run.max_screenshot_scrolling_times,
)
async def clean_up_workflow(
@@ -1453,6 +1459,7 @@ class WorkflowService:
totp_identifier=request.totp_identifier,
persist_browser_session=request.persist_browser_session,
model=request.model,
max_screenshot_scrolling_times=request.max_screenshot_scrolling_times,
workflow_permanent_id=workflow_permanent_id,
version=existing_version + 1,
is_saved_task=request.is_saved_task,
@@ -1470,6 +1477,7 @@ class WorkflowService:
totp_identifier=request.totp_identifier,
persist_browser_session=request.persist_browser_session,
model=request.model,
max_screenshot_scrolling_times=request.max_screenshot_scrolling_times,
is_saved_task=request.is_saved_task,
status=request.status,
)