From 058a9178aa300b945373d8d49a1956081211e98b Mon Sep 17 00:00:00 2001 From: Celal Zamanoglu <95054566+celalzamanoglu@users.noreply.github.com> Date: Wed, 7 Jan 2026 02:12:22 +0300 Subject: [PATCH] link actions to their screenshots - backend (#4404) --- ...6_add_screenshot_artifact_id_to_actions.py | 27 ++++++++++++ skyvern/forge/agent.py | 43 ++++++++++++++----- skyvern/forge/sdk/db/agent_db.py | 12 ++++++ skyvern/forge/sdk/db/models.py | 1 + skyvern/forge/sdk/db/utils.py | 1 + skyvern/webeye/actions/actions.py | 1 + skyvern/webeye/actions/handler.py | 6 ++- 7 files changed, 79 insertions(+), 12 deletions(-) create mode 100644 alembic/versions/2026_01_06_2254-3545c24f02f6_add_screenshot_artifact_id_to_actions.py diff --git a/alembic/versions/2026_01_06_2254-3545c24f02f6_add_screenshot_artifact_id_to_actions.py b/alembic/versions/2026_01_06_2254-3545c24f02f6_add_screenshot_artifact_id_to_actions.py new file mode 100644 index 00000000..ef916858 --- /dev/null +++ b/alembic/versions/2026_01_06_2254-3545c24f02f6_add_screenshot_artifact_id_to_actions.py @@ -0,0 +1,27 @@ +"""add screenshot_artifact_id to actions + +Revision ID: 3545c24f02f6 +Revises: db8667f8ce63 +Create Date: 2026-01-06 22:54:15.401625+00:00 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "3545c24f02f6" +down_revision: Union[str, None] = "db8667f8ce63" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + op.add_column("actions", sa.Column("screenshot_artifact_id", sa.String(), nullable=True)) + + +def downgrade() -> None: + op.drop_column("actions", "screenshot_artifact_id") diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index fff5a6d4..39038e82 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -1231,8 +1231,8 @@ class ForgeAgent: action_order=action_idx, ) detailed_agent_step_output.actions_and_results[action_idx] = (action, [action_result]) - await app.DATABASE.create_action(action=action) - await self.record_artifacts_after_action(task, step, browser_state, engine) + action.action_id = (await app.DATABASE.create_action(action=action)).action_id + await self.record_artifacts_after_action(task, step, browser_state, engine, action) break action = action_node.action @@ -1319,7 +1319,7 @@ class ForgeAgent: ) await asyncio.sleep(wait_time) - await self.record_artifacts_after_action(task, step, browser_state, engine) + await self.record_artifacts_after_action(task, step, browser_state, engine, action) for result in results: result.step_retry_number = step.retry_index result.step_order = step.order @@ -2192,6 +2192,7 @@ class ForgeAgent: step: Step, browser_state: BrowserState, engine: RunEngine, + action: Action, ) -> None: working_page = await browser_state.get_working_page() if not working_page: @@ -2213,6 +2214,7 @@ class ForgeAgent: scrolling_number = 0 artifacts: list[BulkArtifactCreationRequest | None] = [] + screenshot_artifact_id: str | None = None try: # get current x, y position of the page x: int | None = None @@ -2230,13 +2232,17 @@ class ForgeAgent: if skyvern_frame and x is not None and y is not None: await skyvern_frame.safe_scroll_to_x_y(x, y) LOG.debug("Scrolled back to the original x, y position of the page after taking screenshot", x=x, y=y) - artifacts.append( - await app.ARTIFACT_MANAGER.prepare_llm_artifact( - data=screenshot, - artifact_type=ArtifactType.SCREENSHOT_ACTION, - step=step, - ) + screenshot_request = await app.ARTIFACT_MANAGER.prepare_llm_artifact( + data=screenshot, + artifact_type=ArtifactType.SCREENSHOT_ACTION, + step=step, ) + if screenshot_request: + artifacts.append(screenshot_request) + for artifact_data in screenshot_request.artifacts: + if artifact_data.artifact_model.artifact_type == ArtifactType.SCREENSHOT_ACTION: + screenshot_artifact_id = artifact_data.artifact_model.artifact_id + break except Exception: LOG.error( "Failed to record screenshot after action", @@ -2261,6 +2267,23 @@ class ForgeAgent: await app.ARTIFACT_MANAGER.bulk_create_artifacts(artifacts) except Exception: LOG.warning("Failed to bulk create artifacts after action", exc_info=True) + else: + if screenshot_artifact_id and action.action_id and action.organization_id: + try: + # TODO: consider batching screenshot artifact updates to reduce per-action DB writes. + await app.DATABASE.update_action_screenshot_artifact_id( + organization_id=action.organization_id, + action_id=action.action_id, + screenshot_artifact_id=screenshot_artifact_id, + ) + action.screenshot_artifact_id = screenshot_artifact_id + except Exception: + LOG.warning( + "Failed to update action with screenshot artifact id", + action_id=action.action_id, + screenshot_artifact_id=screenshot_artifact_id, + exc_info=True, + ) try: video_artifacts = await app.BROWSER_MANAGER.get_video_artifacts( @@ -3757,7 +3780,7 @@ class ForgeAgent: persisted_action.action_order = len(step.output.actions_and_results) action_results = await ActionHandler.handle_action(scraped_page, task, step, working_page, persisted_action) - await self.record_artifacts_after_action(task, step, browser_state, engine) + await self.record_artifacts_after_action(task, step, browser_state, engine, persisted_action) step.output.action_results.extend(action_results) step.output.actions_and_results.append((persisted_action, action_results)) if isinstance(persisted_action, DecisiveAction) and persisted_action.errors: diff --git a/skyvern/forge/sdk/db/agent_db.py b/skyvern/forge/sdk/db/agent_db.py index 033eef09..8c8d1bfd 100644 --- a/skyvern/forge/sdk/db/agent_db.py +++ b/skyvern/forge/sdk/db/agent_db.py @@ -3871,6 +3871,7 @@ class AgentDB(BaseAlchemyDB): element_id=action.element_id, skyvern_element_hash=action.skyvern_element_hash, skyvern_element_data=action.skyvern_element_data, + screenshot_artifact_id=action.screenshot_artifact_id, action_json=action.model_dump(), confidence_float=action.confidence_float, created_by=action.created_by, @@ -3880,6 +3881,17 @@ class AgentDB(BaseAlchemyDB): await session.refresh(new_action) return Action.model_validate(new_action) + async def update_action_screenshot_artifact_id( + self, *, organization_id: str, action_id: str, screenshot_artifact_id: str + ) -> None: + async with self.Session() as session: + await session.execute( + update(ActionModel) + .where(ActionModel.action_id == action_id, ActionModel.organization_id == organization_id) + .values(screenshot_artifact_id=screenshot_artifact_id) + ) + await session.commit() + async def update_action_reasoning( self, organization_id: str, diff --git a/skyvern/forge/sdk/db/models.py b/skyvern/forge/sdk/db/models.py index 1a23f0ad..6596b6fc 100644 --- a/skyvern/forge/sdk/db/models.py +++ b/skyvern/forge/sdk/db/models.py @@ -680,6 +680,7 @@ class ActionModel(Base): action_json = Column(JSON, nullable=True) input_or_select_context = Column(JSON, nullable=True) confidence_float = Column(Numeric, nullable=True) + screenshot_artifact_id = Column(String, nullable=True) created_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False) modified_at = Column(DateTime, default=datetime.datetime.utcnow, onupdate=datetime.datetime.utcnow, nullable=False) diff --git a/skyvern/forge/sdk/db/utils.py b/skyvern/forge/sdk/db/utils.py index 45c2a892..6f768160 100644 --- a/skyvern/forge/sdk/db/utils.py +++ b/skyvern/forge/sdk/db/utils.py @@ -709,6 +709,7 @@ def hydrate_action(action_model: ActionModel, empty_element_id: bool = False) -> "element_id": element_id, "skyvern_element_hash": action_model.skyvern_element_hash, "skyvern_element_data": action_model.skyvern_element_data, + "screenshot_artifact_id": action_model.screenshot_artifact_id, "created_at": action_model.created_at, "modified_at": action_model.modified_at, } diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index 5253ffe7..c3e110f8 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -113,6 +113,7 @@ class Action(BaseModel): element_id: Annotated[str, Field(coerce_numbers_to_str=True)] | None = None skyvern_element_hash: str | None = None skyvern_element_data: dict[str, Any] | None = None + screenshot_artifact_id: str | None = None tool_call_id: str | None = None xpath: str | None = None diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index f5c64588..88aad5eb 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -409,7 +409,8 @@ class ActionHandler: page=page, action=action, ) - await app.DATABASE.create_action(action=action) + persisted_action = await app.DATABASE.create_action(action=action) + action.action_id = persisted_action.action_id return results context = skyvern_context.current() @@ -529,7 +530,8 @@ class ActionHandler: # close the extra page await pages_after_download[-1].close() - await app.DATABASE.create_action(action=action) + persisted_action = await app.DATABASE.create_action(action=action) + action.action_id = persisted_action.action_id @staticmethod async def _handle_action(