Add periodic cleanup cron job for temp data and stale processes (#4781)

Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-18 23:10:00 +08:00
parent 4db25ec04f
commit f6e79781c1
4 changed files with 401 additions and 0 deletions
--- a/skyvern/forge/sdk/db/agent_db.py
+++ b/skyvern/forge/sdk/db/agent_db.py
@@ -968,6 +968,99 @@ class AgentDB(BaseAlchemyDB):
            LOG.error("UnexpectedError", exc_info=True)
            raise

+    async def get_running_tasks_info_globally(
+        self,
+        stale_threshold_hours: int = 24,
+    ) -> tuple[int, int]:
+        """
+        Get information about running tasks across all organizations.
+        Used by cleanup service to determine if cleanup should be skipped.
+
+        Args:
+            stale_threshold_hours: Tasks not updated for this many hours are considered stale.
+
+        Returns:
+            Tuple of (active_task_count, stale_task_count).
+            Active tasks are those updated within the threshold.
+            Stale tasks are those not updated within the threshold but still in running status.
+        """
+        try:
+            async with self.Session() as session:
+                running_statuses = [TaskStatus.created, TaskStatus.queued, TaskStatus.running]
+                stale_cutoff = datetime.utcnow() - timedelta(hours=stale_threshold_hours)
+
+                # Count active tasks (recently updated)
+                active_query = (
+                    select(func.count())
+                    .select_from(TaskModel)
+                    .filter(TaskModel.status.in_(running_statuses))
+                    .filter(TaskModel.modified_at >= stale_cutoff)
+                )
+                active_count = (await session.execute(active_query)).scalar_one()
+
+                # Count stale tasks (not updated for a long time)
+                stale_query = (
+                    select(func.count())
+                    .select_from(TaskModel)
+                    .filter(TaskModel.status.in_(running_statuses))
+                    .filter(TaskModel.modified_at < stale_cutoff)
+                )
+                stale_count = (await session.execute(stale_query)).scalar_one()
+
+                return (active_count, stale_count)
+        except SQLAlchemyError:
+            LOG.error("SQLAlchemyError in get_running_tasks_info_globally", exc_info=True)
+            raise
+
+    async def get_running_workflow_runs_info_globally(
+        self,
+        stale_threshold_hours: int = 24,
+    ) -> tuple[int, int]:
+        """
+        Get information about running workflow runs across all organizations.
+        Used by cleanup service to determine if cleanup should be skipped.
+
+        Args:
+            stale_threshold_hours: Workflow runs not updated for this many hours are considered stale.
+
+        Returns:
+            Tuple of (active_workflow_count, stale_workflow_count).
+            Active workflows are those updated within the threshold.
+            Stale workflows are those not updated within the threshold but still in running status.
+        """
+        try:
+            async with self.Session() as session:
+                running_statuses = [
+                    WorkflowRunStatus.created,
+                    WorkflowRunStatus.queued,
+                    WorkflowRunStatus.running,
+                    WorkflowRunStatus.paused,
+                ]
+                stale_cutoff = datetime.utcnow() - timedelta(hours=stale_threshold_hours)
+
+                # Count active workflow runs (recently updated)
+                active_query = (
+                    select(func.count())
+                    .select_from(WorkflowRunModel)
+                    .filter(WorkflowRunModel.status.in_(running_statuses))
+                    .filter(WorkflowRunModel.modified_at >= stale_cutoff)
+                )
+                active_count = (await session.execute(active_query)).scalar_one()
+
+                # Count stale workflow runs (not updated for a long time)
+                stale_query = (
+                    select(func.count())
+                    .select_from(WorkflowRunModel)
+                    .filter(WorkflowRunModel.status.in_(running_statuses))
+                    .filter(WorkflowRunModel.modified_at < stale_cutoff)
+                )
+                stale_count = (await session.execute(stale_query)).scalar_one()
+
+                return (active_count, stale_count)
+        except SQLAlchemyError:
+            LOG.error("SQLAlchemyError in get_running_workflow_runs_info_globally", exc_info=True)
+            raise
+
    async def get_all_organizations(self) -> list[Organization]:
        try:
            async with self.Session() as session: