Browser recording: events to blocks (#4195)

This commit is contained in:
Jonathan Dobson
2025-12-04 12:04:38 -05:00
committed by GitHub
parent bc8b20a742
commit b30f3b09c8
23 changed files with 1788 additions and 100 deletions

View File

@@ -32,6 +32,7 @@ from skyvern.forge.sdk.services.credential.custom_credential_vault_service impor
from skyvern.forge.sdk.settings_manager import SettingsManager
from skyvern.forge.sdk.workflow.context_manager import WorkflowContextManager
from skyvern.forge.sdk.workflow.service import WorkflowService
from skyvern.services.browser_recording.service import BrowserSessionRecordingService
from skyvern.webeye.browser_manager import BrowserManager
from skyvern.webeye.persistent_sessions_manager import PersistentSessionsManager
from skyvern.webeye.real_browser_manager import RealBrowserManager
@@ -69,6 +70,7 @@ class ForgeApp:
WORKFLOW_SERVICE: WorkflowService
AGENT_FUNCTION: AgentFunction
PERSISTENT_SESSIONS_MANAGER: PersistentSessionsManager
BROWSER_SESSION_RECORDING_SERVICE: BrowserSessionRecordingService
BITWARDEN_CREDENTIAL_VAULT_SERVICE: BitwardenCredentialVaultService
AZURE_CREDENTIAL_VAULT_SERVICE: AzureCredentialVaultService | None
CUSTOM_CREDENTIAL_VAULT_SERVICE: CustomCredentialVaultService | None
@@ -177,6 +179,7 @@ def create_forge_app() -> ForgeApp:
app.WORKFLOW_SERVICE = WorkflowService()
app.AGENT_FUNCTION = AgentFunction()
app.PERSISTENT_SESSIONS_MANAGER = PersistentSessionsManager(database=app.DATABASE)
app.BROWSER_SESSION_RECORDING_SERVICE = BrowserSessionRecordingService()
app.AZURE_CLIENT_FACTORY = RealAzureClientFactory()
app.BITWARDEN_CREDENTIAL_VAULT_SERVICE = BitwardenCredentialVaultService()

View File

@@ -0,0 +1,21 @@
Given a browser action, come up with a templated one-line prompt suitable for a browser agent, a block label, and a title.
The templated prompt should have one jinja variable in it. Come up with a good name for the variable that is
lower case, no spaces, underscores permitted.
Example: "Enter {{ address }} into the address field."
MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
Reply in JSON format with the following keys:
{
"prompt": str, // A templated, one-line prompt suitable for a browser agent describing the user action.
"title": str, // A descriptive and informative title for the goal. Use no more than 5 words
"block_label": str, // A label for the block. Lower case. Based off of the "title". Underscores are permitted.
"parameter_name": { "key": str } // The name of the parameter being input. Lower case, no spaces. Underscores are permitted.
}
User action:
```
{{ action }}
```

View File

@@ -0,0 +1,15 @@
Given a browser action, come up with a one-line prompt suitable for a browser agent, a block label, and a title.
MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
Reply in JSON format with the following keys:
{
"prompt": str, // A one-line prompt suitable for a browser agent describing the user action.
"title": str, // A descriptive and informative title for the goal. Use no more than 5 words
"block_label": str // A label for the block. Lower case. Based off of the "title". Underscores are permitted.
}
User action:
```
{{ action }}
```

View File

@@ -0,0 +1,13 @@
Given a "go to URL" action, come up with a block label suitable for a browser agent. It should include a short version of a url, if the url exists.
MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
Reply in JSON format with the following keys:
{
"block_label": str // A label for the block. Lower case. Underscores are permitted.
}
go to URL action:
```
{{ action }}
```

View File

@@ -0,0 +1,15 @@
Given a wait action, come up with a block label. It should include the duration in seconds, and a short version of a url, if the url exists.
If the duration for the wait action is in milliseconds, so be sure to convert to seconds.
MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc.
Reply in JSON format with the following keys:
{
"block_label": str // A label for the block. Lower case. Underscores are permitted.
}
Wait action:
```
{{ action }}
```

View File

@@ -18,7 +18,11 @@ from skyvern.forge.sdk.routes.code_samples import (
from skyvern.forge.sdk.routes.routers import base_router
from skyvern.forge.sdk.schemas.organizations import Organization
from skyvern.forge.sdk.services import org_auth_service
from skyvern.schemas.browser_sessions import CreateBrowserSessionRequest
from skyvern.schemas.browser_sessions import (
CreateBrowserSessionRequest,
ProcessBrowserSessionRecordingRequest,
ProcessBrowserSessionRecordingResponse,
)
from skyvern.webeye.schemas import BrowserSessionResponse
@@ -217,3 +221,29 @@ async def get_browser_sessions(
for browser_session in browser_sessions
]
)
@base_router.post(
"/browser_sessions/{browser_session_id}/process_recording",
include_in_schema=False,
)
async def process_recording(
browser_session_id: str = Path(..., description="The ID of the browser session.", examples=["pbs_123456"]),
recording_request: ProcessBrowserSessionRecordingRequest = ProcessBrowserSessionRecordingRequest(),
current_org: Organization = Depends(org_auth_service.get_current_org),
) -> ProcessBrowserSessionRecordingResponse:
browser_session = await app.PERSISTENT_SESSIONS_MANAGER.get_session(
browser_session_id,
current_org.organization_id,
)
if not browser_session:
raise HTTPException(status_code=404, detail=f"Browser session {browser_session_id} not found")
blocks, parameters = await app.BROWSER_SESSION_RECORDING_SERVICE.process_recording(
organization_id=current_org.organization_id,
browser_session_id=browser_session_id,
compressed_chunks=recording_request.compressed_chunks,
workflow_permanent_id=recording_request.workflow_permanent_id,
)
return ProcessBrowserSessionRecordingResponse(blocks=blocks, parameters=parameters)

View File

@@ -14,6 +14,7 @@ import asyncio
import dataclasses
import enum
import json
import time
import typing as t
import structlog
@@ -39,6 +40,7 @@ class ExfiltratedEvent:
# TODO(jdo): improve typing for params
params: dict = dataclasses.field(default_factory=dict)
source: ExfiltratedEventSource = ExfiltratedEventSource.NOT_SPECIFIED
timestamp: float = dataclasses.field(default_factory=lambda: time.time()) # seconds since epoch
OnExfiltrationEvent = t.Callable[[list[ExfiltratedEvent]], None]
@@ -68,6 +70,7 @@ class ExfiltrationChannel(CdpChannel):
event_name="user_interaction",
params=event_data,
source=ExfiltratedEventSource.CONSOLE,
timestamp=time.time(),
),
]
@@ -84,20 +87,25 @@ class ExfiltrationChannel(CdpChannel):
event_name=event_name,
params=params,
source=ExfiltratedEventSource.CDP,
timestamp=time.time(),
),
]
self.on_event(messages)
if event_name in ("frame_navigated", "navigated_within_document"):
# optimistically re-apply exfiltration and decoration on navigation
# (these operations should be idempotent)
pages = self.browser_context.pages if self.browser_context else []
LOG.info(f"{self.class_name} re-applying exfiltration and decoration on navigation.", event_name=event_name)
async def adorn(self, page: Page) -> t.Self:
"""Add a mouse-following follower to the page."""
if page.url.startswith("devtools:"):
return self
for page in pages:
asyncio.create_task(self.exfiltrate(page))
asyncio.create_task(self.decorate(page))
LOG.info(f"{self.class_name} adorning page.", url=page.url)
(await page.evaluate(self.js("adorn")),)
(await page.add_init_script(self.js("adorn")),)
LOG.info(f"{self.class_name} adornment complete on page.", url=page.url)
return self
async def connect(self, cdp_url: str | None = None) -> t.Self:
if self.browser and self.browser.is_connected() and self.cdp_session:
@@ -121,12 +129,18 @@ class ExfiltrationChannel(CdpChannel):
async def exfiltrate(self, page: Page) -> t.Self:
"""
Track user interactions and send to console for CDP to capture.
Uses add_init_script to ensure the exfiltration script is re-injected
on every navigation (including address bar navigations).
"""
if page.url.startswith("devtools:"):
return self
LOG.info(f"{self.class_name} setting up exfiltration on new page.", url=page.url)
page.on("console", self._handle_console_event)
await page.add_init_script(self.js("exfiltrate"))
await page.evaluate(self.js("exfiltrate"))
LOG.info(f"{self.class_name} setup complete on page.", url=page.url)
@@ -135,8 +149,12 @@ class ExfiltrationChannel(CdpChannel):
async def decorate(self, page: Page) -> t.Self:
"""Add a mouse-following follower to the page."""
if page.url.startswith("devtools:"):
return self
LOG.info(f"{self.class_name} adding decoration to page.", url=page.url)
await page.add_init_script(self.js("decorate"))
await page.evaluate(self.js("decorate"))
LOG.info(f"{self.class_name} decoration setup complete on page.", url=page.url)
@@ -145,8 +163,12 @@ class ExfiltrationChannel(CdpChannel):
async def undecorate(self, page: Page) -> t.Self:
"""Remove the mouse-following follower from the page."""
if page.url.startswith("devtools:"):
return self
LOG.info(f"{self.class_name} removing decoration from page.", url=page.url)
await page.add_init_script(self.js("undecorate"))
await page.evaluate(self.js("undecorate"))
LOG.info(f"{self.class_name} decoration removed from page.", url=page.url)
@@ -174,10 +196,35 @@ class ExfiltrationChannel(CdpChannel):
cdp_session.on("Target.targetCreated", lambda params: self._handle_cdp_event("target_created", params))
cdp_session.on("Target.targetDestroyed", lambda params: self._handle_cdp_event("target_destroyed", params))
cdp_session.on("Target.targetInfoChanged", lambda params: self._handle_cdp_event("target_info_changed", params))
cdp_session.on("Page.frameNavigated", lambda params: self._handle_cdp_event("frame_navigated", params))
cdp_session.on(
"Page.navigatedWithinDocument", lambda params: self._handle_cdp_event("navigated_within_document", params)
"Page.frameRequestedNavigation",
lambda params: self._handle_cdp_event("nav:frame_requested_navigation", params),
)
cdp_session.on(
"Page.frameStartedNavigating", lambda params: self._handle_cdp_event("nav:frame_started_navigating", params)
)
cdp_session.on("Page.frameNavigated", lambda params: self._handle_cdp_event("nav:frame_navigated", params))
cdp_session.on(
"Page.navigatedWithinDocument",
lambda params: self._handle_cdp_event("nav:navigated_within_document", params),
)
return self
async def enable_adornment(self) -> t.Self:
browser_context = self.browser_context
if not browser_context:
LOG.error(f"{self.class_name} no browser context to enable adornment.")
return self
tasks: list[asyncio.Task] = []
for page in browser_context.pages:
tasks.append(asyncio.create_task(self.adorn(page)))
await asyncio.gather(*tasks)
browser_context.on("page", lambda page: asyncio.create_task(self.adorn(page)))
return self
@@ -214,6 +261,8 @@ class ExfiltrationChannel(CdpChannel):
await self.enable_cdp_events()
await self.enable_adornment()
self.enable_console_events()
self.enable_decoration()
@@ -236,7 +285,10 @@ class ExfiltrationChannel(CdpChannel):
pages = self.browser_context.pages if self.browser_context else []
for page in pages:
page.remove_listener("console", self._handle_console_event)
try:
page.remove_listener("console", self._handle_console_event)
except KeyError:
pass # listener not found
await self.undecorate(page)
LOG.info(f"{self.class_name} stopped.")

View File

@@ -0,0 +1,97 @@
/**
* DOM-adornment: assign stable identifiers to all DOM elements.
*/
(function () {
console.log("[SYS] adornment evaluated");
window.__skyvern_assignedEls = window.__skyvern_assignedEls ?? 0;
const visited = (window.__skyvern_visited =
window.__skyvern_visited ?? new Set());
function __skyvern_generateUniqueId() {
const timestamp = Date.now().toString(36);
const randomPart = Math.random().toString(36).substring(2);
return `sky-${timestamp}-${randomPart}`;
}
window.__skyvern_generateUniqueId = __skyvern_generateUniqueId;
function __skyvern_assignSkyIds(node) {
if (!node) {
return;
}
if (node.nodeType === 1) {
if (!node.dataset.skyId) {
window.__skyvern_assignedEls += 1;
node.dataset.skyId = __skyvern_generateUniqueId();
}
if (visited.has(node)) {
return;
}
visited.add(node);
const children = node.querySelectorAll("*");
children.forEach((child) => {
__skyvern_assignSkyIds(child);
});
}
}
if (document.body) {
__skyvern_assignSkyIds(document.body);
console.log(
"[SYS] adornment: initially assigned skyIds to elements:",
window.__skyvern_assignedEls,
);
}
document.addEventListener("DOMContentLoaded", () => {
__skyvern_assignSkyIds(document.body);
console.log(
"[SYS] adornment: assigned skyIds to elements on DOMContentLoaded:",
window.__skyvern_assignedEls,
);
});
const observerConfig = {
childList: true,
subtree: true,
};
const observer = new MutationObserver(function (mutationsList) {
for (const mutation of mutationsList) {
if (mutation.type === "childList") {
mutation.addedNodes.forEach((node) => {
__skyvern_assignSkyIds(node);
console.log(
"[SYS] adornment: assigned skyIds to new elements:",
window.__skyvern_assignedEls,
);
});
}
}
});
function observeWhenReady() {
if (document.body) {
observer.observe(document.body, observerConfig);
} else {
document.addEventListener("DOMContentLoaded", () => {
if (document.body) {
observer.observe(document.body, observerConfig);
}
});
}
}
observeWhenReady();
window.__skyvern_adornment_observer = observer;
})();

View File

@@ -1,109 +1,118 @@
(function () {
if (!window.__skyvern_decoration_initialized) {
window.__skyvern_decoration_initialized = true;
console.log("[SYS] decorate: evaluated");
window.__skyvern_create_mouse_follower = function () {
// create the circle element
const existingCircle = document.getElementById(
"__skyvern_mouse_follower",
);
function initiate() {
if (!window.__skyvern_decoration_initialized) {
console.log("[SYS] decorate: initializing");
if (existingCircle) {
return false;
}
window.__skyvern_decoration_initialized = true;
const circle = document.createElement("div");
window.__skyvern_decoration_mouse_follower = circle;
circle.id = "__skyvern_mouse_follower";
circle.style.position = "fixed";
circle.style.left = "0";
circle.style.top = "0";
circle.style.width = "30px";
circle.style.height = "30px";
circle.style.borderRadius = "50%";
circle.style.backgroundColor = "rgba(255, 0, 0, 0.2)";
circle.style.pointerEvents = "none";
circle.style.zIndex = "999999";
circle.style.willChange = "transform";
document.body.appendChild(circle);
window.__skyvern_create_mouse_follower = function () {
const preexistingCircles = document.querySelectorAll(
"#__skyvern_mouse_follower",
);
return true;
};
if (preexistingCircles.length > 0) {
for (const circle of preexistingCircles) {
circle.remove();
}
}
const wasCreated = window.__skyvern_create_mouse_follower();
const circle = document.createElement("div");
window.__skyvern_decoration_mouse_follower = circle;
circle.id = "__skyvern_mouse_follower";
circle.style.position = "fixed";
circle.style.left = "0";
circle.style.top = "0";
circle.style.width = "30px";
circle.style.height = "30px";
circle.style.borderRadius = "50%";
circle.style.backgroundColor = "rgba(255, 0, 0, 0.2)";
circle.style.pointerEvents = "none";
circle.style.zIndex = "999999";
circle.style.willChange = "transform";
document.body.appendChild(circle);
};
if (!wasCreated) {
return;
}
window.__skyvern_create_mouse_follower();
let scale = 1;
let targetScale = 1;
let mouseX = 0;
let mouseY = 0;
let scale = 1;
let targetScale = 1;
let mouseX = 0;
let mouseY = 0;
// smooth scale animation
function animate() {
if (!window.__skyvern_decoration_mouse_follower) {
return;
}
const follower = window.__skyvern_decoration_mouse_follower;
scale += (targetScale - scale) * 0.2;
if (Math.abs(targetScale - scale) > 0.001) {
requestAnimationFrame(animate);
} else {
scale = targetScale;
}
follower.style.transform = `translate(${mouseX - 15}px, ${mouseY - 15}px) scale(${scale})`;
}
// update follower position on mouse move
document.addEventListener(
"mousemove",
(e) => {
// smooth scale animation
function animate() {
if (!window.__skyvern_decoration_mouse_follower) {
return;
}
const follower = window.__skyvern_decoration_mouse_follower;
mouseX = e.clientX;
mouseY = e.clientY;
scale += (targetScale - scale) * 0.2;
if (Math.abs(targetScale - scale) > 0.001) {
requestAnimationFrame(animate);
} else {
scale = targetScale;
}
follower.style.transform = `translate(${mouseX - 15}px, ${mouseY - 15}px) scale(${scale})`;
},
true,
);
}
// expand follower on mouse down
document.addEventListener(
"mousedown",
() => {
if (!window.__skyvern_decoration_mouse_follower) {
return;
}
// update follower position on mouse move
document.addEventListener(
"mousemove",
(e) => {
if (!window.__skyvern_decoration_mouse_follower) {
return;
}
targetScale = 50 / 30;
requestAnimationFrame(animate);
},
true,
);
const follower = window.__skyvern_decoration_mouse_follower;
mouseX = e.clientX;
mouseY = e.clientY;
follower.style.transform = `translate(${mouseX - 15}px, ${mouseY - 15}px) scale(${scale})`;
},
true,
);
// return follower to original size on mouse up
document.addEventListener(
"mouseup",
() => {
if (!window.__skyvern_decoration_mouse_follower) {
return;
}
// expand follower on mouse down
document.addEventListener(
"mousedown",
() => {
if (!window.__skyvern_decoration_mouse_follower) {
return;
}
targetScale = 1;
requestAnimationFrame(animate);
},
true,
);
targetScale = 50 / 30;
requestAnimationFrame(animate);
},
true,
);
// return follower to original size on mouse up
document.addEventListener(
"mouseup",
() => {
if (!window.__skyvern_decoration_mouse_follower) {
return;
}
targetScale = 1;
requestAnimationFrame(animate);
},
true,
);
} else {
window.__skyvern_create_mouse_follower();
}
}
if (document.body) {
console.log("[SYS] decorate: document already loaded, initiating");
initiate();
} else {
window.__skyvern_create_mouse_follower();
console.log("[SYS] decorate: waiting for DOMContentLoaded to initiate");
document.addEventListener("DOMContentLoaded", initiate);
}
})();

View File

@@ -1,5 +1,7 @@
(function () {
console.log("[SYS] exfiltration: evaluated");
if (!window.__skyvern_exfiltration_initialized) {
console.log("[SYS] exfiltration: initializing");
window.__skyvern_exfiltration_initialized = true;
[
@@ -55,6 +57,10 @@
const getElementText = (element) => {
const textSources = [];
if (!element.getAttribute) {
return textSources;
}
if (element.getAttribute("aria-label")) {
textSources.push(element.getAttribute("aria-label"));
}
@@ -96,6 +102,52 @@
return textSources.length > 0 ? textSources : [];
};
const skyId = e.target?.dataset?.skyId || null;
if (!skyId && e.target?.tagName !== "HTML") {
console.log("[SYS] exfiltration: target element has no skyId.");
if (window.__skyvern_generateUniqueId && e.target?.dataset) {
const newSkyId = window.__skyvern_generateUniqueId();
e.target.dataset.skyId = newSkyId;
console.log(
`[SYS] exfiltration: assigned new skyId to target element: ${newSkyId}`,
);
} else {
console.log(
"[SYS] exfiltration: cannot assign skyId, generator not found.",
);
const info = {
tagName: e.target?.tagName,
target: e.target,
targetType: typeof e.target,
eventType,
id: e.target?.id,
className: e.target?.className,
value: e.target?.value,
text: getElementText(e.target),
labels: getAssociatedLabels(e.target),
skyId: e.target?.dataset?.skyId,
};
try {
const infoS = JSON.stringify(info, null, 2);
console.log(
`[SYS] exfiltration: target element info: ${infoS}`,
);
} catch (err) {
console.log(
"[SYS] exfiltration: target element info: [unserializable]",
);
}
}
}
const classText = String(
e.target.classList?.value ?? e.target.getAttribute("class") ?? "",
);
const eventData = {
url: window.location.href,
type: eventType,
@@ -103,10 +155,13 @@
target: {
tagName: e.target?.tagName,
id: e.target?.id,
className: e.target?.className,
isHtml: e.target instanceof HTMLElement,
isSvg: e.target instanceof SVGElement,
className: classText,
value: e.target?.value,
text: getElementText(e.target),
labels: getAssociatedLabels(e.target),
skyId: e.target?.dataset?.skyId,
},
inputValue: ["input", "focus", "blur"].includes(eventType)
? e.target?.value

View File

@@ -1,4 +1,6 @@
(function () {
console.log("[SYS] undecorate: evaluated");
const followers = document.querySelectorAll("#__skyvern_mouse_follower");
for (const follower of followers) {

View File

@@ -122,6 +122,7 @@ class MessageOutExfiltratedEvent(Message):
# TODO(jdo): improve typing for params
params: dict = dataclasses.field(default_factory=dict)
source: ExfiltratedEventSource = ExfiltratedEventSource.NOT_SPECIFIED
timestamp: float = dataclasses.field(default_factory=lambda: 0.0) # seconds since epoch
@dataclasses.dataclass
@@ -433,6 +434,7 @@ async def loop_stream_messages(message_channel: MessageChannel) -> None:
event_name=event.event_name,
params=event.params,
source=t.cast(ExfiltratedEventSource, event.source or ExfiltratedEventSource.NOT_SPECIFIED),
timestamp=event.timestamp,
)
message_channel.send_nowait(messages=[message_out_exfiltrated_event])