192 lines
6.4 KiB
Python
192 lines
6.4 KiB
Python
"""
|
|
A lightweight "agent" for interacting with the streaming browser over CDP.
|
|
"""
|
|
|
|
import typing
|
|
from contextlib import asynccontextmanager
|
|
|
|
import structlog
|
|
from playwright.async_api import Browser, BrowserContext, Page, Playwright, async_playwright
|
|
|
|
import skyvern.forge.sdk.routes.streaming.clients as sc
|
|
from skyvern.config import settings
|
|
|
|
LOG = structlog.get_logger()
|
|
|
|
|
|
class StreamingAgent:
|
|
"""
|
|
A minimal agent that can connect to a browser via CDP and execute JavaScript.
|
|
|
|
Specifically for operations during streaming sessions (like copy/pasting selected text, etc.).
|
|
"""
|
|
|
|
def __init__(self, streaming: sc.Streaming) -> None:
|
|
self.streaming = streaming
|
|
self.browser: Browser | None = None
|
|
self.browser_context: BrowserContext | None = None
|
|
self.page: Page | None = None
|
|
self.pw: Playwright | None = None
|
|
|
|
async def connect(self, cdp_url: str | None = None) -> None:
|
|
url = cdp_url or settings.BROWSER_REMOTE_DEBUGGING_URL
|
|
|
|
LOG.info("StreamingAgent connecting to CDP", cdp_url=url)
|
|
|
|
pw = self.pw or await async_playwright().start()
|
|
|
|
self.pw = pw
|
|
|
|
headers = {
|
|
"x-api-key": self.streaming.x_api_key,
|
|
}
|
|
|
|
self.browser = await pw.chromium.connect_over_cdp(url, headers=headers)
|
|
|
|
org_id = self.streaming.organization_id
|
|
browser_session_id = (
|
|
self.streaming.browser_session.persistent_browser_session_id if self.streaming.browser_session else None
|
|
)
|
|
|
|
if browser_session_id:
|
|
cdp_session = await self.browser.new_browser_cdp_session()
|
|
await cdp_session.send(
|
|
"Browser.setDownloadBehavior",
|
|
{
|
|
"behavior": "allow",
|
|
"downloadPath": f"/app/downloads/{org_id}/{browser_session_id}",
|
|
"eventsEnabled": True,
|
|
},
|
|
)
|
|
|
|
contexts = self.browser.contexts
|
|
if contexts:
|
|
LOG.info("StreamingAgent using existing browser context")
|
|
self.browser_context = contexts[0]
|
|
else:
|
|
LOG.warning("No existing browser context found, creating new one")
|
|
self.browser_context = await self.browser.new_context()
|
|
|
|
pages = self.browser_context.pages
|
|
if pages:
|
|
self.page = pages[0]
|
|
LOG.info("StreamingAgent connected to page", url=self.page.url)
|
|
else:
|
|
LOG.warning("No pages found in browser context")
|
|
|
|
LOG.info("StreamingAgent connected successfully")
|
|
|
|
async def evaluate_js(
|
|
self, expression: str, arg: str | int | float | bool | list | dict | None = None
|
|
) -> str | int | float | bool | list | dict | None:
|
|
if not self.page:
|
|
raise RuntimeError("StreamingAgent is not connected to a page. Call connect() first.")
|
|
|
|
LOG.info("StreamingAgent evaluating JS", expression=expression[:100])
|
|
|
|
try:
|
|
result = await self.page.evaluate(expression, arg)
|
|
LOG.info("StreamingAgent JS evaluation successful")
|
|
return result
|
|
except Exception as ex:
|
|
LOG.exception("StreamingAgent JS evaluation failed", expression=expression, ex=str(ex))
|
|
raise
|
|
|
|
async def get_selected_text(self) -> str:
|
|
LOG.info("StreamingAgent getting selected text")
|
|
|
|
js_expression = """
|
|
() => {
|
|
const selection = window.getSelection();
|
|
return selection ? selection.toString() : '';
|
|
}
|
|
"""
|
|
|
|
selected_text = await self.evaluate_js(js_expression)
|
|
|
|
if isinstance(selected_text, str) or selected_text is None:
|
|
LOG.info("StreamingAgent got selected text", length=len(selected_text) if selected_text else 0)
|
|
return selected_text or ""
|
|
|
|
raise RuntimeError(f"StreamingAgent selected text is not a string, but a(n) '{type(selected_text)}'")
|
|
|
|
async def paste_text(self, text: str) -> None:
|
|
LOG.info("StreamingAgent pasting text")
|
|
|
|
js_expression = """
|
|
(text) => {
|
|
const activeElement = document.activeElement;
|
|
if (activeElement && (activeElement.tagName === 'INPUT' || activeElement.tagName === 'TEXTAREA' || activeElement.isContentEditable)) {
|
|
const start = activeElement.selectionStart || 0;
|
|
const end = activeElement.selectionEnd || 0;
|
|
const value = activeElement.value || '';
|
|
activeElement.value = value.slice(0, start) + text + value.slice(end);
|
|
const newCursorPos = start + text.length;
|
|
activeElement.setSelectionRange(newCursorPos, newCursorPos);
|
|
}
|
|
}
|
|
"""
|
|
|
|
await self.evaluate_js(js_expression, text)
|
|
|
|
LOG.info("StreamingAgent pasted text successfully")
|
|
|
|
async def close(self) -> None:
|
|
LOG.info("StreamingAgent closing connection")
|
|
|
|
if self.browser:
|
|
await self.browser.close()
|
|
self.browser = None
|
|
self.browser_context = None
|
|
self.page = None
|
|
|
|
if self.pw:
|
|
await self.pw.stop()
|
|
self.pw = None
|
|
|
|
LOG.info("StreamingAgent closed")
|
|
|
|
|
|
@asynccontextmanager
|
|
async def connected_agent(streaming: sc.Streaming | None) -> typing.AsyncIterator[StreamingAgent]:
|
|
"""
|
|
The first pass at this has us doing the following for every operation:
|
|
- creating a new agent
|
|
- connecting
|
|
- [doing smth]
|
|
- closing the agent
|
|
|
|
This may add latency, but locally it is pretty fast. This keeps things stateless for now.
|
|
|
|
If it turns out it's too slow, we can refactor to keep a persistent agent per streaming client.
|
|
"""
|
|
|
|
if not streaming:
|
|
msg = "connected_agent: no streaming client provided."
|
|
LOG.error(msg)
|
|
|
|
raise Exception(msg)
|
|
|
|
if not streaming.browser_session or not streaming.browser_session.browser_address:
|
|
msg = "connected_agent: no browser session or browser address found for streaming client."
|
|
|
|
LOG.error(
|
|
msg,
|
|
client_id=streaming.client_id,
|
|
organization_id=streaming.organization_id,
|
|
)
|
|
|
|
raise Exception(msg)
|
|
|
|
agent = StreamingAgent(streaming=streaming)
|
|
|
|
try:
|
|
await agent.connect(streaming.browser_session.browser_address)
|
|
|
|
# NOTE(jdo:streaming-local-dev): use BROWSER_REMOTE_DEBUGGING_URL from settings
|
|
# await agent.connect()
|
|
|
|
yield agent
|
|
finally:
|
|
await agent.close()
|