Files
Dorod-Sky/skyvern/webeye/utils/page.py
2025-06-26 14:08:40 +00:00

430 lines
17 KiB
Python

from __future__ import annotations
import asyncio
import time
from enum import StrEnum
from io import BytesIO
from typing import Any
import structlog
from PIL import Image
from playwright._impl._errors import TimeoutError
from playwright.async_api import ElementHandle, Frame, Page
from skyvern.config import settings
from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, PAGE_CONTENT_TIMEOUT, SKYVERN_DIR
from skyvern.exceptions import FailedToTakeScreenshot
LOG = structlog.get_logger()
def load_js_script() -> str:
# TODO: Handle file location better. This is a hacky way to find the file location.
path = f"{SKYVERN_DIR}/webeye/scraper/domUtils.js"
try:
# TODO: Implement TS of domUtils.js and use the complied JS file instead of the raw JS file.
# This will allow our code to be type safe.
with open(path) as f:
return f.read()
except FileNotFoundError as e:
LOG.exception("Failed to load the JS script", path=path)
raise e
JS_FUNCTION_DEFS = load_js_script()
class ScreenshotMode(StrEnum):
LITE = "lite"
DETAILED = "detailed"
async def _page_screenshot_helper(
page: Page,
file_path: str | None = None,
full_page: bool = False,
timeout: float = settings.BROWSER_SCREENSHOT_TIMEOUT_MS,
) -> bytes:
try:
return await page.screenshot(
path=file_path,
timeout=timeout,
full_page=full_page,
animations="disabled",
)
except TimeoutError as timeout_error:
LOG.info(
f"Timeout error while taking screenshot: {str(timeout_error)}. Going to take a screenshot again with animation allowed."
)
return await page.screenshot(
path=file_path,
timeout=timeout,
full_page=full_page,
animations="allow",
)
async def _current_viewpoint_screenshot_helper(
page: Page,
file_path: str | None = None,
full_page: bool = False,
timeout: float = settings.BROWSER_SCREENSHOT_TIMEOUT_MS,
mode: ScreenshotMode = ScreenshotMode.DETAILED,
) -> bytes:
if page.is_closed():
raise FailedToTakeScreenshot(error_message="Page is closed")
try:
if mode == ScreenshotMode.DETAILED:
await page.wait_for_load_state(timeout=settings.BROWSER_LOADING_TIMEOUT_MS)
LOG.debug("Page is fully loaded, agent is about to take screenshots")
start_time = time.time()
screenshot: bytes = b""
if file_path:
screenshot = await _page_screenshot_helper(
page=page, file_path=file_path, full_page=full_page, timeout=timeout
)
else:
screenshot = await _page_screenshot_helper(page=page, full_page=full_page, timeout=timeout)
end_time = time.time()
LOG.debug(
"Screenshot taking time",
screenshot_time=end_time - start_time,
file_path=file_path,
)
return screenshot
except TimeoutError as e:
LOG.exception(f"Timeout error while taking screenshot: {str(e)}")
raise FailedToTakeScreenshot(error_message=str(e)) from e
except Exception as e:
LOG.exception(f"Unknown error while taking screenshot: {str(e)}")
raise FailedToTakeScreenshot(error_message=str(e)) from e
async def _scrolling_screenshots_helper(
page: Page,
url: str | None = None,
draw_boxes: bool = False,
max_number: int = settings.MAX_NUM_SCREENSHOTS,
mode: ScreenshotMode = ScreenshotMode.DETAILED,
) -> tuple[list[bytes], list[int]]:
skyvern_page = await SkyvernFrame.create_instance(frame=page)
# page is the main frame and the index must be 0
assert isinstance(skyvern_page.frame, Page)
frame = "main.frame"
frame_index = 0
# when mode is lite, we don't draw bounding boxes
# since draw_boxes impacts the performance of processing
if mode == ScreenshotMode.LITE:
draw_boxes = False
screenshots: list[bytes] = []
positions: list[int] = []
if await skyvern_page.is_window_scrollable():
scroll_y_px_old = -30.0
scroll_y_px = await skyvern_page.scroll_to_top(draw_boxes=draw_boxes, frame=frame, frame_index=frame_index)
# Checking max number of screenshots to prevent infinite loop
# We are checking the difference between the old and new scroll_y_px to determine if we have reached the end of the
# page. If the difference is less than 25, we assume we have reached the end of the page.
while abs(scroll_y_px_old - scroll_y_px) > 25 and len(screenshots) < max_number:
screenshot = await _current_viewpoint_screenshot_helper(page=skyvern_page.frame, mode=mode)
screenshots.append(screenshot)
positions.append(int(scroll_y_px))
scroll_y_px_old = scroll_y_px
LOG.debug("Scrolling to next page", url=url, num_screenshots=len(screenshots))
scroll_y_px = await skyvern_page.scroll_to_next_page(
draw_boxes=draw_boxes,
frame=frame,
frame_index=frame_index,
need_overlap=(mode == ScreenshotMode.DETAILED),
)
LOG.debug(
"Scrolled to next page",
scroll_y_px=scroll_y_px,
scroll_y_px_old=scroll_y_px_old,
)
if draw_boxes:
await skyvern_page.remove_bounding_boxes()
await skyvern_page.scroll_to_top(draw_boxes=False, frame=frame, frame_index=frame_index)
if mode == ScreenshotMode.DETAILED:
# wait until animation ends, which is triggered by scrolling
LOG.debug("Waiting for 2 seconds until animation ends.")
await asyncio.sleep(2)
else:
if draw_boxes:
await skyvern_page.build_elements_and_draw_bounding_boxes(frame=frame, frame_index=frame_index)
LOG.debug("Page is not scrollable", url=url, num_screenshots=len(screenshots))
screenshot = await _current_viewpoint_screenshot_helper(page=skyvern_page.frame, mode=mode)
screenshots.append(screenshot)
positions.append(0)
if draw_boxes:
await skyvern_page.remove_bounding_boxes()
return screenshots, positions
def _merge_images_by_position(images: list[Image.Image], positions: list[int]) -> Image.Image:
"""Merge screenshots vertically using scroll positions to remove overlaps."""
if not images:
raise ValueError("no images to merge")
if len(images) != len(positions):
raise ValueError("images and positions length mismatch")
if len(images) == 1:
return images[0]
max_width = max(img.width for img in images)
merged_height = images[0].height
for i in range(1, len(images)):
merged_height += positions[i] - positions[i - 1]
merged_img = Image.new("RGB", (max_width, merged_height), color=(255, 255, 255))
current_y = 0
merged_img.paste(images[0], (0, current_y))
current_y += images[0].height
for i in range(1, len(images)):
step = positions[i] - positions[i - 1]
overlap = images[i].height - step
if overlap > 0:
cropped = images[i].crop((0, overlap, images[i].width, images[i].height))
else:
cropped = images[i]
merged_img.paste(cropped, (0, current_y))
current_y += cropped.height
return merged_img
class SkyvernFrame:
@staticmethod
async def evaluate(
frame: Page | Frame,
expression: str,
arg: Any | None = None,
timeout_ms: float = settings.BROWSER_ACTION_TIMEOUT_MS,
) -> Any:
try:
async with asyncio.timeout(timeout_ms / 1000):
return await frame.evaluate(expression=expression, arg=arg)
except asyncio.TimeoutError:
LOG.exception("Skyvern timed out trying to analyze the page", expression=expression)
raise TimeoutError("Skyvern timed out trying to analyze the page")
@staticmethod
async def get_url(frame: Page | Frame) -> str:
return await SkyvernFrame.evaluate(frame=frame, expression="() => document.location.href")
@staticmethod
async def take_scrolling_screenshot(
page: Page,
file_path: str | None = None,
timeout: float = settings.BROWSER_SCREENSHOT_TIMEOUT_MS,
mode: ScreenshotMode = ScreenshotMode.DETAILED,
scrolling_number: int = settings.MAX_NUM_SCREENSHOTS,
use_playwright_fullpage: bool = False, # TODO: THIS IS ONLY FOR EXPERIMENT. will be removed after experiment.
) -> bytes:
if scrolling_number <= 0:
return await _current_viewpoint_screenshot_helper(
page=page, file_path=file_path, timeout=timeout, mode=mode
)
if use_playwright_fullpage:
return await _current_viewpoint_screenshot_helper(
page=page, file_path=file_path, timeout=timeout, full_page=True
)
if scrolling_number > settings.MAX_NUM_SCREENSHOTS:
LOG.warning(
"scrolling_number is greater than the max number of screenshots, setting it to the max number of screenshots",
scrolling_number=scrolling_number,
max_number=settings.MAX_NUM_SCREENSHOTS,
)
scrolling_number = settings.MAX_NUM_SCREENSHOTS
# use spilt screenshot with lite mode, isntead of fullpage screenshot from playwright
LOG.debug("Page is fully loaded, agent is about to generate the full page screenshot")
start_time = time.time()
async with asyncio.timeout(timeout):
screenshots, positions = await _scrolling_screenshots_helper(
page=page, mode=mode, max_number=scrolling_number
)
images = []
for screenshot in screenshots:
with Image.open(BytesIO(screenshot)) as img:
img.load()
images.append(img)
merged_img = _merge_images_by_position(images, positions)
buffer = BytesIO()
merged_img.save(buffer, format="PNG")
buffer.seek(0)
img_data = buffer.read()
if file_path is not None:
with open(file_path, "wb") as f:
f.write(img_data)
end_time = time.time()
LOG.debug(
"Full page screenshot taking time",
screenshot_time=end_time - start_time,
file_path=file_path,
)
return img_data
@staticmethod
async def take_split_screenshots(
page: Page,
url: str | None = None,
draw_boxes: bool = False,
max_number: int = settings.MAX_NUM_SCREENSHOTS,
scroll: bool = True,
) -> list[bytes]:
if not scroll:
return [await _current_viewpoint_screenshot_helper(page=page, mode=ScreenshotMode.DETAILED)]
screenshots, _ = await _scrolling_screenshots_helper(
page=page, url=url, max_number=max_number, draw_boxes=draw_boxes, mode=ScreenshotMode.DETAILED
)
return screenshots
@classmethod
async def create_instance(cls, frame: Page | Frame) -> SkyvernFrame:
instance = cls(frame=frame)
await cls.evaluate(frame=instance.frame, expression=JS_FUNCTION_DEFS)
return instance
def __init__(self, frame: Page | Frame) -> None:
self.frame = frame
def get_frame(self) -> Page | Frame:
return self.frame
async def get_content(self, timeout: float = PAGE_CONTENT_TIMEOUT) -> str:
async with asyncio.timeout(timeout):
return await self.frame.content()
async def get_scroll_x_y(self) -> tuple[int, int]:
js_script = "() => getScrollXY()"
return await self.evaluate(frame=self.frame, expression=js_script)
async def scroll_to_x_y(self, x: int, y: int) -> None:
js_script = "([x, y]) => scrollToXY(x, y)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=[x, y])
async def scroll_to_element_bottom(self, element: ElementHandle, page_by_page: bool = False) -> None:
js_script = "([element, page_by_page]) => scrollToElementBottom(element, page_by_page)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=[element, page_by_page])
async def scroll_to_element_top(self, element: ElementHandle) -> None:
js_script = "(element) => scrollToElementTop(element)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=element)
async def parse_element_from_html(self, frame: str, element: ElementHandle, interactable: bool) -> dict:
js_script = "async ([frame, element, interactable]) => await buildElementObject(frame, element, interactable)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=[frame, element, interactable])
async def get_element_scrollable(self, element: ElementHandle) -> bool:
js_script = "(element) => isScrollable(element)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=element)
async def get_element_visible(self, element: ElementHandle) -> bool:
js_script = "(element) => isElementVisible(element) && !isHidden(element)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=element)
async def get_disabled_from_style(self, element: ElementHandle) -> bool:
js_script = "(element) => checkDisabledFromStyle(element)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=element)
async def get_blocking_element_id(self, element: ElementHandle) -> tuple[str, bool]:
js_script = "(element) => getBlockElementUniqueID(element)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=element)
async def scroll_to_top(self, draw_boxes: bool, frame: str, frame_index: int) -> float:
"""
Scroll to the top of the page and take a screenshot.
:param drow_boxes: If True, draw bounding boxes around the elements.
:param page: Page instance to take the screenshot from.
:return: Screenshot of the page.
"""
js_script = "async ([draw_boxes, frame, frame_index]) => await safeScrollToTop(draw_boxes, frame, frame_index)"
scroll_y_px = await self.evaluate(
frame=self.frame,
expression=js_script,
timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS,
arg=[draw_boxes, frame, frame_index],
)
return scroll_y_px
async def scroll_to_next_page(
self, draw_boxes: bool, frame: str, frame_index: int, need_overlap: bool = True
) -> float:
"""
Scroll to the next page and take a screenshot.
:param drow_boxes: If True, draw bounding boxes around the elements.
:param page: Page instance to take the screenshot from.
:return: Screenshot of the page.
"""
js_script = "async ([draw_boxes, frame, frame_index, need_overlap]) => await scrollToNextPage(draw_boxes, frame, frame_index, need_overlap)"
scroll_y_px = await self.evaluate(
frame=self.frame,
expression=js_script,
timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS,
arg=[draw_boxes, frame, frame_index, need_overlap],
)
return scroll_y_px
async def remove_bounding_boxes(self) -> None:
"""
Remove the bounding boxes from the page.
:param page: Page instance to remove the bounding boxes from.
"""
js_script = "() => removeBoundingBoxes()"
await self.evaluate(frame=self.frame, expression=js_script, timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS)
async def build_elements_and_draw_bounding_boxes(self, frame: str, frame_index: int) -> None:
js_script = "async ([frame, frame_index]) => await buildElementsAndDrawBoundingBoxes(frame, frame_index)"
await self.evaluate(
frame=self.frame,
expression=js_script,
timeout_ms=BUILDING_ELEMENT_TREE_TIMEOUT_MS,
arg=[frame, frame_index],
)
async def is_window_scrollable(self) -> bool:
js_script = "() => isWindowScrollable()"
return await self.evaluate(frame=self.frame, expression=js_script)
async def is_parent(self, parent: ElementHandle, child: ElementHandle) -> bool:
js_script = "([parent, child]) => isParent(parent, child)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=[parent, child])
async def is_sibling(self, el1: ElementHandle, el2: ElementHandle) -> bool:
js_script = "([el1, el2]) => isSibling(el1, el2)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=[el1, el2])
async def has_ASP_client_control(self) -> bool:
js_script = "() => hasASPClientControl()"
return await self.evaluate(frame=self.frame, expression=js_script)
async def click_element_in_javascript(self, element: ElementHandle) -> None:
js_script = "(element) => element.click()"
return await self.evaluate(frame=self.frame, expression=js_script, arg=element)
async def remove_target_attr(self, element: ElementHandle) -> None:
js_script = "(element) => element.removeAttribute('target')"
return await self.evaluate(frame=self.frame, expression=js_script, arg=element)
async def get_select_options(self, element: ElementHandle) -> tuple[list, str]:
js_script = "([element]) => getSelectOptions(element)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=[element])