add force textural element as interactable exp (#2936)

This commit is contained in:
LawyZheng
2025-07-14 13:09:40 +08:00
committed by GitHub
parent 49590a51a5
commit dd9710eb9f
3 changed files with 28 additions and 13 deletions

View File

@@ -7,6 +7,9 @@ from skyvern.constants import SKYVERN_DIR
class Settings(BaseSettings): class Settings(BaseSettings):
model_config = SettingsConfigDict(env_file=(".env", ".env.staging", ".env.prod"), extra="ignore") model_config = SettingsConfigDict(env_file=(".env", ".env.staging", ".env.prod"), extra="ignore")
# settings for experimentation
ENABLE_EXP_ALL_TEXTUAL_ELEMENTS_INTERACTABLE: bool = False
ADDITIONAL_MODULES: list[str] = [] ADDITIONAL_MODULES: list[str] = []
BROWSER_TYPE: str = "chromium-headful" BROWSER_TYPE: str = "chromium-headful"

View File

@@ -1463,6 +1463,10 @@ async function buildElementTree(
hoverStylesMap = await getHoverStylesMap(); hoverStylesMap = await getHoverStylesMap();
} }
if (window.GlobalEnableAllTextualElements === undefined) {
window.GlobalEnableAllTextualElements = false;
}
var elements = []; var elements = [];
var resultArray = []; var resultArray = [];
@@ -1509,7 +1513,7 @@ async function buildElementTree(
} }
const isVisible = isElementVisible(element); const isVisible = isElementVisible(element);
if (isVisible && !isHidden(element) && !isScriptOrStyle(element)) { if (isVisible && !isHidden(element) && !isScriptOrStyle(element)) {
const interactable = isInteractable(element, hoverStylesMap); let interactable = isInteractable(element, hoverStylesMap);
let elementObj = null; let elementObj = null;
let isParentSVG = null; let isParentSVG = null;
if (interactable) { if (interactable) {
@@ -1542,6 +1546,10 @@ async function buildElementTree(
getElementText(element).length > 0 && getElementText(element).length > 0 &&
getElementText(element).length <= 5000 getElementText(element).length <= 5000
) { ) {
if (window.GlobalEnableAllTextualElements) {
// force all textual elements to be interactable
interactable = true;
}
elementObj = await buildElementObject(frame, element, interactable); elementObj = await buildElementObject(frame, element, interactable);
} else if (full_tree) { } else if (full_tree) {
// when building full tree, we only get text from element itself // when building full tree, we only get text from element itself

View File

@@ -11,9 +11,9 @@ from PIL import Image
from playwright._impl._errors import TimeoutError from playwright._impl._errors import TimeoutError
from playwright.async_api import ElementHandle, Frame, Page from playwright.async_api import ElementHandle, Frame, Page
from skyvern.config import settings
from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, PAGE_CONTENT_TIMEOUT, SKYVERN_DIR from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, PAGE_CONTENT_TIMEOUT, SKYVERN_DIR
from skyvern.exceptions import FailedToTakeScreenshot from skyvern.exceptions import FailedToTakeScreenshot
from skyvern.forge.sdk.settings_manager import SettingsManager
from skyvern.forge.sdk.trace import TraceManager from skyvern.forge.sdk.trace import TraceManager
LOG = structlog.get_logger() LOG = structlog.get_logger()
@@ -44,7 +44,7 @@ async def _page_screenshot_helper(
page: Page, page: Page,
file_path: str | None = None, file_path: str | None = None,
full_page: bool = False, full_page: bool = False,
timeout: float = settings.BROWSER_SCREENSHOT_TIMEOUT_MS, timeout: float = SettingsManager.get_settings().BROWSER_SCREENSHOT_TIMEOUT_MS,
) -> bytes: ) -> bytes:
try: try:
return await page.screenshot( return await page.screenshot(
@@ -69,14 +69,14 @@ async def _current_viewpoint_screenshot_helper(
page: Page, page: Page,
file_path: str | None = None, file_path: str | None = None,
full_page: bool = False, full_page: bool = False,
timeout: float = settings.BROWSER_SCREENSHOT_TIMEOUT_MS, timeout: float = SettingsManager.get_settings().BROWSER_SCREENSHOT_TIMEOUT_MS,
mode: ScreenshotMode = ScreenshotMode.DETAILED, mode: ScreenshotMode = ScreenshotMode.DETAILED,
) -> bytes: ) -> bytes:
if page.is_closed(): if page.is_closed():
raise FailedToTakeScreenshot(error_message="Page is closed") raise FailedToTakeScreenshot(error_message="Page is closed")
try: try:
if mode == ScreenshotMode.DETAILED: if mode == ScreenshotMode.DETAILED:
await page.wait_for_load_state(timeout=settings.BROWSER_LOADING_TIMEOUT_MS) await page.wait_for_load_state(timeout=SettingsManager.get_settings().BROWSER_LOADING_TIMEOUT_MS)
LOG.debug("Page is fully loaded, agent is about to take screenshots") LOG.debug("Page is fully loaded, agent is about to take screenshots")
start_time = time.time() start_time = time.time()
screenshot: bytes = b"" screenshot: bytes = b""
@@ -105,7 +105,7 @@ async def _scrolling_screenshots_helper(
skyvern_page: SkyvernFrame, skyvern_page: SkyvernFrame,
url: str | None = None, url: str | None = None,
draw_boxes: bool = False, draw_boxes: bool = False,
max_number: int = settings.MAX_NUM_SCREENSHOTS, max_number: int = SettingsManager.get_settings().MAX_NUM_SCREENSHOTS,
mode: ScreenshotMode = ScreenshotMode.DETAILED, mode: ScreenshotMode = ScreenshotMode.DETAILED,
) -> tuple[list[bytes], list[int]]: ) -> tuple[list[bytes], list[int]]:
# page is the main frame and the index must be 0 # page is the main frame and the index must be 0
@@ -208,7 +208,7 @@ class SkyvernFrame:
frame: Page | Frame, frame: Page | Frame,
expression: str, expression: str,
arg: Any | None = None, arg: Any | None = None,
timeout_ms: float = settings.BROWSER_ACTION_TIMEOUT_MS, timeout_ms: float = SettingsManager.get_settings().BROWSER_ACTION_TIMEOUT_MS,
) -> Any: ) -> Any:
try: try:
async with asyncio.timeout(timeout_ms / 1000): async with asyncio.timeout(timeout_ms / 1000):
@@ -226,9 +226,9 @@ class SkyvernFrame:
async def take_scrolling_screenshot( async def take_scrolling_screenshot(
page: Page, page: Page,
file_path: str | None = None, file_path: str | None = None,
timeout: float = settings.BROWSER_SCREENSHOT_TIMEOUT_MS, timeout: float = SettingsManager.get_settings().BROWSER_SCREENSHOT_TIMEOUT_MS,
mode: ScreenshotMode = ScreenshotMode.DETAILED, mode: ScreenshotMode = ScreenshotMode.DETAILED,
scrolling_number: int = settings.MAX_NUM_SCREENSHOTS, scrolling_number: int = SettingsManager.get_settings().MAX_NUM_SCREENSHOTS,
use_playwright_fullpage: bool = False, # TODO: THIS IS ONLY FOR EXPERIMENT. will be removed after experiment. use_playwright_fullpage: bool = False, # TODO: THIS IS ONLY FOR EXPERIMENT. will be removed after experiment.
) -> bytes: ) -> bytes:
if scrolling_number <= 0: if scrolling_number <= 0:
@@ -241,13 +241,13 @@ class SkyvernFrame:
page=page, file_path=file_path, timeout=timeout, full_page=True page=page, file_path=file_path, timeout=timeout, full_page=True
) )
if scrolling_number > settings.MAX_NUM_SCREENSHOTS: if scrolling_number > SettingsManager.get_settings().MAX_NUM_SCREENSHOTS:
LOG.warning( LOG.warning(
"scrolling_number is greater than the max number of screenshots, setting it to the max number of screenshots", "scrolling_number is greater than the max number of screenshots, setting it to the max number of screenshots",
scrolling_number=scrolling_number, scrolling_number=scrolling_number,
max_number=settings.MAX_NUM_SCREENSHOTS, max_number=SettingsManager.get_settings().MAX_NUM_SCREENSHOTS,
) )
scrolling_number = settings.MAX_NUM_SCREENSHOTS scrolling_number = SettingsManager.get_settings().MAX_NUM_SCREENSHOTS
# use spilt screenshot with lite mode, isntead of fullpage screenshot from playwright # use spilt screenshot with lite mode, isntead of fullpage screenshot from playwright
LOG.debug("Page is fully loaded, agent is about to generate the full page screenshot") LOG.debug("Page is fully loaded, agent is about to generate the full page screenshot")
@@ -293,7 +293,7 @@ class SkyvernFrame:
page: Page, page: Page,
url: str | None = None, url: str | None = None,
draw_boxes: bool = False, draw_boxes: bool = False,
max_number: int = settings.MAX_NUM_SCREENSHOTS, max_number: int = SettingsManager.get_settings().MAX_NUM_SCREENSHOTS,
scroll: bool = True, scroll: bool = True,
) -> list[bytes]: ) -> list[bytes]:
if not scroll: if not scroll:
@@ -313,6 +313,10 @@ class SkyvernFrame:
async def create_instance(cls, frame: Page | Frame) -> SkyvernFrame: async def create_instance(cls, frame: Page | Frame) -> SkyvernFrame:
instance = cls(frame=frame) instance = cls(frame=frame)
await cls.evaluate(frame=instance.frame, expression=JS_FUNCTION_DEFS) await cls.evaluate(frame=instance.frame, expression=JS_FUNCTION_DEFS)
if SettingsManager.get_settings().ENABLE_EXP_ALL_TEXTUAL_ELEMENTS_INTERACTABLE:
await instance.evaluate(
frame=instance.frame, expression="() => window.GlobalEnableAllTextualElements = true"
)
return instance return instance
def __init__(self, frame: Page | Frame) -> None: def __init__(self, frame: Page | Frame) -> None: