diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index f842dc42..d7bfea43 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -741,7 +741,8 @@ async def handle_sequential_click_for_dropdown( reasoning=action.reasoning, intention=action.intention, element_id=action.element_id ), step=step, - scraped_page=scraped_page, + element_tree_builder=scraped_page, + skyvern_element=anchor_element, ) if dropdown_select_context.is_date_related: @@ -934,7 +935,8 @@ async def handle_input_text_action( input_or_select_context = await _get_input_or_select_context( action=action, - scraped_page=scraped_page, + element_tree_builder=scraped_page, + skyvern_element=skyvern_element, step=step, ) @@ -1538,7 +1540,7 @@ async def handle_select_option_action( ) input_or_select_context = await _get_input_or_select_context( - action=action, scraped_page=scraped_page, step=step + action=action, element_tree_builder=scraped_page, step=step, skyvern_element=skyvern_element ) if len(incremental_element) == 0: @@ -3332,26 +3334,9 @@ async def normal_select( action_result: List[ActionResult] = [] is_success = False locator = skyvern_element.get_locator() - - prompt = load_prompt_with_elements( - element_tree_builder=builder, - prompt_engine=prompt_engine, - template_name="parse-input-or-select-context", - action_reasoning=action.reasoning, - element_id=action.element_id, + input_or_select_context = await _get_input_or_select_context( + action=action, element_tree_builder=builder, step=step, skyvern_element=skyvern_element ) - json_response = await app.SECONDARY_LLM_API_HANDLER( - prompt=prompt, step=step, prompt_name="parse-input-or-select-context" - ) - json_response["intention"] = action.intention - input_or_select_context = InputOrSelectContext.model_validate(json_response) - LOG.info( - "Parsed input/select context", - context=input_or_select_context, - task_id=task.task_id, - step_id=step.step_id, - ) - await skyvern_element.refresh_select_options() options_html = skyvern_element.build_HTML() field_information = ( @@ -3694,10 +3679,46 @@ class AbstractActionForContextParse(BaseModel): async def _get_input_or_select_context( - action: InputTextAction | SelectOptionAction | AbstractActionForContextParse, scraped_page: ScrapedPage, step: Step + action: InputTextAction | SelectOptionAction | AbstractActionForContextParse, + skyvern_element: SkyvernElement, + element_tree_builder: ElementTreeBuilder, + step: Step, + ancestor_depth: int = 5, ) -> InputOrSelectContext: + skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame()) + try: + depth = await skyvern_frame.get_element_dom_depth(await skyvern_element.get_element_handler()) + except Exception: + LOG.warning("Failed to get element depth, using the original element tree", exc_info=True) + depth = 0 + + if depth > ancestor_depth: + # use ancestor to build the context + path = "/".join([".."] * ancestor_depth) + locator = skyvern_element.get_locator().locator(path) + try: + element_handle = await locator.element_handle(timeout=settings.BROWSER_ACTION_TIMEOUT_MS) + if element_handle is not None: + elements, element_tree = await skyvern_frame.build_tree_from_element( + starter=element_handle, + frame=skyvern_element.get_frame_id(), + ) + clean_up_func = app.AGENT_FUNCTION.cleanup_element_tree_factory() + element_tree = await clean_up_func(skyvern_element.get_frame(), "", copy.deepcopy(element_tree)) + element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree)) + element_tree_builder = ScrapedPage( + elements=elements, + element_tree=element_tree, + element_tree_trimmed=element_tree_trimmed, + _browser_state=None, + _clean_up_func=None, + _scrape_exclude=None, + ) + except Exception: + LOG.warning("Failed to get sub element tree, using the original element tree", exc_info=True, path=path) + prompt = load_prompt_with_elements( - element_tree_builder=scraped_page, + element_tree_builder=element_tree_builder, prompt_engine=prompt_engine, template_name="parse-input-or-select-context", action_reasoning=action.reasoning, diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 79751139..cb364ad8 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -257,16 +257,16 @@ class ScrapedPage(BaseModel, ElementTreeBuilder): elements: list[dict] id_to_element_dict: dict[str, dict] = {} id_to_frame_dict: dict[str, str] = {} - id_to_css_dict: dict[str, str] - id_to_element_hash: dict[str, str] - hash_to_element_ids: dict[str, list[str]] + id_to_css_dict: dict[str, str] = {} + id_to_element_hash: dict[str, str] = {} + hash_to_element_ids: dict[str, list[str]] = {} element_tree: list[dict] element_tree_trimmed: list[dict] economy_element_tree: list[dict] | None = None last_used_element_tree: list[dict] | None = None - screenshots: list[bytes] - url: str - html: str + screenshots: list[bytes] = [] + url: str = "" + html: str = "" extracted_text: str | None = None window_dimension: dict[str, int] | None = None _browser_state: BrowserState = PrivateAttr() diff --git a/skyvern/webeye/utils/page.py b/skyvern/webeye/utils/page.py index 04ac68d5..122bb108 100644 --- a/skyvern/webeye/utils/page.py +++ b/skyvern/webeye/utils/page.py @@ -477,6 +477,10 @@ class SkyvernFrame: js_script = "([element]) => getSelectOptions(element)" return await self.evaluate(frame=self.frame, expression=js_script, arg=[element]) + async def get_element_dom_depth(self, element: ElementHandle) -> int: + js_script = "([element]) => getElementDomDepth(element)" + return await self.evaluate(frame=self.frame, expression=js_script, arg=[element]) + @TraceManager.traced_async() async def build_tree_from_body( self, @@ -500,6 +504,19 @@ class SkyvernFrame: frame=self.frame, expression=js_script, timeout_ms=timeout_ms, arg=[wait_until_finished] ) + @TraceManager.traced_async() + async def build_tree_from_element( + self, + starter: ElementHandle, + frame: str, + full_tree: bool = False, + timeout_ms: float = SettingsManager.get_settings().BROWSER_SCRAPING_BUILDING_ELEMENT_TREE_TIMEOUT_MS, + ) -> tuple[list[dict], list[dict]]: + js_script = "async ([starter, frame, full_tree]) => await buildElementTree(starter, frame, full_tree)" + return await self.evaluate( + frame=self.frame, expression=js_script, timeout_ms=timeout_ms, arg=[starter, frame, full_tree] + ) + async def safe_wait_for_animation_end(self, timeout_ms: float = 3000) -> None: try: async with asyncio.timeout(timeout_ms / 1000):