decrease parse input prompt token (#3210)

This commit is contained in:
LawyZheng
2025-08-16 10:05:38 +08:00
committed by GitHub
parent 618fba5371
commit 9a359ebfde
3 changed files with 68 additions and 30 deletions

View File

@@ -741,7 +741,8 @@ async def handle_sequential_click_for_dropdown(
reasoning=action.reasoning, intention=action.intention, element_id=action.element_id
),
step=step,
scraped_page=scraped_page,
element_tree_builder=scraped_page,
skyvern_element=anchor_element,
)
if dropdown_select_context.is_date_related:
@@ -934,7 +935,8 @@ async def handle_input_text_action(
input_or_select_context = await _get_input_or_select_context(
action=action,
scraped_page=scraped_page,
element_tree_builder=scraped_page,
skyvern_element=skyvern_element,
step=step,
)
@@ -1538,7 +1540,7 @@ async def handle_select_option_action(
)
input_or_select_context = await _get_input_or_select_context(
action=action, scraped_page=scraped_page, step=step
action=action, element_tree_builder=scraped_page, step=step, skyvern_element=skyvern_element
)
if len(incremental_element) == 0:
@@ -3332,26 +3334,9 @@ async def normal_select(
action_result: List[ActionResult] = []
is_success = False
locator = skyvern_element.get_locator()
prompt = load_prompt_with_elements(
element_tree_builder=builder,
prompt_engine=prompt_engine,
template_name="parse-input-or-select-context",
action_reasoning=action.reasoning,
element_id=action.element_id,
input_or_select_context = await _get_input_or_select_context(
action=action, element_tree_builder=builder, step=step, skyvern_element=skyvern_element
)
json_response = await app.SECONDARY_LLM_API_HANDLER(
prompt=prompt, step=step, prompt_name="parse-input-or-select-context"
)
json_response["intention"] = action.intention
input_or_select_context = InputOrSelectContext.model_validate(json_response)
LOG.info(
"Parsed input/select context",
context=input_or_select_context,
task_id=task.task_id,
step_id=step.step_id,
)
await skyvern_element.refresh_select_options()
options_html = skyvern_element.build_HTML()
field_information = (
@@ -3694,10 +3679,46 @@ class AbstractActionForContextParse(BaseModel):
async def _get_input_or_select_context(
action: InputTextAction | SelectOptionAction | AbstractActionForContextParse, scraped_page: ScrapedPage, step: Step
action: InputTextAction | SelectOptionAction | AbstractActionForContextParse,
skyvern_element: SkyvernElement,
element_tree_builder: ElementTreeBuilder,
step: Step,
ancestor_depth: int = 5,
) -> InputOrSelectContext:
skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame())
try:
depth = await skyvern_frame.get_element_dom_depth(await skyvern_element.get_element_handler())
except Exception:
LOG.warning("Failed to get element depth, using the original element tree", exc_info=True)
depth = 0
if depth > ancestor_depth:
# use ancestor to build the context
path = "/".join([".."] * ancestor_depth)
locator = skyvern_element.get_locator().locator(path)
try:
element_handle = await locator.element_handle(timeout=settings.BROWSER_ACTION_TIMEOUT_MS)
if element_handle is not None:
elements, element_tree = await skyvern_frame.build_tree_from_element(
starter=element_handle,
frame=skyvern_element.get_frame_id(),
)
clean_up_func = app.AGENT_FUNCTION.cleanup_element_tree_factory()
element_tree = await clean_up_func(skyvern_element.get_frame(), "", copy.deepcopy(element_tree))
element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree))
element_tree_builder = ScrapedPage(
elements=elements,
element_tree=element_tree,
element_tree_trimmed=element_tree_trimmed,
_browser_state=None,
_clean_up_func=None,
_scrape_exclude=None,
)
except Exception:
LOG.warning("Failed to get sub element tree, using the original element tree", exc_info=True, path=path)
prompt = load_prompt_with_elements(
element_tree_builder=scraped_page,
element_tree_builder=element_tree_builder,
prompt_engine=prompt_engine,
template_name="parse-input-or-select-context",
action_reasoning=action.reasoning,

View File

@@ -257,16 +257,16 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
elements: list[dict]
id_to_element_dict: dict[str, dict] = {}
id_to_frame_dict: dict[str, str] = {}
id_to_css_dict: dict[str, str]
id_to_element_hash: dict[str, str]
hash_to_element_ids: dict[str, list[str]]
id_to_css_dict: dict[str, str] = {}
id_to_element_hash: dict[str, str] = {}
hash_to_element_ids: dict[str, list[str]] = {}
element_tree: list[dict]
element_tree_trimmed: list[dict]
economy_element_tree: list[dict] | None = None
last_used_element_tree: list[dict] | None = None
screenshots: list[bytes]
url: str
html: str
screenshots: list[bytes] = []
url: str = ""
html: str = ""
extracted_text: str | None = None
window_dimension: dict[str, int] | None = None
_browser_state: BrowserState = PrivateAttr()

View File

@@ -477,6 +477,10 @@ class SkyvernFrame:
js_script = "([element]) => getSelectOptions(element)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=[element])
async def get_element_dom_depth(self, element: ElementHandle) -> int:
js_script = "([element]) => getElementDomDepth(element)"
return await self.evaluate(frame=self.frame, expression=js_script, arg=[element])
@TraceManager.traced_async()
async def build_tree_from_body(
self,
@@ -500,6 +504,19 @@ class SkyvernFrame:
frame=self.frame, expression=js_script, timeout_ms=timeout_ms, arg=[wait_until_finished]
)
@TraceManager.traced_async()
async def build_tree_from_element(
self,
starter: ElementHandle,
frame: str,
full_tree: bool = False,
timeout_ms: float = SettingsManager.get_settings().BROWSER_SCRAPING_BUILDING_ELEMENT_TREE_TIMEOUT_MS,
) -> tuple[list[dict], list[dict]]:
js_script = "async ([starter, frame, full_tree]) => await buildElementTree(starter, frame, full_tree)"
return await self.evaluate(
frame=self.frame, expression=js_script, timeout_ms=timeout_ms, arg=[starter, frame, full_tree]
)
async def safe_wait_for_animation_end(self, timeout_ms: float = 3000) -> None:
try:
async with asyncio.timeout(timeout_ms / 1000):