extend auto completion agent logic (#3010)

This commit is contained in:
LawyZheng
2025-07-22 14:25:47 +08:00
committed by GitHub
parent 95c0ac4deb
commit 03cc8a5a52
3 changed files with 43 additions and 8 deletions

View File

@@ -23,7 +23,7 @@ Reply in JSON format with the following keys:
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence.
"relevance_float": float, // The relative between the selected element and the provided information. You should consider how much the selected option is related to the user goal, the user details and the context. Pick a number between 0.00 and 1.00. 0.00 means no relevance, 1.00 means full relevance, the precision is 0.01.
"value": str, // The value to select.
"id": str, // The id of the most relevant and interactable element to take the action. The id must be from "HTML elements". It should be null if no element is relative or there's no auto completion suggestion.
"id": str, // The id of the most relevant and interactable element to take the action. The id must be from {{'"IDs for emerging HTML elements"' if new_elements_ids else '"HTML elements"'}}. It should be null if no element is relative or there's no auto completion suggestion.
}
Context:
@@ -45,7 +45,12 @@ User details:
```
{{ navigation_payload_str }}
```
{% if new_elements_ids %}
IDs for emerging HTML elements
```
{{ new_elements_ids }}
```
{% endif %}
HTML elements:
```
{{ elements }}

View File

@@ -13,7 +13,7 @@ async def wait_on_persistent_browser_address(
session_id: str,
organization_id: str,
timeout: int = 600,
poll_interval: int = 2,
poll_interval: float = 2,
) -> str | None:
persistent_browser_session = await await_browser_session(db, session_id, organization_id, timeout, poll_interval)
return persistent_browser_session.browser_address if persistent_browser_session else None
@@ -24,7 +24,7 @@ async def await_browser_session(
session_id: str,
organization_id: str,
timeout: int = 600,
poll_interval: int = 2,
poll_interval: float = 2,
) -> PersistentBrowserSession | None:
try:
async with asyncio.timeout(timeout):

View File

@@ -1096,6 +1096,7 @@ async def handle_input_text_action(
if await skyvern_element.is_auto_completion_input() or input_or_select_context.is_location_input:
if result := await input_or_auto_complete_input(
input_or_select_context=input_or_select_context,
scraped_page=scraped_page,
page=page,
dom=dom,
text=text,
@@ -2132,6 +2133,7 @@ async def chain_click(
async def choose_auto_completion_dropdown(
context: InputOrSelectContext,
page: Page,
scraped_page: ScrapedPage,
dom: DomUtil,
text: str,
skyvern_element: SkyvernElement,
@@ -2190,11 +2192,34 @@ async def choose_auto_completion_dropdown(
incremental_element.extend(confirmed_preserved_list)
result.incremental_elements = copy.deepcopy(incremental_element)
if len(incremental_element) == 0:
raise NoIncrementalElementFoundForAutoCompletion(element_id=skyvern_element.get_id(), text=text)
html = ""
new_interactable_element_ids = []
if len(incremental_element) > 0:
cleaned_incremental_element = remove_duplicated_HTML_element(incremental_element)
html = incremental_scraped.build_html_tree(cleaned_incremental_element)
else:
scraped_page_after_open = await scraped_page.generate_scraped_page_without_screenshots()
new_element_ids = set(scraped_page_after_open.id_to_css_dict.keys()) - set(
scraped_page.id_to_css_dict.keys()
)
dom_after_open = DomUtil(scraped_page=scraped_page_after_open, page=page)
new_interactable_element_ids = [
element_id
for element_id in new_element_ids
if (await dom_after_open.get_skyvern_element_by_id(element_id)).is_interactable()
]
if len(new_interactable_element_ids) == 0:
raise NoIncrementalElementFoundForAutoCompletion(element_id=skyvern_element.get_id(), text=text)
LOG.info(
"New elements detected after the input",
new_elements_ids=new_interactable_element_ids,
)
result.incremental_elements = copy.deepcopy(
[scraped_page_after_open.id_to_element_dict[element_id] for element_id in new_interactable_element_ids]
)
html = scraped_page_after_open.build_element_tree()
cleaned_incremental_element = remove_duplicated_HTML_element(incremental_element)
html = incremental_scraped.build_html_tree(cleaned_incremental_element)
auto_completion_confirm_prompt = prompt_engine.load_prompt(
"auto-completion-choose-option",
is_search=context.is_search_bar,
@@ -2203,6 +2228,7 @@ async def choose_auto_completion_dropdown(
navigation_goal=task.navigation_goal,
navigation_payload_str=json.dumps(task.navigation_payload),
elements=html,
new_elements_ids=new_interactable_element_ids,
local_datetime=datetime.now(skyvern_context.ensure_context().tz_info).isoformat(),
)
LOG.info(
@@ -2257,6 +2283,7 @@ async def choose_auto_completion_dropdown(
await locator.click(timeout=settings.BROWSER_ACTION_TIMEOUT_MS)
clear_input = False
return result
except Exception as e:
LOG.info(
"Failed to choose the auto completion dropdown",
@@ -2287,6 +2314,7 @@ def remove_duplicated_HTML_element(elements: list[dict]) -> list[dict]:
async def input_or_auto_complete_input(
input_or_select_context: InputOrSelectContext,
scraped_page: ScrapedPage,
page: Page,
dom: DomUtil,
text: str,
@@ -2326,6 +2354,7 @@ async def input_or_auto_complete_input(
result = await choose_auto_completion_dropdown(
context=input_or_select_context,
page=page,
scraped_page=scraped_page,
dom=dom,
text=current_value,
preserved_elements=result.incremental_elements,
@@ -2395,6 +2424,7 @@ async def input_or_auto_complete_input(
result = await choose_auto_completion_dropdown(
context=input_or_select_context,
page=page,
scraped_page=scraped_page,
dom=dom,
text=value,
preserved_elements=result.incremental_elements,