From b7515919d7b601b4a2a667582b848d5b35985821 Mon Sep 17 00:00:00 2001 From: Shuchang Zheng Date: Thu, 11 Sep 2025 13:49:05 -0700 Subject: [PATCH] Pedro/migrate context call to main prompt (#3400) --- ..._add_input_or_select_context_column_to_.py | 31 +++++++++++++++++++ skyvern/forge/agent.py | 2 ++ .../forge/prompts/skyvern/extract-action.j2 | 10 +++++- .../prompts/skyvern/single-input-action.j2 | 10 +++++- .../prompts/skyvern/single-select-action.j2 | 10 +++++- skyvern/forge/sdk/core/skyvern_context.py | 1 + skyvern/forge/sdk/db/models.py | 1 + skyvern/services/task_v2_service.py | 6 ++++ skyvern/webeye/actions/actions.py | 5 +-- skyvern/webeye/actions/handler.py | 22 +++++++++++-- skyvern/webeye/actions/parse_actions.py | 15 ++++++++- 11 files changed, 104 insertions(+), 9 deletions(-) create mode 100644 alembic/versions/2025_09_11_2038-bbcc08ba09f5_add_input_or_select_context_column_to_.py diff --git a/alembic/versions/2025_09_11_2038-bbcc08ba09f5_add_input_or_select_context_column_to_.py b/alembic/versions/2025_09_11_2038-bbcc08ba09f5_add_input_or_select_context_column_to_.py new file mode 100644 index 00000000..0e6b7d95 --- /dev/null +++ b/alembic/versions/2025_09_11_2038-bbcc08ba09f5_add_input_or_select_context_column_to_.py @@ -0,0 +1,31 @@ +"""add input_or_select_context column to the actions table + +Revision ID: bbcc08ba09f5 +Revises: 8de03b8cb83a +Create Date: 2025-09-11 20:38:47.648188+00:00 + +""" + +from typing import Sequence, Union + +import sqlalchemy as sa + +from alembic import op + +# revision identifiers, used by Alembic. +revision: str = "bbcc08ba09f5" +down_revision: Union[str, None] = "8de03b8cb83a" +branch_labels: Union[str, Sequence[str], None] = None +depends_on: Union[str, Sequence[str], None] = None + + +def upgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.add_column("actions", sa.Column("input_or_select_context", sa.JSON(), nullable=True)) + # ### end Alembic commands ### + + +def downgrade() -> None: + # ### commands auto generated by Alembic - please adjust! ### + op.drop_column("actions", "input_or_select_context") + # ### end Alembic commands ### diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index 9e7732c8..14251213 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -2052,6 +2052,7 @@ class ForgeAgent: raise UnsupportedTaskType(task_type=task_type) context = skyvern_context.ensure_context() + return load_prompt_with_elements( element_tree_builder=scraped_page, prompt_engine=prompt_engine, @@ -2067,6 +2068,7 @@ class ForgeAgent: verification_code_check=verification_code_check, complete_criterion=task.complete_criterion.strip() if task.complete_criterion else None, terminate_criterion=task.terminate_criterion.strip() if task.terminate_criterion else None, + parse_select_feature_enabled=context.enable_parse_select_in_extract, ) def _build_navigation_payload( diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2 index 5cd4066a..0f859852 100644 --- a/skyvern/forge/prompts/skyvern/extract-action.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action.j2 @@ -27,7 +27,15 @@ Reply in JSON format with the following keys: "label": str, // the label of the option if any. MAKE SURE YOU USE THIS LABEL TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION LABEL HERE "index": int, // the index corresponding to the option index under the select element. "value": str // the value of the option. MAKE SURE YOU USE THIS VALUE TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION VALUE HERE - } + }{% if parse_select_feature_enabled %}, + "context": { // The context for INPUT_TEXT or SELECT_OPTION action only. null if not INPUT_TEXT or SELECT_OPTION action. Extract the following detailed information from the "reasoning", and double-check the information by analysing the HTML elements. + "thought": str, // A string to describe how you double-check the context information to ensure the accuracy. + "field": str, // Which field is this action intended to fill out? + "is_required": bool, // True if this is a required field, otherwise false. + "is_search_bar": bool, // True if the element to take the action is a search bar, otherwise false. + "is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. Output False if it only requires ZIP code or postal code. + "is_date_related": bool, // True if the field is related to date input or select, otherwise false. + }{% endif %} }],{% if verification_code_check %} "verification_code_reasoning": str, // Let's think step by step. Describe what you see and think if there is somewhere on the current page where you must enter the verification code now for login or any verification step. Explain why you believe a verification code needs to be entered somewhere or not. Do not imagine any place to enter the code if the code has not been sent yet. "place_to_enter_verification_code": bool, // Whether there is a place on the current page to enter the verification code now. diff --git a/skyvern/forge/prompts/skyvern/single-input-action.j2 b/skyvern/forge/prompts/skyvern/single-input-action.j2 index e7ff09b7..18f58a8e 100644 --- a/skyvern/forge/prompts/skyvern/single-input-action.j2 +++ b/skyvern/forge/prompts/skyvern/single-input-action.j2 @@ -14,7 +14,15 @@ Reply in JSON format with the following keys: "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence "action_type": str, // It's a string enum: "INPUT_TEXT". "INPUT_TEXT" is an element you'd like to input text into. "id": str, // The id of the element to take action on. The id has to be one from the elements list. - "text": str, // The text to input. + "text": str, // The text to input.{% if parse_select_feature_enabled %} + "context": { // Extract the following detailed information from the "reasoning", and double-check the information by analysing the HTML elements. + "thought": str, // A string to describe how you double-check the context information to ensure the accuracy. + "field": str, // Which field is this action intended to fill out? + "is_required": bool, // True if this is a required field, otherwise false. + "is_search_bar": bool, // True if the element to take the action is a search bar, otherwise false. + "is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. Output False if it only requires ZIP code or postal code. + "is_date_related": bool, // True if the field is related to date input or select, otherwise false. + }{% endif %} }]{% if verification_code_check %} "verification_code_reasoning": str, // Let's think step by step. Describe what you see and think if there is somewhere on the current page where you must enter the verification code now for login or any verification step. Explain why you believe a verification code needs to be entered somewhere or not. Do not imagine any place to enter the code if the code has not been sent yet. "place_to_enter_verification_code": bool, // Whether there is a place on the current page to enter the verification code now. diff --git a/skyvern/forge/prompts/skyvern/single-select-action.j2 b/skyvern/forge/prompts/skyvern/single-select-action.j2 index eea267ad..6388c45c 100644 --- a/skyvern/forge/prompts/skyvern/single-select-action.j2 +++ b/skyvern/forge/prompts/skyvern/single-select-action.j2 @@ -18,7 +18,15 @@ Reply in JSON format with the following keys: "label": str, // the label of the option if any. MAKE SURE YOU USE THIS LABEL TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION LABEL HERE "index": int, // the index corresponding to the option index under the select element. "value": str // the value of the option. MAKE SURE YOU USE THIS VALUE TO SELECT THE OPTION. DO NOT PUT ANYTHING OTHER THAN A VALID OPTION VALUE HERE - }, + }{% if parse_select_feature_enabled %}, + "context": { // Extract the following detailed information from the "reasoning", and double-check the information by analysing the HTML elements. + "thought": str, // A string to describe how you double-check the context information to ensure the accuracy. + "field": str, // Which field is this action intended to fill out? + "is_required": bool, // True if this is a required field, otherwise false. + "is_search_bar": bool, // True if the element to take the action is a search bar, otherwise false. + "is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. Output False if it only requires ZIP code or postal code. + "is_date_related": bool, // True if the field is related to date input or select, otherwise false. + }{% endif %} }] } diff --git a/skyvern/forge/sdk/core/skyvern_context.py b/skyvern/forge/sdk/core/skyvern_context.py index 298f600b..cc09240f 100644 --- a/skyvern/forge/sdk/core/skyvern_context.py +++ b/skyvern/forge/sdk/core/skyvern_context.py @@ -31,6 +31,7 @@ class SkyvernContext: script_revision_id: str | None = None action_order: int = 0 prompt: str | None = None + enable_parse_select_in_extract: bool = False def __repr__(self) -> str: return f"SkyvernContext(request_id={self.request_id}, organization_id={self.organization_id}, task_id={self.task_id}, step_id={self.step_id}, workflow_id={self.workflow_id}, workflow_run_id={self.workflow_run_id}, task_v2_id={self.task_v2_id}, max_steps_override={self.max_steps_override}, run_id={self.run_id})" diff --git a/skyvern/forge/sdk/db/models.py b/skyvern/forge/sdk/db/models.py index 04485df1..e5e5191d 100644 --- a/skyvern/forge/sdk/db/models.py +++ b/skyvern/forge/sdk/db/models.py @@ -580,6 +580,7 @@ class ActionModel(Base): skyvern_element_hash = Column(String, nullable=True) skyvern_element_data = Column(JSON, nullable=True) action_json = Column(JSON, nullable=True) + input_or_select_context = Column(JSON, nullable=True) confidence_float = Column(Numeric, nullable=True) created_at = Column(DateTime, default=datetime.datetime.utcnow, nullable=False) diff --git a/skyvern/services/task_v2_service.py b/skyvern/services/task_v2_service.py index 007c38a2..f588745a 100644 --- a/skyvern/services/task_v2_service.py +++ b/skyvern/services/task_v2_service.py @@ -466,6 +466,11 @@ async def run_task_v2_helper( context: skyvern_context.SkyvernContext | None = skyvern_context.current() current_run_id = context.run_id if context and context.run_id else task_v2_id + enable_parse_select_in_extract = app.EXPERIMENTATION_PROVIDER.is_feature_enabled_cached( + "ENABLE_PARSE_SELECT_IN_EXTRACT", + organization_id, + properties={"organization_id": organization_id, "task_url": task_v2.url}, + ) skyvern_context.set( SkyvernContext( organization_id=organization_id, @@ -476,6 +481,7 @@ async def run_task_v2_helper( run_id=current_run_id, browser_session_id=browser_session_id, max_screenshot_scrolls=task_v2.max_screenshot_scrolls, + enable_parse_select_in_extract=bool(enable_parse_select_in_extract), ) ) diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index d51a2b1b..b07aa748 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -83,6 +83,7 @@ class Action(BaseModel): download: bool | None = None is_upload_file_tag: bool | None = None text: str | None = None + input_or_select_context: InputOrSelectContext | None = None option: SelectOption | None = None is_checked: bool | None = None verified: bool = False @@ -165,7 +166,7 @@ class InputTextAction(WebAction): text: str def __repr__(self) -> str: - return f"InputTextAction(element_id={self.element_id}, text={self.text}, tool_call_id={self.tool_call_id})" + return f"InputTextAction(element_id={self.element_id}, text={self.text}, context={self.input_or_select_context}, tool_call_id={self.tool_call_id})" class UploadFileAction(WebAction): @@ -199,7 +200,7 @@ class SelectOptionAction(WebAction): option: SelectOption def __repr__(self) -> str: - return f"SelectOptionAction(element_id={self.element_id}, option={self.option})" + return f"SelectOptionAction(element_id={self.element_id}, option={self.option}, context={self.input_or_select_context})" ### diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 9676c726..194c4857 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -784,9 +784,9 @@ async def handle_sequential_click_for_dropdown( action=AbstractActionForContextParse( reasoning=action.reasoning, intention=action.intention, element_id=action.element_id ), - step=step, - element_tree_builder=scraped_page, skyvern_element=anchor_element, + element_tree_builder=scraped_page, + step=step, ) if dropdown_select_context.is_date_related: @@ -3425,9 +3425,20 @@ async def normal_select( action_result: List[ActionResult] = [] is_success = False locator = skyvern_element.get_locator() + input_or_select_context = await _get_input_or_select_context( - action=action, element_tree_builder=builder, step=step, skyvern_element=skyvern_element + action=action, + element_tree_builder=builder, + step=step, + skyvern_element=skyvern_element, ) + LOG.info( + "Parsed input/select context", + context=input_or_select_context, + task_id=task.task_id, + step_id=step.step_id, + ) + await skyvern_element.refresh_select_options() options_html = skyvern_element.build_HTML() field_information = ( @@ -3766,6 +3777,11 @@ async def _get_input_or_select_context( step: Step, ancestor_depth: int = 5, ) -> InputOrSelectContext: + # Early return optimization: if action already has input_or_select_context, use it + if not isinstance(action, AbstractActionForContextParse) and action.input_or_select_context is not None: + return action.input_or_select_context + + # Ancestor depth optimization: use ancestor element for deep DOM structures skyvern_frame = await SkyvernFrame.create_instance(skyvern_element.get_frame()) try: depth = await skyvern_frame.get_element_dom_depth(await skyvern_element.get_element_handler()) diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index af98c71f..b6c53764 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -21,6 +21,7 @@ from skyvern.webeye.actions.actions import ( CompleteAction, DownloadFileAction, DragAction, + InputOrSelectContext, InputTextAction, KeypressAction, LeftMouseAction, @@ -68,6 +69,7 @@ def parse_action(action: Dict[str, Any], scraped_page: ScrapedPage, data_extract "intention": intention, "response": response, } + input_or_select_context: InputOrSelectContext | None = None if "action_type" not in action or action["action_type"] is None: return NullAction(**base_action_dict) @@ -89,7 +91,11 @@ def parse_action(action: Dict[str, Any], scraped_page: ScrapedPage, data_extract return ClickAction(**base_action_dict, file_url=file_url, download=action.get("download", False)) if action_type == ActionType.INPUT_TEXT: - return InputTextAction(**base_action_dict, text=action["text"]) + context_dict = action.get("context", {}) + if context_dict and len(context_dict) > 0: + context_dict["intention"] = intention + input_or_select_context = InputOrSelectContext.model_validate(context_dict) + return InputTextAction(**base_action_dict, text=action["text"], input_or_select_context=input_or_select_context) if action_type == ActionType.UPLOAD_FILE: # TODO: see if the element is a file input element. if it's not, convert this action into a click action @@ -106,6 +112,12 @@ def parse_action(action: Dict[str, Any], scraped_page: ScrapedPage, data_extract option = action["option"] if option is None: raise ValueError("SelectOptionAction requires an 'option' field") + + context_dict = action.get("context", {}) + if context_dict and len(context_dict) > 0: + context_dict["intention"] = intention + input_or_select_context = InputOrSelectContext.model_validate(context_dict) + label = option.get("label") value = option.get("value") index = option.get("index") @@ -118,6 +130,7 @@ def parse_action(action: Dict[str, Any], scraped_page: ScrapedPage, data_extract value=value, index=index, ), + input_or_select_context=input_or_select_context, ) if action_type == ActionType.CHECKBOX: