diff --git a/skyvern/constants.py b/skyvern/constants.py index 9bc63156..f228a1fc 100644 --- a/skyvern/constants.py +++ b/skyvern/constants.py @@ -11,7 +11,6 @@ PAGE_CONTENT_TIMEOUT = 300 # 5 mins # reserved fields for navigation payload SPECIAL_FIELD_VERIFICATION_CODE = "verification_code" -VERIFICATION_CODE_PLACEHOLDER = "REAL_TOTP_CODE" VERIFICATION_CODE_POLLING_TIMEOUT_MINS = 10 diff --git a/skyvern/forge/agent.py b/skyvern/forge/agent.py index d720a01b..255d91a0 100644 --- a/skyvern/forge/agent.py +++ b/skyvern/forge/agent.py @@ -11,12 +11,7 @@ from playwright._impl._errors import TargetClosedError from playwright.async_api import Page from skyvern import analytics -from skyvern.constants import ( - SCRAPE_TYPE_ORDER, - SPECIAL_FIELD_VERIFICATION_CODE, - VERIFICATION_CODE_PLACEHOLDER, - ScrapeType, -) +from skyvern.constants import SCRAPE_TYPE_ORDER, SPECIAL_FIELD_VERIFICATION_CODE, ScrapeType from skyvern.exceptions import ( BrowserStateMissingPage, EmptyScrapePage, @@ -53,7 +48,7 @@ from skyvern.webeye.actions.actions import ( WebAction, parse_actions, ) -from skyvern.webeye.actions.handler import ActionHandler +from skyvern.webeye.actions.handler import ActionHandler, poll_verification_code from skyvern.webeye.actions.models import AgentStepOutput, DetailedAgentStepOutput from skyvern.webeye.actions.responses import ActionResult from skyvern.webeye.browser_factory import BrowserState @@ -548,6 +543,13 @@ class ForgeAgent: step=step, screenshots=scraped_page.screenshots, ) + json_response = await self.handle_potential_verification_code( + task, + step, + scraped_page, + browser_state, + json_response, + ) detailed_agent_step_output.llm_response = json_response actions = parse_actions(task, json_response["actions"]) @@ -951,16 +953,6 @@ class ForgeAgent: num_elements=len(scraped_page.elements), url=task.url, ) - - actions_and_results_str = await self._get_action_results(task) - - # Generate the extract action prompt - navigation_goal = task.navigation_goal - starting_url = task.url - current_url = ( - await browser_state.page.evaluate("() => document.location.href") if browser_state.page else starting_url - ) - # TODO: we only use HTML element for now, introduce a way to switch in the future element_tree_format = ElementTreeFormat.HTML LOG.info( @@ -971,18 +963,12 @@ class ForgeAgent: ) element_tree_in_prompt: str = scraped_page.build_element_tree(element_tree_format) - final_navigation_payload = self._build_navigation_payload(task) - extract_action_prompt = prompt_engine.load_prompt( - "extract-action", - navigation_goal=navigation_goal, - navigation_payload_str=json.dumps(final_navigation_payload), - starting_url=starting_url, - current_url=current_url, - elements=element_tree_in_prompt, - data_extraction_goal=task.data_extraction_goal, - action_history=actions_and_results_str, - error_code_mapping_str=(json.dumps(task.error_code_mapping) if task.error_code_mapping else None), - utc_datetime=datetime.utcnow().strftime("%Y-%m-%d %H:%M"), + extract_action_prompt = await self._build_extract_action_prompt( + task, + browser_state, + element_tree_in_prompt, + verification_code_check=bool(task.totp_verification_url), + expire_verification_code=True, ) await app.ARTIFACT_MANAGER.create_artifact( @@ -1013,26 +999,62 @@ class ForgeAgent: return scraped_page, extract_action_prompt + async def _build_extract_action_prompt( + self, + task: Task, + browser_state: BrowserState, + element_tree_in_prompt: str, + verification_code_check: bool = False, + expire_verification_code: bool = False, + ) -> str: + actions_and_results_str = await self._get_action_results(task) + + # Generate the extract action prompt + navigation_goal = task.navigation_goal + starting_url = task.url + current_url = ( + await browser_state.page.evaluate("() => document.location.href") if browser_state.page else starting_url + ) + final_navigation_payload = self._build_navigation_payload( + task, expire_verification_code=expire_verification_code + ) + return prompt_engine.load_prompt( + "extract-action", + navigation_goal=navigation_goal, + navigation_payload_str=json.dumps(final_navigation_payload), + starting_url=starting_url, + current_url=current_url, + elements=element_tree_in_prompt, + data_extraction_goal=task.data_extraction_goal, + action_history=actions_and_results_str, + error_code_mapping_str=(json.dumps(task.error_code_mapping) if task.error_code_mapping else None), + utc_datetime=datetime.utcnow().strftime("%Y-%m-%d %H:%M"), + verification_code_check=verification_code_check, + ) + def _build_navigation_payload( self, task: Task, + expire_verification_code: bool = False, ) -> dict[str, Any] | list | str | None: final_navigation_payload = task.navigation_payload - if task.totp_verification_url: + current_context = skyvern_context.ensure_context() + verification_code = current_context.totp_codes.get(task.task_id) + if task.totp_verification_url and verification_code: if ( isinstance(final_navigation_payload, dict) and SPECIAL_FIELD_VERIFICATION_CODE not in final_navigation_payload ): - final_navigation_payload[SPECIAL_FIELD_VERIFICATION_CODE] = VERIFICATION_CODE_PLACEHOLDER + final_navigation_payload[SPECIAL_FIELD_VERIFICATION_CODE] = verification_code elif ( isinstance(final_navigation_payload, str) and SPECIAL_FIELD_VERIFICATION_CODE not in final_navigation_payload ): final_navigation_payload = ( - final_navigation_payload - + "\n" - + str({SPECIAL_FIELD_VERIFICATION_CODE: VERIFICATION_CODE_PLACEHOLDER}) + final_navigation_payload + "\n" + str({SPECIAL_FIELD_VERIFICATION_CODE: verification_code}) ) + if expire_verification_code: + current_context.totp_codes.pop(task.task_id) return final_navigation_payload async def _get_action_results(self, task: Task) -> str: @@ -1552,6 +1574,40 @@ class ForgeAgent: ) return None, None, next_step + async def handle_potential_verification_code( + self, + task: Task, + step: Step, + scraped_page: ScrapedPage, + browser_state: BrowserState, + json_response: dict[str, Any], + ) -> dict[str, Any]: + # TODO: handle verifications and resend the request if needed + # parse the "need_verification_code" field from the response + need_verification_code = json_response.get("need_verification_code") + if need_verification_code and task.totp_verification_url and task.organization_id: + LOG.info("Need verification code", step_id=step.step_id) + verification_code = await poll_verification_code( + task.task_id, task.organization_id, url=task.totp_verification_url + ) + current_context = skyvern_context.ensure_context() + current_context.totp_codes[task.task_id] = verification_code + + element_tree_in_prompt: str = scraped_page.build_element_tree(ElementTreeFormat.HTML) + extract_action_prompt = await self._build_extract_action_prompt( + task, + browser_state, + element_tree_in_prompt, + verification_code_check=False, + expire_verification_code=False, + ) + return await app.LLM_API_HANDLER( + prompt=extract_action_prompt, + step=step, + screenshots=scraped_page.screenshots, + ) + return json_response + @staticmethod async def get_task_errors(task: Task) -> list[UserDefinedError]: steps = await app.DATABASE.get_task_steps(task_id=task.task_id, organization_id=task.organization_id) diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2 index 9fe04fa1..818d05d8 100644 --- a/skyvern/forge/prompts/skyvern/extract-action.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action.j2 @@ -9,6 +9,7 @@ If you see a popup in the page screenshot, prioritize actions on the popup. Reply in JSON format with the following keys: { +{% if verification_code_check %} "need_verification_code": bool, // Whether a verification code is needed to proceed.{% endif %} "user_goal_achieved": str, // A string that describes if user goal has been completed with reasoning. "action_plan": str, // A string that describes the plan of actions you're going to take. Be specific and to the point. Use this as a quick summary of the actions you're going to take, and what order you're going to take them in, and how that moves you towards your overall goal. Output "COMPLETE" action in the "actions" if user goal has been achieved. "actions": array // An array of actions. Here's the format of each action: @@ -36,10 +37,8 @@ Reply in JSON format with the following keys: }], } {% if action_history %} - Consider the action history from the last step and the screenshot together, if actions from the last step don't yield positive impact, try other actions or other action combinations. {% endif %} - Clickable elements from `{{ current_url }}`: ``` {{ elements }} @@ -52,12 +51,10 @@ User goal: {{ navigation_goal }} ``` {% if error_code_mapping_str %} - Use the error codes and their descriptions to surface user-defined errors. Do not return any error that's not defined by the user. User defined errors: {{ error_code_mapping_str }} {% endif %} {% if data_extraction_goal %} - User Data Extraction Goal: ``` {{ data_extraction_goal }} @@ -69,11 +66,9 @@ User details: {{ navigation_payload_str }} ``` {% if action_history %} - Action results from previous steps: (note: even if the action history suggests goal is achieved, check the screenshot and the DOM elements to make sure the goal is achieved) {{ action_history }} {% endif %} - Current datetime in UTC, YYYY-MM-DD HH:MM format: ``` {{ utc_datetime }} diff --git a/skyvern/forge/sdk/core/skyvern_context.py b/skyvern/forge/sdk/core/skyvern_context.py index f6375d27..11a615fc 100644 --- a/skyvern/forge/sdk/core/skyvern_context.py +++ b/skyvern/forge/sdk/core/skyvern_context.py @@ -1,5 +1,5 @@ from contextvars import ContextVar -from dataclasses import dataclass +from dataclasses import dataclass, field @dataclass @@ -10,6 +10,7 @@ class SkyvernContext: workflow_id: str | None = None workflow_run_id: str | None = None max_steps_override: int | None = None + totp_codes: dict[str, str | None] = field(default_factory=dict) def __repr__(self) -> str: return f"SkyvernContext(request_id={self.request_id}, organization_id={self.organization_id}, task_id={self.task_id}, workflow_id={self.workflow_id}, workflow_run_id={self.workflow_run_id}, max_steps_override={self.max_steps_override})" diff --git a/skyvern/webeye/actions/handler.py b/skyvern/webeye/actions/handler.py index 164fdc2d..6be5bc1f 100644 --- a/skyvern/webeye/actions/handler.py +++ b/skyvern/webeye/actions/handler.py @@ -10,7 +10,7 @@ import structlog from deprecation import deprecated from playwright.async_api import FileChooser, Locator, Page, TimeoutError -from skyvern.constants import REPO_ROOT_DIR, VERIFICATION_CODE_PLACEHOLDER, VERIFICATION_CODE_POLLING_TIMEOUT_MINS +from skyvern.constants import REPO_ROOT_DIR, VERIFICATION_CODE_POLLING_TIMEOUT_MINS from skyvern.exceptions import ( EmptySelect, ErrFoundSelectableElement, @@ -711,13 +711,6 @@ async def get_actual_value_of_parameter_if_secret(task: Task, parameter: str) -> This is only used for InputTextAction, UploadFileAction, and ClickAction (if it has a file_url). """ - if task.totp_verification_url and task.organization_id and VERIFICATION_CODE_PLACEHOLDER == parameter: - # if parameter is the secret code in the navigation playload, - # fetch the real verification from totp_verification_url - # do polling every 10 seconds to fetch the verification code - verification_code = await poll_verification_code(task.task_id, task.organization_id, task.totp_verification_url) - return verification_code - if task.workflow_run_id is None: return parameter