diff --git a/skyvern/forge/prompts/skyvern/extract-action-static.j2 b/skyvern/forge/prompts/skyvern/extract-action-static.j2 index a2936ae6..34ceb295 100644 --- a/skyvern/forge/prompts/skyvern/extract-action-static.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action-static.j2 @@ -20,6 +20,7 @@ Reply in JSON format with the following keys: "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence "action_type": str, // It's a string enum: "CLICK", "HOVER", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "HOVER" is used to move the mouse over an element without clicking, particularly when revealing hover-only menus or buttons before clicking, or when the UI hints that a control (like a CTA button) only appears after hovering a card, tile, or model name. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} "id": str, // The id of the element to take action on. The id has to be one from the elements list + "captcha_type": str, // The type of captcha for SOLVE_CAPTCHA action only. null if not SOLVE_CAPTCHA action. It's a string enum: "TEXT_CAPTCHA", "RECAPTCHA", "HCAPTCHA", "MTCAPTCHA", "FUNCAPTCHA", "CLOUDFLARE", "OTHER". "text": str, // Text for INPUT_TEXT action only "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise. "download": bool, // Can only be true for CLICK or SELECT_OPTION actions. If true, the browser will trigger a download by clicking the element. If false, the browser will click the element without triggering a download. @@ -40,6 +41,7 @@ Reply in JSON format with the following keys: "is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. Output False if it only requires ZIP code or postal code. "is_date_related": bool, // True if the field is related to date input or select, otherwise false. "date_format": str, // The format of the date or datetime to be input. For example YYYY-MM-DD, YYYY-MM-DD HH:MM:SS, DD.MM.YYYY, MM/DD/YYYY, etc. If the field is not related to date input or select, this should be null. + "is_text_captcha": bool, // True if the field is asking for a text captcha, otherwise false. Do not confuse it with the verification code. Text CAPTCHAs are typically displayed alongside an image of distorted letters or numbers. }{% endif %} }],{% if verification_code_check %} "verification_code_reasoning": str, // Let's think step by step. Describe what you see and think if there is somewhere on the current page where you must enter the verification code now for login or any verification step. Explain why you believe a verification code needs to be entered somewhere or not. Do not imagine any place to enter the code if the code has not been sent yet. diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2 index 6f32e1db..da1e0430 100644 --- a/skyvern/forge/prompts/skyvern/extract-action.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action.j2 @@ -20,6 +20,7 @@ Reply in JSON format with the following keys: "confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence "action_type": str, // It's a string enum: "CLICK", "HOVER", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "HOVER" is used to move the mouse over an element without clicking, particularly when revealing hover-only menus or buttons before clicking, or when the UI hints that a control (like a CTA button) only appears after hovering a card, tile, or model name. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}} "id": str, // The id of the element to take action on. The id has to be one from the elements list + "captcha_type": str, // The type of captcha for SOLVE_CAPTCHA action only. null if not SOLVE_CAPTCHA action. It's a string enum: "TEXT_CAPTCHA", "RECAPTCHA", "HCAPTCHA", "MTCAPTCHA", "FUNCAPTCHA", "CLOUDFLARE", "OTHER". "text": str, // Text for INPUT_TEXT action only "file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise. "download": bool, // Can only be true for CLICK or SELECT_OPTION actions. If true, the browser will trigger a download by clicking the element. If false, the browser will click the element without triggering a download. @@ -40,6 +41,7 @@ Reply in JSON format with the following keys: "is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. Output False if it only requires ZIP code or postal code. "is_date_related": bool, // True if the field is related to date input or select, otherwise false. "date_format": str, // The format of the date or datetime to be input. For example YYYY-MM-DD, YYYY-MM-DD HH:MM:SS, DD.MM.YYYY, MM/DD/YYYY, etc. If the field is not related to date input or select, this should be null. + "is_text_captcha": bool, // True if the field is asking for a text captcha, otherwise false. Do not confuse it with the verification code. Text CAPTCHAs are typically displayed alongside an image of distorted letters or numbers. }{% endif %} }],{% if verification_code_check %} "verification_code_reasoning": str, // Let's think step by step. Describe what you see and think if there is somewhere on the current page where you must enter the verification code now for login or any verification step. Explain why you believe a verification code needs to be entered somewhere or not. Do not imagine any place to enter the code if the code has not been sent yet. diff --git a/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2 b/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2 index 81e341b3..9c15d6f4 100644 --- a/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2 +++ b/skyvern/forge/prompts/skyvern/parse-input-or-select-context.j2 @@ -11,6 +11,7 @@ Reply in the following JSON format: "is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. Output False if it only requires ZIP code or postal code. "is_date_related": bool, // True if the field is related to date input or select, otherwise false. "date_format": str, // The format of the date or datetime to be input. For example YYYY-MM-DD, YYYY-MM-DD HH:MM:SS, DD.MM.YYYY, MM/DD/YYYY, etc. If the field is not related to date input or select, this should be null. + "is_text_captcha": bool, // True if the field is asking for a text captcha, otherwise false. Do not confuse it with the verification code. Text CAPTCHAs are typically displayed alongside an image of distorted letters or numbers. } Existing reasoning context: diff --git a/skyvern/forge/prompts/skyvern/single-input-action.j2 b/skyvern/forge/prompts/skyvern/single-input-action.j2 index c1305ae8..73785ae8 100644 --- a/skyvern/forge/prompts/skyvern/single-input-action.j2 +++ b/skyvern/forge/prompts/skyvern/single-input-action.j2 @@ -23,6 +23,7 @@ Reply in JSON format with the following keys: "is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. Output False if it only requires ZIP code or postal code. "is_date_related": bool, // True if the field is related to date input or select, otherwise false. "date_format": str, // The format of the date or datetime to be input. For example YYYY-MM-DD, YYYY-MM-DD HH:MM:SS, DD.MM.YYYY, MM/DD/YYYY, etc. If the field is not related to date input or select, this should be null. + "is_text_captcha": bool, // True if the field is asking for a text captcha, otherwise false. Do not confuse it with the verification code. Text CAPTCHAs are typically displayed alongside an image of distorted letters or numbers. }{% endif %} }]{% if verification_code_check %} "verification_code_reasoning": str, // Let's think step by step. Describe what you see and think if there is somewhere on the current page where you must enter the verification code now for login or any verification step. Explain why you believe a verification code needs to be entered somewhere or not. Do not imagine any place to enter the code if the code has not been sent yet. diff --git a/skyvern/webeye/actions/actions.py b/skyvern/webeye/actions/actions.py index 477c4d78..d7f62260 100644 --- a/skyvern/webeye/actions/actions.py +++ b/skyvern/webeye/actions/actions.py @@ -12,6 +12,16 @@ LOG = structlog.get_logger() T = TypeVar("T", bound="Action") +class CaptchaType(StrEnum): + TEXT_CAPTCHA = "text_captcha" + RECAPTCHA = "recaptcha" + HCAPTCHA = "hcaptcha" + MTCAPTCHA = "mtcaptcha" + FUNCAPTCHA = "funcaptcha" + CLOUDFLARE = "cloudflare" + OTHER = "other" + + class ActionStatus(StrEnum): pending = "pending" skipped = "skipped" @@ -82,9 +92,10 @@ class InputOrSelectContext(BaseModel): is_location_input: bool | None = None # address input usually requires auto completion is_date_related: bool | None = None # date picker mini agent requires some special logic date_format: str | None = None + is_text_captcha: bool | None = None def __repr__(self) -> str: - return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})" + return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, is_date_related={self.is_date_related}, date_format={self.date_format}, is_text_captcha={self.is_text_captcha}, intention={self.intention})" class ClickContext(BaseModel): @@ -266,6 +277,7 @@ class NullAction(Action): class SolveCaptchaAction(Action): action_type: ActionType = ActionType.SOLVE_CAPTCHA + captcha_type: CaptchaType | None = None class SelectOptionAction(WebAction): diff --git a/skyvern/webeye/actions/parse_actions.py b/skyvern/webeye/actions/parse_actions.py index 2c27ede1..b81e580c 100644 --- a/skyvern/webeye/actions/parse_actions.py +++ b/skyvern/webeye/actions/parse_actions.py @@ -19,6 +19,7 @@ from skyvern.utils.image_resizer import Resolution, scale_coordinates from skyvern.webeye.actions.action_types import ActionType from skyvern.webeye.actions.actions import ( Action, + CaptchaType, CheckboxAction, ClickAction, ClickContext, @@ -181,7 +182,10 @@ def parse_action( return NullAction(**base_action_dict) if action_type == ActionType.SOLVE_CAPTCHA: - return SolveCaptchaAction(**base_action_dict) + captcha_type: str | None = action.get("captcha_type") + return SolveCaptchaAction( + **base_action_dict, captcha_type=CaptchaType[captcha_type.upper()] if captcha_type else None + ) if action_type == ActionType.CLOSE_PAGE: return ClosePageAction(**base_action_dict) diff --git a/skyvern/webeye/browser_state.py b/skyvern/webeye/browser_state.py index 68a02e7d..fd0c8f84 100644 --- a/skyvern/webeye/browser_state.py +++ b/skyvern/webeye/browser_state.py @@ -83,4 +83,5 @@ class BrowserState(Protocol): scroll: bool = True, support_empty_page: bool = False, wait_seconds: float = 0, + must_included_tags: list[str] | None = None, ) -> ScrapedPage: ... diff --git a/skyvern/webeye/real_browser_state.py b/skyvern/webeye/real_browser_state.py index be5a6e95..3ce0d8e2 100644 --- a/skyvern/webeye/real_browser_state.py +++ b/skyvern/webeye/real_browser_state.py @@ -394,6 +394,7 @@ class RealBrowserState(BrowserState): scroll: bool = True, support_empty_page: bool = False, wait_seconds: float = 0, + must_included_tags: list[str] | None = None, ) -> ScrapedPage: return await scraper.scrape_website( browser_state=self, @@ -408,6 +409,7 @@ class RealBrowserState(BrowserState): scroll=scroll, support_empty_page=support_empty_page, wait_seconds=wait_seconds, + must_included_tags=must_included_tags, ) async def close(self, close_browser_on_completion: bool = True) -> None: diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index 65c55c2b..3c3993c7 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -1683,6 +1683,7 @@ async function buildElementObject( async function buildTreeFromBody( frame = "main.frame", frame_index = undefined, + must_included_tags = [], ) { if ( window.GlobalSkyvernFrameIndex === undefined && @@ -1697,6 +1698,7 @@ async function buildTreeFromBody( false, undefined, maxElementNumber, + must_included_tags, ); DomUtils.elementListCache = elementsAndResultArray[0]; return elementsAndResultArray; @@ -1708,12 +1710,19 @@ async function buildElementTree( full_tree = false, hoverStylesMap = undefined, maxElementNumber = 0, + must_included_tags = [], ) { // Generate hover styles map at the start if (hoverStylesMap === undefined) { hoverStylesMap = await getHoverStylesMap(); } + if (must_included_tags.length > 0) { + _jsConsoleLog( + "full tree will be enabled as the must_included_tags is not empty", + ); + full_tree = true; + } if (window.GlobalEnableAllTextualElements === undefined) { window.GlobalEnableAllTextualElements = false; } @@ -1835,6 +1844,10 @@ async function buildElementTree( if (elementObj.text.length > 0) { elementObj.purgeable = false; } + if (must_included_tags.includes(tagName)) { + elementObj.purgeable = false; + elementObj.interactable = true; + } } if (elementObj) { diff --git a/skyvern/webeye/scraper/scraped_page.py b/skyvern/webeye/scraper/scraped_page.py index 0ab48feb..bc4fe530 100644 --- a/skyvern/webeye/scraper/scraped_page.py +++ b/skyvern/webeye/scraper/scraped_page.py @@ -306,6 +306,7 @@ class ScrapedPage(BaseModel, ElementTreeBuilder): scroll: bool = True, take_screenshots: bool = True, max_retries: int = 0, + must_included_tags: list[str] | None = None, ) -> Self: return await self._browser_state.scrape_website( url=self.url, @@ -315,6 +316,7 @@ class ScrapedPage(BaseModel, ElementTreeBuilder): take_screenshots=take_screenshots, draw_boxes=draw_boxes, scroll=scroll, + must_included_tags=must_included_tags, ) async def generate_scraped_page_without_screenshots(self, max_retries: int = 0) -> Self: diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 0caed0c3..0f8ab054 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -150,6 +150,7 @@ async def scrape_website( scroll: bool = True, support_empty_page: bool = False, wait_seconds: float = 0, + must_included_tags: list[str] | None = None, ) -> ScrapedPage: """ ************************************************************************************************ @@ -186,6 +187,7 @@ async def scrape_website( scroll=scroll, support_empty_page=support_empty_page, wait_seconds=wait_seconds, + must_included_tags=must_included_tags, ) except ScrapingFailedBlankPage: raise @@ -216,6 +218,7 @@ async def scrape_website( draw_boxes=draw_boxes, max_screenshot_number=max_screenshot_number, scroll=scroll, + must_included_tags=must_included_tags, ) @@ -269,6 +272,7 @@ async def scrape_web_unsafe( scroll: bool = True, support_empty_page: bool = False, wait_seconds: float = 0, + must_included_tags: list[str] | None = None, ) -> ScrapedPage: """ Asynchronous function that performs web scraping without any built-in error handling. This function is intended @@ -301,11 +305,11 @@ async def scrape_web_unsafe( LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.", wait_seconds=wait_seconds) await asyncio.sleep(wait_seconds) - elements, element_tree = await get_interactable_element_tree(page, scrape_exclude) + elements, element_tree = await get_interactable_element_tree(page, scrape_exclude, must_included_tags) if not elements and not support_empty_page: LOG.warning("No elements found on the page, wait and retry") await empty_page_retry_wait() - elements, element_tree = await get_interactable_element_tree(page, scrape_exclude) + elements, element_tree = await get_interactable_element_tree(page, scrape_exclude, must_included_tags) element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree)) element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree)) @@ -415,6 +419,7 @@ async def add_frame_interactable_elements( frame_index: int, elements: list[dict], element_tree: list[dict], + must_included_tags: list[str] | None = None, ) -> tuple[list[dict], list[dict]]: """ Add the interactable element of the frame to the elements and element_tree. @@ -444,7 +449,7 @@ async def add_frame_interactable_elements( await skyvern_frame.safe_wait_for_animation_end() frame_elements, frame_element_tree = await skyvern_frame.build_tree_from_body( - frame_name=skyvern_id, frame_index=frame_index + frame_name=skyvern_id, frame_index=frame_index, must_included_tags=must_included_tags ) for element in elements: @@ -460,6 +465,7 @@ async def add_frame_interactable_elements( async def get_interactable_element_tree( page: Page, scrape_exclude: ScrapeExcludeFunc | None = None, + must_included_tags: list[str] | None = None, ) -> tuple[list[dict], list[dict]]: """ Get the element tree of the page, including all the elements that are interactable. @@ -468,7 +474,9 @@ async def get_interactable_element_tree( """ # main page index is 0 skyvern_page = await SkyvernFrame.create_instance(page) - elements, element_tree = await skyvern_page.build_tree_from_body(frame_name="main.frame", frame_index=0) + elements, element_tree = await skyvern_page.build_tree_from_body( + frame_name="main.frame", frame_index=0, must_included_tags=must_included_tags + ) context = skyvern_context.ensure_context() frames = await get_all_children_frames(page) @@ -487,6 +495,7 @@ async def get_interactable_element_tree( frame_index, elements, element_tree, + must_included_tags, ) return elements, element_tree diff --git a/skyvern/webeye/utils/page.py b/skyvern/webeye/utils/page.py index 97dea9c9..34b07f6c 100644 --- a/skyvern/webeye/utils/page.py +++ b/skyvern/webeye/utils/page.py @@ -489,11 +489,16 @@ class SkyvernFrame: self, frame_name: str | None, frame_index: int, + must_included_tags: list[str] | None = None, timeout_ms: float = SettingsManager.get_settings().BROWSER_SCRAPING_BUILDING_ELEMENT_TREE_TIMEOUT_MS, ) -> tuple[list[dict], list[dict]]: - js_script = "async ([frame_name, frame_index]) => await buildTreeFromBody(frame_name, frame_index)" + must_included_tags = must_included_tags or [] + js_script = "async ([frame_name, frame_index, must_included_tags]) => await buildTreeFromBody(frame_name, frame_index, must_included_tags)" return await self.evaluate( - frame=self.frame, expression=js_script, timeout_ms=timeout_ms, arg=[frame_name, frame_index] + frame=self.frame, + expression=js_script, + timeout_ms=timeout_ms, + arg=[frame_name, frame_index, must_included_tags], ) @TraceManager.traced_async()