general text captcha solution (#4517)

2026-01-23 00:23:57 +08:00
parent b5ff547a3a
commit 2c60d8e180
12 changed files with 62 additions and 8 deletions
--- a/skyvern/webeye/actions/actions.py
+++ b/skyvern/webeye/actions/actions.py
@@ -12,6 +12,16 @@ LOG = structlog.get_logger()
 T = TypeVar("T", bound="Action")


+class CaptchaType(StrEnum):
+    TEXT_CAPTCHA = "text_captcha"
+    RECAPTCHA = "recaptcha"
+    HCAPTCHA = "hcaptcha"
+    MTCAPTCHA = "mtcaptcha"
+    FUNCAPTCHA = "funcaptcha"
+    CLOUDFLARE = "cloudflare"
+    OTHER = "other"
+
+
 class ActionStatus(StrEnum):
    pending = "pending"
    skipped = "skipped"
@@ -82,9 +92,10 @@ class InputOrSelectContext(BaseModel):
    is_location_input: bool | None = None  # address input usually requires auto completion
    is_date_related: bool | None = None  # date picker mini agent requires some special logic
    date_format: str | None = None
+    is_text_captcha: bool | None = None

    def __repr__(self) -> str:
-        return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})"
+        return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, is_date_related={self.is_date_related}, date_format={self.date_format}, is_text_captcha={self.is_text_captcha}, intention={self.intention})"


 class ClickContext(BaseModel):
@@ -266,6 +277,7 @@ class NullAction(Action):

 class SolveCaptchaAction(Action):
    action_type: ActionType = ActionType.SOLVE_CAPTCHA
+    captcha_type: CaptchaType | None = None


 class SelectOptionAction(WebAction):
--- a/skyvern/webeye/actions/parse_actions.py
+++ b/skyvern/webeye/actions/parse_actions.py
@@ -19,6 +19,7 @@ from skyvern.utils.image_resizer import Resolution, scale_coordinates
 from skyvern.webeye.actions.action_types import ActionType
 from skyvern.webeye.actions.actions import (
    Action,
+    CaptchaType,
    CheckboxAction,
    ClickAction,
    ClickContext,
@@ -181,7 +182,10 @@ def parse_action(
        return NullAction(**base_action_dict)

    if action_type == ActionType.SOLVE_CAPTCHA:
-        return SolveCaptchaAction(**base_action_dict)
+        captcha_type: str | None = action.get("captcha_type")
+        return SolveCaptchaAction(
+            **base_action_dict, captcha_type=CaptchaType[captcha_type.upper()] if captcha_type else None
+        )

    if action_type == ActionType.CLOSE_PAGE:
        return ClosePageAction(**base_action_dict)
--- a/skyvern/webeye/browser_state.py
+++ b/skyvern/webeye/browser_state.py
@@ -83,4 +83,5 @@ class BrowserState(Protocol):
        scroll: bool = True,
        support_empty_page: bool = False,
        wait_seconds: float = 0,
+        must_included_tags: list[str] | None = None,
    ) -> ScrapedPage: ...
--- a/skyvern/webeye/real_browser_state.py
+++ b/skyvern/webeye/real_browser_state.py
@@ -394,6 +394,7 @@ class RealBrowserState(BrowserState):
        scroll: bool = True,
        support_empty_page: bool = False,
        wait_seconds: float = 0,
+        must_included_tags: list[str] | None = None,
    ) -> ScrapedPage:
        return await scraper.scrape_website(
            browser_state=self,
@@ -408,6 +409,7 @@ class RealBrowserState(BrowserState):
            scroll=scroll,
            support_empty_page=support_empty_page,
            wait_seconds=wait_seconds,
+            must_included_tags=must_included_tags,
        )

    async def close(self, close_browser_on_completion: bool = True) -> None:
--- a/skyvern/webeye/scraper/domUtils.js
+++ b/skyvern/webeye/scraper/domUtils.js
@@ -1683,6 +1683,7 @@ async function buildElementObject(
 async function buildTreeFromBody(
  frame = "main.frame",
  frame_index = undefined,
+  must_included_tags = [],
 ) {
  if (
    window.GlobalSkyvernFrameIndex === undefined &&
@@ -1697,6 +1698,7 @@ async function buildTreeFromBody(
    false,
    undefined,
    maxElementNumber,
+    must_included_tags,
  );
  DomUtils.elementListCache = elementsAndResultArray[0];
  return elementsAndResultArray;
@@ -1708,12 +1710,19 @@ async function buildElementTree(
  full_tree = false,
  hoverStylesMap = undefined,
  maxElementNumber = 0,
+  must_included_tags = [],
 ) {
  // Generate hover styles map at the start
  if (hoverStylesMap === undefined) {
    hoverStylesMap = await getHoverStylesMap();
  }

+  if (must_included_tags.length > 0) {
+    _jsConsoleLog(
+      "full tree will be enabled as the must_included_tags is not empty",
+    );
+    full_tree = true;
+  }
  if (window.GlobalEnableAllTextualElements === undefined) {
    window.GlobalEnableAllTextualElements = false;
  }
@@ -1835,6 +1844,10 @@ async function buildElementTree(
        if (elementObj.text.length > 0) {
          elementObj.purgeable = false;
        }
+        if (must_included_tags.includes(tagName)) {
+          elementObj.purgeable = false;
+          elementObj.interactable = true;
+        }
      }

      if (elementObj) {
--- a/skyvern/webeye/scraper/scraped_page.py
+++ b/skyvern/webeye/scraper/scraped_page.py
@@ -306,6 +306,7 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
        scroll: bool = True,
        take_screenshots: bool = True,
        max_retries: int = 0,
+        must_included_tags: list[str] | None = None,
    ) -> Self:
        return await self._browser_state.scrape_website(
            url=self.url,
@@ -315,6 +316,7 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
            take_screenshots=take_screenshots,
            draw_boxes=draw_boxes,
            scroll=scroll,
+            must_included_tags=must_included_tags,
        )

    async def generate_scraped_page_without_screenshots(self, max_retries: int = 0) -> Self:
--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -150,6 +150,7 @@ async def scrape_website(
    scroll: bool = True,
    support_empty_page: bool = False,
    wait_seconds: float = 0,
+    must_included_tags: list[str] | None = None,
 ) -> ScrapedPage:
    """
    ************************************************************************************************
@@ -186,6 +187,7 @@ async def scrape_website(
            scroll=scroll,
            support_empty_page=support_empty_page,
            wait_seconds=wait_seconds,
+            must_included_tags=must_included_tags,
        )
    except ScrapingFailedBlankPage:
        raise
@@ -216,6 +218,7 @@ async def scrape_website(
            draw_boxes=draw_boxes,
            max_screenshot_number=max_screenshot_number,
            scroll=scroll,
+            must_included_tags=must_included_tags,
        )


@@ -269,6 +272,7 @@ async def scrape_web_unsafe(
    scroll: bool = True,
    support_empty_page: bool = False,
    wait_seconds: float = 0,
+    must_included_tags: list[str] | None = None,
 ) -> ScrapedPage:
    """
    Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -301,11 +305,11 @@ async def scrape_web_unsafe(
        LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.", wait_seconds=wait_seconds)
        await asyncio.sleep(wait_seconds)

-    elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
+    elements, element_tree = await get_interactable_element_tree(page, scrape_exclude, must_included_tags)
    if not elements and not support_empty_page:
        LOG.warning("No elements found on the page, wait and retry")
        await empty_page_retry_wait()
-        elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
+        elements, element_tree = await get_interactable_element_tree(page, scrape_exclude, must_included_tags)

    element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
    element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree))
@@ -415,6 +419,7 @@ async def add_frame_interactable_elements(
    frame_index: int,
    elements: list[dict],
    element_tree: list[dict],
+    must_included_tags: list[str] | None = None,
 ) -> tuple[list[dict], list[dict]]:
    """
    Add the interactable element of the frame to the elements and element_tree.
@@ -444,7 +449,7 @@ async def add_frame_interactable_elements(
    await skyvern_frame.safe_wait_for_animation_end()

    frame_elements, frame_element_tree = await skyvern_frame.build_tree_from_body(
-        frame_name=skyvern_id, frame_index=frame_index
+        frame_name=skyvern_id, frame_index=frame_index, must_included_tags=must_included_tags
    )

    for element in elements:
@@ -460,6 +465,7 @@ async def add_frame_interactable_elements(
 async def get_interactable_element_tree(
    page: Page,
    scrape_exclude: ScrapeExcludeFunc | None = None,
+    must_included_tags: list[str] | None = None,
 ) -> tuple[list[dict], list[dict]]:
    """
    Get the element tree of the page, including all the elements that are interactable.
@@ -468,7 +474,9 @@ async def get_interactable_element_tree(
    """
    # main page index is 0
    skyvern_page = await SkyvernFrame.create_instance(page)
-    elements, element_tree = await skyvern_page.build_tree_from_body(frame_name="main.frame", frame_index=0)
+    elements, element_tree = await skyvern_page.build_tree_from_body(
+        frame_name="main.frame", frame_index=0, must_included_tags=must_included_tags
+    )

    context = skyvern_context.ensure_context()
    frames = await get_all_children_frames(page)
@@ -487,6 +495,7 @@ async def get_interactable_element_tree(
            frame_index,
            elements,
            element_tree,
+            must_included_tags,
        )

    return elements, element_tree
--- a/skyvern/webeye/utils/page.py
+++ b/skyvern/webeye/utils/page.py
@@ -489,11 +489,16 @@ class SkyvernFrame:
        self,
        frame_name: str | None,
        frame_index: int,
+        must_included_tags: list[str] | None = None,
        timeout_ms: float = SettingsManager.get_settings().BROWSER_SCRAPING_BUILDING_ELEMENT_TREE_TIMEOUT_MS,
    ) -> tuple[list[dict], list[dict]]:
-        js_script = "async ([frame_name, frame_index]) => await buildTreeFromBody(frame_name, frame_index)"
+        must_included_tags = must_included_tags or []
+        js_script = "async ([frame_name, frame_index, must_included_tags]) => await buildTreeFromBody(frame_name, frame_index, must_included_tags)"
        return await self.evaluate(
-            frame=self.frame, expression=js_script, timeout_ms=timeout_ms, arg=[frame_name, frame_index]
+            frame=self.frame,
+            expression=js_script,
+            timeout_ms=timeout_ms,
+            arg=[frame_name, frame_index, must_included_tags],
        )

    @TraceManager.traced_async()