general text captcha solution (#4517)

This commit is contained in:
LawyZheng
2026-01-23 00:23:57 +08:00
committed by GitHub
parent b5ff547a3a
commit 2c60d8e180
12 changed files with 62 additions and 8 deletions

View File

@@ -20,6 +20,7 @@ Reply in JSON format with the following keys:
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
"action_type": str, // It's a string enum: "CLICK", "HOVER", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "HOVER" is used to move the mouse over an element without clicking, particularly when revealing hover-only menus or buttons before clicking, or when the UI hints that a control (like a CTA button) only appears after hovering a card, tile, or model name. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}}
"id": str, // The id of the element to take action on. The id has to be one from the elements list
"captcha_type": str, // The type of captcha for SOLVE_CAPTCHA action only. null if not SOLVE_CAPTCHA action. It's a string enum: "TEXT_CAPTCHA", "RECAPTCHA", "HCAPTCHA", "MTCAPTCHA", "FUNCAPTCHA", "CLOUDFLARE", "OTHER".
"text": str, // Text for INPUT_TEXT action only
"file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise.
"download": bool, // Can only be true for CLICK or SELECT_OPTION actions. If true, the browser will trigger a download by clicking the element. If false, the browser will click the element without triggering a download.
@@ -40,6 +41,7 @@ Reply in JSON format with the following keys:
"is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. Output False if it only requires ZIP code or postal code.
"is_date_related": bool, // True if the field is related to date input or select, otherwise false.
"date_format": str, // The format of the date or datetime to be input. For example YYYY-MM-DD, YYYY-MM-DD HH:MM:SS, DD.MM.YYYY, MM/DD/YYYY, etc. If the field is not related to date input or select, this should be null.
"is_text_captcha": bool, // True if the field is asking for a text captcha, otherwise false. Do not confuse it with the verification code. Text CAPTCHAs are typically displayed alongside an image of distorted letters or numbers.
}{% endif %}
}],{% if verification_code_check %}
"verification_code_reasoning": str, // Let's think step by step. Describe what you see and think if there is somewhere on the current page where you must enter the verification code now for login or any verification step. Explain why you believe a verification code needs to be entered somewhere or not. Do not imagine any place to enter the code if the code has not been sent yet.

View File

@@ -20,6 +20,7 @@ Reply in JSON format with the following keys:
"confidence_float": float, // The confidence of the action. Pick a number between 0.0 and 1.0. 0.0 means no confidence, 1.0 means full confidence
"action_type": str, // It's a string enum: "CLICK", "HOVER", "INPUT_TEXT", "UPLOAD_FILE", "SELECT_OPTION", "WAIT", "SOLVE_CAPTCHA", "COMPLETE", "TERMINATE"{{', "CLOSE_PAGE"' if has_magic_link_page else ""}}. "CLICK" is an element you'd like to click. "HOVER" is used to move the mouse over an element without clicking, particularly when revealing hover-only menus or buttons before clicking, or when the UI hints that a control (like a CTA button) only appears after hovering a card, tile, or model name. "INPUT_TEXT" is an element you'd like to input text into. "UPLOAD_FILE" is an element you'd like to upload a file into. "SELECT_OPTION" is an element you'd like to select an option from. "WAIT" action should be used if there are no actions to take and there is some indication on screen that waiting could yield more actions. "WAIT" should not be used if there are actions to take. "SOLVE_CAPTCHA" should be used if there's a captcha to solve on the screen. "COMPLETE" is used when the {{"complete criterion has been met" if complete_criterion else "user goal has been achieved"}} AND if there's any data extraction goal, you should be able to get data from the page. Never return a COMPLETE action unless the {{ "complete criterion is met" if complete_criterion else "user goal is achieved" }}. "TERMINATE" is used to terminate the whole task with a failure when it doesn't seem like the user goal can be achieved. Do not use "TERMINATE" if waiting could lead the user towards the goal. Only return "TERMINATE" if you are on a page where the user goal cannot be achieved. All other actions are ignored when "TERMINATE" is returned.{{' "CLOSE_PAGE" is used to close the current page when it is impossible to achieve the user goal on the current page.' if has_magic_link_page else ''}}
"id": str, // The id of the element to take action on. The id has to be one from the elements list
"captcha_type": str, // The type of captcha for SOLVE_CAPTCHA action only. null if not SOLVE_CAPTCHA action. It's a string enum: "TEXT_CAPTCHA", "RECAPTCHA", "HCAPTCHA", "MTCAPTCHA", "FUNCAPTCHA", "CLOUDFLARE", "OTHER".
"text": str, // Text for INPUT_TEXT action only
"file_url": str, // The url of the file to upload if applicable. This field must be present for UPLOAD_FILE but can also be present for CLICK only if the click is to upload the file. It should be null otherwise.
"download": bool, // Can only be true for CLICK or SELECT_OPTION actions. If true, the browser will trigger a download by clicking the element. If false, the browser will click the element without triggering a download.
@@ -40,6 +41,7 @@ Reply in JSON format with the following keys:
"is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. Output False if it only requires ZIP code or postal code.
"is_date_related": bool, // True if the field is related to date input or select, otherwise false.
"date_format": str, // The format of the date or datetime to be input. For example YYYY-MM-DD, YYYY-MM-DD HH:MM:SS, DD.MM.YYYY, MM/DD/YYYY, etc. If the field is not related to date input or select, this should be null.
"is_text_captcha": bool, // True if the field is asking for a text captcha, otherwise false. Do not confuse it with the verification code. Text CAPTCHAs are typically displayed alongside an image of distorted letters or numbers.
}{% endif %}
}],{% if verification_code_check %}
"verification_code_reasoning": str, // Let's think step by step. Describe what you see and think if there is somewhere on the current page where you must enter the verification code now for login or any verification step. Explain why you believe a verification code needs to be entered somewhere or not. Do not imagine any place to enter the code if the code has not been sent yet.

View File

@@ -11,6 +11,7 @@ Reply in the following JSON format:
"is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. Output False if it only requires ZIP code or postal code.
"is_date_related": bool, // True if the field is related to date input or select, otherwise false.
"date_format": str, // The format of the date or datetime to be input. For example YYYY-MM-DD, YYYY-MM-DD HH:MM:SS, DD.MM.YYYY, MM/DD/YYYY, etc. If the field is not related to date input or select, this should be null.
"is_text_captcha": bool, // True if the field is asking for a text captcha, otherwise false. Do not confuse it with the verification code. Text CAPTCHAs are typically displayed alongside an image of distorted letters or numbers.
}
Existing reasoning context:

View File

@@ -23,6 +23,7 @@ Reply in JSON format with the following keys:
"is_location_input": bool, // True if the element is asking user to input where he lives, otherwise false. For example, it is asking for location, or address, or other similar information. Output False if it only requires ZIP code or postal code.
"is_date_related": bool, // True if the field is related to date input or select, otherwise false.
"date_format": str, // The format of the date or datetime to be input. For example YYYY-MM-DD, YYYY-MM-DD HH:MM:SS, DD.MM.YYYY, MM/DD/YYYY, etc. If the field is not related to date input or select, this should be null.
"is_text_captcha": bool, // True if the field is asking for a text captcha, otherwise false. Do not confuse it with the verification code. Text CAPTCHAs are typically displayed alongside an image of distorted letters or numbers.
}{% endif %}
}]{% if verification_code_check %}
"verification_code_reasoning": str, // Let's think step by step. Describe what you see and think if there is somewhere on the current page where you must enter the verification code now for login or any verification step. Explain why you believe a verification code needs to be entered somewhere or not. Do not imagine any place to enter the code if the code has not been sent yet.

View File

@@ -12,6 +12,16 @@ LOG = structlog.get_logger()
T = TypeVar("T", bound="Action")
class CaptchaType(StrEnum):
TEXT_CAPTCHA = "text_captcha"
RECAPTCHA = "recaptcha"
HCAPTCHA = "hcaptcha"
MTCAPTCHA = "mtcaptcha"
FUNCAPTCHA = "funcaptcha"
CLOUDFLARE = "cloudflare"
OTHER = "other"
class ActionStatus(StrEnum):
pending = "pending"
skipped = "skipped"
@@ -82,9 +92,10 @@ class InputOrSelectContext(BaseModel):
is_location_input: bool | None = None # address input usually requires auto completion
is_date_related: bool | None = None # date picker mini agent requires some special logic
date_format: str | None = None
is_text_captcha: bool | None = None
def __repr__(self) -> str:
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})"
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, is_date_related={self.is_date_related}, date_format={self.date_format}, is_text_captcha={self.is_text_captcha}, intention={self.intention})"
class ClickContext(BaseModel):
@@ -266,6 +277,7 @@ class NullAction(Action):
class SolveCaptchaAction(Action):
action_type: ActionType = ActionType.SOLVE_CAPTCHA
captcha_type: CaptchaType | None = None
class SelectOptionAction(WebAction):

View File

@@ -19,6 +19,7 @@ from skyvern.utils.image_resizer import Resolution, scale_coordinates
from skyvern.webeye.actions.action_types import ActionType
from skyvern.webeye.actions.actions import (
Action,
CaptchaType,
CheckboxAction,
ClickAction,
ClickContext,
@@ -181,7 +182,10 @@ def parse_action(
return NullAction(**base_action_dict)
if action_type == ActionType.SOLVE_CAPTCHA:
return SolveCaptchaAction(**base_action_dict)
captcha_type: str | None = action.get("captcha_type")
return SolveCaptchaAction(
**base_action_dict, captcha_type=CaptchaType[captcha_type.upper()] if captcha_type else None
)
if action_type == ActionType.CLOSE_PAGE:
return ClosePageAction(**base_action_dict)

View File

@@ -83,4 +83,5 @@ class BrowserState(Protocol):
scroll: bool = True,
support_empty_page: bool = False,
wait_seconds: float = 0,
must_included_tags: list[str] | None = None,
) -> ScrapedPage: ...

View File

@@ -394,6 +394,7 @@ class RealBrowserState(BrowserState):
scroll: bool = True,
support_empty_page: bool = False,
wait_seconds: float = 0,
must_included_tags: list[str] | None = None,
) -> ScrapedPage:
return await scraper.scrape_website(
browser_state=self,
@@ -408,6 +409,7 @@ class RealBrowserState(BrowserState):
scroll=scroll,
support_empty_page=support_empty_page,
wait_seconds=wait_seconds,
must_included_tags=must_included_tags,
)
async def close(self, close_browser_on_completion: bool = True) -> None:

View File

@@ -1683,6 +1683,7 @@ async function buildElementObject(
async function buildTreeFromBody(
frame = "main.frame",
frame_index = undefined,
must_included_tags = [],
) {
if (
window.GlobalSkyvernFrameIndex === undefined &&
@@ -1697,6 +1698,7 @@ async function buildTreeFromBody(
false,
undefined,
maxElementNumber,
must_included_tags,
);
DomUtils.elementListCache = elementsAndResultArray[0];
return elementsAndResultArray;
@@ -1708,12 +1710,19 @@ async function buildElementTree(
full_tree = false,
hoverStylesMap = undefined,
maxElementNumber = 0,
must_included_tags = [],
) {
// Generate hover styles map at the start
if (hoverStylesMap === undefined) {
hoverStylesMap = await getHoverStylesMap();
}
if (must_included_tags.length > 0) {
_jsConsoleLog(
"full tree will be enabled as the must_included_tags is not empty",
);
full_tree = true;
}
if (window.GlobalEnableAllTextualElements === undefined) {
window.GlobalEnableAllTextualElements = false;
}
@@ -1835,6 +1844,10 @@ async function buildElementTree(
if (elementObj.text.length > 0) {
elementObj.purgeable = false;
}
if (must_included_tags.includes(tagName)) {
elementObj.purgeable = false;
elementObj.interactable = true;
}
}
if (elementObj) {

View File

@@ -306,6 +306,7 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
scroll: bool = True,
take_screenshots: bool = True,
max_retries: int = 0,
must_included_tags: list[str] | None = None,
) -> Self:
return await self._browser_state.scrape_website(
url=self.url,
@@ -315,6 +316,7 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
take_screenshots=take_screenshots,
draw_boxes=draw_boxes,
scroll=scroll,
must_included_tags=must_included_tags,
)
async def generate_scraped_page_without_screenshots(self, max_retries: int = 0) -> Self:

View File

@@ -150,6 +150,7 @@ async def scrape_website(
scroll: bool = True,
support_empty_page: bool = False,
wait_seconds: float = 0,
must_included_tags: list[str] | None = None,
) -> ScrapedPage:
"""
************************************************************************************************
@@ -186,6 +187,7 @@ async def scrape_website(
scroll=scroll,
support_empty_page=support_empty_page,
wait_seconds=wait_seconds,
must_included_tags=must_included_tags,
)
except ScrapingFailedBlankPage:
raise
@@ -216,6 +218,7 @@ async def scrape_website(
draw_boxes=draw_boxes,
max_screenshot_number=max_screenshot_number,
scroll=scroll,
must_included_tags=must_included_tags,
)
@@ -269,6 +272,7 @@ async def scrape_web_unsafe(
scroll: bool = True,
support_empty_page: bool = False,
wait_seconds: float = 0,
must_included_tags: list[str] | None = None,
) -> ScrapedPage:
"""
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -301,11 +305,11 @@ async def scrape_web_unsafe(
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.", wait_seconds=wait_seconds)
await asyncio.sleep(wait_seconds)
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude, must_included_tags)
if not elements and not support_empty_page:
LOG.warning("No elements found on the page, wait and retry")
await empty_page_retry_wait()
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude, must_included_tags)
element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree))
@@ -415,6 +419,7 @@ async def add_frame_interactable_elements(
frame_index: int,
elements: list[dict],
element_tree: list[dict],
must_included_tags: list[str] | None = None,
) -> tuple[list[dict], list[dict]]:
"""
Add the interactable element of the frame to the elements and element_tree.
@@ -444,7 +449,7 @@ async def add_frame_interactable_elements(
await skyvern_frame.safe_wait_for_animation_end()
frame_elements, frame_element_tree = await skyvern_frame.build_tree_from_body(
frame_name=skyvern_id, frame_index=frame_index
frame_name=skyvern_id, frame_index=frame_index, must_included_tags=must_included_tags
)
for element in elements:
@@ -460,6 +465,7 @@ async def add_frame_interactable_elements(
async def get_interactable_element_tree(
page: Page,
scrape_exclude: ScrapeExcludeFunc | None = None,
must_included_tags: list[str] | None = None,
) -> tuple[list[dict], list[dict]]:
"""
Get the element tree of the page, including all the elements that are interactable.
@@ -468,7 +474,9 @@ async def get_interactable_element_tree(
"""
# main page index is 0
skyvern_page = await SkyvernFrame.create_instance(page)
elements, element_tree = await skyvern_page.build_tree_from_body(frame_name="main.frame", frame_index=0)
elements, element_tree = await skyvern_page.build_tree_from_body(
frame_name="main.frame", frame_index=0, must_included_tags=must_included_tags
)
context = skyvern_context.ensure_context()
frames = await get_all_children_frames(page)
@@ -487,6 +495,7 @@ async def get_interactable_element_tree(
frame_index,
elements,
element_tree,
must_included_tags,
)
return elements, element_tree

View File

@@ -489,11 +489,16 @@ class SkyvernFrame:
self,
frame_name: str | None,
frame_index: int,
must_included_tags: list[str] | None = None,
timeout_ms: float = SettingsManager.get_settings().BROWSER_SCRAPING_BUILDING_ELEMENT_TREE_TIMEOUT_MS,
) -> tuple[list[dict], list[dict]]:
js_script = "async ([frame_name, frame_index]) => await buildTreeFromBody(frame_name, frame_index)"
must_included_tags = must_included_tags or []
js_script = "async ([frame_name, frame_index, must_included_tags]) => await buildTreeFromBody(frame_name, frame_index, must_included_tags)"
return await self.evaluate(
frame=self.frame, expression=js_script, timeout_ms=timeout_ms, arg=[frame_name, frame_index]
frame=self.frame,
expression=js_script,
timeout_ms=timeout_ms,
arg=[frame_name, frame_index, must_included_tags],
)
@TraceManager.traced_async()