general text captcha solution (#4517)

This commit is contained in:
LawyZheng
2026-01-23 00:23:57 +08:00
committed by GitHub
parent b5ff547a3a
commit 2c60d8e180
12 changed files with 62 additions and 8 deletions

View File

@@ -12,6 +12,16 @@ LOG = structlog.get_logger()
T = TypeVar("T", bound="Action")
class CaptchaType(StrEnum):
TEXT_CAPTCHA = "text_captcha"
RECAPTCHA = "recaptcha"
HCAPTCHA = "hcaptcha"
MTCAPTCHA = "mtcaptcha"
FUNCAPTCHA = "funcaptcha"
CLOUDFLARE = "cloudflare"
OTHER = "other"
class ActionStatus(StrEnum):
pending = "pending"
skipped = "skipped"
@@ -82,9 +92,10 @@ class InputOrSelectContext(BaseModel):
is_location_input: bool | None = None # address input usually requires auto completion
is_date_related: bool | None = None # date picker mini agent requires some special logic
date_format: str | None = None
is_text_captcha: bool | None = None
def __repr__(self) -> str:
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, intention={self.intention})"
return f"InputOrSelectContext(field={self.field}, is_required={self.is_required}, is_search_bar={self.is_search_bar}, is_location_input={self.is_location_input}, is_date_related={self.is_date_related}, date_format={self.date_format}, is_text_captcha={self.is_text_captcha}, intention={self.intention})"
class ClickContext(BaseModel):
@@ -266,6 +277,7 @@ class NullAction(Action):
class SolveCaptchaAction(Action):
action_type: ActionType = ActionType.SOLVE_CAPTCHA
captcha_type: CaptchaType | None = None
class SelectOptionAction(WebAction):

View File

@@ -19,6 +19,7 @@ from skyvern.utils.image_resizer import Resolution, scale_coordinates
from skyvern.webeye.actions.action_types import ActionType
from skyvern.webeye.actions.actions import (
Action,
CaptchaType,
CheckboxAction,
ClickAction,
ClickContext,
@@ -181,7 +182,10 @@ def parse_action(
return NullAction(**base_action_dict)
if action_type == ActionType.SOLVE_CAPTCHA:
return SolveCaptchaAction(**base_action_dict)
captcha_type: str | None = action.get("captcha_type")
return SolveCaptchaAction(
**base_action_dict, captcha_type=CaptchaType[captcha_type.upper()] if captcha_type else None
)
if action_type == ActionType.CLOSE_PAGE:
return ClosePageAction(**base_action_dict)

View File

@@ -83,4 +83,5 @@ class BrowserState(Protocol):
scroll: bool = True,
support_empty_page: bool = False,
wait_seconds: float = 0,
must_included_tags: list[str] | None = None,
) -> ScrapedPage: ...

View File

@@ -394,6 +394,7 @@ class RealBrowserState(BrowserState):
scroll: bool = True,
support_empty_page: bool = False,
wait_seconds: float = 0,
must_included_tags: list[str] | None = None,
) -> ScrapedPage:
return await scraper.scrape_website(
browser_state=self,
@@ -408,6 +409,7 @@ class RealBrowserState(BrowserState):
scroll=scroll,
support_empty_page=support_empty_page,
wait_seconds=wait_seconds,
must_included_tags=must_included_tags,
)
async def close(self, close_browser_on_completion: bool = True) -> None:

View File

@@ -1683,6 +1683,7 @@ async function buildElementObject(
async function buildTreeFromBody(
frame = "main.frame",
frame_index = undefined,
must_included_tags = [],
) {
if (
window.GlobalSkyvernFrameIndex === undefined &&
@@ -1697,6 +1698,7 @@ async function buildTreeFromBody(
false,
undefined,
maxElementNumber,
must_included_tags,
);
DomUtils.elementListCache = elementsAndResultArray[0];
return elementsAndResultArray;
@@ -1708,12 +1710,19 @@ async function buildElementTree(
full_tree = false,
hoverStylesMap = undefined,
maxElementNumber = 0,
must_included_tags = [],
) {
// Generate hover styles map at the start
if (hoverStylesMap === undefined) {
hoverStylesMap = await getHoverStylesMap();
}
if (must_included_tags.length > 0) {
_jsConsoleLog(
"full tree will be enabled as the must_included_tags is not empty",
);
full_tree = true;
}
if (window.GlobalEnableAllTextualElements === undefined) {
window.GlobalEnableAllTextualElements = false;
}
@@ -1835,6 +1844,10 @@ async function buildElementTree(
if (elementObj.text.length > 0) {
elementObj.purgeable = false;
}
if (must_included_tags.includes(tagName)) {
elementObj.purgeable = false;
elementObj.interactable = true;
}
}
if (elementObj) {

View File

@@ -306,6 +306,7 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
scroll: bool = True,
take_screenshots: bool = True,
max_retries: int = 0,
must_included_tags: list[str] | None = None,
) -> Self:
return await self._browser_state.scrape_website(
url=self.url,
@@ -315,6 +316,7 @@ class ScrapedPage(BaseModel, ElementTreeBuilder):
take_screenshots=take_screenshots,
draw_boxes=draw_boxes,
scroll=scroll,
must_included_tags=must_included_tags,
)
async def generate_scraped_page_without_screenshots(self, max_retries: int = 0) -> Self:

View File

@@ -150,6 +150,7 @@ async def scrape_website(
scroll: bool = True,
support_empty_page: bool = False,
wait_seconds: float = 0,
must_included_tags: list[str] | None = None,
) -> ScrapedPage:
"""
************************************************************************************************
@@ -186,6 +187,7 @@ async def scrape_website(
scroll=scroll,
support_empty_page=support_empty_page,
wait_seconds=wait_seconds,
must_included_tags=must_included_tags,
)
except ScrapingFailedBlankPage:
raise
@@ -216,6 +218,7 @@ async def scrape_website(
draw_boxes=draw_boxes,
max_screenshot_number=max_screenshot_number,
scroll=scroll,
must_included_tags=must_included_tags,
)
@@ -269,6 +272,7 @@ async def scrape_web_unsafe(
scroll: bool = True,
support_empty_page: bool = False,
wait_seconds: float = 0,
must_included_tags: list[str] | None = None,
) -> ScrapedPage:
"""
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
@@ -301,11 +305,11 @@ async def scrape_web_unsafe(
LOG.info(f"Waiting for {wait_seconds} seconds before scraping the website.", wait_seconds=wait_seconds)
await asyncio.sleep(wait_seconds)
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude, must_included_tags)
if not elements and not support_empty_page:
LOG.warning("No elements found on the page, wait and retry")
await empty_page_retry_wait()
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude, must_included_tags)
element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
element_tree_trimmed = trim_element_tree(copy.deepcopy(element_tree))
@@ -415,6 +419,7 @@ async def add_frame_interactable_elements(
frame_index: int,
elements: list[dict],
element_tree: list[dict],
must_included_tags: list[str] | None = None,
) -> tuple[list[dict], list[dict]]:
"""
Add the interactable element of the frame to the elements and element_tree.
@@ -444,7 +449,7 @@ async def add_frame_interactable_elements(
await skyvern_frame.safe_wait_for_animation_end()
frame_elements, frame_element_tree = await skyvern_frame.build_tree_from_body(
frame_name=skyvern_id, frame_index=frame_index
frame_name=skyvern_id, frame_index=frame_index, must_included_tags=must_included_tags
)
for element in elements:
@@ -460,6 +465,7 @@ async def add_frame_interactable_elements(
async def get_interactable_element_tree(
page: Page,
scrape_exclude: ScrapeExcludeFunc | None = None,
must_included_tags: list[str] | None = None,
) -> tuple[list[dict], list[dict]]:
"""
Get the element tree of the page, including all the elements that are interactable.
@@ -468,7 +474,9 @@ async def get_interactable_element_tree(
"""
# main page index is 0
skyvern_page = await SkyvernFrame.create_instance(page)
elements, element_tree = await skyvern_page.build_tree_from_body(frame_name="main.frame", frame_index=0)
elements, element_tree = await skyvern_page.build_tree_from_body(
frame_name="main.frame", frame_index=0, must_included_tags=must_included_tags
)
context = skyvern_context.ensure_context()
frames = await get_all_children_frames(page)
@@ -487,6 +495,7 @@ async def get_interactable_element_tree(
frame_index,
elements,
element_tree,
must_included_tags,
)
return elements, element_tree

View File

@@ -489,11 +489,16 @@ class SkyvernFrame:
self,
frame_name: str | None,
frame_index: int,
must_included_tags: list[str] | None = None,
timeout_ms: float = SettingsManager.get_settings().BROWSER_SCRAPING_BUILDING_ELEMENT_TREE_TIMEOUT_MS,
) -> tuple[list[dict], list[dict]]:
js_script = "async ([frame_name, frame_index]) => await buildTreeFromBody(frame_name, frame_index)"
must_included_tags = must_included_tags or []
js_script = "async ([frame_name, frame_index, must_included_tags]) => await buildTreeFromBody(frame_name, frame_index, must_included_tags)"
return await self.evaluate(
frame=self.frame, expression=js_script, timeout_ms=timeout_ms, arg=[frame_name, frame_index]
frame=self.frame,
expression=js_script,
timeout_ms=timeout_ms,
arg=[frame_name, frame_index, must_included_tags],
)
@TraceManager.traced_async()