From 474de0e5763f4458f3815b7bf29e6cd75b9c51a4 Mon Sep 17 00:00:00 2001 From: LawyZheng Date: Tue, 9 Jul 2024 11:36:25 +0800 Subject: [PATCH] exclude base64 data (#567) --- skyvern/webeye/scraper/scraper.py | 27 +++++++++++++++++++++++++++ 1 file changed, 27 insertions(+) diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index 6019fa79..7d59d498 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -44,6 +44,15 @@ RESERVED_ATTRIBUTES = { "value", } +BASE64_INCLUDE_ATTRIBUTES = { + "href", + "src", + "poster", + "srcset", + "icon", +} + + ELEMENT_NODE_ATTRIBUTES = { "id", } @@ -474,6 +483,13 @@ def trim_element_tree(elements: list[dict]) -> list[dict]: if not queue_ele.get("interactable"): del queue_ele["id"] + if "attributes" in queue_ele: + new_attributes = _trimmed_base64_data(queue_ele["attributes"]) + if new_attributes: + queue_ele["attributes"] = new_attributes + else: + del queue_ele["attributes"] + if "attributes" in queue_ele and not queue_ele.get("keepAllAttr", False): tag_name = queue_ele["tagName"] if "tagName" in queue_ele else "" new_attributes = _trimmed_attributes(tag_name, queue_ele["attributes"]) @@ -495,6 +511,17 @@ def trim_element_tree(elements: list[dict]) -> list[dict]: return elements +def _trimmed_base64_data(attributes: dict) -> dict: + new_attributes: dict = {} + + for key in attributes: + if key in BASE64_INCLUDE_ATTRIBUTES and "data:" in attributes.get(key, ""): + continue + new_attributes[key] = attributes[key] + + return new_attributes + + def _trimmed_attributes(tag_name: str, attributes: dict) -> dict: new_attributes: dict = {}