add actions db model and caching V0 (#980)

This commit is contained in:
Shuchang Zheng
2024-10-15 12:06:50 -07:00
committed by GitHub
parent e7583ac878
commit 9048cdfa73
19 changed files with 731 additions and 90 deletions

View File

@@ -11,6 +11,7 @@ from pydantic import BaseModel
from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
from skyvern.exceptions import FailedToTakeScreenshot, UnknownElementTreeFormat
from skyvern.forge.sdk.api.crypto import calculate_sha256
from skyvern.forge.sdk.settings_manager import SettingsManager
from skyvern.webeye.browser_factory import BrowserState
from skyvern.webeye.utils.page import SkyvernFrame
@@ -127,10 +128,34 @@ def json_to_html(element: dict, need_skyvern_attrs: bool = True) -> str:
return f'<{tag}{attributes_html if not attributes_html else " "+attributes_html}>{text}{children_html+option_html}</{tag}>'
def build_element_dict(elements: list[dict]) -> tuple[dict[str, str], dict[str, dict], dict[str, str]]:
def clean_element_before_hashing(element: dict) -> dict:
element_copy = copy.deepcopy(element)
element_copy.pop("id", None)
element_copy.pop("rect", None)
if "attributes" in element_copy:
element_copy["attributes"].pop(SKYVERN_ID_ATTR, None)
if "children" in element_copy:
for idx, child in enumerate(element_copy["children"]):
element_copy["children"][idx] = clean_element_before_hashing(child)
return element_copy
def hash_element(element: dict) -> str:
hash_ready_element = clean_element_before_hashing(element)
# Sort the keys to ensure consistent ordering
element_string = json.dumps(hash_ready_element, sort_keys=True)
return calculate_sha256(element_string)
def build_element_dict(
elements: list[dict],
) -> tuple[dict[str, str], dict[str, dict], dict[str, str], dict[str, str], dict[str, list[str]]]:
id_to_css_dict: dict[str, str] = {}
id_to_element_dict: dict[str, dict] = {}
id_to_frame_dict: dict[str, str] = {}
id_to_element_hash: dict[str, str] = {}
hash_to_element_ids: dict[str, list[str]] = {}
for element in elements:
element_id: str = element.get("id", "")
@@ -138,8 +163,11 @@ def build_element_dict(elements: list[dict]) -> tuple[dict[str, str], dict[str,
id_to_css_dict[element_id] = f"[{SKYVERN_ID_ATTR}='{element_id}']"
id_to_element_dict[element_id] = element
id_to_frame_dict[element_id] = element["frame"]
element_hash = hash_element(element)
id_to_element_hash[element_id] = element_hash
hash_to_element_ids[element_hash] = hash_to_element_ids.get(element_hash, []) + [element_id]
return id_to_css_dict, id_to_element_dict, id_to_frame_dict
return id_to_css_dict, id_to_element_dict, id_to_frame_dict, id_to_element_hash, hash_to_element_ids
class ElementTreeFormat(StrEnum):
@@ -163,6 +191,8 @@ class ScrapedPage(BaseModel):
id_to_element_dict: dict[str, dict] = {}
id_to_frame_dict: dict[str, str] = {}
id_to_css_dict: dict[str, str]
id_to_element_hash: dict[str, str]
hash_to_element_ids: dict[str, list[str]]
element_tree: list[dict]
element_tree_trimmed: list[dict]
screenshots: list[bytes]
@@ -309,7 +339,13 @@ async def scrape_web_unsafe(
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
element_tree = await cleanup_element_tree(url, copy.deepcopy(element_tree))
id_to_css_dict, id_to_element_dict, id_to_frame_dict = build_element_dict(elements)
id_to_css_dict, id_to_element_dict, id_to_frame_dict, id_to_element_hash, hash_to_element_ids = build_element_dict(
elements
)
# if there are no elements, fail the scraping
if not elements:
raise Exception("No elements found on the page")
text_content = await get_frame_text(page.main_frame)
@@ -329,6 +365,8 @@ async def scrape_web_unsafe(
id_to_css_dict=id_to_css_dict,
id_to_element_dict=id_to_element_dict,
id_to_frame_dict=id_to_frame_dict,
id_to_element_hash=id_to_element_hash,
hash_to_element_ids=hash_to_element_ids,
element_tree=element_tree,
element_tree_trimmed=trim_element_tree(copy.deepcopy(element_tree)),
screenshots=screenshots,
@@ -434,7 +472,7 @@ class IncrementalScrapePage:
js_script = "() => getIncrementElements()"
incremental_elements, incremental_tree = await frame.evaluate(js_script)
# we listen the incremental elements seperated by frames, so all elements will be in the same SkyvernFrame
self.id_to_css_dict, self.id_to_element_dict, _ = build_element_dict(incremental_elements)
self.id_to_css_dict, self.id_to_element_dict, _, _, _ = build_element_dict(incremental_elements)
self.elements = incremental_elements