support to parse pseudo content (#1069)

2024-10-28 19:30:11 +08:00
parent d64fd263fe
commit 9d93280302
2 changed files with 46 additions and 2 deletions
--- a/skyvern/webeye/scraper/domUtils.js
+++ b/skyvern/webeye/scraper/domUtils.js
@@ -616,6 +616,30 @@ const isAngularDropdown = (element) => {
  return false;
 };

+function getPseudoContent(element, pseudo) {
+  const pseudoStyle = getElementComputedStyle(element, pseudo);
+  if (!pseudoStyle) {
+    return null;
+  }
+  const content = pseudoStyle
+    .getPropertyValue("content")
+    .replace(/"/g, "")
+    .trim();
+
+  if (content === "none" || !content) {
+    return null;
+  }
+
+  return content;
+}
+
+function hasBeforeOrAfterPseudoContent(element) {
+  return (
+    getPseudoContent(element, "::before") != null ||
+    getPseudoContent(element, "::after") != null
+  );
+}
+
 const checkParentClass = (className) => {
  const targetParentClasses = ["field", "entry"];
  for (let i = 0; i < targetParentClasses.length; i++) {
@@ -876,7 +900,9 @@ function buildElementObject(frame, element, interactable, purgeable = false) {
    interactable: interactable,
    tagName: elementTagNameLower,
    attributes: attrs,
+    beforePseudoText: getPseudoContent(element, "::before"),
    text: getElementContent(element),
+    afterPseudoText: getPseudoContent(element, "::after"),
    children: [],
    rect: DomUtils.getVisibleClientRect(element, true),
    // if purgeable is True, which means this element is only used for building the tree relationship
@@ -1020,6 +1046,8 @@ function buildElementTree(starter = document.body, frame, full_tree = false) {
          // build all table related elements into skyvern element
          // we need these elements to preserve the DOM structure
          elementObj = buildElementObject(frame, element, false);
+        } else if (hasBeforeOrAfterPseudoContent(element)) {
+          elementObj = buildElementObject(frame, element, false);
        } else if (full_tree) {
          // when building full tree, we only get text from element itself
          // elements without text are purgeable
--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -122,11 +122,20 @@ def json_to_html(element: dict, need_skyvern_attrs: bool = True) -> str:
    if element.get("purgeable", False):
        return children_html + option_html

+    before_pseudo_text = element.get("beforePseudoText", "")
+    after_pseudo_text = element.get("afterPseudoText", "")
+
    # Check if the element is self-closing
-    if tag in ["img", "input", "br", "hr", "meta", "link"] and not option_html and not children_html:
+    if (
+        tag in ["img", "input", "br", "hr", "meta", "link"]
+        and not option_html
+        and not children_html
+        and not before_pseudo_text
+        and not after_pseudo_text
+    ):
        return f'<{tag}{attributes_html if not attributes_html else " "+attributes_html}>'
    else:
-        return f'<{tag}{attributes_html if not attributes_html else " "+attributes_html}>{text}{children_html+option_html}</{tag}>'
+        return f'<{tag}{attributes_html if not attributes_html else " "+attributes_html}>{before_pseudo_text}{text}{children_html+option_html}{after_pseudo_text}</{tag}>'


 def clean_element_before_hashing(element: dict) -> dict:
@@ -602,6 +611,13 @@ def trim_element(element: dict) -> dict:
            element_text = str(queue_ele["text"]).strip()
            if not element_text:
                del queue_ele["text"]
+
+        if "beforePseudoText" in queue_ele and not queue_ele.get("beforePseudoText"):
+            del queue_ele["beforePseudoText"]
+
+        if "afterPseudoText" in queue_ele and not queue_ele.get("afterPseudoText"):
+            del queue_ele["afterPseudoText"]
+
    return element