Move the code over from private repository (#3)

2024-03-01 10:09:30 -08:00
parent 32dd6d92a5
commit 9eddb3d812
93 changed files with 16798 additions and 0 deletions
--- a/skyvern/webeye/scraper/init.py
+++ b/skyvern/webeye/scraper/init.py
--- a/skyvern/webeye/scraper/domUtils.js
+++ b/skyvern/webeye/scraper/domUtils.js
@@ -0,0 +1,806 @@
+// Commands for manipulating rects.
+class Rect {
+  // Create a rect given the top left and bottom right corners.
+  static create(x1, y1, x2, y2) {
+    return {
+      bottom: y2,
+      top: y1,
+      left: x1,
+      right: x2,
+      width: x2 - x1,
+      height: y2 - y1,
+    };
+  }
+
+  static copy(rect) {
+    return {
+      bottom: rect.bottom,
+      top: rect.top,
+      left: rect.left,
+      right: rect.right,
+      width: rect.width,
+      height: rect.height,
+    };
+  }
+
+  // Translate a rect by x horizontally and y vertically.
+  static translate(rect, x, y) {
+    if (x == null) x = 0;
+    if (y == null) y = 0;
+    return {
+      bottom: rect.bottom + y,
+      top: rect.top + y,
+      left: rect.left + x,
+      right: rect.right + x,
+      width: rect.width,
+      height: rect.height,
+    };
+  }
+
+  // Determine whether two rects overlap.
+  static intersects(rect1, rect2) {
+    return (
+      rect1.right > rect2.left &&
+      rect1.left < rect2.right &&
+      rect1.bottom > rect2.top &&
+      rect1.top < rect2.bottom
+    );
+  }
+
+  static equals(rect1, rect2) {
+    for (const property of [
+      "top",
+      "bottom",
+      "left",
+      "right",
+      "width",
+      "height",
+    ]) {
+      if (rect1[property] !== rect2[property]) return false;
+    }
+    return true;
+  }
+}
+
+class DomUtils {
+  //
+  // Bounds the rect by the current viewport dimensions. If the rect is offscreen or has a height or
+  // width < 3 then null is returned instead of a rect.
+  //
+  static cropRectToVisible(rect) {
+    const boundedRect = Rect.create(
+      Math.max(rect.left, 0),
+      Math.max(rect.top, 0),
+      rect.right,
+      rect.bottom,
+    );
+    if (
+      boundedRect.top >= window.innerHeight - 4 ||
+      boundedRect.left >= window.innerWidth - 4
+    ) {
+      return null;
+    } else {
+      return boundedRect;
+    }
+  }
+
+  static getVisibleClientRect(element, testChildren) {
+    // Note: this call will be expensive if we modify the DOM in between calls.
+    let clientRect;
+    if (testChildren == null) testChildren = false;
+    const clientRects = (() => {
+      const result = [];
+      for (clientRect of element.getClientRects()) {
+        result.push(Rect.copy(clientRect));
+      }
+      return result;
+    })();
+
+    // Inline elements with font-size: 0px; will declare a height of zero, even if a child with
+    // non-zero font-size contains text.
+    let isInlineZeroHeight = function () {
+      const elementComputedStyle = window.getComputedStyle(element, null);
+      const isInlineZeroFontSize =
+        0 ===
+          elementComputedStyle.getPropertyValue("display").indexOf("inline") &&
+        elementComputedStyle.getPropertyValue("font-size") === "0px";
+      // Override the function to return this value for the rest of this context.
+      isInlineZeroHeight = () => isInlineZeroFontSize;
+      return isInlineZeroFontSize;
+    };
+
+    for (clientRect of clientRects) {
+      // If the link has zero dimensions, it may be wrapping visible but floated elements. Check for
+      // this.
+      let computedStyle;
+      if ((clientRect.width === 0 || clientRect.height === 0) && testChildren) {
+        for (const child of Array.from(element.children)) {
+          computedStyle = window.getComputedStyle(child, null);
+          // Ignore child elements which are not floated and not absolutely positioned for parent
+          // elements with zero width/height, as long as the case described at isInlineZeroHeight
+          // does not apply.
+          // NOTE(mrmr1993): This ignores floated/absolutely positioned descendants nested within
+          // inline children.
+          const position = computedStyle.getPropertyValue("position");
+          if (
+            computedStyle.getPropertyValue("float") === "none" &&
+            !["absolute", "fixed"].includes(position) &&
+            !(
+              clientRect.height === 0 &&
+              isInlineZeroHeight() &&
+              0 === computedStyle.getPropertyValue("display").indexOf("inline")
+            )
+          ) {
+            continue;
+          }
+          const childClientRect = this.getVisibleClientRect(child, true);
+          if (
+            childClientRect === null ||
+            childClientRect.width < 3 ||
+            childClientRect.height < 3
+          )
+            continue;
+          return childClientRect;
+        }
+      } else {
+        clientRect = this.cropRectToVisible(clientRect);
+
+        if (
+          clientRect === null ||
+          clientRect.width < 3 ||
+          clientRect.height < 3
+        )
+          continue;
+
+        // eliminate invisible elements (see test_harnesses/visibility_test.html)
+        computedStyle = window.getComputedStyle(element, null);
+        if (computedStyle.getPropertyValue("visibility") !== "visible")
+          continue;
+
+        return clientRect;
+      }
+    }
+
+    return null;
+  }
+
+  static getViewportTopLeft() {
+    const box = document.documentElement;
+    const style = getComputedStyle(box);
+    const rect = box.getBoundingClientRect();
+    if (
+      style.position === "static" &&
+      !/content|paint|strict/.test(style.contain || "")
+    ) {
+      // The margin is included in the client rect, so we need to subtract it back out.
+      const marginTop = parseInt(style.marginTop);
+      const marginLeft = parseInt(style.marginLeft);
+      return {
+        top: -rect.top + marginTop,
+        left: -rect.left + marginLeft,
+      };
+    } else {
+      const { clientTop, clientLeft } = box;
+      return {
+        top: -rect.top - clientTop,
+        left: -rect.left - clientLeft,
+      };
+    }
+  }
+}
+
+// from playwright
+function getElementComputedStyle(element, pseudo) {
+  return element.ownerDocument && element.ownerDocument.defaultView
+    ? element.ownerDocument.defaultView.getComputedStyle(element, pseudo)
+    : undefined;
+}
+
+// from playwright
+function isElementStyleVisibilityVisible(element, style) {
+  style = style ?? getElementComputedStyle(element);
+  if (!style) return true;
+  if (
+    !element.checkVisibility({ checkOpacity: false, checkVisibilityCSS: false })
+  )
+    return false;
+  if (style.visibility !== "visible") return false;
+  return true;
+}
+
+// from playwright
+function isElementVisible(element) {
+  // TODO: This is a hack to not check visibility for option elements
+  // because they are not visible by default. We check their parent instead for visibility.
+  if (element.tagName.toLowerCase() === "option")
+    return element.parentElement && isElementVisible(element.parentElement);
+
+  const style = getElementComputedStyle(element);
+  if (!style) return true;
+  if (style.display === "contents") {
+    // display:contents is not rendered itself, but its child nodes are.
+    for (let child = element.firstChild; child; child = child.nextSibling) {
+      if (
+        child.nodeType === 1 /* Node.ELEMENT_NODE */ &&
+        isElementVisible(child)
+      )
+        return true;
+      // skipping other nodes including text
+    }
+    return false;
+  }
+  if (!isElementStyleVisibilityVisible(element, style)) return false;
+  const rect = element.getBoundingClientRect();
+  return rect.width > 0 && rect.height > 0;
+}
+
+function isHiddenOrDisabled(element) {
+  const style = getElementComputedStyle(element);
+  return style?.display === "none" || element.hidden || element.disabled;
+}
+
+function isScriptOrStyle(element) {
+  const tagName = element.tagName.toLowerCase();
+  return tagName === "script" || tagName === "style";
+}
+
+function hasWidgetRole(element) {
+  const role = element.getAttribute("role");
+  if (!role) {
+    return false;
+  }
+  // https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles#2._widget_roles
+  // Not all roles make sense for the time being so we only check for the ones that do
+  const widgetRoles = [
+    "button",
+    "link",
+    "checkbox",
+    "menuitem",
+    "menuitemcheckbox",
+    "menuitemradio",
+    "radio",
+    "tab",
+    "combobox",
+    "textbox",
+    "searchbox",
+    "slider",
+    "spinbutton",
+    "switch",
+    "gridcell",
+  ];
+  return widgetRoles.includes(role.toLowerCase().trim());
+}
+
+function isInteractableInput(element) {
+  const tagName = element.tagName.toLowerCase();
+  const type = element.getAttribute("type");
+  if (tagName !== "input" || !type) {
+    // let other checks decide
+    return false;
+  }
+  const clickableTypes = [
+    "button",
+    "checkbox",
+    "date",
+    "datetime-local",
+    "email",
+    "file",
+    "image",
+    "month",
+    "number",
+    "password",
+    "radio",
+    "range",
+    "reset",
+    "search",
+    "submit",
+    "tel",
+    "text",
+    "time",
+    "url",
+    "week",
+  ];
+  return clickableTypes.includes(type.toLowerCase().trim());
+}
+
+function isInteractable(element) {
+  if (!isElementVisible(element)) {
+    return false;
+  }
+
+  if (isHiddenOrDisabled(element)) {
+    return false;
+  }
+
+  if (isScriptOrStyle(element)) {
+    return false;
+  }
+
+  if (hasWidgetRole(element)) {
+    return true;
+  }
+
+  if (isInteractableInput(element)) {
+    return true;
+  }
+
+  const tagName = element.tagName.toLowerCase();
+
+  if (tagName === "a" && element.href) {
+    return true;
+  }
+
+  if (
+    tagName === "button" ||
+    tagName === "select" ||
+    tagName === "option" ||
+    tagName === "textarea"
+  ) {
+    return true;
+  }
+
+  if (tagName === "label" && element.control && !element.control.disabled) {
+    return true;
+  }
+
+  if (
+    element.hasAttribute("onclick") ||
+    element.isContentEditable ||
+    element.hasAttribute("jsaction")
+  ) {
+    return true;
+  }
+
+  if (tagName === "div" || tagName === "img" || tagName === "span") {
+    const computedStyle = window.getComputedStyle(element);
+    const hasPointer = computedStyle.cursor === "pointer";
+    const hasCursor = computedStyle.cursor === "cursor";
+    return hasPointer || hasCursor;
+  }
+
+  return false;
+}
+
+function removeMultipleSpaces(str) {
+  if (!str) {
+    return str;
+  }
+  return str.replace(/\s+/g, " ");
+}
+
+function cleanupText(text) {
+  return removeMultipleSpaces(
+    text.replace("SVGs not supported by this browser.", ""),
+  ).trim();
+}
+
+function getElementContext(element) {
+  // dfs to collect the non unique_id context
+  let fullContext = "";
+  if (element.childNodes.length === 0) {
+    return fullContext;
+  }
+  let childContextList = new Array();
+  for (var child of element.childNodes) {
+    let childContext = "";
+    if (child.nodeType === Node.TEXT_NODE) {
+      if (!element.hasAttribute("unique_id")) {
+        childContext = child.data.trim();
+      }
+    } else if (child.nodeType === Node.ELEMENT_NODE) {
+      if (!child.hasAttribute("unique_id")) {
+        childContext = getElementContext(child);
+      }
+    }
+    if (childContext.length > 0) {
+      childContextList.push(childContext);
+    }
+
+    if (childContextList.length > 0) {
+      fullContext = childContextList.join(";");
+    }
+
+    const charLimit = 1000;
+    if (fullContext.length > charLimit) {
+      fullContext = "";
+    }
+  }
+  return fullContext;
+}
+
+function getElementContent(element) {
+  // DFS to get all the text content from all the nodes under the element
+
+  let textContent = element.textContent;
+  let nodeContent = "";
+  // if element has children, then build a list of text and join with a semicolon
+  if (element.childNodes.length > 0) {
+    let childTextContentList = new Array();
+    let nodeTextContentList = new Array();
+    for (var child of element.childNodes) {
+      let childText = "";
+      if (child.nodeType === Node.TEXT_NODE) {
+        childText = child.data.trim();
+        nodeTextContentList.push(childText);
+      } else if (child.nodeType === Node.ELEMENT_NODE) {
+        // childText = child.textContent.trim();
+        childText = getElementContent(child);
+      } else {
+        console.log("Unhandled node type: ", child.nodeType);
+      }
+      if (childText.length > 0) {
+        childTextContentList.push(childText);
+      }
+    }
+    textContent = childTextContentList.join(";");
+    nodeContent = cleanupText(nodeTextContentList.join(";"));
+  }
+  let finalTextContent = cleanupText(textContent);
+
+  // Currently we don't support too much context. Character limit is 1000 per element.
+  // we don't think element context has to be that big
+  const charLimit = 1000;
+  if (finalTextContent.length > charLimit) {
+    if (nodeContent.length <= charLimit) {
+      finalTextContent = nodeContent;
+    } else {
+      finalTextContent = "";
+    }
+  }
+
+  return finalTextContent;
+}
+
+function getSelectOptions(element) {
+  const options = Array.from(element.options);
+  const selectOptions = [];
+  for (const option of options) {
+    selectOptions.push({
+      optionIndex: option.index,
+      text: removeMultipleSpaces(option.textContent),
+    });
+  }
+  return selectOptions;
+}
+
+function buildTreeFromBody() {
+  var elements = [];
+  var resultArray = [];
+
+  function buildElementObject(element) {
+    var element_id = elements.length;
+    var elementTagNameLower = element.tagName.toLowerCase();
+    element.setAttribute("unique_id", element_id);
+    // if element is an "a" tag and has a target="_blank" attribute, remove the target attribute
+    // We're doing this so that skyvern can do all the navigation in a single page/tab and not open new tab
+    if (element.tagName.toLowerCase() === "a") {
+      if (element.getAttribute("target") === "_blank") {
+        element.removeAttribute("target");
+      }
+    }
+    const attrs = {};
+    for (const attr of element.attributes) {
+      var attrValue = attr.value;
+      if (
+        attr.name === "required" ||
+        attr.name === "aria-required" ||
+        attr.name === "checked" ||
+        attr.name === "aria-checked" ||
+        attr.name === "selected" ||
+        attr.name === "aria-selected" ||
+        attr.name === "readonly" ||
+        attr.name === "aria-readonly"
+      ) {
+        attrValue = true;
+      }
+      attrs[attr.name] = attrValue;
+    }
+    if (elementTagNameLower === "input" || elementTagNameLower === "textarea") {
+      attrs["value"] = element.value;
+    }
+
+    let elementObj = {
+      id: element_id,
+      tagName: elementTagNameLower,
+      attributes: attrs,
+      text: getElementContent(element),
+      children: [],
+      rect: DomUtils.getVisibleClientRect(element, true),
+    };
+
+    // get options for select element or for listbox element
+    let selectOptions = null;
+    if (elementTagNameLower === "select") {
+      selectOptions = getSelectOptions(element);
+    }
+    if (selectOptions) {
+      elementObj.options = selectOptions;
+    }
+
+    return elementObj;
+  }
+
+  function getChildElements(element) {
+    if (element.childElementCount !== 0) {
+      return Array.from(element.children);
+    } else {
+      return [];
+    }
+  }
+  function processElement(element, interactableParentId) {
+    // Check if the element is interactable
+    if (isInteractable(element)) {
+      var elementObj = buildElementObject(element);
+      elements.push(elementObj);
+      // If the element is interactable but has no interactable parent,
+      // then it starts a new tree, so add it to the result array
+      // and set its id as the interactable parent id for the next elements
+      // under it
+      if (interactableParentId === null) {
+        resultArray.push(elementObj);
+      }
+      // If the element is interactable and has an interactable parent,
+      // then add it to the children of the parent
+      else {
+        elements[interactableParentId].children.push(elementObj);
+      }
+      // Recursively process the children of the element
+      getChildElements(element).forEach((child) => {
+        processElement(child, elementObj.id);
+      });
+      return elementObj;
+    } else {
+      // For a non-interactable element, process its children
+      // and check if any of them are interactable
+      let interactableChildren = [];
+      getChildElements(element).forEach((child) => {
+        let children = processElement(child, interactableParentId);
+      });
+    }
+  }
+
+  // TODO: Handle iframes
+
+  // Clear all the unique_id attributes so that there are no conflicts
+  removeAllUniqueIdAttributes();
+  processElement(document.body, null);
+
+  for (var element of elements) {
+    if (
+      ((element.tagName === "input" && element.attributes["type"] === "text") ||
+        element.tagName === "textarea") &&
+      (element.attributes["required"] || element.attributes["aria-required"]) &&
+      element.attributes.value === ""
+    ) {
+      // TODO (kerem): we may want to pass these elements to the LLM as empty but required fields in the future
+      console.log(
+        "input element with required attribute and no value",
+        element,
+      );
+    }
+
+    // for most elements, we're going 10 layers up to see if we can find "label" as a parent
+    // if found, most likely the context under label is relevant to this element
+    let targetParentElements = new Set(["label", "fieldset"]);
+
+    // look up for 10 levels to find the most contextual parent element
+    let targetContextualParent = null;
+    let currentEle = document.querySelector(`[unique_id="${element.id}"]`);
+    let parentEle = currentEle;
+    for (var i = 0; i < 10; i++) {
+      parentEle = parentEle.parentElement;
+      if (parentEle) {
+        if (targetParentElements.has(parentEle.tagName.toLowerCase())) {
+          targetContextualParent = parentEle;
+        }
+      } else {
+        break;
+      }
+    }
+    if (targetContextualParent) {
+      let context = "";
+      var lowerCaseTagName = targetContextualParent.tagName.toLowerCase();
+      if (lowerCaseTagName === "label") {
+        context = getElementContext(targetContextualParent);
+      } else if (lowerCaseTagName === "fieldset") {
+        // fieldset is usually within a form or another element that contains the whole context
+        targetContextualParent = targetContextualParent.parentElement;
+        if (targetContextualParent) {
+          context = getElementContext(targetContextualParent);
+        }
+      }
+      if (context.length > 0) {
+        element.context = context;
+      }
+    }
+  }
+
+  return [elements, resultArray];
+}
+
+function drawBoundingBoxes(elements) {
+  // draw a red border around the elements
+  var groups = groupElementsVisually(elements);
+  var hintMarkers = createHintMarkersForGroups(groups);
+  addHintMarkersToPage(hintMarkers);
+}
+
+function removeAllUniqueIdAttributes() {
+  var elementsWithUniqueId = document.querySelectorAll("[unique_id]");
+
+  elementsWithUniqueId.forEach(function (element) {
+    element.removeAttribute("unique_id");
+  });
+}
+
+function captchaSolvedCallback() {
+  console.log("captcha solved");
+  if (!window["captchaSolvedCounter"]) {
+    window["captchaSolvedCounter"] = 0;
+  }
+  // For some reason this isn't being called.. TODO figure out why
+  window["captchaSolvedCounter"] = window["captchaSolvedCounter"] + 1;
+}
+
+function getCaptchaSolves() {
+  if (!window["captchaSolvedCounter"]) {
+    window["captchaSolvedCounter"] = 0;
+  }
+  return window["captchaSolvedCounter"];
+}
+
+function groupElementsVisually(elements) {
+  const groups = [];
+  // o n^2
+  // go through each hint and see if it overlaps with any other hints, if it does, add it to the group of the other hint
+  // *** if we start from the bigger elements (top -> bottom) we can avoid merging groups
+  for (const element of elements) {
+    if (!element.rect) {
+      continue;
+    }
+    const group = groups.find((group) => {
+      for (const groupElement of group.elements) {
+        if (Rect.intersects(groupElement.rect, element.rect)) {
+          return true;
+        }
+      }
+      return false;
+    });
+    if (group) {
+      group.elements.push(element);
+    } else {
+      groups.push({
+        elements: [element],
+      });
+    }
+  }
+
+  // go through each group and create a rectangle that encompasses all the hints in the group
+  for (const group of groups) {
+    group.rect = createRectangleForGroup(group);
+  }
+
+  return groups;
+}
+
+function createRectangleForGroup(group) {
+  const rects = group.elements.map((element) => element.rect);
+  const top = Math.min(...rects.map((rect) => rect.top));
+  const left = Math.min(...rects.map((rect) => rect.left));
+  const bottom = Math.max(...rects.map((rect) => rect.bottom));
+  const right = Math.max(...rects.map((rect) => rect.right));
+  return Rect.create(left, top, right, bottom);
+}
+
+function generateHintStrings(count) {
+  const hintCharacters = "sadfjklewcmpgh";
+  let hintStrings = [""];
+  let offset = 0;
+
+  while (hintStrings.length - offset < count || hintStrings.length === 1) {
+    const hintString = hintStrings[offset++];
+    for (const ch of hintCharacters) {
+      hintStrings.push(ch + hintString);
+    }
+  }
+  hintStrings = hintStrings.slice(offset, offset + count);
+
+  // Shuffle the hints so that they're scattered; hints starting with the same character and short
+  // hints are spread evenly throughout the array.
+  return hintStrings.sort(); // .map((str) => str.reverse())
+}
+
+function createHintMarkersForGroups(groups) {
+  if (groups.length === 0) {
+    console.log("No groups found, not adding hint markers to page.");
+    return [];
+  }
+
+  const hintMarkers = groups.map((group) => createHintMarkerForGroup(group));
+
+  // fill in marker text
+  const hintStrings = generateHintStrings(hintMarkers.length);
+  for (let i = 0; i < hintMarkers.length; i++) {
+    const hintMarker = hintMarkers[i];
+    hintMarker.hintString = hintStrings[i];
+    hintMarker.element.innerHTML = hintMarker.hintString.toUpperCase();
+  }
+
+  return hintMarkers;
+}
+
+function createHintMarkerForGroup(group) {
+  const marker = {};
+  // yellow annotation box with string
+  const el = document.createElement("div");
+  el.style.left = group.rect.left + "px";
+  el.style.top = group.rect.top + "px";
+  // Each group is assigned a different incremental z-index, we use the same z-index for the
+  // bounding box and the hint marker
+  el.style.zIndex = this.currentZIndex;
+
+  // The bounding box around the group of hints.
+  const boundingBox = document.createElement("div");
+
+  // Calculate the position of the element relative to the document
+  var scrollTop = window.pageYOffset || document.documentElement.scrollTop;
+  var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
+
+  // Set styles for the bounding box
+  boundingBox.style.position = "absolute";
+  boundingBox.style.display = "display";
+  boundingBox.style.left = group.rect.left + scrollLeft + "px";
+  boundingBox.style.top = group.rect.top + scrollTop + "px";
+  boundingBox.style.width = group.rect.width + "px";
+  boundingBox.style.height = group.rect.height + "px";
+  boundingBox.style.bottom = boundingBox.style.top + boundingBox.style.height;
+  boundingBox.style.right = boundingBox.style.left + boundingBox.style.width;
+  boundingBox.style.border = "2px solid blue"; // Change the border color as needed
+  boundingBox.style.pointerEvents = "none"; // Ensures the box doesn't interfere with other interactions
+  boundingBox.style.zIndex = this.currentZIndex++;
+
+  return Object.assign(marker, {
+    element: el,
+    boundingBox: boundingBox,
+    group: group,
+  });
+}
+
+function addHintMarkersToPage(hintMarkers) {
+  const parent = document.createElement("div");
+  parent.id = "boundingBoxContainer";
+  for (const hintMarker of hintMarkers) {
+    // parent.appendChild(hintMarker.element);
+    parent.appendChild(hintMarker.boundingBox);
+  }
+  document.documentElement.appendChild(parent);
+}
+
+function removeBoundingBoxes() {
+  var hintMarkerContainer = document.querySelector("#boundingBoxContainer");
+  if (hintMarkerContainer) {
+    hintMarkerContainer.remove();
+  }
+}
+
+function scrollToTop(draw_boxes) {
+  removeBoundingBoxes();
+  window.scrollTo(0, 0);
+  if (draw_boxes) {
+    var elementsAndResultArray = buildTreeFromBody();
+    drawBoundingBoxes(elementsAndResultArray[0]);
+  }
+  return window.scrollY;
+}
+
+function scrollToNextPage(draw_boxes) {
+  // remove bounding boxes, scroll to next page with 200px overlap, then draw bounding boxes again
+  // return true if there is a next page, false otherwise
+  removeBoundingBoxes();
+  window.scrollBy(0, window.innerHeight - 200);
+  if (draw_boxes) {
+    var elementsAndResultArray = buildTreeFromBody();
+    drawBoundingBoxes(elementsAndResultArray[0]);
+  }
+  return window.scrollY;
+}
--- a/skyvern/webeye/scraper/scraper.py
+++ b/skyvern/webeye/scraper/scraper.py
@@ -0,0 +1,316 @@
+import asyncio
+import copy
+
+import structlog
+from playwright.async_api import Page
+from pydantic import BaseModel
+
+from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
+from skyvern.forge.sdk.settings_manager import SettingsManager
+from skyvern.webeye.browser_factory import BrowserState
+
+LOG = structlog.get_logger()
+
+RESERVED_ATTRIBUTES = {
+    "accept",  # for input file
+    "alt",
+    "aria-checked",  # for option tag
+    "aria-current",
+    "aria-label",
+    "aria-required",
+    "aria-role",
+    "aria-selected",  # for option tag
+    "checked",
+    "data-ui",
+    "for",
+    "href",  # For a tags
+    "maxlength",
+    "name",
+    "pattern",
+    "placeholder",
+    "readonly",
+    "required",
+    "selected",  # for option tag
+    "src",  # do we need this?
+    "text-value",
+    "title",
+    "type",
+    "value",
+}
+
+
+def load_js_script() -> str:
+    # TODO: Handle file location better. This is a hacky way to find the file location.
+    path = f"{SKYVERN_DIR}/webeye/scraper/domUtils.js"
+    try:
+        # TODO: Implement TS of domUtils.js and use the complied JS file instead of the raw JS file.
+        # This will allow our code to be type safe.
+        with open(path, "r") as f:
+            return f.read()
+    except FileNotFoundError as e:
+        LOG.exception("Failed to load the JS script", exc_info=True, path=path)
+        raise e
+
+
+JS_FUNCTION_DEFS = load_js_script()
+
+
+class ScrapedPage(BaseModel):
+    """
+    Scraped response from a webpage, including:
+    1. List of elements
+    2. ID to xpath map
+    3. The element tree of the page (list of dicts). Each element has children and attributes.
+    4. The screenshot (base64 encoded)
+    5. The URL of the page
+    6. The HTML of the page
+    7. The extracted text from the page
+    """
+
+    elements: list[dict]
+    id_to_xpath_dict: dict[int, str]
+    element_tree: list[dict]
+    element_tree_trimmed: list[dict]
+    screenshots: list[bytes]
+    url: str
+    html: str
+    extracted_text: str | None = None
+
+
+async def scrape_website(
+    browser_state: BrowserState,
+    url: str,
+    num_retry: int = 0,
+) -> ScrapedPage:
+    """
+    ************************************************************************************************
+    ************ NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production *************
+    ************************************************************************************************
+    High-level asynchronous function to scrape a web page. It sets up the Playwright environment, handles browser and
+    page initialization, and calls the safe scraping function. This function is ideal for general use where initial
+    setup and safety measures are required.
+
+    Asynchronous function that safely scrapes a web page. It handles exceptions and retries scraping up to a maximum
+    number of attempts. This function should be used when reliability and error handling are crucial, such as in
+    automated scraping tasks.
+
+    :param browser_context: BrowserContext instance used for scraping.
+    :param url: URL of the web page to be scraped.
+    :param page: Optional Page instance for scraping, a new page is created if None.
+    :param num_retry: Tracks number of retries if scraping fails, defaults to 0.
+
+    :return: Tuple containing Page instance, base64 encoded screenshot, and page elements.
+
+    :raises Exception: When scraping fails after maximum retries.
+    """
+    try:
+        num_retry += 1
+        return await scrape_web_unsafe(browser_state, url)
+    except Exception:
+        # NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
+        if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES:
+            LOG.error(
+                "Scraping failed after max retries, aborting.",
+                max_retries=SettingsManager.get_settings().MAX_SCRAPING_RETRIES,
+                url=url,
+                exc_info=True,
+            )
+            raise Exception("Scraping failed.")
+        LOG.info("Scraping failed, will retry", num_retry=num_retry, url=url)
+        return await scrape_website(
+            browser_state,
+            url,
+            num_retry=num_retry,
+        )
+
+
+async def get_all_visible_text(page: Page) -> str:
+    """
+    Get all the visible text on the page.
+    :param page: Page instance to get the text from.
+    :return: All the visible text on the page.
+    """
+    js_script = "() => document.body.innerText"
+    return await page.evaluate(js_script)
+
+
+async def scrape_web_unsafe(
+    browser_state: BrowserState,
+    url: str,
+) -> ScrapedPage:
+    """
+    Asynchronous function that performs web scraping without any built-in error handling. This function is intended
+    for use cases where the caller handles exceptions or in controlled environments. It directly scrapes the provided
+    URL or continues on the given page.
+
+    :param browser_context: BrowserContext instance used for scraping.
+    :param url: URL of the web page to be scraped. Used only when creating a new page.
+    :param page: Optional Page instance for scraping, a new page is created if None.
+    :return: Tuple containing Page instance, base64 encoded screenshot, and page elements.
+    :note: This function does not handle exceptions. Ensure proper error handling in the calling context.
+    """
+    # We only create a new page if one does not exist. This is to allow keeping the same page since we want to
+    # continue working on the same page that we're taking actions on.
+    # *This also means URL is only used when creating a new page, and not when using an existing page.
+    page = await browser_state.get_or_create_page(url)
+    # Take screenshots of the page with the bounding boxes. We will remove the bounding boxes later.
+    # Scroll to the top of the page and take a screenshot.
+    # Scroll to the next page and take a screenshot until we reach the end of the page.
+    # We check if the scroll_y_px_old is the same as scroll_y_px to determine if we have reached the end of the page.
+    # This also solves the issue where we can't scroll due to a popup.(e.g. geico first popup on the homepage after
+    # clicking start my quote)
+
+    LOG.info("Waiting for 5 seconds before scraping the website.")
+    await asyncio.sleep(5)
+
+    screenshots: list[bytes] = []
+    scroll_y_px_old = -1.0
+    scroll_y_px = await scroll_to_top(page, drow_boxes=True)
+    # Checking max number of screenshots to prevent infinite loop
+    while scroll_y_px_old != scroll_y_px and len(screenshots) < SettingsManager.get_settings().MAX_NUM_SCREENSHOTS:
+        screenshot = await page.screenshot(full_page=False)
+        screenshots.append(screenshot)
+        scroll_y_px_old = scroll_y_px
+        LOG.info("Scrolling to next page", url=url, num_screenshots=len(screenshots))
+        scroll_y_px = await scroll_to_next_page(page, drow_boxes=True)
+        LOG.info("Scrolled to next page", scroll_y_px=scroll_y_px, scroll_y_px_old=scroll_y_px_old)
+    await remove_bounding_boxes(page)
+    await scroll_to_top(page, drow_boxes=False)
+
+    elements, element_tree = await get_interactable_element_tree(page)
+    element_tree = cleanup_elements(copy.deepcopy(element_tree))
+
+    id_to_xpath_dict = {}
+    for element in elements:
+        element_id = element["id"]
+        # get_interactable_element_tree marks each interactable element with a unique_id attribute
+        id_to_xpath_dict[element_id] = f"//*[@{SKYVERN_ID_ATTR}='{element_id}']"
+
+    text_content = await get_all_visible_text(page)
+    return ScrapedPage(
+        elements=elements,
+        id_to_xpath_dict=id_to_xpath_dict,
+        element_tree=element_tree,
+        element_tree_trimmed=trim_element_tree(copy.deepcopy(element_tree)),
+        screenshots=screenshots,
+        url=page.url,
+        html=await page.content(),
+        extracted_text=text_content,
+    )
+
+
+async def get_interactable_element_tree(page: Page) -> tuple[list[dict], list[dict]]:
+    """
+    Get the element tree of the page, including all the elements that are interactable.
+    :param page: Page instance to get the element tree from.
+    :return: Tuple containing the element tree and a map of element IDs to elements.
+    """
+    await page.evaluate(JS_FUNCTION_DEFS)
+    js_script = "() => buildTreeFromBody()"
+    elements, element_tree = await page.evaluate(js_script)
+    return elements, element_tree
+
+
+async def scroll_to_top(page: Page, drow_boxes: bool) -> float:
+    """
+    Scroll to the top of the page and take a screenshot.
+    :param drow_boxes: If True, draw bounding boxes around the elements.
+    :param page: Page instance to take the screenshot from.
+    :return: Screenshot of the page.
+    """
+    await page.evaluate(JS_FUNCTION_DEFS)
+    js_script = f"() => scrollToTop({str(drow_boxes).lower()})"
+    scroll_y_px = await page.evaluate(js_script)
+    return scroll_y_px
+
+
+async def scroll_to_next_page(page: Page, drow_boxes: bool) -> bool:
+    """
+    Scroll to the next page and take a screenshot.
+    :param drow_boxes: If True, draw bounding boxes around the elements.
+    :param page: Page instance to take the screenshot from.
+    :return: Screenshot of the page.
+    """
+    await page.evaluate(JS_FUNCTION_DEFS)
+    js_script = f"() => scrollToNextPage({str(drow_boxes).lower()})"
+    scroll_y_px = await page.evaluate(js_script)
+    return scroll_y_px
+
+
+async def remove_bounding_boxes(page: Page) -> None:
+    """
+    Remove the bounding boxes from the page.
+    :param page: Page instance to remove the bounding boxes from.
+    """
+    js_script = "() => removeBoundingBoxes()"
+    await page.evaluate(js_script)
+
+
+def cleanup_elements(elements: list[dict]) -> list[dict]:
+    """
+    Remove rect and attribute.unique_id from the elements.
+    The reason we're doing it is to
+    1. reduce unnecessary data so that llm get less distrction
+    # TODO later: 2. reduce tokens sent to llm to save money
+    :param elements: List of elements to remove xpaths from.
+    :return: List of elements without xpaths.
+    """
+    queue = []
+    for element in elements:
+        queue.append(element)
+    while queue:
+        queue_ele = queue.pop(0)
+        _remove_rect(queue_ele)
+        # TODO: we can come back to test removing the unique_id
+        # from element attributes to make sure this won't increase hallucination
+        # _remove_unique_id(queue_ele)
+        if "children" in queue_ele:
+            queue.extend(queue_ele["children"])
+    return elements
+
+
+def trim_element_tree(elements: list[dict]) -> list[dict]:
+    queue = []
+    for element in elements:
+        queue.append(element)
+    while queue:
+        queue_ele = queue.pop(0)
+        if "attributes" in queue_ele:
+            tag_name = queue_ele["tagName"] if "tagName" in queue_ele else ""
+            new_attributes = _trimmed_attributes(tag_name, queue_ele["attributes"])
+            if new_attributes:
+                queue_ele["attributes"] = new_attributes
+            else:
+                del queue_ele["attributes"]
+        if "children" in queue_ele:
+            queue.extend(queue_ele["children"])
+            if not queue_ele["children"]:
+                del queue_ele["children"]
+        if "text" in queue_ele:
+            element_text = str(queue_ele["text"]).strip()
+            if not element_text:
+                del queue_ele["text"]
+    return elements
+
+
+def _trimmed_attributes(tag_name: str, attributes: dict) -> dict:
+    new_attributes: dict = {}
+    for key in attributes:
+        if key == "id" and tag_name in ["input", "textarea", "select"]:
+            # We don't want to remove the id attribute any of these elements in case there's a label for it
+            new_attributes[key] = attributes[key]
+        if key in RESERVED_ATTRIBUTES:
+            new_attributes[key] = attributes[key]
+    return new_attributes
+
+
+def _remove_rect(element: dict) -> None:
+    if "rect" in element:
+        del element["rect"]
+
+
+def _remove_unique_id(element: dict) -> None:
+    if "attributes" not in element:
+        return
+    if SKYVERN_ID_ATTR in element["attributes"]:
+        del element["attributes"][SKYVERN_ID_ATTR]