Move the code over from private repository (#3)

This commit is contained in:
Kerem Yilmaz
2024-03-01 10:09:30 -08:00
committed by GitHub
parent 32dd6d92a5
commit 9eddb3d812
93 changed files with 16798 additions and 0 deletions

View File

View File

@@ -0,0 +1,806 @@
// Commands for manipulating rects.
class Rect {
// Create a rect given the top left and bottom right corners.
static create(x1, y1, x2, y2) {
return {
bottom: y2,
top: y1,
left: x1,
right: x2,
width: x2 - x1,
height: y2 - y1,
};
}
static copy(rect) {
return {
bottom: rect.bottom,
top: rect.top,
left: rect.left,
right: rect.right,
width: rect.width,
height: rect.height,
};
}
// Translate a rect by x horizontally and y vertically.
static translate(rect, x, y) {
if (x == null) x = 0;
if (y == null) y = 0;
return {
bottom: rect.bottom + y,
top: rect.top + y,
left: rect.left + x,
right: rect.right + x,
width: rect.width,
height: rect.height,
};
}
// Determine whether two rects overlap.
static intersects(rect1, rect2) {
return (
rect1.right > rect2.left &&
rect1.left < rect2.right &&
rect1.bottom > rect2.top &&
rect1.top < rect2.bottom
);
}
static equals(rect1, rect2) {
for (const property of [
"top",
"bottom",
"left",
"right",
"width",
"height",
]) {
if (rect1[property] !== rect2[property]) return false;
}
return true;
}
}
class DomUtils {
//
// Bounds the rect by the current viewport dimensions. If the rect is offscreen or has a height or
// width < 3 then null is returned instead of a rect.
//
static cropRectToVisible(rect) {
const boundedRect = Rect.create(
Math.max(rect.left, 0),
Math.max(rect.top, 0),
rect.right,
rect.bottom,
);
if (
boundedRect.top >= window.innerHeight - 4 ||
boundedRect.left >= window.innerWidth - 4
) {
return null;
} else {
return boundedRect;
}
}
static getVisibleClientRect(element, testChildren) {
// Note: this call will be expensive if we modify the DOM in between calls.
let clientRect;
if (testChildren == null) testChildren = false;
const clientRects = (() => {
const result = [];
for (clientRect of element.getClientRects()) {
result.push(Rect.copy(clientRect));
}
return result;
})();
// Inline elements with font-size: 0px; will declare a height of zero, even if a child with
// non-zero font-size contains text.
let isInlineZeroHeight = function () {
const elementComputedStyle = window.getComputedStyle(element, null);
const isInlineZeroFontSize =
0 ===
elementComputedStyle.getPropertyValue("display").indexOf("inline") &&
elementComputedStyle.getPropertyValue("font-size") === "0px";
// Override the function to return this value for the rest of this context.
isInlineZeroHeight = () => isInlineZeroFontSize;
return isInlineZeroFontSize;
};
for (clientRect of clientRects) {
// If the link has zero dimensions, it may be wrapping visible but floated elements. Check for
// this.
let computedStyle;
if ((clientRect.width === 0 || clientRect.height === 0) && testChildren) {
for (const child of Array.from(element.children)) {
computedStyle = window.getComputedStyle(child, null);
// Ignore child elements which are not floated and not absolutely positioned for parent
// elements with zero width/height, as long as the case described at isInlineZeroHeight
// does not apply.
// NOTE(mrmr1993): This ignores floated/absolutely positioned descendants nested within
// inline children.
const position = computedStyle.getPropertyValue("position");
if (
computedStyle.getPropertyValue("float") === "none" &&
!["absolute", "fixed"].includes(position) &&
!(
clientRect.height === 0 &&
isInlineZeroHeight() &&
0 === computedStyle.getPropertyValue("display").indexOf("inline")
)
) {
continue;
}
const childClientRect = this.getVisibleClientRect(child, true);
if (
childClientRect === null ||
childClientRect.width < 3 ||
childClientRect.height < 3
)
continue;
return childClientRect;
}
} else {
clientRect = this.cropRectToVisible(clientRect);
if (
clientRect === null ||
clientRect.width < 3 ||
clientRect.height < 3
)
continue;
// eliminate invisible elements (see test_harnesses/visibility_test.html)
computedStyle = window.getComputedStyle(element, null);
if (computedStyle.getPropertyValue("visibility") !== "visible")
continue;
return clientRect;
}
}
return null;
}
static getViewportTopLeft() {
const box = document.documentElement;
const style = getComputedStyle(box);
const rect = box.getBoundingClientRect();
if (
style.position === "static" &&
!/content|paint|strict/.test(style.contain || "")
) {
// The margin is included in the client rect, so we need to subtract it back out.
const marginTop = parseInt(style.marginTop);
const marginLeft = parseInt(style.marginLeft);
return {
top: -rect.top + marginTop,
left: -rect.left + marginLeft,
};
} else {
const { clientTop, clientLeft } = box;
return {
top: -rect.top - clientTop,
left: -rect.left - clientLeft,
};
}
}
}
// from playwright
function getElementComputedStyle(element, pseudo) {
return element.ownerDocument && element.ownerDocument.defaultView
? element.ownerDocument.defaultView.getComputedStyle(element, pseudo)
: undefined;
}
// from playwright
function isElementStyleVisibilityVisible(element, style) {
style = style ?? getElementComputedStyle(element);
if (!style) return true;
if (
!element.checkVisibility({ checkOpacity: false, checkVisibilityCSS: false })
)
return false;
if (style.visibility !== "visible") return false;
return true;
}
// from playwright
function isElementVisible(element) {
// TODO: This is a hack to not check visibility for option elements
// because they are not visible by default. We check their parent instead for visibility.
if (element.tagName.toLowerCase() === "option")
return element.parentElement && isElementVisible(element.parentElement);
const style = getElementComputedStyle(element);
if (!style) return true;
if (style.display === "contents") {
// display:contents is not rendered itself, but its child nodes are.
for (let child = element.firstChild; child; child = child.nextSibling) {
if (
child.nodeType === 1 /* Node.ELEMENT_NODE */ &&
isElementVisible(child)
)
return true;
// skipping other nodes including text
}
return false;
}
if (!isElementStyleVisibilityVisible(element, style)) return false;
const rect = element.getBoundingClientRect();
return rect.width > 0 && rect.height > 0;
}
function isHiddenOrDisabled(element) {
const style = getElementComputedStyle(element);
return style?.display === "none" || element.hidden || element.disabled;
}
function isScriptOrStyle(element) {
const tagName = element.tagName.toLowerCase();
return tagName === "script" || tagName === "style";
}
function hasWidgetRole(element) {
const role = element.getAttribute("role");
if (!role) {
return false;
}
// https://developer.mozilla.org/en-US/docs/Web/Accessibility/ARIA/Roles#2._widget_roles
// Not all roles make sense for the time being so we only check for the ones that do
const widgetRoles = [
"button",
"link",
"checkbox",
"menuitem",
"menuitemcheckbox",
"menuitemradio",
"radio",
"tab",
"combobox",
"textbox",
"searchbox",
"slider",
"spinbutton",
"switch",
"gridcell",
];
return widgetRoles.includes(role.toLowerCase().trim());
}
function isInteractableInput(element) {
const tagName = element.tagName.toLowerCase();
const type = element.getAttribute("type");
if (tagName !== "input" || !type) {
// let other checks decide
return false;
}
const clickableTypes = [
"button",
"checkbox",
"date",
"datetime-local",
"email",
"file",
"image",
"month",
"number",
"password",
"radio",
"range",
"reset",
"search",
"submit",
"tel",
"text",
"time",
"url",
"week",
];
return clickableTypes.includes(type.toLowerCase().trim());
}
function isInteractable(element) {
if (!isElementVisible(element)) {
return false;
}
if (isHiddenOrDisabled(element)) {
return false;
}
if (isScriptOrStyle(element)) {
return false;
}
if (hasWidgetRole(element)) {
return true;
}
if (isInteractableInput(element)) {
return true;
}
const tagName = element.tagName.toLowerCase();
if (tagName === "a" && element.href) {
return true;
}
if (
tagName === "button" ||
tagName === "select" ||
tagName === "option" ||
tagName === "textarea"
) {
return true;
}
if (tagName === "label" && element.control && !element.control.disabled) {
return true;
}
if (
element.hasAttribute("onclick") ||
element.isContentEditable ||
element.hasAttribute("jsaction")
) {
return true;
}
if (tagName === "div" || tagName === "img" || tagName === "span") {
const computedStyle = window.getComputedStyle(element);
const hasPointer = computedStyle.cursor === "pointer";
const hasCursor = computedStyle.cursor === "cursor";
return hasPointer || hasCursor;
}
return false;
}
function removeMultipleSpaces(str) {
if (!str) {
return str;
}
return str.replace(/\s+/g, " ");
}
function cleanupText(text) {
return removeMultipleSpaces(
text.replace("SVGs not supported by this browser.", ""),
).trim();
}
function getElementContext(element) {
// dfs to collect the non unique_id context
let fullContext = "";
if (element.childNodes.length === 0) {
return fullContext;
}
let childContextList = new Array();
for (var child of element.childNodes) {
let childContext = "";
if (child.nodeType === Node.TEXT_NODE) {
if (!element.hasAttribute("unique_id")) {
childContext = child.data.trim();
}
} else if (child.nodeType === Node.ELEMENT_NODE) {
if (!child.hasAttribute("unique_id")) {
childContext = getElementContext(child);
}
}
if (childContext.length > 0) {
childContextList.push(childContext);
}
if (childContextList.length > 0) {
fullContext = childContextList.join(";");
}
const charLimit = 1000;
if (fullContext.length > charLimit) {
fullContext = "";
}
}
return fullContext;
}
function getElementContent(element) {
// DFS to get all the text content from all the nodes under the element
let textContent = element.textContent;
let nodeContent = "";
// if element has children, then build a list of text and join with a semicolon
if (element.childNodes.length > 0) {
let childTextContentList = new Array();
let nodeTextContentList = new Array();
for (var child of element.childNodes) {
let childText = "";
if (child.nodeType === Node.TEXT_NODE) {
childText = child.data.trim();
nodeTextContentList.push(childText);
} else if (child.nodeType === Node.ELEMENT_NODE) {
// childText = child.textContent.trim();
childText = getElementContent(child);
} else {
console.log("Unhandled node type: ", child.nodeType);
}
if (childText.length > 0) {
childTextContentList.push(childText);
}
}
textContent = childTextContentList.join(";");
nodeContent = cleanupText(nodeTextContentList.join(";"));
}
let finalTextContent = cleanupText(textContent);
// Currently we don't support too much context. Character limit is 1000 per element.
// we don't think element context has to be that big
const charLimit = 1000;
if (finalTextContent.length > charLimit) {
if (nodeContent.length <= charLimit) {
finalTextContent = nodeContent;
} else {
finalTextContent = "";
}
}
return finalTextContent;
}
function getSelectOptions(element) {
const options = Array.from(element.options);
const selectOptions = [];
for (const option of options) {
selectOptions.push({
optionIndex: option.index,
text: removeMultipleSpaces(option.textContent),
});
}
return selectOptions;
}
function buildTreeFromBody() {
var elements = [];
var resultArray = [];
function buildElementObject(element) {
var element_id = elements.length;
var elementTagNameLower = element.tagName.toLowerCase();
element.setAttribute("unique_id", element_id);
// if element is an "a" tag and has a target="_blank" attribute, remove the target attribute
// We're doing this so that skyvern can do all the navigation in a single page/tab and not open new tab
if (element.tagName.toLowerCase() === "a") {
if (element.getAttribute("target") === "_blank") {
element.removeAttribute("target");
}
}
const attrs = {};
for (const attr of element.attributes) {
var attrValue = attr.value;
if (
attr.name === "required" ||
attr.name === "aria-required" ||
attr.name === "checked" ||
attr.name === "aria-checked" ||
attr.name === "selected" ||
attr.name === "aria-selected" ||
attr.name === "readonly" ||
attr.name === "aria-readonly"
) {
attrValue = true;
}
attrs[attr.name] = attrValue;
}
if (elementTagNameLower === "input" || elementTagNameLower === "textarea") {
attrs["value"] = element.value;
}
let elementObj = {
id: element_id,
tagName: elementTagNameLower,
attributes: attrs,
text: getElementContent(element),
children: [],
rect: DomUtils.getVisibleClientRect(element, true),
};
// get options for select element or for listbox element
let selectOptions = null;
if (elementTagNameLower === "select") {
selectOptions = getSelectOptions(element);
}
if (selectOptions) {
elementObj.options = selectOptions;
}
return elementObj;
}
function getChildElements(element) {
if (element.childElementCount !== 0) {
return Array.from(element.children);
} else {
return [];
}
}
function processElement(element, interactableParentId) {
// Check if the element is interactable
if (isInteractable(element)) {
var elementObj = buildElementObject(element);
elements.push(elementObj);
// If the element is interactable but has no interactable parent,
// then it starts a new tree, so add it to the result array
// and set its id as the interactable parent id for the next elements
// under it
if (interactableParentId === null) {
resultArray.push(elementObj);
}
// If the element is interactable and has an interactable parent,
// then add it to the children of the parent
else {
elements[interactableParentId].children.push(elementObj);
}
// Recursively process the children of the element
getChildElements(element).forEach((child) => {
processElement(child, elementObj.id);
});
return elementObj;
} else {
// For a non-interactable element, process its children
// and check if any of them are interactable
let interactableChildren = [];
getChildElements(element).forEach((child) => {
let children = processElement(child, interactableParentId);
});
}
}
// TODO: Handle iframes
// Clear all the unique_id attributes so that there are no conflicts
removeAllUniqueIdAttributes();
processElement(document.body, null);
for (var element of elements) {
if (
((element.tagName === "input" && element.attributes["type"] === "text") ||
element.tagName === "textarea") &&
(element.attributes["required"] || element.attributes["aria-required"]) &&
element.attributes.value === ""
) {
// TODO (kerem): we may want to pass these elements to the LLM as empty but required fields in the future
console.log(
"input element with required attribute and no value",
element,
);
}
// for most elements, we're going 10 layers up to see if we can find "label" as a parent
// if found, most likely the context under label is relevant to this element
let targetParentElements = new Set(["label", "fieldset"]);
// look up for 10 levels to find the most contextual parent element
let targetContextualParent = null;
let currentEle = document.querySelector(`[unique_id="${element.id}"]`);
let parentEle = currentEle;
for (var i = 0; i < 10; i++) {
parentEle = parentEle.parentElement;
if (parentEle) {
if (targetParentElements.has(parentEle.tagName.toLowerCase())) {
targetContextualParent = parentEle;
}
} else {
break;
}
}
if (targetContextualParent) {
let context = "";
var lowerCaseTagName = targetContextualParent.tagName.toLowerCase();
if (lowerCaseTagName === "label") {
context = getElementContext(targetContextualParent);
} else if (lowerCaseTagName === "fieldset") {
// fieldset is usually within a form or another element that contains the whole context
targetContextualParent = targetContextualParent.parentElement;
if (targetContextualParent) {
context = getElementContext(targetContextualParent);
}
}
if (context.length > 0) {
element.context = context;
}
}
}
return [elements, resultArray];
}
function drawBoundingBoxes(elements) {
// draw a red border around the elements
var groups = groupElementsVisually(elements);
var hintMarkers = createHintMarkersForGroups(groups);
addHintMarkersToPage(hintMarkers);
}
function removeAllUniqueIdAttributes() {
var elementsWithUniqueId = document.querySelectorAll("[unique_id]");
elementsWithUniqueId.forEach(function (element) {
element.removeAttribute("unique_id");
});
}
function captchaSolvedCallback() {
console.log("captcha solved");
if (!window["captchaSolvedCounter"]) {
window["captchaSolvedCounter"] = 0;
}
// For some reason this isn't being called.. TODO figure out why
window["captchaSolvedCounter"] = window["captchaSolvedCounter"] + 1;
}
function getCaptchaSolves() {
if (!window["captchaSolvedCounter"]) {
window["captchaSolvedCounter"] = 0;
}
return window["captchaSolvedCounter"];
}
function groupElementsVisually(elements) {
const groups = [];
// o n^2
// go through each hint and see if it overlaps with any other hints, if it does, add it to the group of the other hint
// *** if we start from the bigger elements (top -> bottom) we can avoid merging groups
for (const element of elements) {
if (!element.rect) {
continue;
}
const group = groups.find((group) => {
for (const groupElement of group.elements) {
if (Rect.intersects(groupElement.rect, element.rect)) {
return true;
}
}
return false;
});
if (group) {
group.elements.push(element);
} else {
groups.push({
elements: [element],
});
}
}
// go through each group and create a rectangle that encompasses all the hints in the group
for (const group of groups) {
group.rect = createRectangleForGroup(group);
}
return groups;
}
function createRectangleForGroup(group) {
const rects = group.elements.map((element) => element.rect);
const top = Math.min(...rects.map((rect) => rect.top));
const left = Math.min(...rects.map((rect) => rect.left));
const bottom = Math.max(...rects.map((rect) => rect.bottom));
const right = Math.max(...rects.map((rect) => rect.right));
return Rect.create(left, top, right, bottom);
}
function generateHintStrings(count) {
const hintCharacters = "sadfjklewcmpgh";
let hintStrings = [""];
let offset = 0;
while (hintStrings.length - offset < count || hintStrings.length === 1) {
const hintString = hintStrings[offset++];
for (const ch of hintCharacters) {
hintStrings.push(ch + hintString);
}
}
hintStrings = hintStrings.slice(offset, offset + count);
// Shuffle the hints so that they're scattered; hints starting with the same character and short
// hints are spread evenly throughout the array.
return hintStrings.sort(); // .map((str) => str.reverse())
}
function createHintMarkersForGroups(groups) {
if (groups.length === 0) {
console.log("No groups found, not adding hint markers to page.");
return [];
}
const hintMarkers = groups.map((group) => createHintMarkerForGroup(group));
// fill in marker text
const hintStrings = generateHintStrings(hintMarkers.length);
for (let i = 0; i < hintMarkers.length; i++) {
const hintMarker = hintMarkers[i];
hintMarker.hintString = hintStrings[i];
hintMarker.element.innerHTML = hintMarker.hintString.toUpperCase();
}
return hintMarkers;
}
function createHintMarkerForGroup(group) {
const marker = {};
// yellow annotation box with string
const el = document.createElement("div");
el.style.left = group.rect.left + "px";
el.style.top = group.rect.top + "px";
// Each group is assigned a different incremental z-index, we use the same z-index for the
// bounding box and the hint marker
el.style.zIndex = this.currentZIndex;
// The bounding box around the group of hints.
const boundingBox = document.createElement("div");
// Calculate the position of the element relative to the document
var scrollTop = window.pageYOffset || document.documentElement.scrollTop;
var scrollLeft = window.pageXOffset || document.documentElement.scrollLeft;
// Set styles for the bounding box
boundingBox.style.position = "absolute";
boundingBox.style.display = "display";
boundingBox.style.left = group.rect.left + scrollLeft + "px";
boundingBox.style.top = group.rect.top + scrollTop + "px";
boundingBox.style.width = group.rect.width + "px";
boundingBox.style.height = group.rect.height + "px";
boundingBox.style.bottom = boundingBox.style.top + boundingBox.style.height;
boundingBox.style.right = boundingBox.style.left + boundingBox.style.width;
boundingBox.style.border = "2px solid blue"; // Change the border color as needed
boundingBox.style.pointerEvents = "none"; // Ensures the box doesn't interfere with other interactions
boundingBox.style.zIndex = this.currentZIndex++;
return Object.assign(marker, {
element: el,
boundingBox: boundingBox,
group: group,
});
}
function addHintMarkersToPage(hintMarkers) {
const parent = document.createElement("div");
parent.id = "boundingBoxContainer";
for (const hintMarker of hintMarkers) {
// parent.appendChild(hintMarker.element);
parent.appendChild(hintMarker.boundingBox);
}
document.documentElement.appendChild(parent);
}
function removeBoundingBoxes() {
var hintMarkerContainer = document.querySelector("#boundingBoxContainer");
if (hintMarkerContainer) {
hintMarkerContainer.remove();
}
}
function scrollToTop(draw_boxes) {
removeBoundingBoxes();
window.scrollTo(0, 0);
if (draw_boxes) {
var elementsAndResultArray = buildTreeFromBody();
drawBoundingBoxes(elementsAndResultArray[0]);
}
return window.scrollY;
}
function scrollToNextPage(draw_boxes) {
// remove bounding boxes, scroll to next page with 200px overlap, then draw bounding boxes again
// return true if there is a next page, false otherwise
removeBoundingBoxes();
window.scrollBy(0, window.innerHeight - 200);
if (draw_boxes) {
var elementsAndResultArray = buildTreeFromBody();
drawBoundingBoxes(elementsAndResultArray[0]);
}
return window.scrollY;
}

View File

@@ -0,0 +1,316 @@
import asyncio
import copy
import structlog
from playwright.async_api import Page
from pydantic import BaseModel
from skyvern.constants import SKYVERN_DIR, SKYVERN_ID_ATTR
from skyvern.forge.sdk.settings_manager import SettingsManager
from skyvern.webeye.browser_factory import BrowserState
LOG = structlog.get_logger()
RESERVED_ATTRIBUTES = {
"accept", # for input file
"alt",
"aria-checked", # for option tag
"aria-current",
"aria-label",
"aria-required",
"aria-role",
"aria-selected", # for option tag
"checked",
"data-ui",
"for",
"href", # For a tags
"maxlength",
"name",
"pattern",
"placeholder",
"readonly",
"required",
"selected", # for option tag
"src", # do we need this?
"text-value",
"title",
"type",
"value",
}
def load_js_script() -> str:
# TODO: Handle file location better. This is a hacky way to find the file location.
path = f"{SKYVERN_DIR}/webeye/scraper/domUtils.js"
try:
# TODO: Implement TS of domUtils.js and use the complied JS file instead of the raw JS file.
# This will allow our code to be type safe.
with open(path, "r") as f:
return f.read()
except FileNotFoundError as e:
LOG.exception("Failed to load the JS script", exc_info=True, path=path)
raise e
JS_FUNCTION_DEFS = load_js_script()
class ScrapedPage(BaseModel):
"""
Scraped response from a webpage, including:
1. List of elements
2. ID to xpath map
3. The element tree of the page (list of dicts). Each element has children and attributes.
4. The screenshot (base64 encoded)
5. The URL of the page
6. The HTML of the page
7. The extracted text from the page
"""
elements: list[dict]
id_to_xpath_dict: dict[int, str]
element_tree: list[dict]
element_tree_trimmed: list[dict]
screenshots: list[bytes]
url: str
html: str
extracted_text: str | None = None
async def scrape_website(
browser_state: BrowserState,
url: str,
num_retry: int = 0,
) -> ScrapedPage:
"""
************************************************************************************************
************ NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production *************
************************************************************************************************
High-level asynchronous function to scrape a web page. It sets up the Playwright environment, handles browser and
page initialization, and calls the safe scraping function. This function is ideal for general use where initial
setup and safety measures are required.
Asynchronous function that safely scrapes a web page. It handles exceptions and retries scraping up to a maximum
number of attempts. This function should be used when reliability and error handling are crucial, such as in
automated scraping tasks.
:param browser_context: BrowserContext instance used for scraping.
:param url: URL of the web page to be scraped.
:param page: Optional Page instance for scraping, a new page is created if None.
:param num_retry: Tracks number of retries if scraping fails, defaults to 0.
:return: Tuple containing Page instance, base64 encoded screenshot, and page elements.
:raises Exception: When scraping fails after maximum retries.
"""
try:
num_retry += 1
return await scrape_web_unsafe(browser_state, url)
except Exception:
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES:
LOG.error(
"Scraping failed after max retries, aborting.",
max_retries=SettingsManager.get_settings().MAX_SCRAPING_RETRIES,
url=url,
exc_info=True,
)
raise Exception("Scraping failed.")
LOG.info("Scraping failed, will retry", num_retry=num_retry, url=url)
return await scrape_website(
browser_state,
url,
num_retry=num_retry,
)
async def get_all_visible_text(page: Page) -> str:
"""
Get all the visible text on the page.
:param page: Page instance to get the text from.
:return: All the visible text on the page.
"""
js_script = "() => document.body.innerText"
return await page.evaluate(js_script)
async def scrape_web_unsafe(
browser_state: BrowserState,
url: str,
) -> ScrapedPage:
"""
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
for use cases where the caller handles exceptions or in controlled environments. It directly scrapes the provided
URL or continues on the given page.
:param browser_context: BrowserContext instance used for scraping.
:param url: URL of the web page to be scraped. Used only when creating a new page.
:param page: Optional Page instance for scraping, a new page is created if None.
:return: Tuple containing Page instance, base64 encoded screenshot, and page elements.
:note: This function does not handle exceptions. Ensure proper error handling in the calling context.
"""
# We only create a new page if one does not exist. This is to allow keeping the same page since we want to
# continue working on the same page that we're taking actions on.
# *This also means URL is only used when creating a new page, and not when using an existing page.
page = await browser_state.get_or_create_page(url)
# Take screenshots of the page with the bounding boxes. We will remove the bounding boxes later.
# Scroll to the top of the page and take a screenshot.
# Scroll to the next page and take a screenshot until we reach the end of the page.
# We check if the scroll_y_px_old is the same as scroll_y_px to determine if we have reached the end of the page.
# This also solves the issue where we can't scroll due to a popup.(e.g. geico first popup on the homepage after
# clicking start my quote)
LOG.info("Waiting for 5 seconds before scraping the website.")
await asyncio.sleep(5)
screenshots: list[bytes] = []
scroll_y_px_old = -1.0
scroll_y_px = await scroll_to_top(page, drow_boxes=True)
# Checking max number of screenshots to prevent infinite loop
while scroll_y_px_old != scroll_y_px and len(screenshots) < SettingsManager.get_settings().MAX_NUM_SCREENSHOTS:
screenshot = await page.screenshot(full_page=False)
screenshots.append(screenshot)
scroll_y_px_old = scroll_y_px
LOG.info("Scrolling to next page", url=url, num_screenshots=len(screenshots))
scroll_y_px = await scroll_to_next_page(page, drow_boxes=True)
LOG.info("Scrolled to next page", scroll_y_px=scroll_y_px, scroll_y_px_old=scroll_y_px_old)
await remove_bounding_boxes(page)
await scroll_to_top(page, drow_boxes=False)
elements, element_tree = await get_interactable_element_tree(page)
element_tree = cleanup_elements(copy.deepcopy(element_tree))
id_to_xpath_dict = {}
for element in elements:
element_id = element["id"]
# get_interactable_element_tree marks each interactable element with a unique_id attribute
id_to_xpath_dict[element_id] = f"//*[@{SKYVERN_ID_ATTR}='{element_id}']"
text_content = await get_all_visible_text(page)
return ScrapedPage(
elements=elements,
id_to_xpath_dict=id_to_xpath_dict,
element_tree=element_tree,
element_tree_trimmed=trim_element_tree(copy.deepcopy(element_tree)),
screenshots=screenshots,
url=page.url,
html=await page.content(),
extracted_text=text_content,
)
async def get_interactable_element_tree(page: Page) -> tuple[list[dict], list[dict]]:
"""
Get the element tree of the page, including all the elements that are interactable.
:param page: Page instance to get the element tree from.
:return: Tuple containing the element tree and a map of element IDs to elements.
"""
await page.evaluate(JS_FUNCTION_DEFS)
js_script = "() => buildTreeFromBody()"
elements, element_tree = await page.evaluate(js_script)
return elements, element_tree
async def scroll_to_top(page: Page, drow_boxes: bool) -> float:
"""
Scroll to the top of the page and take a screenshot.
:param drow_boxes: If True, draw bounding boxes around the elements.
:param page: Page instance to take the screenshot from.
:return: Screenshot of the page.
"""
await page.evaluate(JS_FUNCTION_DEFS)
js_script = f"() => scrollToTop({str(drow_boxes).lower()})"
scroll_y_px = await page.evaluate(js_script)
return scroll_y_px
async def scroll_to_next_page(page: Page, drow_boxes: bool) -> bool:
"""
Scroll to the next page and take a screenshot.
:param drow_boxes: If True, draw bounding boxes around the elements.
:param page: Page instance to take the screenshot from.
:return: Screenshot of the page.
"""
await page.evaluate(JS_FUNCTION_DEFS)
js_script = f"() => scrollToNextPage({str(drow_boxes).lower()})"
scroll_y_px = await page.evaluate(js_script)
return scroll_y_px
async def remove_bounding_boxes(page: Page) -> None:
"""
Remove the bounding boxes from the page.
:param page: Page instance to remove the bounding boxes from.
"""
js_script = "() => removeBoundingBoxes()"
await page.evaluate(js_script)
def cleanup_elements(elements: list[dict]) -> list[dict]:
"""
Remove rect and attribute.unique_id from the elements.
The reason we're doing it is to
1. reduce unnecessary data so that llm get less distrction
# TODO later: 2. reduce tokens sent to llm to save money
:param elements: List of elements to remove xpaths from.
:return: List of elements without xpaths.
"""
queue = []
for element in elements:
queue.append(element)
while queue:
queue_ele = queue.pop(0)
_remove_rect(queue_ele)
# TODO: we can come back to test removing the unique_id
# from element attributes to make sure this won't increase hallucination
# _remove_unique_id(queue_ele)
if "children" in queue_ele:
queue.extend(queue_ele["children"])
return elements
def trim_element_tree(elements: list[dict]) -> list[dict]:
queue = []
for element in elements:
queue.append(element)
while queue:
queue_ele = queue.pop(0)
if "attributes" in queue_ele:
tag_name = queue_ele["tagName"] if "tagName" in queue_ele else ""
new_attributes = _trimmed_attributes(tag_name, queue_ele["attributes"])
if new_attributes:
queue_ele["attributes"] = new_attributes
else:
del queue_ele["attributes"]
if "children" in queue_ele:
queue.extend(queue_ele["children"])
if not queue_ele["children"]:
del queue_ele["children"]
if "text" in queue_ele:
element_text = str(queue_ele["text"]).strip()
if not element_text:
del queue_ele["text"]
return elements
def _trimmed_attributes(tag_name: str, attributes: dict) -> dict:
new_attributes: dict = {}
for key in attributes:
if key == "id" and tag_name in ["input", "textarea", "select"]:
# We don't want to remove the id attribute any of these elements in case there's a label for it
new_attributes[key] = attributes[key]
if key in RESERVED_ATTRIBUTES:
new_attributes[key] = attributes[key]
return new_attributes
def _remove_rect(element: dict) -> None:
if "rect" in element:
del element["rect"]
def _remove_unique_id(element: dict) -> None:
if "attributes" not in element:
return
if SKYVERN_ID_ATTR in element["attributes"]:
del element["attributes"][SKYVERN_ID_ATTR]