diff --git a/skyvern/forge/prompts/skyvern/extract-action-claude3-sonnet.j2 b/skyvern/forge/prompts/skyvern/extract-action-claude3-sonnet.j2 index 1b133347..02fdf395 100644 --- a/skyvern/forge/prompts/skyvern/extract-action-claude3-sonnet.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action-claude3-sonnet.j2 @@ -1,5 +1,6 @@ Identify actions to help user progress towards the user goal using the DOM elements given in the list and the screenshot of the website. Include only the elements that are relevant to the user goal, without altering or imagining new elements. +Accurately interpret and understand the functional significance of SVG elements based on their shapes and context within the webpage. Use the details from the user details to fill in necessary values. Always satisfy required fields if the field isn't already filled in. Don't return any action for the same field, if this field is already filled in and the value is the same as the one you would have filled in. MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. Each interactable element is tagged with an ID. diff --git a/skyvern/forge/prompts/skyvern/extract-action.j2 b/skyvern/forge/prompts/skyvern/extract-action.j2 index 53619cbd..a9351b06 100644 --- a/skyvern/forge/prompts/skyvern/extract-action.j2 +++ b/skyvern/forge/prompts/skyvern/extract-action.j2 @@ -1,5 +1,6 @@ Identify actions to help user progress towards the user goal using the DOM elements given in the list and the screenshot of the website. Include only the elements that are relevant to the user goal, without altering or imagining new elements. +Accurately interpret and understand the functional significance of SVG elements based on their shapes and context within the webpage. Use the details from the user details to fill in necessary values. Always satisfy required fields if the field isn't already filled in. Don't return any action for the same field, if this field is already filled in and the value is the same as the one you would have filled in. MAKE SURE YOU OUTPUT VALID JSON. No text before or after JSON, no trailing commas, no comments (//), no unnecessary quotes, etc. Each interactable element is tagged with an ID. diff --git a/skyvern/webeye/scraper/domUtils.js b/skyvern/webeye/scraper/domUtils.js index 63c2bac1..5a6a20cc 100644 --- a/skyvern/webeye/scraper/domUtils.js +++ b/skyvern/webeye/scraper/domUtils.js @@ -727,6 +727,9 @@ async function buildTreeFromBody(frame = "main.frame", open_select = false) { text: getElementContent(element), children: [], rect: DomUtils.getVisibleClientRect(element, true), + // don't trim any attr of this element if keepAllAttr=True + keepAllAttr: + elementTagNameLower === "svg" || element.closest("svg") !== null, }; // get options for select element or for listbox element @@ -859,19 +862,30 @@ async function buildTreeFromBody(frame = "main.frame", open_select = false) { !isHidden(element) && !isScriptOrStyle(element) ) { - let textContent = ""; - for (let i = 0; i < element.childNodes.length; i++) { - var node = element.childNodes[i]; - if (node.nodeType === Node.TEXT_NODE) { - textContent += getVisibleText(node).trim(); + let elementObj = null; + if (element.tagName.toLowerCase() === "svg") { + // if element is we save all attributes and its children + elementObj = buildElementObject(element, false); + } else if (element.closest("svg")) { + // if elemnet is the children of + elementObj = buildElementObject(element, false); + } else { + // character length limit for non-interactable elements should be 5000 + // we don't use element context in HTML format, + // so we need to make sure we parse all text node to avoid missing text in HTML. + let textContent = ""; + for (let i = 0; i < element.childNodes.length; i++) { + var node = element.childNodes[i]; + if (node.nodeType === Node.TEXT_NODE) { + textContent += getVisibleText(node).trim(); + } + } + if (textContent && textContent.length <= 5000) { + elementObj = buildElementObject(element, false); } } - // character length limit for non-interactable elements should be 5000 - // we don't use element context in HTML format, - // so we need to make sure we parse all text node to avoid missing text in HTML. - if (textContent && textContent.length <= 5000) { - var elementObj = await buildElementObject(element, false); + if (elementObj !== null) { elements.push(elementObj); if (parentId === null) { resultArray.push(elementObj); diff --git a/skyvern/webeye/scraper/scraper.py b/skyvern/webeye/scraper/scraper.py index d8a6fd27..82421dc3 100644 --- a/skyvern/webeye/scraper/scraper.py +++ b/skyvern/webeye/scraper/scraper.py @@ -453,13 +453,16 @@ def trim_element_tree(elements: list[dict]) -> list[dict]: if not queue_ele.get("interactable"): del queue_ele["id"] - if "attributes" in queue_ele: + if "attributes" in queue_ele and not queue_ele.get("keepAllAttr", False): tag_name = queue_ele["tagName"] if "tagName" in queue_ele else "" new_attributes = _trimmed_attributes(tag_name, queue_ele["attributes"]) if new_attributes: queue_ele["attributes"] = new_attributes else: del queue_ele["attributes"] + # remove the tag, don't need it in the HTML tree + del queue_ele["keepAllAttr"] + if "children" in queue_ele: queue.extend(queue_ele["children"]) if not queue_ele["children"]: