add cleanup function (#631)
This commit is contained in:
@@ -899,6 +899,7 @@ class ForgeAgent:
|
|||||||
return await scrape_website(
|
return await scrape_website(
|
||||||
browser_state,
|
browser_state,
|
||||||
task.url,
|
task.url,
|
||||||
|
app.AGENT_FUNCTION.cleanup_element_tree,
|
||||||
scrape_exclude=app.scrape_exclude,
|
scrape_exclude=app.scrape_exclude,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
|||||||
@@ -1,3 +1,5 @@
|
|||||||
|
from typing import Dict, List
|
||||||
|
|
||||||
from playwright.async_api import Page
|
from playwright.async_api import Page
|
||||||
|
|
||||||
from skyvern.exceptions import StepUnableToExecuteError
|
from skyvern.exceptions import StepUnableToExecuteError
|
||||||
@@ -8,6 +10,11 @@ from skyvern.forge.sdk.schemas.tasks import Task, TaskStatus
|
|||||||
from skyvern.webeye.browser_factory import BrowserState
|
from skyvern.webeye.browser_factory import BrowserState
|
||||||
|
|
||||||
|
|
||||||
|
def _remove_rect(element: dict) -> None:
|
||||||
|
if "rect" in element:
|
||||||
|
del element["rect"]
|
||||||
|
|
||||||
|
|
||||||
class AgentFunction:
|
class AgentFunction:
|
||||||
async def validate_step_execution(
|
async def validate_step_execution(
|
||||||
self,
|
self,
|
||||||
@@ -56,3 +63,29 @@ class AgentFunction:
|
|||||||
page: Page,
|
page: Page,
|
||||||
) -> list[AsyncOperation]:
|
) -> list[AsyncOperation]:
|
||||||
return []
|
return []
|
||||||
|
|
||||||
|
async def cleanup_element_tree(
|
||||||
|
self,
|
||||||
|
url: str,
|
||||||
|
element_tree: List[Dict],
|
||||||
|
) -> List[Dict]:
|
||||||
|
"""
|
||||||
|
Remove rect and attribute.unique_id from the elements.
|
||||||
|
The reason we're doing it is to
|
||||||
|
1. reduce unnecessary data so that llm get less distrction
|
||||||
|
TODO later: 2. reduce tokens sent to llm to save money
|
||||||
|
:param elements: List of elements to remove xpaths from.
|
||||||
|
:return: List of elements without xpaths.
|
||||||
|
"""
|
||||||
|
queue = []
|
||||||
|
for element in element_tree:
|
||||||
|
queue.append(element)
|
||||||
|
while queue:
|
||||||
|
queue_ele = queue.pop(0)
|
||||||
|
_remove_rect(queue_ele)
|
||||||
|
# TODO: we can come back to test removing the unique_id
|
||||||
|
# from element attributes to make sure this won't increase hallucination
|
||||||
|
# _remove_unique_id(queue_ele)
|
||||||
|
if "children" in queue_ele:
|
||||||
|
queue.extend(queue_ele["children"])
|
||||||
|
return element_tree
|
||||||
|
|||||||
@@ -153,6 +153,7 @@ class ScrapedPage(BaseModel):
|
|||||||
async def scrape_website(
|
async def scrape_website(
|
||||||
browser_state: BrowserState,
|
browser_state: BrowserState,
|
||||||
url: str,
|
url: str,
|
||||||
|
cleanup_element_tree: Callable[[str, list[dict]], Awaitable[list[dict]]],
|
||||||
num_retry: int = 0,
|
num_retry: int = 0,
|
||||||
scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
|
scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
|
||||||
) -> ScrapedPage:
|
) -> ScrapedPage:
|
||||||
@@ -179,7 +180,7 @@ async def scrape_website(
|
|||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
num_retry += 1
|
num_retry += 1
|
||||||
return await scrape_web_unsafe(browser_state, url, scrape_exclude)
|
return await scrape_web_unsafe(browser_state, url, cleanup_element_tree, scrape_exclude)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
|
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
|
||||||
if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES:
|
if num_retry > SettingsManager.get_settings().MAX_SCRAPING_RETRIES:
|
||||||
@@ -197,6 +198,7 @@ async def scrape_website(
|
|||||||
return await scrape_website(
|
return await scrape_website(
|
||||||
browser_state,
|
browser_state,
|
||||||
url,
|
url,
|
||||||
|
cleanup_element_tree,
|
||||||
num_retry=num_retry,
|
num_retry=num_retry,
|
||||||
scrape_exclude=scrape_exclude,
|
scrape_exclude=scrape_exclude,
|
||||||
)
|
)
|
||||||
@@ -231,6 +233,7 @@ async def get_frame_text(iframe: Frame) -> str:
|
|||||||
async def scrape_web_unsafe(
|
async def scrape_web_unsafe(
|
||||||
browser_state: BrowserState,
|
browser_state: BrowserState,
|
||||||
url: str,
|
url: str,
|
||||||
|
cleanup_element_tree: Callable[[str, list[dict]], Awaitable[list[dict]]],
|
||||||
scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
|
scrape_exclude: Callable[[Page, Frame], Awaitable[bool]] | None = None,
|
||||||
) -> ScrapedPage:
|
) -> ScrapedPage:
|
||||||
"""
|
"""
|
||||||
@@ -261,9 +264,7 @@ async def scrape_web_unsafe(
|
|||||||
screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=url, draw_boxes=True)
|
screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=url, draw_boxes=True)
|
||||||
|
|
||||||
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
|
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
|
||||||
element_tree = cleanup_elements(copy.deepcopy(element_tree))
|
element_tree = await cleanup_element_tree(url, copy.deepcopy(element_tree))
|
||||||
|
|
||||||
_build_element_links(elements)
|
|
||||||
|
|
||||||
id_to_css_dict = {}
|
id_to_css_dict = {}
|
||||||
id_to_element_dict = {}
|
id_to_element_dict = {}
|
||||||
@@ -377,29 +378,6 @@ async def get_interactable_element_tree(
|
|||||||
return elements, element_tree
|
return elements, element_tree
|
||||||
|
|
||||||
|
|
||||||
def cleanup_elements(elements: list[dict]) -> list[dict]:
|
|
||||||
"""
|
|
||||||
Remove rect and attribute.unique_id from the elements.
|
|
||||||
The reason we're doing it is to
|
|
||||||
1. reduce unnecessary data so that llm get less distrction
|
|
||||||
# TODO later: 2. reduce tokens sent to llm to save money
|
|
||||||
:param elements: List of elements to remove xpaths from.
|
|
||||||
:return: List of elements without xpaths.
|
|
||||||
"""
|
|
||||||
queue = []
|
|
||||||
for element in elements:
|
|
||||||
queue.append(element)
|
|
||||||
while queue:
|
|
||||||
queue_ele = queue.pop(0)
|
|
||||||
_remove_rect(queue_ele)
|
|
||||||
# TODO: we can come back to test removing the unique_id
|
|
||||||
# from element attributes to make sure this won't increase hallucination
|
|
||||||
# _remove_unique_id(queue_ele)
|
|
||||||
if "children" in queue_ele:
|
|
||||||
queue.extend(queue_ele["children"])
|
|
||||||
return elements
|
|
||||||
|
|
||||||
|
|
||||||
def trim_element_tree(elements: list[dict]) -> list[dict]:
|
def trim_element_tree(elements: list[dict]) -> list[dict]:
|
||||||
queue = []
|
queue = []
|
||||||
for element in elements:
|
for element in elements:
|
||||||
@@ -466,11 +444,6 @@ def _trimmed_attributes(tag_name: str, attributes: dict) -> dict:
|
|||||||
return new_attributes
|
return new_attributes
|
||||||
|
|
||||||
|
|
||||||
def _remove_rect(element: dict) -> None:
|
|
||||||
if "rect" in element:
|
|
||||||
del element["rect"]
|
|
||||||
|
|
||||||
|
|
||||||
def _remove_unique_id(element: dict) -> None:
|
def _remove_unique_id(element: dict) -> None:
|
||||||
if "attributes" not in element:
|
if "attributes" not in element:
|
||||||
return
|
return
|
||||||
|
|||||||
Reference in New Issue
Block a user