make sure we do a web scrape before data extraction at the end of task to ensure the refreshness of the scraped data (#1152)
This commit is contained in:
@@ -907,17 +907,19 @@ class ForgeAgent:
|
||||
step_id=step.step_id,
|
||||
workflow_run_id=task.workflow_run_id,
|
||||
)
|
||||
scraped_page_without_screenshots = await scraped_page.refresh(with_screenshot=False)
|
||||
scraped_page_refreshed = await scraped_page.refresh()
|
||||
|
||||
verification_prompt = prompt_engine.load_prompt(
|
||||
"check-user-goal",
|
||||
navigation_goal=task.navigation_goal,
|
||||
navigation_payload=task.navigation_payload,
|
||||
elements=scraped_page_without_screenshots.build_element_tree(ElementTreeFormat.HTML),
|
||||
elements=scraped_page_refreshed.build_element_tree(ElementTreeFormat.HTML),
|
||||
)
|
||||
|
||||
# this prompt is critical to our agent so let's use the primary LLM API handler
|
||||
verification_response = await app.LLM_API_HANDLER(prompt=verification_prompt, step=step, screenshots=None)
|
||||
verification_response = await app.LLM_API_HANDLER(
|
||||
prompt=verification_prompt, step=step, screenshots=scraped_page_refreshed.screenshots
|
||||
)
|
||||
if "user_goal_achieved" not in verification_response or "thoughts" not in verification_response:
|
||||
LOG.error(
|
||||
"Invalid LLM response for user goal success verification, skipping verification",
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
Your are here to help the user determine if the user has completed their goal on the web. Use the content of the elements parsed from the page, the user goal and user details to determine whether the user goal has been completed or not.
|
||||
Your are here to help the user determine if the user has completed their goal on the web. Use the content of the elements parsed from the page, the screenshots of the page, the user goal and user details to determine whether the user goal has been completed or not.
|
||||
|
||||
Make sure to ONLY return the JSON object in this format with no additional text before or after it:
|
||||
```json
|
||||
|
||||
@@ -2237,6 +2237,7 @@ async def extract_information_for_navigation_goal(
|
||||
element_tree_format = ElementTreeFormat.HTML
|
||||
element_tree_in_prompt: str = scraped_page.build_element_tree(element_tree_format)
|
||||
|
||||
scraped_page_refreshed = await scraped_page.refresh()
|
||||
extract_information_prompt = prompt_engine.load_prompt(
|
||||
prompt_template,
|
||||
navigation_goal=task.navigation_goal,
|
||||
@@ -2244,8 +2245,8 @@ async def extract_information_for_navigation_goal(
|
||||
elements=element_tree_in_prompt,
|
||||
data_extraction_goal=task.data_extraction_goal,
|
||||
extracted_information_schema=task.extracted_information_schema,
|
||||
current_url=scraped_page.url,
|
||||
extracted_text=scraped_page.extracted_text,
|
||||
current_url=scraped_page_refreshed.url,
|
||||
extracted_text=scraped_page_refreshed.extracted_text,
|
||||
error_code_mapping_str=(json.dumps(task.error_code_mapping) if task.error_code_mapping else None),
|
||||
utc_datetime=datetime.utcnow().strftime("%Y-%m-%d %H:%M"),
|
||||
)
|
||||
|
||||
@@ -241,13 +241,12 @@ class ScrapedPage(BaseModel):
|
||||
|
||||
raise UnknownElementTreeFormat(fmt=fmt)
|
||||
|
||||
async def refresh(self, with_screenshot: bool = True) -> Self:
|
||||
async def refresh(self) -> Self:
|
||||
refreshed_page = await scrape_website(
|
||||
browser_state=self._browser_state,
|
||||
url=self.url,
|
||||
cleanup_element_tree=self._clean_up_func,
|
||||
scrape_exclude=self._scrape_exclude,
|
||||
with_screenshot=with_screenshot,
|
||||
)
|
||||
self.elements = refreshed_page.elements
|
||||
self.id_to_css_dict = refreshed_page.id_to_css_dict
|
||||
@@ -260,6 +259,7 @@ class ScrapedPage(BaseModel):
|
||||
self.screenshots = refreshed_page.screenshots or self.screenshots
|
||||
self.html = refreshed_page.html
|
||||
self.extracted_text = refreshed_page.extracted_text
|
||||
self.url = refreshed_page.url
|
||||
return self
|
||||
|
||||
|
||||
@@ -269,7 +269,6 @@ async def scrape_website(
|
||||
cleanup_element_tree: CleanupElementTreeFunc,
|
||||
num_retry: int = 0,
|
||||
scrape_exclude: ScrapeExcludeFunc | None = None,
|
||||
with_screenshot: bool = True,
|
||||
) -> ScrapedPage:
|
||||
"""
|
||||
************************************************************************************************
|
||||
@@ -299,7 +298,6 @@ async def scrape_website(
|
||||
url=url,
|
||||
cleanup_element_tree=cleanup_element_tree,
|
||||
scrape_exclude=scrape_exclude,
|
||||
with_screenshot=with_screenshot,
|
||||
)
|
||||
except Exception as e:
|
||||
# NOTE: MAX_SCRAPING_RETRIES is set to 0 in both staging and production
|
||||
@@ -321,7 +319,6 @@ async def scrape_website(
|
||||
cleanup_element_tree,
|
||||
num_retry=num_retry,
|
||||
scrape_exclude=scrape_exclude,
|
||||
with_screenshot=with_screenshot,
|
||||
)
|
||||
|
||||
|
||||
@@ -369,7 +366,6 @@ async def scrape_web_unsafe(
|
||||
url: str,
|
||||
cleanup_element_tree: CleanupElementTreeFunc,
|
||||
scrape_exclude: ScrapeExcludeFunc | None = None,
|
||||
with_screenshot: bool = True,
|
||||
) -> ScrapedPage:
|
||||
"""
|
||||
Asynchronous function that performs web scraping without any built-in error handling. This function is intended
|
||||
@@ -394,11 +390,7 @@ async def scrape_web_unsafe(
|
||||
LOG.info("Waiting for 5 seconds before scraping the website.")
|
||||
await asyncio.sleep(5)
|
||||
|
||||
screenshots: list[bytes] = []
|
||||
|
||||
# TODO: do we need to scroll to the button when we scrape without screenshots?
|
||||
if with_screenshot:
|
||||
screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=url, draw_boxes=True)
|
||||
screenshots = await SkyvernFrame.take_split_screenshots(page=page, url=url, draw_boxes=True)
|
||||
|
||||
elements, element_tree = await get_interactable_element_tree(page, scrape_exclude)
|
||||
element_tree = await cleanup_element_tree(page, url, copy.deepcopy(element_tree))
|
||||
|
||||
Reference in New Issue
Block a user