hash long href link (#1500)
This commit is contained in:
@@ -9,6 +9,8 @@ Do not ever include anything other than the JSON object in your output, and do n
|
|||||||
|
|
||||||
If you are unable to extract the requested information for a specific field in the json schema, please output a null value for that field.
|
If you are unable to extract the requested information for a specific field in the json schema, please output a null value for that field.
|
||||||
|
|
||||||
|
If you are trying to extract the href links which are using the jinja style like "{% raw %}{{}}{% endraw %}", please keep the orignal string.
|
||||||
|
|
||||||
User Data Extraction Goal: {{ data_extraction_goal }}
|
User Data Extraction Goal: {{ data_extraction_goal }}
|
||||||
|
|
||||||
{% if error_code_mapping_str %}
|
{% if error_code_mapping_str %}
|
||||||
|
|||||||
@@ -6,6 +6,7 @@ from typing import Any
|
|||||||
|
|
||||||
import litellm
|
import litellm
|
||||||
import structlog
|
import structlog
|
||||||
|
from jinja2 import Template
|
||||||
|
|
||||||
from skyvern.config import settings
|
from skyvern.config import settings
|
||||||
from skyvern.forge import app
|
from skyvern.forge import app
|
||||||
@@ -19,6 +20,7 @@ from skyvern.forge.sdk.api.llm.exceptions import (
|
|||||||
from skyvern.forge.sdk.api.llm.models import LLMAPIHandler, LLMConfig, LLMRouterConfig
|
from skyvern.forge.sdk.api.llm.models import LLMAPIHandler, LLMConfig, LLMRouterConfig
|
||||||
from skyvern.forge.sdk.api.llm.utils import llm_messages_builder, parse_api_response
|
from skyvern.forge.sdk.api.llm.utils import llm_messages_builder, parse_api_response
|
||||||
from skyvern.forge.sdk.artifact.models import ArtifactType
|
from skyvern.forge.sdk.artifact.models import ArtifactType
|
||||||
|
from skyvern.forge.sdk.core import skyvern_context
|
||||||
from skyvern.forge.sdk.models import Step
|
from skyvern.forge.sdk.models import Step
|
||||||
from skyvern.forge.sdk.schemas.observers import ObserverCruise, ObserverThought
|
from skyvern.forge.sdk.schemas.observers import ObserverCruise, ObserverThought
|
||||||
|
|
||||||
@@ -79,6 +81,16 @@ class LLMAPIHandlerFactory:
|
|||||||
if parameters is None:
|
if parameters is None:
|
||||||
parameters = LLMAPIHandlerFactory.get_api_parameters(llm_config)
|
parameters = LLMAPIHandlerFactory.get_api_parameters(llm_config)
|
||||||
|
|
||||||
|
context = skyvern_context.current()
|
||||||
|
if context and len(context.hashed_href_map) > 0:
|
||||||
|
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
||||||
|
data=json.dumps(context.hashed_href_map, indent=2).encode("utf-8"),
|
||||||
|
artifact_type=ArtifactType.HASHED_HREF_MAP,
|
||||||
|
step=step,
|
||||||
|
observer_cruise=observer_cruise,
|
||||||
|
observer_thought=observer_thought,
|
||||||
|
)
|
||||||
|
|
||||||
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
||||||
data=prompt.encode("utf-8"),
|
data=prompt.encode("utf-8"),
|
||||||
artifact_type=ArtifactType.LLM_PROMPT,
|
artifact_type=ArtifactType.LLM_PROMPT,
|
||||||
@@ -149,6 +161,19 @@ class LLMAPIHandlerFactory:
|
|||||||
observer_cruise=observer_cruise,
|
observer_cruise=observer_cruise,
|
||||||
observer_thought=observer_thought,
|
observer_thought=observer_thought,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if context and len(context.hashed_href_map) > 0:
|
||||||
|
llm_content = json.dumps(parsed_response)
|
||||||
|
rendered_content = Template(llm_content).render(context.hashed_href_map)
|
||||||
|
parsed_response = json.loads(rendered_content)
|
||||||
|
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
||||||
|
data=json.dumps(parsed_response, indent=2).encode("utf-8"),
|
||||||
|
artifact_type=ArtifactType.LLM_RESPONSE_RENDERED,
|
||||||
|
step=step,
|
||||||
|
observer_cruise=observer_cruise,
|
||||||
|
observer_thought=observer_thought,
|
||||||
|
)
|
||||||
|
|
||||||
return parsed_response
|
return parsed_response
|
||||||
|
|
||||||
return llm_api_handler_with_router_and_fallback
|
return llm_api_handler_with_router_and_fallback
|
||||||
@@ -178,6 +203,16 @@ class LLMAPIHandlerFactory:
|
|||||||
if llm_config.litellm_params: # type: ignore
|
if llm_config.litellm_params: # type: ignore
|
||||||
active_parameters.update(llm_config.litellm_params) # type: ignore
|
active_parameters.update(llm_config.litellm_params) # type: ignore
|
||||||
|
|
||||||
|
context = skyvern_context.current()
|
||||||
|
if context and len(context.hashed_href_map) > 0:
|
||||||
|
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
||||||
|
data=json.dumps(context.hashed_href_map, indent=2).encode("utf-8"),
|
||||||
|
artifact_type=ArtifactType.HASHED_HREF_MAP,
|
||||||
|
step=step,
|
||||||
|
observer_cruise=observer_cruise,
|
||||||
|
observer_thought=observer_thought,
|
||||||
|
)
|
||||||
|
|
||||||
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
||||||
data=prompt.encode("utf-8"),
|
data=prompt.encode("utf-8"),
|
||||||
artifact_type=ArtifactType.LLM_PROMPT,
|
artifact_type=ArtifactType.LLM_PROMPT,
|
||||||
@@ -261,6 +296,19 @@ class LLMAPIHandlerFactory:
|
|||||||
observer_cruise=observer_cruise,
|
observer_cruise=observer_cruise,
|
||||||
observer_thought=observer_thought,
|
observer_thought=observer_thought,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if context and len(context.hashed_href_map) > 0:
|
||||||
|
llm_content = json.dumps(parsed_response)
|
||||||
|
rendered_content = Template(llm_content).render(context.hashed_href_map)
|
||||||
|
parsed_response = json.loads(rendered_content)
|
||||||
|
await app.ARTIFACT_MANAGER.create_llm_artifact(
|
||||||
|
data=json.dumps(parsed_response, indent=2).encode("utf-8"),
|
||||||
|
artifact_type=ArtifactType.LLM_RESPONSE_RENDERED,
|
||||||
|
step=step,
|
||||||
|
observer_cruise=observer_cruise,
|
||||||
|
observer_thought=observer_thought,
|
||||||
|
)
|
||||||
|
|
||||||
return parsed_response
|
return parsed_response
|
||||||
|
|
||||||
return llm_api_handler
|
return llm_api_handler
|
||||||
|
|||||||
@@ -26,12 +26,15 @@ class ArtifactType(StrEnum):
|
|||||||
LLM_REQUEST = "llm_request"
|
LLM_REQUEST = "llm_request"
|
||||||
LLM_RESPONSE = "llm_response"
|
LLM_RESPONSE = "llm_response"
|
||||||
LLM_RESPONSE_PARSED = "llm_response_parsed"
|
LLM_RESPONSE_PARSED = "llm_response_parsed"
|
||||||
|
LLM_RESPONSE_RENDERED = "llm_response_rendered"
|
||||||
VISIBLE_ELEMENTS_ID_CSS_MAP = "visible_elements_id_css_map"
|
VISIBLE_ELEMENTS_ID_CSS_MAP = "visible_elements_id_css_map"
|
||||||
VISIBLE_ELEMENTS_ID_FRAME_MAP = "visible_elements_id_frame_map"
|
VISIBLE_ELEMENTS_ID_FRAME_MAP = "visible_elements_id_frame_map"
|
||||||
VISIBLE_ELEMENTS_TREE = "visible_elements_tree"
|
VISIBLE_ELEMENTS_TREE = "visible_elements_tree"
|
||||||
VISIBLE_ELEMENTS_TREE_TRIMMED = "visible_elements_tree_trimmed"
|
VISIBLE_ELEMENTS_TREE_TRIMMED = "visible_elements_tree_trimmed"
|
||||||
VISIBLE_ELEMENTS_TREE_IN_PROMPT = "visible_elements_tree_in_prompt"
|
VISIBLE_ELEMENTS_TREE_IN_PROMPT = "visible_elements_tree_in_prompt"
|
||||||
|
|
||||||
|
HASHED_HREF_MAP = "hashed_href_map"
|
||||||
|
|
||||||
# DEPRECATED. pls use VISIBLE_ELEMENTS_ID_CSS_MAP
|
# DEPRECATED. pls use VISIBLE_ELEMENTS_ID_CSS_MAP
|
||||||
VISIBLE_ELEMENTS_ID_XPATH_MAP = "visible_elements_id_xpath_map"
|
VISIBLE_ELEMENTS_ID_XPATH_MAP = "visible_elements_id_xpath_map"
|
||||||
|
|
||||||
|
|||||||
@@ -18,6 +18,7 @@ FILE_EXTENTSION_MAP: dict[ArtifactType, str] = {
|
|||||||
ArtifactType.LLM_REQUEST: "json",
|
ArtifactType.LLM_REQUEST: "json",
|
||||||
ArtifactType.LLM_RESPONSE: "json",
|
ArtifactType.LLM_RESPONSE: "json",
|
||||||
ArtifactType.LLM_RESPONSE_PARSED: "json",
|
ArtifactType.LLM_RESPONSE_PARSED: "json",
|
||||||
|
ArtifactType.LLM_RESPONSE_RENDERED: "json",
|
||||||
ArtifactType.VISIBLE_ELEMENTS_ID_CSS_MAP: "json",
|
ArtifactType.VISIBLE_ELEMENTS_ID_CSS_MAP: "json",
|
||||||
ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP: "json",
|
ArtifactType.VISIBLE_ELEMENTS_ID_FRAME_MAP: "json",
|
||||||
ArtifactType.VISIBLE_ELEMENTS_TREE: "json",
|
ArtifactType.VISIBLE_ELEMENTS_TREE: "json",
|
||||||
@@ -27,6 +28,7 @@ FILE_EXTENTSION_MAP: dict[ArtifactType, str] = {
|
|||||||
ArtifactType.HTML_ACTION: "html",
|
ArtifactType.HTML_ACTION: "html",
|
||||||
ArtifactType.TRACE: "zip",
|
ArtifactType.TRACE: "zip",
|
||||||
ArtifactType.HAR: "har",
|
ArtifactType.HAR: "har",
|
||||||
|
ArtifactType.HASHED_HREF_MAP: "json",
|
||||||
# DEPRECATED: we're using CSS selector map now
|
# DEPRECATED: we're using CSS selector map now
|
||||||
ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP: "json",
|
ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP: "json",
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -15,6 +15,7 @@ class SkyvernContext:
|
|||||||
tz_info: ZoneInfo | None = None
|
tz_info: ZoneInfo | None = None
|
||||||
totp_codes: dict[str, str | None] = field(default_factory=dict)
|
totp_codes: dict[str, str | None] = field(default_factory=dict)
|
||||||
log: list[dict] = field(default_factory=list)
|
log: list[dict] = field(default_factory=list)
|
||||||
|
hashed_href_map: dict[str, str] = field(default_factory=dict)
|
||||||
|
|
||||||
def __repr__(self) -> str:
|
def __repr__(self) -> str:
|
||||||
return f"SkyvernContext(request_id={self.request_id}, organization_id={self.organization_id}, task_id={self.task_id}, workflow_id={self.workflow_id}, workflow_run_id={self.workflow_run_id}, max_steps_override={self.max_steps_override})"
|
return f"SkyvernContext(request_id={self.request_id}, organization_id={self.organization_id}, task_id={self.task_id}, workflow_id={self.workflow_id}, workflow_run_id={self.workflow_run_id}, max_steps_override={self.max_steps_override})"
|
||||||
|
|||||||
@@ -13,6 +13,7 @@ from skyvern.config import settings
|
|||||||
from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, SKYVERN_DIR, SKYVERN_ID_ATTR
|
from skyvern.constants import BUILDING_ELEMENT_TREE_TIMEOUT_MS, SKYVERN_DIR, SKYVERN_ID_ATTR
|
||||||
from skyvern.exceptions import FailedToTakeScreenshot, UnknownElementTreeFormat
|
from skyvern.exceptions import FailedToTakeScreenshot, UnknownElementTreeFormat
|
||||||
from skyvern.forge.sdk.api.crypto import calculate_sha256
|
from skyvern.forge.sdk.api.crypto import calculate_sha256
|
||||||
|
from skyvern.forge.sdk.core import skyvern_context
|
||||||
from skyvern.webeye.browser_factory import BrowserState
|
from skyvern.webeye.browser_factory import BrowserState
|
||||||
from skyvern.webeye.utils.page import SkyvernFrame
|
from skyvern.webeye.utils.page import SkyvernFrame
|
||||||
|
|
||||||
@@ -96,8 +97,21 @@ def json_to_html(element: dict, need_skyvern_attrs: bool = True) -> str:
|
|||||||
if element.get("isDropped", False):
|
if element.get("isDropped", False):
|
||||||
return ""
|
return ""
|
||||||
|
|
||||||
|
tag = element["tagName"]
|
||||||
attributes: dict[str, Any] = copy.deepcopy(element.get("attributes", {}))
|
attributes: dict[str, Any] = copy.deepcopy(element.get("attributes", {}))
|
||||||
|
|
||||||
|
context = skyvern_context.ensure_context()
|
||||||
|
|
||||||
|
# FIXME: Theoretically, all href links with over 69(64+1+4) length could be hashed
|
||||||
|
# but currently, just hash length>300 links to confirm the solution goes well
|
||||||
|
if "href" in attributes and len(attributes.get("href", "")) > 300:
|
||||||
|
href = attributes.get("href", "")
|
||||||
|
# jinja style can't accept the variable name starts with number
|
||||||
|
# adding "_" to make sure the variable name is valid.
|
||||||
|
hashed_href = "_" + calculate_sha256(href)
|
||||||
|
context.hashed_href_map[hashed_href] = href
|
||||||
|
attributes["href"] = "{{" + hashed_href + "}}"
|
||||||
|
|
||||||
if need_skyvern_attrs:
|
if need_skyvern_attrs:
|
||||||
# adding the node attribute to attributes
|
# adding the node attribute to attributes
|
||||||
for attr in ELEMENT_NODE_ATTRIBUTES:
|
for attr in ELEMENT_NODE_ATTRIBUTES:
|
||||||
@@ -108,13 +122,14 @@ def json_to_html(element: dict, need_skyvern_attrs: bool = True) -> str:
|
|||||||
|
|
||||||
attributes_html = " ".join(build_attribute(key, value) for key, value in attributes.items())
|
attributes_html = " ".join(build_attribute(key, value) for key, value in attributes.items())
|
||||||
|
|
||||||
tag = element["tagName"]
|
|
||||||
if element.get("isSelectable", False):
|
if element.get("isSelectable", False):
|
||||||
tag = "select"
|
tag = "select"
|
||||||
|
|
||||||
text = element.get("text", "")
|
text = element.get("text", "")
|
||||||
# build children HTML
|
# build children HTML
|
||||||
children_html = "".join(json_to_html(child) for child in element.get("children", []))
|
children_html = "".join(
|
||||||
|
json_to_html(child, need_skyvern_attrs=need_skyvern_attrs) for child in element.get("children", [])
|
||||||
|
)
|
||||||
# build option HTML
|
# build option HTML
|
||||||
option_html = "".join(
|
option_html = "".join(
|
||||||
f'<option index="{option.get("optionIndex")}">{option.get("text")}</option>'
|
f'<option index="{option.get("optionIndex")}">{option.get("text")}</option>'
|
||||||
@@ -183,7 +198,7 @@ def build_element_dict(
|
|||||||
|
|
||||||
|
|
||||||
class ElementTreeFormat(StrEnum):
|
class ElementTreeFormat(StrEnum):
|
||||||
JSON = "json"
|
JSON = "json" # deprecate JSON format soon. please use HTML format
|
||||||
HTML = "html"
|
HTML = "html"
|
||||||
|
|
||||||
|
|
||||||
@@ -232,7 +247,7 @@ class ScrapedPage(BaseModel):
|
|||||||
self._clean_up_func = clean_up_func
|
self._clean_up_func = clean_up_func
|
||||||
self._scrape_exclude = scrape_exclude
|
self._scrape_exclude = scrape_exclude
|
||||||
|
|
||||||
def build_element_tree(self, fmt: ElementTreeFormat = ElementTreeFormat.JSON) -> str:
|
def build_element_tree(self, fmt: ElementTreeFormat = ElementTreeFormat.HTML) -> str:
|
||||||
if fmt == ElementTreeFormat.JSON:
|
if fmt == ElementTreeFormat.JSON:
|
||||||
return json.dumps(self.element_tree_trimmed)
|
return json.dumps(self.element_tree_trimmed)
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user