Files
Dorod-Sky/evaluation/script/create_webvoyager_workflow.py
2025-02-20 15:21:08 +08:00

76 lines
3.0 KiB
Python

import asyncio
import json
from datetime import datetime
from typing import Optional
from uuid import uuid4
import typer
from evaluation.core import Evaluator, SkyvernClient
from evaluation.core.utils import load_webvoyager_case_from_json
from skyvern.forge import app
from skyvern.forge.prompts import prompt_engine
from skyvern.forge.sdk.schemas.tasks import ProxyLocation
from skyvern.forge.sdk.workflow.models.workflow import WorkflowRequestBody
async def create_workflow_run(
base_url: str,
cred: str,
workflow_pid: str,
proxy_location: ProxyLocation | None = None,
) -> None:
client = SkyvernClient(base_url=base_url, credentials=cred)
group_id = uuid4()
cnt = 0
record_file_path = f"{group_id}-webvoyager-record.jsonl"
with open(record_file_path, "w", encoding="utf-8") as f:
for case_data in load_webvoyager_case_from_json(
file_path="evaluation/datasets/webvoyager_tasks.jsonl", group_id=str(group_id)
):
prompt = prompt_engine.load_prompt(
"check-evaluation-goal", user_goal=case_data.question, local_datetime=datetime.now().isoformat()
)
response = await app.LLM_API_HANDLER(prompt=prompt, prompt_name="check-evaluation-goal")
tweaked_user_goal = response.get("tweaked_user_goal")
case_data.is_updated = tweaked_user_goal != case_data.question
case_data.question = tweaked_user_goal
evaluator = Evaluator(client=client, artifact_folder=f"test/artifacts/{case_data.group_id}/{case_data.id}")
request_body = WorkflowRequestBody(
data={
"url": case_data.url,
"instruction": case_data.question,
"answer": case_data.answer,
},
proxy_location=proxy_location,
)
workflow_run_id = evaluator.queue_skyvern_workflow(
workflow_pid=workflow_pid, workflow_request=request_body, max_step=case_data.max_steps
)
dumped_data = case_data.model_dump()
dumped_data.update({"workflow_run_id": workflow_run_id})
print(f"Queued {workflow_run_id} for {case_data.model_dump_json()}")
f.write(json.dumps(dumped_data) + "\n")
cnt += 1
print(f"Queued {cnt} workflows to launch webvoyager evaluation test. saving the records file in {record_file_path}")
def main(
base_url: str = typer.Option(..., "--base-url", help="base url for Skyvern client"),
cred: str = typer.Option(..., "--cred", help="credential for Skyvern organization"),
workflow_pid: str = typer.Option(..., "--workflow-pid", help="workflow pid to execute the evaluation test"),
proxy_location: Optional[ProxyLocation] = typer.Option(
None, "--proxy-location", help="overwrite the workflow proxy location"
),
) -> None:
asyncio.run(
create_workflow_run(base_url=base_url, cred=cred, workflow_pid=workflow_pid, proxy_location=proxy_location)
)
if __name__ == "__main__":
typer.run(main)