push eval script to oss (#1796)

2025-02-19 23:21:08 -08:00
parent ef5cb8d671
commit 367473f930
8 changed files with 748 additions and 0 deletions
--- a/evaluation/script/create_webvoyager_evaluation_result.py
+++ b/evaluation/script/create_webvoyager_evaluation_result.py
@@ -0,0 +1,67 @@
+import csv
+import json
+from typing import Any
+
+import typer
+
+from evaluation.core import SkyvernClient
+from skyvern.forge.sdk.workflow.models.workflow import WorkflowRunStatus
+
+csv_headers = [
+    "id",
+    "status",
+    "assertion",
+    "failure_reason",
+    "url",
+    "question",
+    "answer",
+    "summary",
+    "output",
+    "is_updated",
+    "workflow_permanent_id",
+    "workflow_run_id",
+]
+
+
+def main(
+    base_url: str = typer.Option(..., "--base-url", help="base url for Skyvern client"),
+    cred: str = typer.Option(..., "--cred", help="credential for Skyvern organization"),
+    workflow_pid: str = typer.Option(..., "--workflow-pid", help="workflow pid to execute the evaluation test"),
+    record_json_path: str = typer.Option(..., "--record-json", help="record json path for evaluation run"),
+    output_csv_path: str = typer.Option("output.csv", "--output-path", help="output csv path for evaluation run"),
+) -> None:
+    client = SkyvernClient(base_url=base_url, credentials=cred)
+
+    with open(record_json_path, "r", encoding="utf-8") as file:
+        with open(output_csv_path, newline="", mode="w", encoding="utf-8") as csv_file:
+            writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
+            writer.writeheader()
+
+            for line in file:
+                one_record: dict[str, Any] = json.loads(line)
+                workflow_run_id: str = one_record.get("workflow_run_id", "")
+
+                workflow_run_response = client.get_workflow_run(
+                    workflow_pid=workflow_pid, workflow_run_id=workflow_run_id
+                )
+                one_record.update(
+                    {
+                        "workflow_permanent_id": workflow_pid,
+                        "status": str(workflow_run_response.status),
+                        "summary": workflow_run_response.observer_cruise.summary,
+                        "output": workflow_run_response.observer_cruise.output,
+                        "assertion": workflow_run_response.status == WorkflowRunStatus.completed,
+                        "failure_reason": workflow_run_response.failure_reason or "",
+                    }
+                )
+                csv_data = {key: one_record[key] for key in csv_headers}
+                print(
+                    f"{workflow_run_id}(id={one_record.get('id')}) {workflow_run_response.status}. Saving to the output csv.."
+                )
+                writer.writerow(csv_data)
+
+    print(f"Exported all records in {output_csv_path}")
+
+
+if __name__ == "__main__":
+    typer.run(main)
--- a/evaluation/script/create_webvoyager_observer.py
+++ b/evaluation/script/create_webvoyager_observer.py
@@ -0,0 +1,66 @@
+import asyncio
+import json
+from datetime import datetime
+from uuid import uuid4
+
+import typer
+
+from evaluation.core import Evaluator, SkyvernClient
+from evaluation.core.utils import load_webvoyager_case_from_json
+from skyvern.forge import app
+from skyvern.forge.prompts import prompt_engine
+from skyvern.forge.sdk.schemas.observers import ObserverTaskRequest
+
+
+async def create_observer_cruise(
+    base_url: str,
+    cred: str,
+) -> None:
+    client = SkyvernClient(base_url=base_url, credentials=cred)
+    group_id = uuid4()
+
+    cnt = 0
+    record_file_path = f"{group_id}-webvoyager-record.jsonl"
+    with open(record_file_path, "w", encoding="utf-8") as f:
+        for case_data in load_webvoyager_case_from_json(
+            file_path="evaluation/datasets/webvoyager_tasks.jsonl", group_id=str(group_id)
+        ):
+            prompt = prompt_engine.load_prompt(
+                "check-evaluation-goal", user_goal=case_data.question, local_datetime=datetime.now().isoformat()
+            )
+            response = await app.LLM_API_HANDLER(prompt=prompt, prompt_name="check-evaluation-goal")
+            tweaked_user_goal = response.get("tweaked_user_goal")
+            case_data.is_updated = tweaked_user_goal != case_data.question
+            case_data.question = tweaked_user_goal
+
+            evaluator = Evaluator(client=client, artifact_folder=f"test/artifacts/{case_data.group_id}/{case_data.id}")
+            request_body = ObserverTaskRequest(
+                url=case_data.url,
+                user_prompt=case_data.question,
+            )
+            cruise = evaluator.queue_skyvern_cruise(cruise_request=request_body, max_step=case_data.max_steps)
+            dumped_data = case_data.model_dump()
+            dumped_data.update(
+                {
+                    "observer_cruise_id": cruise.observer_cruise_id,
+                    "workflow_run_id": cruise.workflow_run_id,
+                    "workflow_permanent_id": cruise.workflow_permanent_id,
+                    "cruise_url": str(cruise.url) if cruise.url else cruise.url,
+                }
+            )
+            print(f"Queued {cruise.observer_cruise_id} for {case_data.model_dump_json()}")
+            f.write(json.dumps(dumped_data) + "\n")
+            cnt += 1
+
+    print(f"Queued {cnt} cruises to launch webvoyager evaluation test. saving the records file in {record_file_path}")
+
+
+def main(
+    base_url: str = typer.Option(..., "--base-url", help="base url for Skyvern client"),
+    cred: str = typer.Option(..., "--cred", help="credential for Skyvern organization"),
+) -> None:
+    asyncio.run(create_observer_cruise(base_url=base_url, cred=cred))
+
+
+if __name__ == "__main__":
+    typer.run(main)
--- a/evaluation/script/create_webvoyager_workflow.py
+++ b/evaluation/script/create_webvoyager_workflow.py
@@ -0,0 +1,75 @@
+import asyncio
+import json
+from datetime import datetime
+from typing import Optional
+from uuid import uuid4
+
+import typer
+
+from evaluation.core import Evaluator, SkyvernClient
+from evaluation.core.utils import load_webvoyager_case_from_json
+from skyvern.forge import app
+from skyvern.forge.prompts import prompt_engine
+from skyvern.forge.sdk.schemas.tasks import ProxyLocation
+from skyvern.forge.sdk.workflow.models.workflow import WorkflowRequestBody
+
+
+async def create_workflow_run(
+    base_url: str,
+    cred: str,
+    workflow_pid: str,
+    proxy_location: ProxyLocation | None = None,
+) -> None:
+    client = SkyvernClient(base_url=base_url, credentials=cred)
+    group_id = uuid4()
+
+    cnt = 0
+    record_file_path = f"{group_id}-webvoyager-record.jsonl"
+    with open(record_file_path, "w", encoding="utf-8") as f:
+        for case_data in load_webvoyager_case_from_json(
+            file_path="evaluation/datasets/webvoyager_tasks.jsonl", group_id=str(group_id)
+        ):
+            prompt = prompt_engine.load_prompt(
+                "check-evaluation-goal", user_goal=case_data.question, local_datetime=datetime.now().isoformat()
+            )
+            response = await app.LLM_API_HANDLER(prompt=prompt, prompt_name="check-evaluation-goal")
+            tweaked_user_goal = response.get("tweaked_user_goal")
+            case_data.is_updated = tweaked_user_goal != case_data.question
+            case_data.question = tweaked_user_goal
+
+            evaluator = Evaluator(client=client, artifact_folder=f"test/artifacts/{case_data.group_id}/{case_data.id}")
+            request_body = WorkflowRequestBody(
+                data={
+                    "url": case_data.url,
+                    "instruction": case_data.question,
+                    "answer": case_data.answer,
+                },
+                proxy_location=proxy_location,
+            )
+            workflow_run_id = evaluator.queue_skyvern_workflow(
+                workflow_pid=workflow_pid, workflow_request=request_body, max_step=case_data.max_steps
+            )
+            dumped_data = case_data.model_dump()
+            dumped_data.update({"workflow_run_id": workflow_run_id})
+            print(f"Queued {workflow_run_id} for {case_data.model_dump_json()}")
+            f.write(json.dumps(dumped_data) + "\n")
+            cnt += 1
+
+    print(f"Queued {cnt} workflows to launch webvoyager evaluation test. saving the records file in {record_file_path}")
+
+
+def main(
+    base_url: str = typer.Option(..., "--base-url", help="base url for Skyvern client"),
+    cred: str = typer.Option(..., "--cred", help="credential for Skyvern organization"),
+    workflow_pid: str = typer.Option(..., "--workflow-pid", help="workflow pid to execute the evaluation test"),
+    proxy_location: Optional[ProxyLocation] = typer.Option(
+        None, "--proxy-location", help="overwrite the workflow proxy location"
+    ),
+) -> None:
+    asyncio.run(
+        create_workflow_run(base_url=base_url, cred=cred, workflow_pid=workflow_pid, proxy_location=proxy_location)
+    )
+
+
+if __name__ == "__main__":
+    typer.run(main)
--- a/evaluation/script/eval_webvoyager_cruise.py
+++ b/evaluation/script/eval_webvoyager_cruise.py
@@ -0,0 +1,115 @@
+import asyncio
+import csv
+import json
+from typing import Any
+
+import typer
+
+from evaluation.core import Evaluator, SkyvernClient
+from skyvern.forge.sdk.workflow.models.workflow import WorkflowRunStatus
+
+csv_headers = [
+    "id",
+    "status",
+    "assertion",
+    "failure_reason",
+    "url",
+    "question",
+    "answer",
+    "summary",
+    "output",
+    "is_updated",
+    "workflow_permanent_id",
+    "workflow_run_id",
+]
+
+BATCH_SIZE = 5
+
+
+async def process_record(client: SkyvernClient, one_record: dict[str, Any]) -> dict[str, Any]:
+    workflow_pid: str = one_record.get("workflow_permanent_id", "")
+    workflow_run_id: str = one_record.get("workflow_run_id", "")
+    workflow_run_response = await client.get_workflow_run(workflow_pid=workflow_pid, workflow_run_id=workflow_run_id)
+    one_record.update(
+        {
+            "status": str(workflow_run_response.status),
+            "summary": workflow_run_response.observer_cruise.summary,
+            "output": workflow_run_response.observer_cruise.output,
+        }
+    )
+    if workflow_run_response.status != WorkflowRunStatus.completed:
+        one_record.update(
+            {
+                "assertion": False,
+                "failure_reason": workflow_run_response.failure_reason,
+            },
+        )
+    else:
+        evaluator = Evaluator(
+            client=client,
+            artifact_folder=f"test/artifacts/{one_record.get('group_id', '')}/{one_record.get('id', '')}",
+        )
+        try:
+            await evaluator.eval_skyvern_workflow_run(
+                workflow_pid=workflow_pid,
+                workflow_run_id=workflow_run_id,
+                question=one_record.get("question", ""),
+                answer=one_record.get("answer", ""),
+                is_updated=one_record.get("is_updated", False),
+            )
+            one_record.update({"assertion": True, "failure_reason": ""})
+        except Exception as e:
+            one_record.update({"assertion": False, "failure_reason": str(e)})
+
+    csv_data = {key: one_record[key] for key in csv_headers}
+    print(
+        f"{workflow_pid}/{workflow_run_id}(id={one_record.get('id')}) {workflow_run_response.status}. Saving to the output csv.."
+    )
+    return csv_data
+
+
+async def run_eval(
+    base_url: str,
+    cred: str,
+    record_json_path: str,
+    output_csv_path: str,
+) -> None:
+    client = SkyvernClient(base_url=base_url, credentials=cred)
+
+    with open(record_json_path, "r", encoding="utf-8") as file:
+        with open(output_csv_path, newline="", mode="w", encoding="utf-8") as csv_file:
+            writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
+            writer.writeheader()
+
+            current_batch = []
+            for line in file:
+                one_record: dict[str, Any] = json.loads(line)
+                current_batch.append(one_record)
+
+                if len(current_batch) >= BATCH_SIZE:
+                    results = await asyncio.gather(*(process_record(client, record) for record in current_batch))
+                    for result in results:
+                        writer.writerow(result)
+                    current_batch = []
+
+            if current_batch:
+                results = await asyncio.gather(*(process_record(client, record) for record in current_batch))
+                for result in results:
+                    writer.writerow(result)
+
+    print(f"Exported all records in {output_csv_path}")
+
+
+def main(
+    base_url: str = typer.Option(..., "--base-url", help="base url for Skyvern client"),
+    cred: str = typer.Option(..., "--cred", help="credential for Skyvern organization"),
+    record_json_path: str = typer.Option(..., "--record-json", help="record json path for evaluation run"),
+    output_csv_path: str = typer.Option("output.csv", "--output-path", help="output csv path for evaluation run"),
+) -> None:
+    asyncio.run(
+        run_eval(base_url=base_url, cred=cred, record_json_path=record_json_path, output_csv_path=output_csv_path)
+    )
+
+
+if __name__ == "__main__":
+    typer.run(main)