push eval script to oss (#1796)
This commit is contained in:
67
evaluation/script/create_webvoyager_evaluation_result.py
Normal file
67
evaluation/script/create_webvoyager_evaluation_result.py
Normal file
@@ -0,0 +1,67 @@
|
||||
import csv
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import typer
|
||||
|
||||
from evaluation.core import SkyvernClient
|
||||
from skyvern.forge.sdk.workflow.models.workflow import WorkflowRunStatus
|
||||
|
||||
csv_headers = [
|
||||
"id",
|
||||
"status",
|
||||
"assertion",
|
||||
"failure_reason",
|
||||
"url",
|
||||
"question",
|
||||
"answer",
|
||||
"summary",
|
||||
"output",
|
||||
"is_updated",
|
||||
"workflow_permanent_id",
|
||||
"workflow_run_id",
|
||||
]
|
||||
|
||||
|
||||
def main(
|
||||
base_url: str = typer.Option(..., "--base-url", help="base url for Skyvern client"),
|
||||
cred: str = typer.Option(..., "--cred", help="credential for Skyvern organization"),
|
||||
workflow_pid: str = typer.Option(..., "--workflow-pid", help="workflow pid to execute the evaluation test"),
|
||||
record_json_path: str = typer.Option(..., "--record-json", help="record json path for evaluation run"),
|
||||
output_csv_path: str = typer.Option("output.csv", "--output-path", help="output csv path for evaluation run"),
|
||||
) -> None:
|
||||
client = SkyvernClient(base_url=base_url, credentials=cred)
|
||||
|
||||
with open(record_json_path, "r", encoding="utf-8") as file:
|
||||
with open(output_csv_path, newline="", mode="w", encoding="utf-8") as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
|
||||
writer.writeheader()
|
||||
|
||||
for line in file:
|
||||
one_record: dict[str, Any] = json.loads(line)
|
||||
workflow_run_id: str = one_record.get("workflow_run_id", "")
|
||||
|
||||
workflow_run_response = client.get_workflow_run(
|
||||
workflow_pid=workflow_pid, workflow_run_id=workflow_run_id
|
||||
)
|
||||
one_record.update(
|
||||
{
|
||||
"workflow_permanent_id": workflow_pid,
|
||||
"status": str(workflow_run_response.status),
|
||||
"summary": workflow_run_response.observer_cruise.summary,
|
||||
"output": workflow_run_response.observer_cruise.output,
|
||||
"assertion": workflow_run_response.status == WorkflowRunStatus.completed,
|
||||
"failure_reason": workflow_run_response.failure_reason or "",
|
||||
}
|
||||
)
|
||||
csv_data = {key: one_record[key] for key in csv_headers}
|
||||
print(
|
||||
f"{workflow_run_id}(id={one_record.get('id')}) {workflow_run_response.status}. Saving to the output csv.."
|
||||
)
|
||||
writer.writerow(csv_data)
|
||||
|
||||
print(f"Exported all records in {output_csv_path}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
typer.run(main)
|
||||
66
evaluation/script/create_webvoyager_observer.py
Normal file
66
evaluation/script/create_webvoyager_observer.py
Normal file
@@ -0,0 +1,66 @@
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime
|
||||
from uuid import uuid4
|
||||
|
||||
import typer
|
||||
|
||||
from evaluation.core import Evaluator, SkyvernClient
|
||||
from evaluation.core.utils import load_webvoyager_case_from_json
|
||||
from skyvern.forge import app
|
||||
from skyvern.forge.prompts import prompt_engine
|
||||
from skyvern.forge.sdk.schemas.observers import ObserverTaskRequest
|
||||
|
||||
|
||||
async def create_observer_cruise(
|
||||
base_url: str,
|
||||
cred: str,
|
||||
) -> None:
|
||||
client = SkyvernClient(base_url=base_url, credentials=cred)
|
||||
group_id = uuid4()
|
||||
|
||||
cnt = 0
|
||||
record_file_path = f"{group_id}-webvoyager-record.jsonl"
|
||||
with open(record_file_path, "w", encoding="utf-8") as f:
|
||||
for case_data in load_webvoyager_case_from_json(
|
||||
file_path="evaluation/datasets/webvoyager_tasks.jsonl", group_id=str(group_id)
|
||||
):
|
||||
prompt = prompt_engine.load_prompt(
|
||||
"check-evaluation-goal", user_goal=case_data.question, local_datetime=datetime.now().isoformat()
|
||||
)
|
||||
response = await app.LLM_API_HANDLER(prompt=prompt, prompt_name="check-evaluation-goal")
|
||||
tweaked_user_goal = response.get("tweaked_user_goal")
|
||||
case_data.is_updated = tweaked_user_goal != case_data.question
|
||||
case_data.question = tweaked_user_goal
|
||||
|
||||
evaluator = Evaluator(client=client, artifact_folder=f"test/artifacts/{case_data.group_id}/{case_data.id}")
|
||||
request_body = ObserverTaskRequest(
|
||||
url=case_data.url,
|
||||
user_prompt=case_data.question,
|
||||
)
|
||||
cruise = evaluator.queue_skyvern_cruise(cruise_request=request_body, max_step=case_data.max_steps)
|
||||
dumped_data = case_data.model_dump()
|
||||
dumped_data.update(
|
||||
{
|
||||
"observer_cruise_id": cruise.observer_cruise_id,
|
||||
"workflow_run_id": cruise.workflow_run_id,
|
||||
"workflow_permanent_id": cruise.workflow_permanent_id,
|
||||
"cruise_url": str(cruise.url) if cruise.url else cruise.url,
|
||||
}
|
||||
)
|
||||
print(f"Queued {cruise.observer_cruise_id} for {case_data.model_dump_json()}")
|
||||
f.write(json.dumps(dumped_data) + "\n")
|
||||
cnt += 1
|
||||
|
||||
print(f"Queued {cnt} cruises to launch webvoyager evaluation test. saving the records file in {record_file_path}")
|
||||
|
||||
|
||||
def main(
|
||||
base_url: str = typer.Option(..., "--base-url", help="base url for Skyvern client"),
|
||||
cred: str = typer.Option(..., "--cred", help="credential for Skyvern organization"),
|
||||
) -> None:
|
||||
asyncio.run(create_observer_cruise(base_url=base_url, cred=cred))
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
typer.run(main)
|
||||
75
evaluation/script/create_webvoyager_workflow.py
Normal file
75
evaluation/script/create_webvoyager_workflow.py
Normal file
@@ -0,0 +1,75 @@
|
||||
import asyncio
|
||||
import json
|
||||
from datetime import datetime
|
||||
from typing import Optional
|
||||
from uuid import uuid4
|
||||
|
||||
import typer
|
||||
|
||||
from evaluation.core import Evaluator, SkyvernClient
|
||||
from evaluation.core.utils import load_webvoyager_case_from_json
|
||||
from skyvern.forge import app
|
||||
from skyvern.forge.prompts import prompt_engine
|
||||
from skyvern.forge.sdk.schemas.tasks import ProxyLocation
|
||||
from skyvern.forge.sdk.workflow.models.workflow import WorkflowRequestBody
|
||||
|
||||
|
||||
async def create_workflow_run(
|
||||
base_url: str,
|
||||
cred: str,
|
||||
workflow_pid: str,
|
||||
proxy_location: ProxyLocation | None = None,
|
||||
) -> None:
|
||||
client = SkyvernClient(base_url=base_url, credentials=cred)
|
||||
group_id = uuid4()
|
||||
|
||||
cnt = 0
|
||||
record_file_path = f"{group_id}-webvoyager-record.jsonl"
|
||||
with open(record_file_path, "w", encoding="utf-8") as f:
|
||||
for case_data in load_webvoyager_case_from_json(
|
||||
file_path="evaluation/datasets/webvoyager_tasks.jsonl", group_id=str(group_id)
|
||||
):
|
||||
prompt = prompt_engine.load_prompt(
|
||||
"check-evaluation-goal", user_goal=case_data.question, local_datetime=datetime.now().isoformat()
|
||||
)
|
||||
response = await app.LLM_API_HANDLER(prompt=prompt, prompt_name="check-evaluation-goal")
|
||||
tweaked_user_goal = response.get("tweaked_user_goal")
|
||||
case_data.is_updated = tweaked_user_goal != case_data.question
|
||||
case_data.question = tweaked_user_goal
|
||||
|
||||
evaluator = Evaluator(client=client, artifact_folder=f"test/artifacts/{case_data.group_id}/{case_data.id}")
|
||||
request_body = WorkflowRequestBody(
|
||||
data={
|
||||
"url": case_data.url,
|
||||
"instruction": case_data.question,
|
||||
"answer": case_data.answer,
|
||||
},
|
||||
proxy_location=proxy_location,
|
||||
)
|
||||
workflow_run_id = evaluator.queue_skyvern_workflow(
|
||||
workflow_pid=workflow_pid, workflow_request=request_body, max_step=case_data.max_steps
|
||||
)
|
||||
dumped_data = case_data.model_dump()
|
||||
dumped_data.update({"workflow_run_id": workflow_run_id})
|
||||
print(f"Queued {workflow_run_id} for {case_data.model_dump_json()}")
|
||||
f.write(json.dumps(dumped_data) + "\n")
|
||||
cnt += 1
|
||||
|
||||
print(f"Queued {cnt} workflows to launch webvoyager evaluation test. saving the records file in {record_file_path}")
|
||||
|
||||
|
||||
def main(
|
||||
base_url: str = typer.Option(..., "--base-url", help="base url for Skyvern client"),
|
||||
cred: str = typer.Option(..., "--cred", help="credential for Skyvern organization"),
|
||||
workflow_pid: str = typer.Option(..., "--workflow-pid", help="workflow pid to execute the evaluation test"),
|
||||
proxy_location: Optional[ProxyLocation] = typer.Option(
|
||||
None, "--proxy-location", help="overwrite the workflow proxy location"
|
||||
),
|
||||
) -> None:
|
||||
asyncio.run(
|
||||
create_workflow_run(base_url=base_url, cred=cred, workflow_pid=workflow_pid, proxy_location=proxy_location)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
typer.run(main)
|
||||
115
evaluation/script/eval_webvoyager_cruise.py
Normal file
115
evaluation/script/eval_webvoyager_cruise.py
Normal file
@@ -0,0 +1,115 @@
|
||||
import asyncio
|
||||
import csv
|
||||
import json
|
||||
from typing import Any
|
||||
|
||||
import typer
|
||||
|
||||
from evaluation.core import Evaluator, SkyvernClient
|
||||
from skyvern.forge.sdk.workflow.models.workflow import WorkflowRunStatus
|
||||
|
||||
csv_headers = [
|
||||
"id",
|
||||
"status",
|
||||
"assertion",
|
||||
"failure_reason",
|
||||
"url",
|
||||
"question",
|
||||
"answer",
|
||||
"summary",
|
||||
"output",
|
||||
"is_updated",
|
||||
"workflow_permanent_id",
|
||||
"workflow_run_id",
|
||||
]
|
||||
|
||||
BATCH_SIZE = 5
|
||||
|
||||
|
||||
async def process_record(client: SkyvernClient, one_record: dict[str, Any]) -> dict[str, Any]:
|
||||
workflow_pid: str = one_record.get("workflow_permanent_id", "")
|
||||
workflow_run_id: str = one_record.get("workflow_run_id", "")
|
||||
workflow_run_response = await client.get_workflow_run(workflow_pid=workflow_pid, workflow_run_id=workflow_run_id)
|
||||
one_record.update(
|
||||
{
|
||||
"status": str(workflow_run_response.status),
|
||||
"summary": workflow_run_response.observer_cruise.summary,
|
||||
"output": workflow_run_response.observer_cruise.output,
|
||||
}
|
||||
)
|
||||
if workflow_run_response.status != WorkflowRunStatus.completed:
|
||||
one_record.update(
|
||||
{
|
||||
"assertion": False,
|
||||
"failure_reason": workflow_run_response.failure_reason,
|
||||
},
|
||||
)
|
||||
else:
|
||||
evaluator = Evaluator(
|
||||
client=client,
|
||||
artifact_folder=f"test/artifacts/{one_record.get('group_id', '')}/{one_record.get('id', '')}",
|
||||
)
|
||||
try:
|
||||
await evaluator.eval_skyvern_workflow_run(
|
||||
workflow_pid=workflow_pid,
|
||||
workflow_run_id=workflow_run_id,
|
||||
question=one_record.get("question", ""),
|
||||
answer=one_record.get("answer", ""),
|
||||
is_updated=one_record.get("is_updated", False),
|
||||
)
|
||||
one_record.update({"assertion": True, "failure_reason": ""})
|
||||
except Exception as e:
|
||||
one_record.update({"assertion": False, "failure_reason": str(e)})
|
||||
|
||||
csv_data = {key: one_record[key] for key in csv_headers}
|
||||
print(
|
||||
f"{workflow_pid}/{workflow_run_id}(id={one_record.get('id')}) {workflow_run_response.status}. Saving to the output csv.."
|
||||
)
|
||||
return csv_data
|
||||
|
||||
|
||||
async def run_eval(
|
||||
base_url: str,
|
||||
cred: str,
|
||||
record_json_path: str,
|
||||
output_csv_path: str,
|
||||
) -> None:
|
||||
client = SkyvernClient(base_url=base_url, credentials=cred)
|
||||
|
||||
with open(record_json_path, "r", encoding="utf-8") as file:
|
||||
with open(output_csv_path, newline="", mode="w", encoding="utf-8") as csv_file:
|
||||
writer = csv.DictWriter(csv_file, fieldnames=csv_headers)
|
||||
writer.writeheader()
|
||||
|
||||
current_batch = []
|
||||
for line in file:
|
||||
one_record: dict[str, Any] = json.loads(line)
|
||||
current_batch.append(one_record)
|
||||
|
||||
if len(current_batch) >= BATCH_SIZE:
|
||||
results = await asyncio.gather(*(process_record(client, record) for record in current_batch))
|
||||
for result in results:
|
||||
writer.writerow(result)
|
||||
current_batch = []
|
||||
|
||||
if current_batch:
|
||||
results = await asyncio.gather(*(process_record(client, record) for record in current_batch))
|
||||
for result in results:
|
||||
writer.writerow(result)
|
||||
|
||||
print(f"Exported all records in {output_csv_path}")
|
||||
|
||||
|
||||
def main(
|
||||
base_url: str = typer.Option(..., "--base-url", help="base url for Skyvern client"),
|
||||
cred: str = typer.Option(..., "--cred", help="credential for Skyvern organization"),
|
||||
record_json_path: str = typer.Option(..., "--record-json", help="record json path for evaluation run"),
|
||||
output_csv_path: str = typer.Option("output.csv", "--output-path", help="output csv path for evaluation run"),
|
||||
) -> None:
|
||||
asyncio.run(
|
||||
run_eval(base_url=base_url, cred=cred, record_json_path=record_json_path, output_csv_path=output_csv_path)
|
||||
)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
typer.run(main)
|
||||
Reference in New Issue
Block a user