From aacc61236514fdef405d27ada2e4dc82f9da9bf2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Celal=20Zamano=C4=9Flu?= <95054566+celalzamanoglu@users.noreply.github.com> Date: Tue, 17 Feb 2026 21:22:56 +0300 Subject: [PATCH] Pass existing schema as context to data schema generation prompt (#SKY-7484) (#4766) --- .../nodes/FileParserNode/FileParserNode.tsx | 2 +- .../nodes/PDFParserNode/PDFParserNode.tsx | 2 +- .../nodes/TextPromptNode/TextPromptNode.tsx | 2 +- .../prompts/skyvern/suggest-data-schema.j2 | 89 ++++++++++++++++--- skyvern/forge/sdk/routes/agent_protocol.py | 29 +++++- 5 files changed, 110 insertions(+), 14 deletions(-) diff --git a/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/FileParserNode.tsx b/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/FileParserNode.tsx index cbe80227..16ec92f2 100644 --- a/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/FileParserNode.tsx +++ b/skyvern-frontend/src/routes/workflows/editor/nodes/FileParserNode/FileParserNode.tsx @@ -95,7 +95,7 @@ function FileParserNode({ id, data }: NodeProps) { onChange={(value) => { update({ jsonSchema: value }); }} - suggestionContext={{}} + suggestionContext={{ current_schema: data.jsonSchema }} /> ) { onChange={(value) => { update({ jsonSchema: value }); }} - suggestionContext={{}} + suggestionContext={{ current_schema: data.jsonSchema }} /> ) { onChange={(value) => { update({ jsonSchema: value }); }} - suggestionContext={{}} + suggestionContext={{ current_schema: data.jsonSchema }} /> diff --git a/skyvern/forge/prompts/skyvern/suggest-data-schema.j2 b/skyvern/forge/prompts/skyvern/suggest-data-schema.j2 index cf16a364..5ef4e86f 100644 --- a/skyvern/forge/prompts/skyvern/suggest-data-schema.j2 +++ b/skyvern/forge/prompts/skyvern/suggest-data-schema.j2 @@ -1,22 +1,22 @@ -We are developing an interface for AI agent tasks that use JSON schemas to describe the shape of the data to that needs to extracted from a web page. +We are developing an interface for AI agent tasks that use JSON schemas to describe the shape of the data that needs to be extracted from a web page. You are given an input prompt from a user, and some additional context. Your goal is to generate a JSON schema given the user prompt and the context. IMPORTANT: All field names in the schema MUST use snake_case naming convention (e.g., "first_name", "total_price", "order_date"). Do not use camelCase, PascalCase, or spaces in field names. -If additional context is given, try to use the it for further clues about the data that needs to be extracted. For example, the user might provide some detail about +If additional context is given, try to use it for further clues about the data that needs to be extracted. For example, the user might provide some detail about product information to be extracted in a "data_extraction_goal" inside the context, but maybe not necessarily pass it in the input prompt. In these cases, you should use the context. -Here is an example: +If an existing data schema is provided, you MUST use it as a baseline and modify it according to the user's prompt. Preserve existing fields unless the user explicitly asks to remove them. Add new fields, rename fields, or restructure as the user requests, but keep unchanged parts intact. + +Here is an example of creating a new data schema: User prompt: Generate a data schema that extracts the title, link, and author name for the posts as a list Additional context: ```json { - "url": "https://news.ycombinator.com", - "data_extraction_goal": "Extract the title, link, and author name of the top 5 posts", - "existing_schema": "null" + "data_extraction_goal": "Extract the title, link, and author name of the top 5 posts" } ``` @@ -51,15 +51,84 @@ Suggested Data Schema: } ``` +Here is an example of modifying an existing data schema: + +User prompt: Also extract the score for each post +Existing Data Schema: +```json +{ + "posts" : { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "Title of the post" + }, + "link": { + "type": "string", + "description": "Link to the post" + } + }, + "required": [ + "title", + "link" + ] + } + } +} +``` + +Suggested Data Schema: +```json +{ + "posts" : { + "type": "array", + "items": { + "type": "object", + "properties": { + "title": { + "type": "string", + "description": "Title of the post" + }, + "link": { + "type": "string", + "description": "Link to the post" + }, + "score": { + "type": "integer", + "description": "Score of the post" + } + }, + "required": [ + "title", + "link", + "score" + ] + } + } +} +``` + +{% if existing_schema %} +The user has an existing data schema. Use it as a baseline and modify it according to the user's prompt below. Preserve all existing fields unless the user explicitly asks to remove them. + +Existing Data Schema: +```json +{{ existing_schema }} +``` + +{% endif %} {% if additional_context %} +You are provided some additional context about the suggestion here: -You are provided some additional context about the suggestion here: - -{{additional_context}} +```json +{{ additional_context | tojson(indent=2) }} +``` {% endif %} Respond only with JSON output containing a single key "output" with the value of the suggested data schema given the following input: {{ input }} - diff --git a/skyvern/forge/sdk/routes/agent_protocol.py b/skyvern/forge/sdk/routes/agent_protocol.py index 9840cec8..cdb1f6ff 100644 --- a/skyvern/forge/sdk/routes/agent_protocol.py +++ b/skyvern/forge/sdk/routes/agent_protocol.py @@ -1,4 +1,5 @@ import asyncio +import json from enum import Enum from typing import Annotated, Any @@ -2767,7 +2768,33 @@ async def suggest( llm_prompt = "" if ai_suggestion_type == AISuggestionType.DATA_SCHEMA: - llm_prompt = prompt_engine.load_prompt("suggest-data-schema", input=data.input, additional_context=data.context) + existing_schema = None + additional_context = data.context + if data.context: + raw_schema = data.context.get("current_schema") + if raw_schema: + if isinstance(raw_schema, dict): + existing_schema = json.dumps(raw_schema, indent=2) + elif isinstance(raw_schema, str) and raw_schema not in ("null", ""): + try: + existing_schema = json.dumps(json.loads(raw_schema), indent=2) + except (json.JSONDecodeError, TypeError): + LOG.warning("Invalid JSON in current_schema context, ignoring", raw_schema=raw_schema) + additional_context = {k: v for k, v in data.context.items() if k != "current_schema"} + if not additional_context: + additional_context = None + if existing_schema: + LOG.info( + "Using existing schema for data schema suggestion", + schema_length=len(existing_schema), + has_additional_context=bool(additional_context), + ) + llm_prompt = prompt_engine.load_prompt( + "suggest-data-schema", + input=data.input, + additional_context=additional_context, + existing_schema=existing_schema, + ) try: new_ai_suggestion = await app.DATABASE.create_ai_suggestion(