From 095ef114c4288a91f8c26bffebc33365bbf4589f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Celal=20Zamano=C4=9Flu?= <95054566+celalzamanoglu@users.noreply.github.com> Date: Tue, 17 Feb 2026 20:05:10 +0300 Subject: [PATCH] Improve suggested prompt generation for data extractions (#SKY-7447) (#4765) --- .../nodes/ExtractionNode/ExtractionNode.tsx | 12 ++- .../editor/nodes/TaskNode/TaskNode.tsx | 13 +++- .../improve-prompt-for-data-extraction.j2 | 75 +++++++++++++++++++ skyvern/forge/sdk/routes/prompts.py | 4 + 4 files changed, 102 insertions(+), 2 deletions(-) create mode 100644 skyvern/forge/prompts/skyvern/improve-prompt-for-data-extraction.j2 diff --git a/skyvern-frontend/src/routes/workflows/editor/nodes/ExtractionNode/ExtractionNode.tsx b/skyvern-frontend/src/routes/workflows/editor/nodes/ExtractionNode/ExtractionNode.tsx index f2f9b474..6b00354a 100644 --- a/skyvern-frontend/src/routes/workflows/editor/nodes/ExtractionNode/ExtractionNode.tsx +++ b/skyvern-frontend/src/routes/workflows/editor/nodes/ExtractionNode/ExtractionNode.tsx @@ -118,7 +118,17 @@ function ExtractionNode({ id, data, type }: NodeProps) { { if (!editable) { diff --git a/skyvern-frontend/src/routes/workflows/editor/nodes/TaskNode/TaskNode.tsx b/skyvern-frontend/src/routes/workflows/editor/nodes/TaskNode/TaskNode.tsx index 574823e2..d5f3632e 100644 --- a/skyvern-frontend/src/routes/workflows/editor/nodes/TaskNode/TaskNode.tsx +++ b/skyvern-frontend/src/routes/workflows/editor/nodes/TaskNode/TaskNode.tsx @@ -178,7 +178,18 @@ function TaskNode({ id, data, type }: NodeProps) { /> { update({ dataExtractionGoal: value }); diff --git a/skyvern/forge/prompts/skyvern/improve-prompt-for-data-extraction.j2 b/skyvern/forge/prompts/skyvern/improve-prompt-for-data-extraction.j2 new file mode 100644 index 00000000..976e487b --- /dev/null +++ b/skyvern/forge/prompts/skyvern/improve-prompt-for-data-extraction.j2 @@ -0,0 +1,75 @@ +I am going to give you an original prompt for a data extraction task performed by an AI browser agent, and a field name. + +Can you improve this original prompt using the following best practices for data extraction prompts: + +Depending on the field, you will need to structure the prompt differently. + +**data_extraction_goal**: Describes WHAT data the agent should extract from a web page. This is not about navigation or clicking — it is about reading, identifying, and pulling structured information from the visible content on a page. + +**data_schema**: A JSON Schema that defines the expected shape of the extracted output. + +Rules for improving a data extraction goal: +1. Be specific about what data to extract (e.g., "Extract the product name, price, and availability" not "Extract product info") +2. Mention whether the data is in a list/table or a single item +3. If the data spans multiple sections of a page, describe where to find each piece +4. Clarify how to handle missing data (e.g., "If price is not listed, output null") +5. If there are multiple similar items, clarify whether to extract all of them or just specific ones +6. Do NOT include navigation instructions — data extraction goals assume the agent is already on the correct page +7. If a data schema is provided in the context, ensure the extraction goal aligns with the schema fields + +Rules for improving a data schema: +1. Use JSON Schema specification style +2. Include "type" and "description" for every field +3. Mark fields as required only when they are guaranteed to exist +4. For lists of items, use "type": "array" with an "items" schema +5. Use descriptive field names (e.g., "product_name" not "name") +6. Add a "description" to clarify ambiguous fields + +Examples of good data extraction goals: + +Example 1 (simple): +``` +Extract the product name, price, and availability status from the product detail page. +``` + +Example 2 (list extraction): +``` +Extract all job listings visible on the page. For each listing, capture the job title, company name, location, and salary range if available. If salary is not listed, output null for that field. +``` + +Example 3 (structured extraction with schema reference): +``` +Extract the order summary information including: order number, order date, each line item (product name, quantity, unit price), subtotal, tax, and total amount. Output items as an array matching the provided schema. +``` + +Respond ONLY with valid JSON in this format with no additional text before or after it: +```json +{ + "improved_prompt": str, // The improved version of the prompt +} +``` + +Ensure that the "improved_prompt" contains liberal whitespace tokens for formatting, clarity, and legibility. + +Here is the original prompt from the user: + +``` +{{ prompt }} +``` + +The field being improved is: {{ context.field | default("data_extraction_goal") }} +{% if context and context.data_schema %} + +The expected data schema for the extraction output is: +``` +{{ context.data_schema }} +``` +Ensure the improved prompt aligns with the fields defined in this schema. +{% endif %} +{% if context and context.navigation_goal %} + +Note: This data extraction happens after the following navigation goal is completed: +``` +{{ context.navigation_goal }} +``` +{% endif %} diff --git a/skyvern/forge/sdk/routes/prompts.py b/skyvern/forge/sdk/routes/prompts.py index d047520f..dab21a2f 100644 --- a/skyvern/forge/sdk/routes/prompts.py +++ b/skyvern/forge/sdk/routes/prompts.py @@ -24,9 +24,13 @@ LOG = structlog.get_logger() class Constants: DEFAULT_TEMPLATE_NAME = "improve-prompt-for-ai-browser-agent" + EXTRACTION_TEMPLATE_NAME = "improve-prompt-for-data-extraction" IMPROVE_PROMPT_USE_CASE_TO_TEMPLATE_MAP = { "new_workflow": DEFAULT_TEMPLATE_NAME, "task_v2_prompt": DEFAULT_TEMPLATE_NAME, + "workflow_editor.extraction.data_extraction_goal": EXTRACTION_TEMPLATE_NAME, + "workflow_editor.extraction.data_schema": EXTRACTION_TEMPLATE_NAME, + "workflow_editor.task.data_extraction_goal": EXTRACTION_TEMPLATE_NAME, }