Add "Print PDF" Block (#4452)

This commit is contained in:
Marc Kelechava
2026-01-14 15:46:49 -08:00
committed by GitHub
parent 7dcfa00508
commit 4c2c7df42c
16 changed files with 539 additions and 7 deletions

View File

@@ -14,6 +14,7 @@ export const ArtifactType = {
HTMLScrape: "html_scrape",
SkyvernLog: "skyvern_log",
SkyvernLogRaw: "skyvern_log_raw",
PDF: "pdf",
} as const;
export type ArtifactType = (typeof ArtifactType)[keyof typeof ArtifactType];

View File

@@ -0,0 +1,162 @@
import { HelpTooltip } from "@/components/HelpTooltip";
import { Label } from "@/components/ui/label";
import { Handle, NodeProps, Position } from "@xyflow/react";
import type { PrintPageNode } from "./types";
import { Input } from "@/components/ui/input";
import { Switch } from "@/components/ui/switch";
import {
Select,
SelectContent,
SelectItem,
SelectTrigger,
SelectValue,
} from "@/components/ui/select";
import {
Accordion,
AccordionContent,
AccordionItem,
AccordionTrigger,
} from "@/components/ui/accordion";
import { Separator } from "@/components/ui/separator";
import { cn } from "@/util/utils";
import { NodeHeader } from "../components/NodeHeader";
import { useParams } from "react-router-dom";
import { statusIsRunningOrQueued } from "@/routes/tasks/types";
import { useWorkflowRunQuery } from "@/routes/workflows/hooks/useWorkflowRunQuery";
import { useUpdate } from "@/routes/workflows/editor/useUpdate";
function PrintPageNode({ id, data }: NodeProps<PrintPageNode>) {
const { editable, label } = data;
const { blockLabel: urlBlockLabel } = useParams();
const { data: workflowRun } = useWorkflowRunQuery();
const workflowRunIsRunningOrQueued =
workflowRun && statusIsRunningOrQueued(workflowRun);
const thisBlockIsTargetted =
urlBlockLabel !== undefined && urlBlockLabel === label;
const thisBlockIsPlaying =
workflowRunIsRunningOrQueued && thisBlockIsTargetted;
const update = useUpdate<PrintPageNode["data"]>({ id, editable });
return (
<div>
<Handle
type="source"
position={Position.Bottom}
id="a"
className="opacity-0"
/>
<Handle
type="target"
position={Position.Top}
id="b"
className="opacity-0"
/>
<div
className={cn(
"w-[30rem] space-y-4 rounded-lg bg-slate-elevation3 px-6 py-4 transition-all",
{
"pointer-events-none": thisBlockIsPlaying,
"bg-slate-950 outline outline-2 outline-slate-300":
thisBlockIsTargetted,
},
)}
>
<NodeHeader
blockLabel={label}
editable={editable}
nodeId={id}
totpIdentifier={null}
totpUrl={null}
type="print_page"
/>
<div className="space-y-4">
<div className="flex items-center justify-between">
<Label className="text-xs text-slate-300">Page Format</Label>
<Select
value={data.format}
onValueChange={(value) => update({ format: value })}
disabled={!editable}
>
<SelectTrigger className="nopan w-36 text-xs">
<SelectValue />
</SelectTrigger>
<SelectContent>
<SelectItem value="A4">A4</SelectItem>
<SelectItem value="Letter">Letter</SelectItem>
<SelectItem value="Legal">Legal</SelectItem>
<SelectItem value="Tabloid">Tabloid</SelectItem>
</SelectContent>
</Select>
</div>
<div className="flex items-center justify-between">
<div className="flex gap-2">
<Label className="text-xs text-slate-300">Print Background</Label>
<HelpTooltip content="Include CSS background colors and images in the PDF" />
</div>
<Switch
checked={data.printBackground}
onCheckedChange={(checked) =>
update({ printBackground: checked })
}
disabled={!editable}
/>
</div>
<div className="flex items-center justify-between">
<div className="flex gap-2">
<Label className="text-xs font-normal text-slate-300">
Headers & Footers
</Label>
<HelpTooltip content="Adds date, title, URL, and page numbers to the PDF" />
</div>
<Switch
checked={data.includeTimestamp}
onCheckedChange={(checked) =>
update({ includeTimestamp: checked })
}
disabled={!editable}
/>
</div>
</div>
<Separator />
<Accordion type="single" collapsible>
<AccordionItem value="advanced" className="border-b-0">
<AccordionTrigger className="py-0">
Advanced Settings
</AccordionTrigger>
<AccordionContent className="pl-6 pr-1 pt-1">
<div className="space-y-4">
<div className="space-y-2">
<Label className="text-xs text-slate-300">
Custom Filename
</Label>
<Input
value={data.customFilename}
onChange={(e) => update({ customFilename: e.target.value })}
placeholder="my_report"
disabled={!editable}
className="nopan text-xs"
/>
</div>
<div className="flex items-center justify-between">
<Label className="text-xs font-normal text-slate-300">
Landscape
</Label>
<Switch
checked={data.landscape}
onCheckedChange={(checked) =>
update({ landscape: checked })
}
disabled={!editable}
/>
</div>
</div>
</AccordionContent>
</AccordionItem>
</Accordion>
</div>
</div>
);
}
export { PrintPageNode };

View File

@@ -0,0 +1,30 @@
import type { Node } from "@xyflow/react";
import { debuggableWorkflowBlockTypes } from "@/routes/workflows/types/workflowTypes";
import { NodeBaseData } from "../types";
export type PrintPageNodeData = NodeBaseData & {
includeTimestamp: boolean;
customFilename: string;
format: string;
landscape: boolean;
printBackground: boolean;
};
export type PrintPageNode = Node<PrintPageNodeData, "printPage">;
export const printPageNodeDefaultData: PrintPageNodeData = {
debuggable: debuggableWorkflowBlockTypes.has("print_page"),
label: "",
continueOnFailure: false,
editable: true,
model: null,
includeTimestamp: true,
customFilename: "",
format: "A4",
landscape: false,
printBackground: true,
};
export function isPrintPageNode(node: Node): node is PrintPageNode {
return node.type === "printPage";
}

View File

@@ -91,6 +91,9 @@ function WorkflowBlockIcon({ workflowBlockType, className }: Props) {
case "http_request": {
return <GlobeIcon className={className} />;
}
case "print_page": {
return <FileTextIcon className={className} />;
}
}
}

View File

@@ -47,6 +47,8 @@ import { HttpRequestNode } from "./HttpRequestNode/types";
import { HttpRequestNode as HttpRequestNodeComponent } from "./HttpRequestNode/HttpRequestNode";
import { HumanInteractionNode } from "./HumanInteractionNode/types";
import { HumanInteractionNode as HumanInteractionNodeComponent } from "./HumanInteractionNode/HumanInteractionNode";
import { PrintPageNode } from "./PrintPageNode/types";
import { PrintPageNode as PrintPageNodeComponent } from "./PrintPageNode/PrintPageNode";
export type UtilityNode = StartNode | NodeAdderNode;
@@ -72,7 +74,8 @@ export type WorkflowBlockNode =
| PDFParserNode
| Taskv2Node
| URLNode
| HttpRequestNode;
| HttpRequestNode
| PrintPageNode;
export function isUtilityNode(node: AppNode): node is UtilityNode {
return node.type === "nodeAdder" || node.type === "start";
@@ -109,4 +112,5 @@ export const nodeTypes = {
taskv2: memo(Taskv2NodeComponent),
url: memo(URLNodeComponent),
http_request: memo(HttpRequestNodeComponent),
printPage: memo(PrintPageNodeComponent),
} as const;

View File

@@ -66,4 +66,5 @@ export const workflowBlockTitle: {
task_v2: "Browser Task v2",
goto_url: "Go to URL",
http_request: "HTTP Request",
print_page: "Print Page",
};

View File

@@ -266,6 +266,17 @@ const nodeLibraryItems: Array<{
title: "HTTP Request Block",
description: "Make HTTP API calls",
},
{
nodeType: "printPage",
icon: (
<WorkflowBlockIcon
workflowBlockType={WorkflowBlockTypes.PrintPage}
className="size-6"
/>
),
title: "Print Page Block",
description: "Print current page to PDF",
},
];
type Props = {

View File

@@ -46,6 +46,7 @@ import {
URLBlockYAML,
FileUploadBlockYAML,
HttpRequestBlockYAML,
PrintPageBlockYAML,
} from "../types/workflowYamlTypes";
import {
EMAIL_BLOCK_SENDER,
@@ -122,6 +123,7 @@ import { taskv2NodeDefaultData } from "./nodes/Taskv2Node/types";
import { urlNodeDefaultData } from "./nodes/URLNode/types";
import { fileUploadNodeDefaultData } from "./nodes/FileUploadNode/types";
import { httpRequestNodeDefaultData } from "./nodes/HttpRequestNode/types";
import { printPageNodeDefaultData } from "./nodes/PrintPageNode/types";
export const NEW_NODE_LABEL_PREFIX = "block_";
@@ -839,6 +841,21 @@ function convertToNode(
},
};
}
case "print_page": {
return {
...identifiers,
...common,
type: "printPage",
data: {
...commonData,
includeTimestamp: block.include_timestamp ?? false,
customFilename: block.custom_filename ?? "",
format: block.format ?? "A4",
landscape: block.landscape ?? false,
printBackground: block.print_background ?? true,
},
};
}
}
}
@@ -1877,6 +1894,17 @@ function createNode(
},
};
}
case "printPage": {
return {
...identifiers,
...common,
type: "printPage",
data: {
...printPageNodeDefaultData,
label,
},
};
}
case "conditional": {
const branches = createDefaultBranchConditions();
return {
@@ -2332,6 +2360,17 @@ function getWorkflowBlock(
save_response_as_file: node.data.saveResponseAsFile,
};
}
case "printPage": {
return {
...base,
block_type: "print_page",
include_timestamp: node.data.includeTimestamp,
custom_filename: node.data.customFilename || null,
format: node.data.format,
landscape: node.data.landscape,
print_background: node.data.printBackground,
};
}
case "conditional": {
return serializeConditionalBlock(node as ConditionalNode, nodes, edges);
}
@@ -3338,6 +3377,18 @@ function convertBlocksToBlockYAML(
};
return blockYaml;
}
case "print_page": {
const blockYaml: PrintPageBlockYAML = {
...base,
block_type: "print_page",
include_timestamp: block.include_timestamp,
custom_filename: block.custom_filename,
format: block.format,
landscape: block.landscape,
print_background: block.print_background,
};
return blockYaml;
}
}
});
}

View File

@@ -211,7 +211,8 @@ export type WorkflowBlock =
| PDFParserBlock
| Taskv2Block
| URLBlock
| HttpRequestBlock;
| HttpRequestBlock
| PrintPageBlock;
export const WorkflowBlockTypes = {
Task: "task",
@@ -236,6 +237,7 @@ export const WorkflowBlockTypes = {
Taskv2: "task_v2",
URL: "goto_url",
HttpRequest: "http_request",
PrintPage: "print_page",
} as const;
// all of them
@@ -554,6 +556,15 @@ export type HttpRequestBlock = WorkflowBlockBase & {
save_response_as_file: boolean;
};
export type PrintPageBlock = WorkflowBlockBase & {
block_type: "print_page";
include_timestamp: boolean;
custom_filename: string | null;
format: string;
landscape: boolean;
print_background: boolean;
};
export type WorkflowDefinition = {
version?: number | null;
parameters: Array<Parameter>;

View File

@@ -141,7 +141,8 @@ export type BlockYAML =
| PDFParserBlockYAML
| Taskv2BlockYAML
| URLBlockYAML
| HttpRequestBlockYAML;
| HttpRequestBlockYAML
| PrintPageBlockYAML;
export type BlockYAMLBase = {
block_type: WorkflowBlockType;
@@ -404,3 +405,12 @@ export type HttpRequestBlockYAML = BlockYAMLBase & {
download_filename?: string | null;
save_response_as_file?: boolean;
};
export type PrintPageBlockYAML = BlockYAMLBase & {
block_type: "print_page";
include_timestamp: boolean;
custom_filename: string | null;
format: string;
landscape: boolean;
print_background: boolean;
};

View File

@@ -279,13 +279,13 @@ class ArtifactManager:
path=path,
)
async def create_workflow_run_block_artifact(
async def _create_workflow_run_block_artifact_internal(
self,
workflow_run_block: WorkflowRunBlock,
artifact_type: ArtifactType,
data: bytes | None = None,
path: str | None = None,
) -> str:
) -> tuple[str, str]:
artifact_id = generate_artifact_id()
uri = app.STORAGE.build_workflow_run_block_uri(
organization_id=workflow_run_block.organization_id,
@@ -293,7 +293,7 @@ class ArtifactManager:
workflow_run_block=workflow_run_block,
artifact_type=artifact_type,
)
return await self._create_artifact(
await self._create_artifact(
aio_task_primary_key=workflow_run_block.workflow_run_block_id,
artifact_id=artifact_id,
artifact_type=artifact_type,
@@ -304,6 +304,36 @@ class ArtifactManager:
data=data,
path=path,
)
return artifact_id, uri
async def create_workflow_run_block_artifact(
self,
workflow_run_block: WorkflowRunBlock,
artifact_type: ArtifactType,
data: bytes | None = None,
path: str | None = None,
) -> str:
artifact_id, _ = await self._create_workflow_run_block_artifact_internal(
workflow_run_block=workflow_run_block,
artifact_type=artifact_type,
data=data,
path=path,
)
return artifact_id
async def create_workflow_run_block_artifact_with_uri(
self,
workflow_run_block: WorkflowRunBlock,
artifact_type: ArtifactType,
data: bytes | None = None,
path: str | None = None,
) -> tuple[str, str]:
return await self._create_workflow_run_block_artifact_internal(
workflow_run_block=workflow_run_block,
artifact_type=artifact_type,
data=data,
path=path,
)
async def create_workflow_run_block_artifacts(
self,

View File

@@ -52,6 +52,9 @@ class ArtifactType(StrEnum):
# Script files
SCRIPT_FILE = "script_file"
# PDF files
PDF = "pdf"
class Artifact(BaseModel):
created_at: datetime = Field(

View File

@@ -34,6 +34,7 @@ FILE_EXTENTSION_MAP: dict[ArtifactType, str] = {
ArtifactType.HASHED_HREF_MAP: "json",
# DEPRECATED: we're using CSS selector map now
ArtifactType.VISIBLE_ELEMENTS_ID_XPATH_MAP: "json",
ArtifactType.PDF: "pdf",
}

View File

@@ -11,13 +11,14 @@ import smtplib
import textwrap
import uuid
from collections import defaultdict, deque
from datetime import datetime
from datetime import datetime, timezone
from email.message import EmailMessage
from pathlib import Path
from types import SimpleNamespace
from typing import Annotated, Any, Awaitable, Callable, ClassVar, Literal, Union, cast
from urllib.parse import quote, urlparse
import aiofiles
import aiohttp
import filetype
import pandas as pd
@@ -66,6 +67,7 @@ from skyvern.forge.sdk.artifact.models import ArtifactType
from skyvern.forge.sdk.core import skyvern_context
from skyvern.forge.sdk.core.aiohttp_helper import aiohttp_request
from skyvern.forge.sdk.db.enums import TaskType
from skyvern.forge.sdk.db.exceptions import NotFoundError
from skyvern.forge.sdk.experimentation.llm_prompt_config import get_llm_handler_for_prompt_type
from skyvern.forge.sdk.schemas.files import FileInfo
from skyvern.forge.sdk.schemas.task_v2 import TaskV2Status
@@ -4474,6 +4476,189 @@ class HttpRequestBlock(Block):
)
class PrintPageBlock(Block):
block_type: Literal[BlockType.PRINT_PAGE] = BlockType.PRINT_PAGE # type: ignore
include_timestamp: bool = True
custom_filename: str | None = None
format: str = "A4"
landscape: bool = False
print_background: bool = True
VALID_FORMATS: ClassVar[set[str]] = {"A4", "Letter", "Legal", "Tabloid"}
def get_all_parameters(self, workflow_run_id: str) -> list[PARAMETER_TYPE]:
return []
@staticmethod
def _sanitize_filename(filename: str) -> str:
sanitized = re.sub(r'[<>:"/\\|?*]', "_", filename)
sanitized = sanitized.strip(". ")
return sanitized[:200] if sanitized else "document"
def _build_pdf_options(self) -> dict[str, Any]:
pdf_format = self.format if self.format in self.VALID_FORMATS else "A4"
pdf_options: dict[str, Any] = {
"format": pdf_format,
"landscape": self.landscape,
"print_background": self.print_background,
}
if self.include_timestamp:
pdf_options["display_header_footer"] = True
pdf_options["header_template"] = (
'<div style="font-size:10px;width:100%;display:flex;justify-content:space-between;padding:0 10px;">'
'<span class="date"></span><span class="title"></span><span></span></div>'
)
pdf_options["footer_template"] = (
'<div style="font-size:10px;width:100%;display:flex;justify-content:space-between;padding:0 10px;">'
'<span class="url"></span><span></span><span><span class="pageNumber"></span>/<span class="totalPages"></span></span></div>'
)
pdf_options["margin"] = {"top": "40px", "bottom": "40px"}
return pdf_options
async def _upload_pdf_artifact(
self,
*,
pdf_bytes: bytes,
workflow_run_id: str,
workflow_run_block_id: str,
workflow_run_context: WorkflowRunContext,
organization_id: str | None,
) -> str | None:
artifact_org_id = organization_id or workflow_run_context.organization_id
if not artifact_org_id:
LOG.warning(
"PrintPageBlock: Missing organization_id, skipping artifact upload",
workflow_run_id=workflow_run_id,
workflow_run_block_id=workflow_run_block_id,
)
return None
try:
workflow_run_block = await app.DATABASE.get_workflow_run_block(
workflow_run_block_id,
organization_id=artifact_org_id,
)
except NotFoundError:
LOG.warning(
"PrintPageBlock: Workflow run block not found, skipping artifact upload",
workflow_run_id=workflow_run_id,
workflow_run_block_id=workflow_run_block_id,
organization_id=artifact_org_id,
)
return None
_, artifact_uri = await app.ARTIFACT_MANAGER.create_workflow_run_block_artifact_with_uri(
workflow_run_block=workflow_run_block,
artifact_type=ArtifactType.PDF,
data=pdf_bytes,
)
try:
await app.ARTIFACT_MANAGER.wait_for_upload_aiotasks([workflow_run_block.workflow_run_block_id])
except Exception:
LOG.warning(
"PrintPageBlock: Failed to upload PDF artifact",
workflow_run_id=workflow_run_id,
workflow_run_block_id=workflow_run_block.workflow_run_block_id,
exc_info=True,
)
return None
return artifact_uri
async def execute(
self,
workflow_run_id: str,
workflow_run_block_id: str,
organization_id: str | None = None,
browser_session_id: str | None = None,
**kwargs: dict,
) -> BlockResult:
workflow_run_context = self.get_workflow_run_context(workflow_run_id)
browser_state = app.BROWSER_MANAGER.get_for_workflow_run(workflow_run_id)
if not browser_state:
return await self.build_block_result(
success=False,
failure_reason="No browser state available",
status=BlockStatus.failed,
workflow_run_block_id=workflow_run_block_id,
organization_id=organization_id,
)
page = await browser_state.get_working_page()
if not page:
return await self.build_block_result(
success=False,
failure_reason="No page available",
status=BlockStatus.failed,
workflow_run_block_id=workflow_run_block_id,
organization_id=organization_id,
)
pdf_options = self._build_pdf_options()
try:
pdf_bytes = await page.pdf(**pdf_options)
except Exception as e:
error_msg = str(e)
if "pdf" in error_msg.lower() and ("not supported" in error_msg.lower() or "chromium" in error_msg.lower()):
error_msg = "PDF generation requires Chromium browser. Current browser does not support page.pdf()."
LOG.warning("PrintPageBlock: Failed to generate PDF", error=error_msg, workflow_run_id=workflow_run_id)
return await self.build_block_result(
success=False,
failure_reason=f"Failed to generate PDF: {error_msg}",
status=BlockStatus.failed,
workflow_run_block_id=workflow_run_block_id,
organization_id=organization_id,
)
timestamp_str = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
if self.custom_filename:
filename = self.format_block_parameter_template_from_workflow_run_context(
self.custom_filename, workflow_run_context
)
filename = self._sanitize_filename(filename)
if not filename.endswith(".pdf"):
filename += ".pdf"
else:
filename = f"page_{timestamp_str}.pdf"
# Save PDF to download directory so it appears in runs UI
download_dir = get_download_dir(workflow_run_id)
file_path = os.path.join(download_dir, filename)
async with aiofiles.open(file_path, "wb") as f:
await f.write(pdf_bytes)
# Upload to artifact storage for downstream block access (e.g., File Extraction Block)
artifact_uri = await self._upload_pdf_artifact(
pdf_bytes=pdf_bytes,
workflow_run_id=workflow_run_id,
workflow_run_block_id=workflow_run_block_id,
workflow_run_context=workflow_run_context,
organization_id=organization_id,
)
output = {
"filename": filename,
"file_path": file_path,
"size_bytes": len(pdf_bytes),
"artifact_uri": artifact_uri,
}
await self.record_output_parameter_value(workflow_run_context, workflow_run_id, output)
return await self.build_block_result(
success=True,
failure_reason=None,
output_parameter_value=output,
status=BlockStatus.completed,
workflow_run_block_id=workflow_run_block_id,
organization_id=organization_id,
)
class BranchEvaluationContext:
"""Collection of runtime data that BranchCriteria evaluators can consume."""
@@ -5246,6 +5431,7 @@ BlockSubclasses = Union[
TaskV2Block,
FileUploadBlock,
HttpRequestBlock,
PrintPageBlock,
]
BlockTypeVar = Annotated[BlockSubclasses, Field(discriminator="block_type")]

View File

@@ -43,6 +43,7 @@ from skyvern.forge.sdk.workflow.models.block import (
LoginBlock,
NavigationBlock,
PDFParserBlock,
PrintPageBlock,
PromptBranchCriteria,
SendEmailBlock,
TaskBlock,
@@ -729,6 +730,15 @@ def block_yaml_to_block(
url=block_yaml.url,
complete_verification=False,
)
elif block_yaml.block_type == BlockType.PRINT_PAGE:
return PrintPageBlock(
**base_kwargs,
include_timestamp=block_yaml.include_timestamp,
custom_filename=block_yaml.custom_filename,
format=block_yaml.format,
landscape=block_yaml.landscape,
print_background=block_yaml.print_background,
)
raise ValueError(f"Invalid block type {block_yaml.block_type}")

View File

@@ -41,6 +41,7 @@ class BlockType(StrEnum):
PDF_PARSER = "pdf_parser"
HTTP_REQUEST = "http_request"
HUMAN_INTERACTION = "human_interaction"
PRINT_PAGE = "print_page"
class BlockStatus(StrEnum):
@@ -68,6 +69,13 @@ class FileType(StrEnum):
PDF = "pdf"
class PDFFormat(StrEnum):
A4 = "A4"
LETTER = "Letter"
LEGAL = "Legal"
TABLOID = "Tabloid"
class FileStorageType(StrEnum):
S3 = "s3"
AZURE = "azure"
@@ -546,6 +554,15 @@ class HttpRequestBlockYAML(BlockYAML):
parameter_keys: list[str] | None = None
class PrintPageBlockYAML(BlockYAML):
block_type: Literal[BlockType.PRINT_PAGE] = BlockType.PRINT_PAGE # type: ignore
include_timestamp: bool = True
custom_filename: str | None = None
format: PDFFormat = PDFFormat.A4
landscape: bool = False
print_background: bool = True
PARAMETER_YAML_SUBCLASSES = (
AWSSecretParameterYAML
| BitwardenLoginCredentialParameterYAML
@@ -583,6 +600,7 @@ BLOCK_YAML_SUBCLASSES = (
| TaskV2BlockYAML
| HttpRequestBlockYAML
| ConditionalBlockYAML
| PrintPageBlockYAML
)
BLOCK_YAML_TYPES = Annotated[BLOCK_YAML_SUBCLASSES, Field(discriminator="block_type")]