Making file parser flexible to deprecate pdf parser (#3073)

Co-authored-by: Suchintan <suchintan@users.noreply.github.com>
This commit is contained in:
PHSB
2025-08-06 11:15:04 -06:00
committed by GitHub
parent 31aa7d6973
commit 468f5c6051
15 changed files with 555 additions and 49 deletions

View File

@@ -10,6 +10,8 @@ import { useDebugStore } from "@/store/useDebugStore";
import { cn } from "@/util/utils";
import { NodeHeader } from "../components/NodeHeader";
import { useParams } from "react-router-dom";
import { WorkflowDataSchemaInputGroup } from "@/components/DataSchemaInputGroup/WorkflowDataSchemaInputGroup";
import { dataSchemaExampleForFileExtraction } from "../types";
function FileParserNode({ id, data }: NodeProps<FileParserNode>) {
const { updateNodeData } = useReactFlow();
@@ -21,8 +23,17 @@ function FileParserNode({ id, data }: NodeProps<FileParserNode>) {
urlBlockLabel !== undefined && urlBlockLabel === label;
const [inputs, setInputs] = useState({
fileUrl: data.fileUrl,
jsonSchema: data.jsonSchema,
});
function handleChange(key: string, value: unknown) {
if (!data.editable) {
return;
}
setInputs({ ...inputs, [key]: value });
updateNodeData(id, { [key]: value });
}
const isFirstWorkflowBlock = useIsFirstBlockInWorkflow({ id });
return (
@@ -75,15 +86,19 @@ function FileParserNode({ id, data }: NodeProps<FileParserNode>) {
nodeId={id}
value={inputs.fileUrl}
onChange={(value) => {
if (!data.editable) {
return;
}
setInputs({ ...inputs, fileUrl: value });
updateNodeData(id, { fileUrl: value });
handleChange("fileUrl", value);
}}
className="nopan text-xs"
/>
</div>
<WorkflowDataSchemaInputGroup
exampleValue={dataSchemaExampleForFileExtraction}
value={inputs.jsonSchema}
onChange={(value) => {
handleChange("jsonSchema", value);
}}
suggestionContext={{}}
/>
</div>
</div>
</div>

View File

@@ -1,9 +1,11 @@
import type { Node } from "@xyflow/react";
import { NodeBaseData } from "../types";
import { AppNode } from "..";
import { debuggableWorkflowBlockTypes } from "@/routes/workflows/types/workflowTypes";
export type FileParserNodeData = NodeBaseData & {
fileUrl: string;
jsonSchema: string;
};
export type FileParserNode = Node<FileParserNodeData, "fileParser">;
@@ -14,5 +16,10 @@ export const fileParserNodeDefaultData: FileParserNodeData = {
label: "",
fileUrl: "",
continueOnFailure: false,
jsonSchema: "null",
model: null,
} as const;
export function isFileParserNode(node: AppNode): node is FileParserNode {
return node.type === "fileParser";
}

View File

@@ -162,20 +162,19 @@ const nodeLibraryItems: Array<{
/>
),
title: "File Parser Block",
description: "Parse data from files",
},
{
nodeType: "pdfParser",
icon: (
<WorkflowBlockIcon
workflowBlockType={WorkflowBlockTypes.PDFParser}
className="size-6"
/>
),
title: "PDF Parser Block",
description: "Extract data from PDF files",
description: "Parse PDFs, CSVs, and Excel files",
},
// {
// nodeType: "pdfParser",
// icon: (
// <WorkflowBlockIcon
// workflowBlockType={WorkflowBlockTypes.PDFParser}
// className="size-6"
// />
// ),
// title: "PDF Parser Block",
// description: "Extract data from PDF files",
// },
// nodeType: "upload",
// icon: (
// <WorkflowBlockIcon

View File

@@ -56,7 +56,10 @@ import { ParametersState } from "./types";
import { AppNode, isWorkflowBlockNode, WorkflowBlockNode } from "./nodes";
import { codeBlockNodeDefaultData } from "./nodes/CodeBlockNode/types";
import { downloadNodeDefaultData } from "./nodes/DownloadNode/types";
import { fileParserNodeDefaultData } from "./nodes/FileParserNode/types";
import {
isFileParserNode,
fileParserNodeDefaultData,
} from "./nodes/FileParserNode/types";
import {
isLoopNode,
LoopNode,
@@ -468,6 +471,7 @@ function convertToNode(
data: {
...commonData,
fileUrl: block.file_url,
jsonSchema: JSON.stringify(block.json_schema, null, 2),
},
};
}
@@ -1254,7 +1258,8 @@ function getWorkflowBlock(node: WorkflowBlockNode): BlockYAML {
...base,
block_type: "file_url_parser",
file_url: node.data.fileUrl,
file_type: "csv",
file_type: "csv", // Backend will auto-detect based on file extension
json_schema: JSONParseSafe(node.data.jsonSchema),
};
}
case "textPrompt": {
@@ -2187,6 +2192,15 @@ function getWorkflowErrors(nodes: Array<AppNode>): Array<string> {
}
});
const fileParserNodes = nodes.filter(isFileParserNode);
fileParserNodes.forEach((node) => {
try {
JSON.parse(node.data.jsonSchema);
} catch {
errors.push(`${node.data.label}: Data schema is not valid JSON.`);
}
});
const waitNodes = nodes.filter(isWaitNode);
waitNodes.forEach((node) => {
const waitTimeString = node.data.waitInSeconds.trim();

View File

@@ -354,7 +354,8 @@ export type SendEmailBlock = WorkflowBlockBase & {
export type FileURLParserBlock = WorkflowBlockBase & {
block_type: "file_url_parser";
file_url: string;
file_type: "csv";
file_type: "csv" | "excel" | "pdf";
json_schema: Record<string, unknown> | null;
};
export type ValidationBlock = WorkflowBlockBase & {

View File

@@ -308,7 +308,8 @@ export type SendEmailBlockYAML = BlockYAMLBase & {
export type FileUrlParserBlockYAML = BlockYAMLBase & {
block_type: "file_url_parser";
file_url: string;
file_type: "csv";
file_type: "csv" | "excel" | "pdf";
json_schema?: Record<string, unknown> | null;
};
export type ForLoopBlockYAML = BlockYAMLBase & {