From 1dbaccbde3e2b83b6929fa2236026ffdc8a3d2da Mon Sep 17 00:00:00 2001 From: "glm-5.1" Date: Tue, 19 May 2026 11:36:55 +0000 Subject: [PATCH] specify failure propagation semantics (C-04) Key design decisions: - Failure follows dependency edges, not structural scope - Parallel branches are independent: failure in one branch doesn't cancel sibling branches - blockedByFailure computed signal detects failed/aborted predecessors - Conditionals serve as error boundaries (caught failures redirect to else branch, uncaught failures cascade) - aborted nodes don't satisfy preconditions; skipped nodes do - abortAll() for systemic failures (provider outage, auth failure) Changes: - reactive-execution.md: new Failure Propagation section with sequential/parallel/join/conditional examples, blockedByFailure effect, partial success model - host-configs.md: add blockedByFailure to WorkflowNode, update status propagation effects, replace cascadeAbort with abortAll - schema.md: document precondition semantics for NodeStatus - build-distribution.md + README.md: add blockedByFailure to node-status.ts comments - review checklist: mark C-04 resolved --- docs/architecture/README.md | 2 +- docs/architecture/build-distribution.md | 2 +- docs/architecture/host-configs.md | 39 ++- docs/architecture/reactive-execution.md | 298 +++++++++++++++--- docs/architecture/schema.md | 2 + docs/reviews/001-architecture-gap-analysis.md | 2 +- 6 files changed, 280 insertions(+), 65 deletions(-) diff --git a/docs/architecture/README.md b/docs/architecture/README.md index 4cf2670..012e113 100644 --- a/docs/architecture/README.md +++ b/docs/architecture/README.md @@ -118,7 +118,7 @@ src/ index.ts reactive/ workflow.ts # ReactiveRoot for workflow state - node-status.ts # Per-node status signals + computed preconditions + node-status.ts # Per-node status signals + computed preconditions + blockedByFailure index.ts analysis/ type-compat.ts # Schema compatibility checking between operation input/output diff --git a/docs/architecture/build-distribution.md b/docs/architecture/build-distribution.md index 81ff1a3..ee5c680 100644 --- a/docs/architecture/build-distribution.md +++ b/docs/architecture/build-distribution.md @@ -36,7 +36,7 @@ Package structure, exports map, dependencies, and platform targets. │ │ └── index.ts │ ├── reactive/ │ │ ├── workflow.ts # WorkflowReactiveRoot (signal-backed execution) -│ │ ├── node-status.ts # Signal, computed preconditions +│ │ ├── node-status.ts # Signal, computed preconditions, computed blockedByFailure │ │ └── index.ts │ ├── analysis/ │ │ ├── type-compat.ts # typeCompat, buildTypeEdges, analyzeTypeCompat diff --git a/docs/architecture/host-configs.md b/docs/architecture/host-configs.md index 6116fac..f56dc77 100644 --- a/docs/architecture/host-configs.md +++ b/docs/architecture/host-configs.md @@ -172,7 +172,8 @@ interface WorkflowNode { key: string; // Operation name or structural container ID type: "operation" | "sequential" | "parallel" | "conditional" | "map"; status: Signal; // Reactive status signal - preconditions: Computed; // Computed: true when all preconditions are met + preconditions: Computed; // Computed: true when all preconditions are met + blockedByFailure: Computed; // Computed: true when any predecessor failed/aborted (uncaught) operationId?: string; // For operation nodes: the fully qualified ID output?: Signal; // For operation nodes: the call result (when completed) children: WorkflowNode[]; // Child nodes (structural containers have children) @@ -181,7 +182,8 @@ interface WorkflowNode { Each `WorkflowNode` holds: - A `signal` that tracks the call's lifecycle (`idle` → `waiting` → `ready` → `running` → `completed`/`failed`/`aborted`/`skipped`) -- A `computed` that derives `preconditions` from parent nodes' statuses +- A `computed` that derives `preconditions` from parent nodes' statuses (true when all predecessors are `completed` or `skipped`) +- A `computed` that derives `blockedByFailure` from parent nodes' statuses (true when any predecessor is `failed` or `aborted`) - An optional `output` signal that holds the call result when completed ### ReactiveContext @@ -243,29 +245,42 @@ function computePreconditions(node: WorkflowNode, ctx: ReactiveContext): boolean ### Status Propagation -When a node's `status` signal changes, its dependents' `preconditions` computed automatically re-evaluate. If preconditions are met, the node transitions to `ready`: +When a node's `status` signal changes, its dependents' `preconditions` and `blockedByFailure` computed values automatically re-evaluate. If preconditions are met, the node transitions to `ready`; if blocked by failure, it transitions to `aborted`: ```typescript +// Start when preconditions are met effect(() => { if (node.preconditions.value) { - node.status.value = "ready"; + if (node.status.value === "idle" || node.status.value === "waiting") { + node.status.value = "ready"; + } + } +}); + +// Abort when a predecessor fails (uncaught failure propagation) +effect(() => { + if (node.blockedByFailure.value) { + if (node.status.value === "idle" || node.status.value === "waiting") { + node.status.value = "aborted"; + } } }); ``` -The reactive engine then starts the call associated with the node, which sets `status` to `running`, and eventually `completed` or `failed`. +The reactive engine then starts the call associated with the node (when `ready`), which sets `status` to `running`, and eventually `completed` or `failed`. + +**Note**: Failure propagation follows dependency edges, not structural scope. A failed node only causes its downstream dependents (via DAG edges) to abort. Sibling branches in a `Parallel` group are independent and continue running. See [reactive-execution.md](reactive-execution.md) for the full failure propagation model. ### Abort Cascading -When a node is aborted, all its descendants are also aborted: +System-level abort (e.g., provider outage) aborts the entire workflow: ```typescript -function cascadeAbort(node: WorkflowNode): void { - if (node.status.value === "running" || node.status.value === "ready" || node.status.value === "waiting") { - node.status.value = "aborted"; - } - for (const child of node.children) { - cascadeAbort(child); +function abortAll(root: WorkflowReactiveRoot): void { + for (const [nodeId, status] of root.statusMap) { + if (status.value !== "completed" && status.value !== "failed") { + status.value = "aborted"; + } } } ``` diff --git a/docs/architecture/reactive-execution.md b/docs/architecture/reactive-execution.md index 701d169..8e20f6e 100644 --- a/docs/architecture/reactive-execution.md +++ b/docs/architecture/reactive-execution.md @@ -5,7 +5,7 @@ last_updated: 2026-05-19 # Reactive Execution -Signal-driven status propagation, computed preconditions, and abort cascading for workflow template execution. +Signal-driven status propagation, computed preconditions, and failure propagation for workflow template execution. ## Overview @@ -13,9 +13,10 @@ The reactive execution layer bridges workflow template structure (DAG) to runtim - Each `` node gets a `signal` tracking its lifecycle state - Preconditions are `computed` values that automatically resolve when upstream dependencies complete -- Abort cascades propagate through the signal graph — setting one node to `"aborted"` automatically prevents downstream nodes from starting +- Failure propagation follows dependency edges — a failed predecessor causes downstream dependents to abort, while independent branches continue running +- Conditionals can serve as error boundaries, catching failures and redirecting to fallback paths -This layer does NOT execute operations directly. It provides reactive state that the hub coordinator reads and writes. The coordinator calls `registry.execute()` when a node's preconditions are met, and updates the node's status signal when the call completes. +This layer does NOT execute operations directly. It provides reactive state that the hub coordinator reads and writes. The coordinator calls `registry.execute()` when a node's preconditions are met, and updates the node's status signal when the call completes or fails. ## ReactiveRoot for Workflows @@ -23,14 +24,16 @@ This layer does NOT execute operations directly. It provides reactive state that class WorkflowReactiveRoot { private statusMap: Map>; private preconditions: Map>; + private blockedByFailure: Map>; private graph: DirectedGraph; - private abortMap: Map void>; + private effectDisposers: (() => void)[]; constructor(graph: DirectedGraph) { this.graph = graph; this.statusMap = new Map(); this.preconditions = new Map(); - this.abortMap = new Map(); + this.blockedByFailure = new Map(); + this.effectDisposers = []; this.initializeSignals(); } } @@ -48,25 +51,36 @@ private initializeSignals(): void { const status = signal("idle"); + const predecessors = this.graph.inNeighbors(node); + + // Preconditions: all predecessors completed or skipped const preconditions = computed(() => { - const predecessors = this.graph.inNeighbors(node); return predecessors.every(pred => { const predStatus = this.statusMap.get(pred); - return predStatus && predStatus.value === "completed"; + return predStatus && (predStatus.value === "completed" || predStatus.value === "skipped"); + }); + }); + + // Blocked by failure: any predecessor failed or aborted (uncaught) + const blockedByFailure = computed(() => { + return predecessors.some(pred => { + const predStatus = this.statusMap.get(pred); + return predStatus && (predStatus.value === "failed" || predStatus.value === "aborted"); }); }); this.statusMap.set(node, status); this.preconditions.set(node, preconditions); - this.abortMap.set(node, () => this.cascadeAbort(node)); + this.blockedByFailure.set(node, blockedByFailure); } } ``` For each operation node in the DAG: 1. Create a `signal` starting at `"idle"` -2. Create a `computed` that's `true` when all predecessor nodes have status `"completed"` -3. Register an abort function that cascades to all descendants +2. Create a `computed` that's `true` when all predecessor nodes have status `"completed"` (or `"skipped"` — a skipped node satisfies its dependents' preconditions) +3. Create a `computed` that detects whether any predecessor has failed or been aborted, triggering a cascade +4. Register an abort function that cascades to all descendants ### Status lifecycle @@ -74,22 +88,47 @@ The signal-based status lifecycle mirrors `CallStatus` with workflow-specific ad ``` idle → waiting → ready → running → completed - → failed - → aborted → aborted + ↓ ↑ + failed │ + ↓ │ + (uncaught) → aborted ←──┘ + ↑ + (cascade from failed predecessor) + ↑ + skipped (conditional) +``` + +Full transition rules: + +``` +idle → waiting (predecessor starts running) +idle → ready (no predecessors — root node) +waiting → ready (all predecessors completed or skipped) +waiting → aborted (predecessor failed and failure is uncaught) +ready → running (hub starts the call) +running → completed (call succeeded) +running → failed (call threw an error) +running → aborted (call cancelled externally) +failed → [terminal] (no further transitions) +aborted → [terminal] (no further transitions) +skipped → [terminal] (conditional branch not taken) +completed → [terminal] (no further transitions) ``` | Status | Meaning | Signal trigger | |--------|---------|---------------| -| `idle` | Node just created, no parent completion yet | Initial state | -| `waiting` | At least one predecessor is running, none have completed | Any predecessor status change | -| `ready` | All predecessors completed (preconditions met) | `computed` resolves to `true` | +| `idle` | Node just created, no predecessor activity yet | Initial state | +| `waiting` | At least one predecessor is running, none have completed yet | Any predecessor status change | +| `ready` | All predecessors completed or skipped (preconditions met) | `computed` resolves to `true` | | `running` | Call executing | Hub sets `status.value = "running"` | | `completed` | Call succeeded | Hub sets `status.value = "completed"` | -| `failed` | Call failed | Hub sets `status.value = "failed"` | -| `aborted` | Call cancelled (or parent cancelled) | Hub or cascade sets `status.value = "aborted"` | +| `failed` | Call failed (uncaught error) | Hub sets `status.value = "failed"` | +| `aborted` | Call cancelled, or cascaded from failed predecessor | Hub or cascade sets `status.value = "aborted"` | | `skipped` | Conditional branch not taken | Conditional evaluation sets this | -The hub coordinator reads the `ready` state (via `preconditions`) and triggers execution. When the call completes, the hub writes the new status to the signal. The signal propagates to all downstream `computed` values automatically. +The key distinction between `failed` and `aborted`: +- **`failed`** means the operation itself threw an error. The node is the *source* of the failure. +- **`aborted`** means the operation was cancelled or a predecessor failed. The node is a *victim* of failure propagation. ## Computed Preconditions @@ -98,14 +137,20 @@ The core innovation of reactive execution: each node's "can I start?" question i ```typescript const preconditions = computed(() => { const predecessors = graph.inNeighbors(node); - return predecessors.every(pred => statusMap.get(pred)!.value === "completed"); + return predecessors.every(pred => { + const status = statusMap.get(pred)!.value; + return status === "completed" || status === "skipped"; + }); }); ``` +A node's preconditions are met when **all predecessors have reached a satisfying terminal state** (`completed` or `skipped`). A `failed` or `aborted` predecessor does NOT satisfy preconditions — it prevents the dependent from ever becoming `ready`. + This means: - Adding a new predecessor automatically includes it in the check (if the DAG changes) - A predecessor completing automatically re-evaluates all dependent preconditions -- An aborted predecessor prevents all dependents from becoming `ready` +- An aborted predecessor prevents dependents from becoming `ready` +- A skipped predecessor satisfies preconditions (the branch was deliberately bypassed, not broken) - No manual event wiring or callback chains ### Sequential preconditions @@ -135,43 +180,160 @@ When a node depends on multiple predecessors (e.g., D depends on both B and C co D only becomes `ready` when all predecessors complete. This is the "join" in fork-join parallelism. -## Abort Cascade +## Failure Propagation -Abort cascading is signal-driven. When a node is aborted: +Failure propagation is the mechanism by which a failed or aborted node causes its downstream dependents to abort. The key design principle: **failure follows dependency edges, not structural scope**. + +This means: +- In a `Sequential` group, failure propagates forward through the chain (B depends on A, so if A fails, B aborts) +- In a `Parallel` group, sibling branches are independent — a failure in branch A does NOT affect branch B, because there are no dependency edges between them +- A node that depends on multiple predecessors (a join) aborts only when it's impossible for its preconditions to ever be met + +### The preconditions-failure duality + +Each node has two complementary reactive computations: + +1. **`preconditions`** (`computed`) — true when all predecessors are `completed` or `skipped`. Node can start. +2. **`blockedByFailure`** (`computed`) — true when any predecessor is `failed` or `aborted` and the failure is uncaught (not handled by a `Conditional`). ```typescript -cascadeAbort(nodeId: string): void { - const status = this.statusMap.get(nodeId); - if (status && !isTerminal(status.value)) { - status.value = "aborted"; - // Cascade to all descendants - for (const desc of this.graph.descendants(nodeId)) { - const descStatus = this.statusMap.get(desc); - if (descStatus && !isTerminal(descStatus.value)) { - descStatus.value = "aborted"; - } - } - } -} +const preconditions = computed(() => { + const predecessors = graph.inNeighbors(node); + return predecessors.every(pred => { + const status = statusMap.get(pred)!.value; + return status === "completed" || status === "skipped"; + }); +}); + +const blockedByFailure = computed(() => { + const predecessors = graph.inNeighbors(node); + return predecessors.some(pred => { + const status = statusMap.get(pred)!.value; + return status === "failed" || status === "aborted"; + }); +}); ``` -This sets the status of the aborted node and all of its descendants to `"aborted"`. The `computed` preconditions of these nodes automatically re-evaluate — but since aborted nodes never become "completed", their dependents will never become "ready". +When `blockedByFailure` becomes `true` and the node hasn't started (`idle` or `waiting`), the node transitions to `aborted`. This happens via an `effect()`: + +```typescript +effect(() => { + if (blockedByFailure.value && (status.value === "idle" || status.value === "waiting")) { + status.value = "aborted"; + } +}); +``` + +This cascade is automatic and reactive — when a predecessor fails, all downstream `blockedByFailure` computations re-evaluate, and their effects fire, aborting any waiting dependents. + +### Sequential failure propagation + +``` +A (failed) → B (aborted) → C (aborted) +``` + +When A fails, B's `blockedByFailure` becomes true. B transitions from `waiting` to `aborted`. C's `blockedByFailure` then becomes true (B is now `aborted`). C transitions to `aborted`. The entire downstream chain aborts. + +### Parallel independence + +``` + ┌── B (completed) ──┐ +A (completed) ├── D (ready) + └── C (failed) ─────┘ +``` + +When C fails: +- C's downstream dependents see `blockedByFailure = true` +- B is unaffected — it's on an independent branch +- D depends on both B and C. D's `preconditions` will never be met (C is `failed`, not `completed`). D's `blockedByFailure` is true (C is `failed`). D transitions to `aborted`. + +But crucially, this is because D *depends on* C, not because they share a structural scope: + +``` + ┌── B (completed) ──┐ +A (completed) │ (no edge from C to E) + └── C (failed) ─────┘ + └── E (completed) +``` + +E has no dependency on C. E continues running regardless of C's failure. **Failure follows dependency edges, not structural boundaries.** + +### Join semantics + +When a node depends on multiple predecessors (fork-join): + +``` + ┌── B (completed) ──┐ +A (completed) ├── D (aborted) + └── C (failed) ─────┘ +``` + +D's `preconditions` requires both B and C to be completed/skipped. Since C is `failed`, D's preconditions can never be met. D transitions to `aborted`. + +The alternative would be "partial success" — D starts with B's output even though C failed. This is NOT supported by the precondition model. If partial execution is needed, the template author should use a `Conditional` to handle the failure case explicitly. + +### Conditional as error boundary + +A `Conditional` can catch a failure and redirect to a fallback path: + +```typescript +h(Sequential, {}, + h(Operation, { name: "fetch-data" }), + h(Conditional, { + test: (results) => results["fetch-data"].status !== "failed", + }, + // then: proceed with data processing + h(Sequential, {}, + h(Operation, { name: "transform" }), + h(Operation, { name: "store" }), + ), + // else: fallback path + h(Operation, { name: "notify-error" }), + ), +) +``` + +If `fetch-data` fails: +1. The `Conditional`'s `test` function receives the results map including `fetch-data`'s status +2. `test` evaluates to `false` (the operation failed) +3. The `then` branch transitions to `skipped` +4. The `else` branch (`notify-error`) becomes `ready` +5. Downstream nodes after the `Conditional` see the `Conditional` as `completed` (it resolved successfully, just on a different branch) + +This makes `Conditional` a **caught error boundary**. The failure is handled — downstream nodes don't see a cascade because the `Conditional` resolved successfully. + +Without a `Conditional`, the failure is **uncaught**. It cascades through dependency edges to all dependents, which transition to `aborted`. + +### Systemic failure: aborting the entire workflow + +For failures that should cancel everything (e.g., provider outage, authentication failure), the hub coordinator can abort the entire `WorkflowReactiveRoot`: + +```typescript +workflowRoot.abortAll(); // Sets all non-terminal nodes to "aborted" +``` + +This is separate from dependency-edge failure propagation. It's for systemic failures where the workflow cannot meaningfully continue regardless of which branches are independent. ### Interaction with call protocol abort There are two abort mechanisms: -1. **Signal cascade** (this layer) — sets `status.value = "aborted"` for the node and all descendants. This is immediate and graph-based. +1. **Signal cascade** (this layer) — `blockedByFailure` effects transition dependents to `aborted`. This is automatic and follows dependency edges. 2. **Call protocol abort** (operations layer) — `PendingRequestMap.abort(requestId)` propagates `call.aborted` events through the pub/sub layer. This is network-aware and handles remote calls. +3. **Full workflow abort** — `workflowRoot.abortAll()` aborts all non-terminal nodes. For systemic failures. -The hub coordinator should invoke both: +The hub coordinator should invoke signal cascade and protocol abort together: ```typescript // When aborting a call: -workflowRoot.cascadeAbort(nodeId); // Signal cascade -prm.abort(requestId); // Protocol cascade +workflowRoot.abortNode(nodeId); // Signal: transition dependents to aborted +prm.abort(requestId); // Protocol: cancel the remote call + +// When aborting entire workflow: +workflowRoot.abortAll(); // Signal: abort everything +prm.abortAll(pendingRequestIds); // Protocol: cancel all pending calls ``` -The signal cascade is for local state (the reactive graph). The protocol cascade is for remote state (the running calls). They're complementary — the protocol cascade may take time to propagate, but the signal cascade is instant. +Signal cascades are instant. Protocol aborts may take time to propagate. They're complementary — the signal cascade ensures local state is immediately consistent, while the protocol abort ensures remote state eventually catches up. ## NodeStatus vs CallStatus @@ -210,15 +372,16 @@ function callStatusToNodeStatus(callStatus: CallStatus): NodeStatus { ## Effect-Driven Execution -The hub coordinator uses `effect()` to react to precondition changes: +The hub coordinator uses two `effect()`s per node — one for starting when preconditions are met, and one for aborting when failure propagates: ```typescript -for (const [nodeId, preconditions] of workflowRoot.preconditions) { +for (const [nodeId, preconditions, blockedByFailure] of workflowRoot.nodes) { + // Start the call when preconditions are met effect(() => { if (preconditions.value) { const status = workflowRoot.statusMap.get(nodeId)!; if (status.value === "idle" || status.value === "waiting") { - // All preconditions met — start the call + // All preconditions satisfied — start the call status.value = "running"; const operationId = graph.getNodeAttributes(nodeId).name; prm.call(operationId, getInput(nodeId), { parentRequestId: parentCallId }) @@ -227,12 +390,23 @@ for (const [nodeId, preconditions] of workflowRoot.preconditions) { } } }); + + // Abort when a predecessor fails (uncaught failure propagation) + effect(() => { + if (blockedByFailure.value) { + const status = workflowRoot.statusMap.get(nodeId)!; + if (status.value === "idle" || status.value === "waiting") { + // A predecessor failed and no Conditional caught it — abort + status.value = "aborted"; + } + } + }); } ``` -Each node gets an `effect()` that watches its `preconditions` computed value. When preconditions resolve to `true` and the node is in a startable state (`idle` or `waiting`), the effect starts the call via `PendingRequestMap.call()`. +Both effects are reactive. When a predecessor completes, the `preconditions` computed re-evaluates, potentially triggering the start effect. When a predecessor fails, the `blockedByFailure` computed re-evaluates, potentially triggering the abort effect. -The call's promise resolution updates the node's status signal, which triggers downstream preconditions to re-evaluate, which triggers their effects, and so on. +The call's promise resolution updates the node's status signal, which triggers downstream preconditions and failure propagations to re-evaluate, which triggers their effects, and so on. ### Effect disposal @@ -245,23 +419,43 @@ dispose(): void { } this.statusMap.clear(); this.preconditions.clear(); - this.abortMap.clear(); + this.blockedByFailure.clear(); } ``` This is critical for cleaning up when a workflow completes, fails, or is aborted. Without disposal, signal subscriptions leak. +### Full workflow abort + +For systemic failures (provider outage, authentication failure), `WorkflowReactiveRoot` provides `abortAll()`: + +```typescript +abortAll(): void { + for (const [nodeId, status] of this.statusMap) { + if (status.value !== "completed" && status.value !== "failed") { + status.value = "aborted"; + } + } + // Effects will fire and clean up any waiting/ready nodes +} +``` + +This transitions all non-terminal, non-failed nodes to `aborted`. It's for cases where the entire workflow should stop, regardless of which branches are independent. + ## Constraints - **Signals are in-memory** — `WorkflowReactiveRoot` state is not persisted. If the hub restarts, the reactive state is lost and must be reconstructed from call protocol events + template re-render. -- **Effect-driven execution is optional** — the hub coordinator can choose not to use `effect()` and instead poll `preconditions.value` manually. The reactive layer provides the building blocks; the coordinator decides how to use them. +- **Effect-driven execution is optional** — the hub coordinator can choose not to use `effect()` and instead poll `preconditions.value` and `blockedByFailure.value` manually. The reactive layer provides the building blocks; the coordinator decides how to use them. +- **Failure follows dependency edges, not structural scope** — a failed node causes only its downstream dependents (via DAG edges) to abort. Sibling branches in a `Parallel` group are independent and continue running. This enables partial success: one branch can fail while another completes. +- **Conditionals are error boundaries** — a `Conditional` whose test evaluates against a failed predecessor can redirect to an else branch, catching the failure. Without a `Conditional`, failures cascade uncaught through dependency edges. - **Abort is immediate in signals, delayed in protocol** — setting `status.value = "aborted"` is instant, but `prm.abort(requestId)` takes time to propagate through the call protocol. The hub should invoke both. -- **`skipped` is set by conditional evaluation, not by the call protocol** — a `Conditional` node whose test evaluates to `false` sets its child's status to `skipped`, which prevents the call from ever starting. +- **`skipped` satisfies preconditions** — a `skipped` predecessor is treated as "completed for the purpose of preconditions." It means the branch was deliberately bypassed, not broken. +- **`failed` and `aborted` block preconditions** — a `failed` or `aborted` predecessor means the dependent's preconditions can never be met. The `blockedByFailure` effect transitions the dependent to `aborted`. - **`NodeStatus` and `CallStatus` share terminal states** — `running`, `completed`, `failed`, `aborted` map directly. `idle`, `waiting`, `ready`, `skipped` are workflow-specific additions. ## Open Questions -1. **Should preconditions support OR logic?** Currently all predecessors must complete. An `anyOf` predicate would allow "start this node as soon as any predecessor completes." This would require an edge attribute or node-level configuration. +1. **Should preconditions support OR logic?** Currently all predecessors must complete (AND logic). An `anyOf` predicate would allow "start this node as soon as any predecessor completes." This would require an edge attribute or node-level configuration. 2. **How are retries handled at the signal level?** If an operation fails and should be retried, the status would go `running → failed → ready → running`. This requires resetting the status back to `ready`, which the current state machine doesn't support (failed is terminal). A `retried` status or a separate `retryCount` attribute may be needed. @@ -269,6 +463,10 @@ This is critical for cleaning up when a workflow completes, fails, or is aborted 4. **How does `maxConcurrency` interact with preconditions?** A `Parallel` group with `maxConcurrency: 3` should only start 3 nodes at a time, even though all preconditions are met. This is a scheduling concern, not a structural one. The reactive layer could implement this as a semaphore signal, or it could be the coordinator's responsibility. +5. **Should `blockedByFailure` be a separate `computed` or derived from `preconditions`?** Currently the design has two separate computeds — `preconditions` (all predecessors completed/skipped) and `blockedByFailure` (any predecessor failed/aborted). An alternative is a single `computed` that returns `"ready" | "blocked" | "failed"` or similar. This reduces the number of effects but makes the readiness check less composable. + +6. **What happens to running nodes when a predecessor fails?** The current spec transitions `idle` and `waiting` nodes to `aborted`. But what about a node that's already `running`? Should it be cancelled (set to `aborted` and call `prm.abort()`), or should it be allowed to complete? The answer depends on whether the running node's output is still needed — which the template author decides via `Conditional` error boundaries. + ## References - ujsx reactive layer: `@alkdev/ujsx/docs/architecture/reactive-layer.md` diff --git a/docs/architecture/schema.md b/docs/architecture/schema.md index f0ffdb4..3891f31 100644 --- a/docs/architecture/schema.md +++ b/docs/architecture/schema.md @@ -97,6 +97,8 @@ type NodeStatus = Static; `NodeStatus` extends `CallStatus` with workflow-specific states (`idle`, `waiting`, `ready`, `skipped`) that have no call protocol equivalent. A node that is `waiting` has no call yet because its preconditions haven't been met. +**Precondition semantics**: A predecessor in `completed` or `skipped` status satisfies a dependent's preconditions. A predecessor in `failed` or `aborted` status does NOT satisfy preconditions — it blocks the dependent and triggers failure propagation (the dependent transitions to `aborted`). This enables partial success: independent parallel branches continue running even when one branch fails. + ### EdgeType The type of edge in a flowgraph. Matches the call graph storage schema's `edgeType` column: diff --git a/docs/reviews/001-architecture-gap-analysis.md b/docs/reviews/001-architecture-gap-analysis.md index 1e601b7..2729ea5 100644 --- a/docs/reviews/001-architecture-gap-analysis.md +++ b/docs/reviews/001-architecture-gap-analysis.md @@ -273,7 +273,7 @@ When addressing these issues, use this checklist to track progress: - [x] C-01: Fix README cross-reference link - [x] C-02: Add `CallEdgeAttrs` type alias to schema.md - [x] C-03: Resolve `OperationEdgeAttrs` vs `TypedEdgeAttrs` naming (renamed `TypedEdgeAttrs` → `OperationEdgeAttrs`) -- [ ] C-04: Specify failure propagation semantics in reactive-execution.md +- [x] C-04: Specify failure propagation semantics in reactive-execution.md (failure follows dependency edges, not structural scope; Conditionals as error boundaries; blockedByFailure computed; partial success for parallel branches) - [ ] C-05: Create FlowGraph public API document - [ ] C-06: Document `` component in workflow-templates.md - [ ] C-07: Specify `Conditional` else-branch behavior