diff --git a/README.md b/README.md
index a72529d..f8e238b 100644
--- a/README.md
+++ b/README.md
@@ -92,8 +92,8 @@ Sykora is tested by [CCRL](https://computerchess.org.uk/ccrl/404/). Current entr
 <summary><b>Evaluation</b>: NNUE (default) with classical fallback</summary>
 
 - **NNUE evaluation** (default, embedded in binary):
-  - `SYKNNUE3` and `SYKNNUE4` network loading
-  - Legacy `768 -> Nx2 -> 1` and mirrored king-bucketed sparse-input nets
+  - `SYKNNUE5` material-output-bucket nets, with compatibility for the current embedded net
+  - Mirrored king-bucketed sparse-input nets
   - SCReLU activation with incremental accumulators during search
   - Trained on high-depth self-play data via the Bullet trainer
   - King-bucket training path via `nnue/bullet_repo/examples/sykora_bucketed.rs`
@@ -260,7 +260,7 @@ See `history/README.md` for folder schema and the archived workflow.
 
 ## NNUE
 
-Sykora supports both legacy `768 -> Nx2 -> 1` nets and mirrored king-bucketed nets with dual-perspective accumulator updates and SCReLU activation. The engine can load both `SYKNNUE3` and `SYKNNUE4` files.
+Sykora's current training target is `SYKNNUE5`: mirrored king-bucketed sparse inputs with dual-perspective accumulator updates, SCReLU activation, and material-count output buckets. The engine still keeps loader compatibility for the current embedded net until `src/net.sknnue` is replaced.
 
 ### Runtime
 
@@ -270,7 +270,7 @@ Sykora supports both legacy `768 -> Nx2 -> 1` nets and mirrored king-bucketed ne
 - To use a different net, set `EvalFile` to the path of an external `.sknnue` file.
 - `NnueScale` scales the NNUE score before it is fed into the search.
 
-For exact file-format details, see [specs/syknnue4_spec.md](specs/syknnue4_spec.md) and `src/nnue.zig`.
+For exact file-format details, see [specs/syknnue5_spec.md](specs/syknnue5_spec.md) and `src/nnue.zig`.
 
 ### Training Pipeline
 
@@ -284,8 +284,9 @@ python utils/nnue/bullet/train_cuda_longrun.py \
   --data-format binpack \
   --bullet-repo nnue/bullet_repo \
   --output-root nnue/models/bullet \
-  --network-format syk3 \
-  --hidden 256 --end-superbatch 320 --threads 8
+  --network-format syk5 \
+  --bucket-layout sykora16 \
+  --hidden 512 --end-superbatch 320 --threads 8
 ```
 
 **Using BulletFormat .data files:**
@@ -295,8 +296,9 @@ python utils/nnue/bullet/train_cuda_longrun.py \
   --dataset nnue/data/bullet/train/train_main.data \
   --bullet-repo nnue/bullet_repo \
   --output-root nnue/models/bullet \
-  --network-format syk3 \
-  --hidden 256 --end-superbatch 320 --threads 8
+  --network-format syk5 \
+  --bucket-layout sykora16 \
+  --hidden 512 --end-superbatch 320 --threads 8
 ```
 
 **Multiple datasets** can be passed space-separated:
@@ -319,16 +321,15 @@ python utils/nnue/bullet/train_cuda_longrun.py \
   ...
 ```
 
-**Training a `SYKNNUE4` baseline:**
+**Training a `SYKNNUE5` material-output-bucket net:**
 
 ```bash
 python utils/nnue/bullet/train_cuda_longrun.py \
   --dataset data/training.binpack \
   --data-format binpack \
-  --network-format syk4 \
+  --network-format syk5 \
   --bucket-layout sykora16 \
-  --hidden 1536 \
-  --dense-l1 16 --dense-l2 32 \
+  --hidden 512 \
   --end-superbatch 320 --threads 8
 ```
 
@@ -343,15 +344,15 @@ Sykora can generate its own training data via the `gensfen` command:
 
 ### Exporting a Trained Net
 
-Export a `SYKNNUE4` checkpoint:
+Export a `SYKNNUE5` checkpoint:
 
 ```bash
 python utils/nnue/bullet/checkpoint_raw_to_npz.py \
   --input nnue/models/bullet/<run_id>/checkpoints/<checkpoint> \
-  --output checkpoint_syk4.npz
+  --output checkpoint_syk5.npz
 
-python utils/nnue/bullet/export_npz_to_syk4.py \
-  --input checkpoint_syk4.npz \
+python utils/nnue/bullet/export_npz_to_syk5.py \
+  --input checkpoint_syk5.npz \
   --output-net output.sknnue
 ```
 
@@ -379,7 +380,7 @@ python utils/nnue/bullet/gate_checkpoints.py \
 
 This gate now evaluates recent checkpoints by selfplay only. STS is intentionally not part of the checkpoint promotion path.
 
-SYKNNUE4 design spec: `specs/syknnue4_spec.md`.
+SYKNNUE5 design spec: `specs/syknnue5_spec.md`.
 
 ## Contributing
 
diff --git a/launch_training.ps1 b/launch_training.ps1
index 3cb48b2..40de935 100644
--- a/launch_training.ps1
+++ b/launch_training.ps1
@@ -1,4 +1,4 @@
-# Sykora NNUE V4 Training Launch Script
+# Sykora NNUE V5 Training Launch Script
 # Run from project root: .\launch_training.ps1
 #
 # Dataset: T80-2023 (jun-dec) + T80-2024 (jan-jun) .min-v2.v6 binpacks
@@ -60,11 +60,12 @@ foreach ($bp in $binpacks) {
 }
 
 # --- Training Parameters ---
-# SYKNNUE4 baseline:
-# mirrored king buckets (sykora16) -> FT 768 -> shared linear output
-$networkFormat = "syk4"
+# SYKNNUE5:
+# mirrored king buckets (sykora16) -> FT 512 -> 8 material-count output heads
+$networkFormat = "syk5"
 $bucketLayout = "sykora16"
-$hidden = 768
+$hidden = 512
+$outputBuckets = 8
 $endSuperbatch = 600
 $lrStart = 0.001
 $wdl = 0.25
@@ -72,7 +73,7 @@ $saveRate = 10
 $threads = 8
 
 Write-Host "============================================"
-Write-Host "  Sykora NNUE V4 Training (RTX 4070 Ti SUPER)"
+Write-Host "  Sykora NNUE V5 Training (RTX 4070 Ti SUPER)"
 Write-Host "============================================"
 Write-Host "Data:          T80-2023/2024 filtered set"
 Write-Host "Filtering:     .min-v2.v6 on T80 inputs"
@@ -81,7 +82,8 @@ Write-Host "Format:        binpack (sfbinpack)"
 Write-Host "Net format:    $networkFormat"
 Write-Host "Bucket layout: $bucketLayout"
 Write-Host "FT hidden:     $hidden"
-Write-Host "Dense head:    linear $($hidden * 2) -> 1"
+Write-Host "Output heads:  $outputBuckets material-count buckets"
+Write-Host "Dense head:    bucketed linear $($hidden * 2) -> 1"
 Write-Host "Superbatches:  1 -> $endSuperbatch"
 Write-Host "Save rate:     every $saveRate superbatches"
 Write-Host "Threads:       $threads"
@@ -104,6 +106,7 @@ python "$PSScriptRoot\utils\nnue\bullet\train_cuda_longrun.py" `
     --network-format $networkFormat `
     --bucket-layout $bucketLayout `
     --hidden $hidden `
+    --output-buckets $outputBuckets `
     --end-superbatch $endSuperbatch `
     --save-rate $saveRate `
     --threads $threads `
diff --git a/specs/syknnue4_spec.md b/specs/syknnue4_spec.md
deleted file mode 100644
index da5de16..0000000
--- a/specs/syknnue4_spec.md
+++ /dev/null
@@ -1,342 +0,0 @@
-# SYKNNUE4 Design Spec
-
-## Goal
-
-`SYKNNUE4` is the simple, stable baseline Sykora NNUE format.
-
-The design goal is:
-
-- keep the sparse incremental part large
-- keep the head shared
-- stay close to the already-working v3 math
-- make the file format self-describing for mirrored king-bucket inputs
-
-The baseline `SYKNNUE4` net is:
-
-```text
-king_buckets_mirrored(16 buckets)
--> shared sparse FT, width 768, two color-fixed accumulators
--> concat(screlu(A_us), screlu(A_them))   # 1536 inputs
--> shared linear output
-```
-
-Short form:
-
-```text
-shared FT: 12288 -> 768, color-fixed dual perspective
--> concat(us, them) -> 1
-```
-
-This is intentionally a monotonic upgrade from the v3 family:
-
-- same shared-head philosophy
-- same SCReLU inference contract
-- wider FT
-- explicit mirrored king-bucket layout stored in the file
-
-## Non-Goals
-
-The first `SYKNNUE4` implementation should not include:
-
-- multiple output heads
-- multi-layer dense heads
-- PSQT side channels
-- product pooling
-- mixed float/int inference
-- approximate rescale rules in the reference path
-
-## Architecture
-
-### Inputs
-
-- Feature set: `king_buckets_mirrored`
-- Per-bucket base feature size: `768`
-- Default input bucket count: `16`
-- Bucket layout: stored explicitly in the file
-- Horizontal mirroring: enabled
-- Training-only factorization is allowed, but exported nets must contain merged
-  sparse weights only
-
-Per perspective:
-
-```text
-INPUT_SIZE = 768
-INPUT_BUCKET_COUNT = 16
-HORIZONTAL_MIRRORING = true
-```
-
-Feature indexing is defined for color-fixed perspectives `white` and `black`,
-not for side-to-move / side-not-to-move.
-
-For a perspective `p`:
-
-```text
-feature =
-    king_bucket(p.king_sq) * 768
-  + relative_color(piece, p) * (6 * 64)
-  + piece_type * 64
-  + mirrored_square(p.king_sq, sq)
-```
-
-### Sparse Transformer
-
-The sparse transformer is:
-
-```text
-SparseAffine(768, 768) per king bucket
-```
-
-Maintain two color-fixed accumulators:
-
-- `A_white[768]`
-- `A_black[768]`
-
-For the reference implementation, store these accumulators as `i32`.
-
-At evaluation time:
-
-```text
-if side_to_move == white:
-    A_us   = A_white
-    A_them = A_black
-else:
-    A_us   = A_black
-    A_them = A_white
-```
-
-### Hidden Activation
-
-For each hidden accumulator entry:
-
-```text
-u = clamp(A_us[i],   0, Q0)
-t = clamp(A_them[i], 0, Q0)
-```
-
-Apply the activation selected by `activation_type`:
-
-- `0 = ReLU`
-- `1 = SCReLU`
-
-Baseline `SYKNNUE4` uses `SCReLU`.
-
-For `SCReLU`:
-
-```text
-U[i] = u * u
-T[i] = t * t
-```
-
-Concatenate:
-
-```text
-X = [U, T]
-```
-
-So:
-
-- `X` has length `2 * H`
-- with the baseline `H = 768`, `X` has length `1536`
-- each entry is in the `Q0^2` domain for `SCReLU`
-
-### Output Head
-
-The output head is shared. There are no phase-specific output stacks.
-
-```text
-Out: Affine(2 * H, 1)
-```
-
-## Quantization Contract
-
-Use the following constants:
-
-```text
-Q0 = 255
-Q  = 64
-SCALE = 400
-```
-
-Interpretation:
-
-- `Q0`: sparse hidden clamp / scale
-- `Q`: output-weight scale
-- `SCALE`: final centipawn conversion
-
-All float-to-int quantization in this spec uses:
-
-```text
-quantize_round(x, scale) =
-    if x >= 0:
-        floor(x * scale + 0.5)
-    else:
-        -floor((-x) * scale + 0.5)
-```
-
-This is round-to-nearest with ties away from zero.
-
-### Hidden FT Storage
-
-Export the sparse branch as:
-
-- hidden biases: `i16`
-- hidden weights: `i16`
-
-Quantization:
-
-```text
-hidden_bias_int   = quantize_round(hidden_bias_float, Q0)
-hidden_weight_int = quantize_round(hidden_weight_float, Q0)
-```
-
-### Output Head Storage
-
-Export the shared output head as:
-
-- output weights: `i16`
-- output bias: `i32`
-
-Quantization:
-
-```text
-out_weight_int = quantize_round(out_weight_float, Q)
-out_bias_int   = quantize_round(out_bias_float, Q0 * Q)
-```
-
-## Integer Inference Contract
-
-### Hidden Accumulators
-
-The reference accumulator update path sums stored sparse integers directly:
-
-```text
-A_white[i] = hidden_bias_int[i] + sum(active white-perspective feature weights)
-A_black[i] = hidden_bias_int[i] + sum(active black-perspective feature weights)
-```
-
-### Output Evaluation
-
-For `SCReLU`:
-
-```text
-sum_int =
-    Σ_i (clamp(A_us[i],   0, Q0)^2 * out_weight_int[i])
-  + Σ_i (clamp(A_them[i], 0, Q0)^2 * out_weight_int[H + i])
-```
-
-Rescale by one factor of `Q0` before adding bias:
-
-```text
-sum_rescaled = div_round_nearest_signed(sum_int, Q0)
-z_int = sum_rescaled + out_bias_int
-```
-
-Convert to centipawns:
-
-```text
-eval_cp = div_round_nearest_signed(z_int * SCALE, Q0 * Q)
-```
-
-For `ReLU`, omit the squaring and the intermediate `/Q0` rescale:
-
-```text
-sum_int =
-    Σ_i (clamp(A_us[i],   0, Q0) * out_weight_int[i])
-  + Σ_i (clamp(A_them[i], 0, Q0) * out_weight_int[H + i])
-
-z_int = sum_int + out_bias_int
-eval_cp = div_round_nearest_signed(z_int * SCALE, Q0 * Q)
-```
-
-### Signed Division
-
-The reference path uses signed round-to-nearest:
-
-```text
-div_round_nearest_signed(x, d) =
-    if x >= 0:
-        (x + d / 2) / d
-    else:
-        -(((-x) + d / 2) / d)
-```
-
-This is the reference contract to match across trainer, exporter, and runtime.
-
-## File Format
-
-All integers are little-endian.
-
-### Header
-
-```text
-u8[8]     magic                 = "SYKNNUE4"
-u16       format_version        = 4
-u8        feature_set           = 1   # king_buckets_mirrored
-u16       ft_hidden_size        # baseline 768
-u8        activation_type       # baseline 1 = SCReLU
-u8        input_bucket_count    # baseline 16
-u16       q0                    # baseline 255
-u16       q                     # baseline 64
-u16       scale                 # baseline 400
-u8[64]    bucket_layout_64
-```
-
-### Payload
-
-Let:
-
-- `I = 768 * input_bucket_count`
-- `H = ft_hidden_size`
-
-Payload order:
-
-```text
-i32                    output_bias
-i16[H]                 ft_biases
-i16[I * H]             ft_weights
-i16[2 * H]             output_weights
-```
-
-Weight order:
-
-- `ft_weights[input_feature][hidden]`
-- `output_weights[0..H]` are `us`
-- `output_weights[H..2H]` are `them`
-
-## Loader Validation
-
-A loader should reject nets where:
-
-- `magic != "SYKNNUE4"`
-- `format_version != 4`
-- `feature_set != 1`
-- `ft_hidden_size == 0`
-- `input_bucket_count == 0`
-- any `bucket_layout_64` entry is `>= input_bucket_count`
-- `q0 == 0`
-- `q == 0`
-- `scale == 0`
-- payload size does not match the header
-
-## Baseline Defaults
-
-Baseline values:
-
-```text
-feature_set       = king_buckets_mirrored
-input_bucket_count = 16
-ft_hidden_size    = 768
-activation_type   = SCReLU
-q0                = 255
-q                 = 64
-scale             = 400
-```
-
-## Reference Implementation Priorities
-
-If implementing or training this architecture, the recommended order is:
-
-1. make the sparse update path correct
-2. make exporter and runtime agree bit-for-bit on fixed FENs
-3. validate the shared-head model against v3-like sanity positions
-4. only then consider widening the FT or adding extra head complexity
diff --git a/specs/syknnue5_spec.md b/specs/syknnue5_spec.md
new file mode 100644
index 0000000..45b9f48
--- /dev/null
+++ b/specs/syknnue5_spec.md
@@ -0,0 +1,83 @@
+# SYKNNUE5 Design Spec
+
+`SYKNNUE5` is Sykora's current king-bucketed SCReLU training target with
+material-count output buckets.
+
+## Architecture
+
+```text
+king_buckets_mirrored(16 buckets)
+-> shared sparse FT, H hidden units, color-fixed dual perspective
+-> concat(screlu(A_us), screlu(A_them))
+-> material-count bucketed linear output head
+```
+
+The first intended training target is:
+
+```text
+shared FT: 12288 -> 512
+-> concat(us, them): 1024
+-> 8 material-count output heads
+```
+
+`H = 768` is the larger follow-up target.
+
+## Output Buckets
+
+The output bucket selector matches Bullet's `MaterialCount<8>`:
+
+```text
+piece_count = popcount(occupied)
+non_king_count = piece_count - 2
+divisor = ceil(32 / output_bucket_count)
+output_bucket = min(non_king_count / divisor, output_bucket_count - 1)
+```
+
+With the default `output_bucket_count = 8`, the divisor is `4`.
+
+## File Format
+
+All integers are little-endian.
+
+```text
+u8[8]  magic = "SYKNNUE5"
+u16    version = 5
+u8     feature_set = 1                  # king_buckets_mirrored
+u16    ft_hidden_size = H
+u8     activation_type                  # 0 = ReLU, 1 = SCReLU
+u8     input_bucket_count
+u8     output_bucket_count
+u16    q0
+u16    q
+u16    scale
+u8[64] bucket_layout
+i16[H] ft_biases
+i16[input_bucket_count * 768 * H] ft_weights
+i32[output_bucket_count] output_biases
+i16[output_bucket_count * 2 * H] output_weights
+```
+
+`output_weights` are bucket-major. For bucket `b`, the slice is:
+
+```text
+output_weights[b * 2H .. (b + 1) * 2H]
+```
+
+The first `H` weights apply to `A_us`; the second `H` apply to `A_them`.
+
+## Quantization
+
+The baseline constants are:
+
+```text
+Q0 = 255
+Q  = 64
+SCALE = 400
+```
+
+SCReLU output is divided by `Q0` before adding the selected output bias, then the
+final score is converted to centipawns with:
+
+```text
+score = round(sum * SCALE / (Q0 * Q))
+```
diff --git a/src/nnue.zig b/src/nnue.zig
index ecdb0cd..62fef27 100644
--- a/src/nnue.zig
+++ b/src/nnue.zig
@@ -13,9 +13,9 @@ pub const SCALE: i32 = 400;
 const MAX_NETWORK_BYTES = 64 * 1024 * 1024;
 
 const MAGIC_V3 = "SYKNNUE3";
-const MAGIC_V4 = "SYKNNUE4";
+const MAGIC_V5 = "SYKNNUE5";
 const FORMAT_VERSION_V3: u16 = 3;
-const FORMAT_VERSION_V4: u16 = 4;
+const FORMAT_VERSION_V5: u16 = 5;
 
 pub const FeatureSet = enum(u8) {
     legacy_psqt = 0,
@@ -43,13 +43,14 @@ pub const Network = struct {
         output_bias: i32,
     };
 
-    pub const V4Head = struct {
+    pub const V5Head = struct {
         activation_type: u8, // 0 = ReLU, 1 = SCReLU
         q0: u16,
         q: u16,
         scale: u16,
-        output_weights: []i16, // [2 * H]
-        output_bias: i32,
+        output_bucket_count: u8,
+        output_weights: []i16, // [output_bucket_count * 2 * H], bucket-major
+        output_biases: []i32, // [output_bucket_count]
     };
 
     allocator: std.mem.Allocator,
@@ -61,7 +62,7 @@ pub const Network = struct {
     ft_weights: []i16,
     head: union(enum) {
         v3: V3Head,
-        v4: V4Head,
+        v5: V5Head,
     },
 
     pub fn deinit(self: *Network) void {
@@ -71,8 +72,9 @@ pub const Network = struct {
             .v3 => |v3| {
                 self.allocator.free(v3.output_weights);
             },
-            .v4 => |v4| {
-                self.allocator.free(v4.output_weights);
+            .v5 => |v5| {
+                self.allocator.free(v5.output_weights);
+                self.allocator.free(v5.output_biases);
             },
         }
     }
@@ -83,8 +85,8 @@ pub const Network = struct {
         if (std.mem.eql(u8, data[0..8], MAGIC_V3)) {
             return loadFromBytesV3(allocator, data);
         }
-        if (std.mem.eql(u8, data[0..8], MAGIC_V4)) {
-            return loadFromBytesV4(allocator, data);
+        if (std.mem.eql(u8, data[0..8], MAGIC_V5)) {
+            return loadFromBytesV5(allocator, data);
         }
         return error.UnsupportedVersion;
     }
@@ -144,21 +146,29 @@ fn checkedAddU64(a: u64, b: u64) ?u64 {
     return std.math.add(u64, a, b) catch null;
 }
 
-fn computeV4PayloadBytes(
+fn computeV5PayloadBytes(
     input_size: usize,
     ft_hidden_size: usize,
+    output_bucket_count: usize,
 ) ?u64 {
     var total: u64 = 0;
 
-    const ft_bias_bytes = checkedMulU64(@as(u64, @intCast(ft_hidden_size)), @sizeOf(i16)) orelse return null;
+    const hidden_size_u64: u64 = @intCast(ft_hidden_size);
+    const ft_bias_bytes = checkedMulU64(hidden_size_u64, @sizeOf(i16)) orelse return null;
     total = checkedAddU64(total, ft_bias_bytes) orelse return null;
 
-    const ft_weight_count = checkedMulU64(@as(u64, @intCast(input_size)), @as(u64, @intCast(ft_hidden_size))) orelse return null;
+    const ft_weight_count = checkedMulU64(@as(u64, @intCast(input_size)), hidden_size_u64) orelse return null;
     total = checkedAddU64(total, checkedMulU64(ft_weight_count, @sizeOf(i16)) orelse return null) orelse return null;
 
-    total = checkedAddU64(total, @sizeOf(i32)) orelse return null;
-    const out_weight_count = checkedMulU64(2, @as(u64, @intCast(ft_hidden_size))) orelse return null;
-    total = checkedAddU64(total, checkedMulU64(out_weight_count, @sizeOf(i16)) orelse return null) orelse return null;
+    const bias_bytes = checkedMulU64(@as(u64, @intCast(output_bucket_count)), @sizeOf(i32)) orelse return null;
+    total = checkedAddU64(total, bias_bytes) orelse return null;
+
+    const single_head_weight_count = checkedMulU64(2, hidden_size_u64) orelse return null;
+    const output_weight_count = checkedMulU64(
+        @as(u64, @intCast(output_bucket_count)),
+        single_head_weight_count,
+    ) orelse return null;
+    total = checkedAddU64(total, checkedMulU64(output_weight_count, @sizeOf(i16)) orelse return null) orelse return null;
 
     return total;
 }
@@ -229,11 +239,11 @@ fn loadFromBytesV3(allocator: std.mem.Allocator, data: []const u8) LoadError!Net
     };
 }
 
-fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Network {
+fn loadFromBytesV5(allocator: std.mem.Allocator, data: []const u8) LoadError!Network {
     var pos: usize = 8;
 
     const version = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork;
-    if (version != FORMAT_VERSION_V4) return error.UnsupportedVersion;
+    if (version != FORMAT_VERSION_V5) return error.UnsupportedVersion;
 
     if (pos >= data.len) return error.InvalidNetwork;
     const feature_set = std.meta.intToEnum(FeatureSet, data[pos]) catch return error.InvalidNetwork;
@@ -255,6 +265,11 @@ fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Net
     pos += 1;
     if (bucket_count == 0) return error.InvalidNetwork;
 
+    if (pos >= data.len) return error.InvalidNetwork;
+    const output_bucket_count = data[pos];
+    pos += 1;
+    if (output_bucket_count == 0) return error.InvalidNetwork;
+
     const q0 = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork;
     const q = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork;
     const scale = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork;
@@ -269,21 +284,24 @@ fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Net
     }
 
     const input_size = LEGACY_INPUT_SIZE * @as(usize, bucket_count);
-    const payload_size = computeV4PayloadBytes(
+    const payload_size = computeV5PayloadBytes(
         input_size,
         ft_hidden_size,
+        output_bucket_count,
     ) orelse return error.InvalidNetwork;
     const expected_size = checkedAddU64(@as(u64, @intCast(pos)), payload_size) orelse return error.InvalidNetwork;
     if (expected_size != data.len) return error.InvalidNetwork;
 
-    const output_bias = readBytesInt(i32, data, &pos) orelse return error.InvalidNetwork;
     const ft_biases = try allocAndReadInts(i16, allocator, data, &pos, ft_hidden_size);
     errdefer allocator.free(ft_biases);
 
     const ft_weights = try allocAndReadInts(i16, allocator, data, &pos, input_size * ft_hidden_size);
     errdefer allocator.free(ft_weights);
 
-    const output_weights = try allocAndReadInts(i16, allocator, data, &pos, 2 * ft_hidden_size);
+    const output_biases = try allocAndReadInts(i32, allocator, data, &pos, output_bucket_count);
+    errdefer allocator.free(output_biases);
+
+    const output_weights = try allocAndReadInts(i16, allocator, data, &pos, output_bucket_count * 2 * ft_hidden_size);
     errdefer allocator.free(output_weights);
 
     if (pos != data.len) return error.InvalidNetwork;
@@ -297,13 +315,14 @@ fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Net
         .ft_biases = ft_biases,
         .ft_weights = ft_weights,
         .head = .{
-            .v4 = .{
+            .v5 = .{
                 .activation_type = activation_type,
                 .q0 = q0,
                 .q = q,
                 .scale = scale,
+                .output_bucket_count = output_bucket_count,
                 .output_weights = output_weights,
-                .output_bias = output_bias,
+                .output_biases = output_biases,
             },
         },
     };
@@ -1043,10 +1062,19 @@ pub fn updateAccumulators(
     }
 }
 
-fn evaluateV4FromAccumulators(
+inline fn materialCountOutputBucket(b: *Board, output_bucket_count: u8) usize {
+    const piece_count = @popCount(b.board.occupied());
+    const non_king_count = if (piece_count >= 2) piece_count - 2 else 0;
+    const divisor = (32 + @as(usize, output_bucket_count) - 1) / @as(usize, output_bucket_count);
+    const bucket = non_king_count / divisor;
+    return @min(bucket, @as(usize, output_bucket_count) - 1);
+}
+
+fn evaluateV5FromAccumulators(
     net: *const Network,
-    head: *const Network.V4Head,
+    head: *const Network.V5Head,
     acc: *const AccumulatorPair,
+    b: *Board,
     stm_is_white: bool,
 ) i32 {
     const hidden_size: usize = @intCast(net.ft_hidden_size);
@@ -1055,17 +1083,20 @@ fn evaluateV4FromAccumulators(
     const scale: i32 = head.scale;
     const use_screlu = head.activation_type == 1;
     const final_den: i64 = @as(i64, q0) * @as(i64, q);
+    const output_bucket = materialCountOutputBucket(b, head.output_bucket_count);
+    const weights_base = output_bucket * 2 * hidden_size;
+    const weights = head.output_weights[weights_base .. weights_base + 2 * hidden_size];
 
     const us_acc = if (stm_is_white) acc.white[0..hidden_size] else acc.black[0..hidden_size];
     const them_acc = if (stm_is_white) acc.black[0..hidden_size] else acc.white[0..hidden_size];
 
-    var sum = activatedDot(us_acc, head.output_weights[0..hidden_size], hidden_size, head.activation_type, q0) +
-        activatedDot(them_acc, head.output_weights[hidden_size .. 2 * hidden_size], hidden_size, head.activation_type, q0);
+    var sum = activatedDot(us_acc, weights[0..hidden_size], hidden_size, head.activation_type, q0) +
+        activatedDot(them_acc, weights[hidden_size .. 2 * hidden_size], hidden_size, head.activation_type, q0);
 
     if (use_screlu) {
         sum = divRoundNearestSigned(sum, q0);
     }
-    sum += head.output_bias;
+    sum += head.output_biases[output_bucket];
     return @intCast(divRoundNearestSigned(sum * scale, final_den));
 }
 
@@ -1100,7 +1131,7 @@ pub fn evaluateFromAccumulators(
     const stm_is_white = b.board.move == .white;
     return switch (net.head) {
         .v3 => |*head| evaluateV3FromAccumulators(net, head, acc, stm_is_white),
-        .v4 => |*head| evaluateV4FromAccumulators(net, head, acc, stm_is_white),
+        .v5 => |*head| evaluateV5FromAccumulators(net, head, acc, b, stm_is_white),
     };
 }
 
diff --git a/utils/nnue/bullet/checkpoint_raw_to_npz.py b/utils/nnue/bullet/checkpoint_raw_to_npz.py
index 8aaca7a..5009583 100644
--- a/utils/nnue/bullet/checkpoint_raw_to_npz.py
+++ b/utils/nnue/bullet/checkpoint_raw_to_npz.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Convert a SYKNNUE4 Bullet checkpoint raw.bin into explicit NPZ tensors."""
+"""Convert a Sykora Bullet checkpoint raw.bin into explicit NPZ tensors."""
 
 from __future__ import annotations
 
@@ -14,8 +14,8 @@
 
 from common import (  # noqa: E402
     SCALE,
-    V4_Q0,
-    V4_Q,
+    NNUE_Q0,
+    NNUE_Q,
     SYKORA16_BUCKET_LAYOUT_32,
     expand_mirrored_bucket_layout,
 )
@@ -23,7 +23,7 @@
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
-        description="Convert a SYKNNUE4 Bullet raw checkpoint into NPZ tensors."
+        description="Convert a SYKNNUE5 Bullet raw checkpoint into NPZ tensors."
     )
     parser.add_argument(
         "--input",
@@ -76,25 +76,29 @@ def take_f32(buf, offset: int, count: int):
 
 
 def expected_raw_sizes(
-    *, bucket_count: int, ft_hidden: int
+    *, bucket_count: int, ft_hidden: int, network_format: str, output_bucket_count: int
 ) -> dict[str, int]:
     input_size = 768 * bucket_count
+    if network_format != "syk5":
+        raise ValueError(f"unsupported network format: {network_format}")
     return {
-        "spec_merged_ft": (
+        "syk5_output_buckets": (
             input_size * ft_hidden
             + ft_hidden
-            + (2 * ft_hidden)
-            + 1
+            + (output_bucket_count * 2 * ft_hidden)
+            + output_bucket_count
         ),
     }
 
 
 def detect_layout(
-    *, raw_len: int, bucket_count: int, ft_hidden: int
+    *, raw_len: int, bucket_count: int, ft_hidden: int, network_format: str, output_bucket_count: int
 ) -> str:
     sizes = expected_raw_sizes(
         bucket_count=bucket_count,
         ft_hidden=ft_hidden,
+        network_format=network_format,
+        output_bucket_count=output_bucket_count,
     )
     for name, expected in sizes.items():
         if raw_len == expected:
@@ -109,10 +113,10 @@ def parse_network_config(run_meta: dict) -> dict:
     network = dict(run_meta.get("network", {}))
     env = run_meta.get("env", {})
 
-    network_format = network.get("format") or env.get("SYK_NETWORK_FORMAT") or "syk4"
-    if network_format != "syk4":
+    network_format = network.get("format") or env.get("SYK_NETWORK_FORMAT") or "syk5"
+    if network_format != "syk5":
         raise ValueError(
-            f"run_meta.json does not describe a SYKNNUE4 run: {network_format!r}"
+            f"run_meta.json does not describe a SYKNNUE5 run: {network_format!r}"
         )
 
     if "bucket_layout_64" in network:
@@ -130,6 +134,11 @@ def parse_network_config(run_meta: dict) -> dict:
         "format": network_format,
         "bucket_layout_64": bucket_layout_64,
         "ft_hidden": int(network.get("ft_hidden") or env["SYK_HIDDEN"]),
+        "output_bucket_count": int(
+            network.get("output_bucket_count")
+            or env.get("SYK_OUTPUT_BUCKETS")
+            or 8
+        ),
     }
 
 
@@ -148,6 +157,8 @@ def main() -> int:
     bucket_layout_64 = [int(v) for v in network["bucket_layout_64"]]
     bucket_count = max(bucket_layout_64) + 1
     ft_hidden = int(network["ft_hidden"])
+    network_format = str(network["format"])
+    output_bucket_count = int(network["output_bucket_count"])
     input_size = 768 * bucket_count
 
     raw = np.fromfile(raw_path, dtype="<f4")
@@ -155,13 +166,17 @@ def main() -> int:
         raw_len=raw.shape[0],
         bucket_count=bucket_count,
         ft_hidden=ft_hidden,
+        network_format=network_format,
+        output_bucket_count=output_bucket_count,
     )
     offset = 0
 
     l0w, offset = take_f32(raw, offset, input_size * ft_hidden)
     l0b, offset = take_f32(raw, offset, ft_hidden)
-    outw, offset = take_f32(raw, offset, 2 * ft_hidden)
-    outb, offset = take_f32(raw, offset, 1)
+    outw_len = output_bucket_count * 2 * ft_hidden
+    outb_len = output_bucket_count
+    outw, offset = take_f32(raw, offset, outw_len)
+    outb, offset = take_f32(raw, offset, outb_len)
 
     if offset != raw.shape[0]:
         raise ValueError(
@@ -170,8 +185,8 @@ def main() -> int:
 
     ft_weights = l0w.reshape(input_size, ft_hidden)
     ft_bias = l0b.reshape(ft_hidden)
-    out_weights = outw.reshape(2 * ft_hidden)
-    out_bias = outb.reshape(1)
+    out_weights = outw.reshape(output_bucket_count, 2 * ft_hidden)
+    out_bias = outb.reshape(output_bucket_count)
 
     out_path = Path(args.output)
     out_path.parent.mkdir(parents=True, exist_ok=True)
@@ -184,19 +199,21 @@ def main() -> int:
         bucket_layout_64=np.asarray(bucket_layout_64, dtype=np.uint8),
         feature_set=np.asarray([1], dtype=np.uint8),
         input_bucket_count=np.asarray([bucket_count], dtype=np.uint8),
+        output_bucket_count=np.asarray([output_bucket_count], dtype=np.uint8),
         activation_type=np.asarray([1], dtype=np.uint8),
-        q0=np.asarray([V4_Q0], dtype=np.uint16),
-        q=np.asarray([V4_Q], dtype=np.uint16),
+        q0=np.asarray([NNUE_Q0], dtype=np.uint16),
+        q=np.asarray([NNUE_Q], dtype=np.uint16),
         scale=np.asarray([SCALE], dtype=np.uint16),
     )
 
     print(f"Input: {raw_path}")
     print(f"Run metadata: {run_meta_path}")
-    print("Network format: SYKNNUE4")
+    print(f"Network format: {network_format.upper()}")
     print(f"Detected raw layout: {layout}")
     print(f"Bucket count: {bucket_count}")
     print(f"FT hidden: {ft_hidden}")
-    print(f"Dense head: linear {2 * ft_hidden} -> 1")
+    print(f"Output buckets: {output_bucket_count}")
+    print(f"Dense head: bucketed linear {2 * ft_hidden} -> 1")
     print(f"Wrote: {out_path}")
     return 0
 
diff --git a/utils/nnue/bullet/export_npz_to_syk4.py b/utils/nnue/bullet/export_npz_to_syk5.py
similarity index 76%
rename from utils/nnue/bullet/export_npz_to_syk4.py
rename to utils/nnue/bullet/export_npz_to_syk5.py
index b5465d4..841fe32 100644
--- a/utils/nnue/bullet/export_npz_to_syk4.py
+++ b/utils/nnue/bullet/export_npz_to_syk5.py
@@ -1,5 +1,5 @@
 #!/usr/bin/env python3
-"""Convert a float-domain NPZ checkpoint into Sykora SYKNNUE4 format."""
+"""Convert a float-domain NPZ checkpoint into Sykora SYKNNUE5 format."""
 
 from __future__ import annotations
 
@@ -16,18 +16,18 @@
     ACTIVATION_SCRELU,
     FEATURE_SET_KING_BUCKETS_MIRRORED,
     SCALE,
-    V4_Q,
-    V4_Q0,
+    NNUE_Q,
+    NNUE_Q0,
     SYKORA16_BUCKET_LAYOUT_32,
     expand_mirrored_bucket_layout,
     input_size_for_feature_set,
-    write_syk_nnue_v4,
+    write_syk_nnue_v5,
 )
 
 
 def parse_args() -> argparse.Namespace:
     parser = argparse.ArgumentParser(
-        description="Export NPZ checkpoint to SYKNNUE4 net."
+        description="Export NPZ checkpoint to SYKNNUE5 net."
     )
     parser.add_argument("--input", required=True, help="Input .npz checkpoint")
     parser.add_argument("--output-net", required=True, help="Output .sknnue path")
@@ -73,7 +73,7 @@ def main() -> int:
     with np.load(in_path) as ckpt:
         ft_weights = np.asarray(expect_array(ckpt, "ft_weights"), dtype=np.float32)
         ft_bias = np.asarray(expect_array(ckpt, "ft_bias"), dtype=np.float32).reshape(-1)
-        out_weights = np.asarray(expect_array(ckpt, "out_weights"), dtype=np.float32).reshape(-1)
+        out_weights = np.asarray(expect_array(ckpt, "out_weights"), dtype=np.float32)
         out_bias = np.asarray(expect_array(ckpt, "out_bias"), dtype=np.float32).reshape(-1)
 
         if "bucket_layout_64" in ckpt:
@@ -92,8 +92,8 @@ def main() -> int:
         else:
             activation_type = ACTIVATION_SCRELU
 
-        q0 = int(np.asarray(ckpt["q0"]).reshape(-1)[0]) if "q0" in ckpt else V4_Q0
-        q = int(np.asarray(ckpt["q"]).reshape(-1)[0]) if "q" in ckpt else V4_Q
+        q0 = int(np.asarray(ckpt["q0"]).reshape(-1)[0]) if "q0" in ckpt else NNUE_Q0
+        q = int(np.asarray(ckpt["q"]).reshape(-1)[0]) if "q" in ckpt else NNUE_Q
         scale = int(np.asarray(ckpt["scale"]).reshape(-1)[0]) if "scale" in ckpt else SCALE
 
     if args.q0 is not None:
@@ -104,7 +104,7 @@ def main() -> int:
         scale = args.scale
 
     if feature_set != FEATURE_SET_KING_BUCKETS_MIRRORED:
-        raise ValueError("SYKNNUE4 only supports king_buckets_mirrored inputs")
+        raise ValueError("SYKNNUE5 only supports king_buckets_mirrored inputs")
     if len(bucket_layout) != 64:
         raise ValueError(f"bucket_layout_64 must have 64 entries, got {len(bucket_layout)}")
 
@@ -120,22 +120,35 @@ def main() -> int:
         raise ValueError(
             f"ft_bias length mismatch: expected {ft_hidden_size}, got {ft_bias.shape[0]}"
         )
-    if out_weights.shape[0] != 2 * ft_hidden_size:
+
+    out_weights = np.asarray(out_weights, dtype=np.float32)
+    if out_weights.ndim == 1:
+        if out_bias.shape[0] <= 0:
+            raise ValueError("out_bias must contain at least one output bucket")
+        output_bucket_count = out_bias.shape[0]
+        out_weights = out_weights.reshape(output_bucket_count, 2 * ft_hidden_size)
+    elif out_weights.ndim == 2:
+        output_bucket_count = out_weights.shape[0]
+    else:
+        raise ValueError(f"out_weights must be rank-1 or rank-2, got shape {out_weights.shape}")
+
+    if output_bucket_count <= 1:
+        raise ValueError("SYKNNUE5 requires more than one output bucket")
+    if out_weights.shape != (output_bucket_count, 2 * ft_hidden_size):
         raise ValueError(
-            f"out_weights shape mismatch: expected {(2 * ft_hidden_size,)}, got {out_weights.shape}"
+            f"out_weights shape mismatch: expected {(output_bucket_count, 2 * ft_hidden_size)}, got {out_weights.shape}"
         )
-    if out_bias.shape[0] != 1:
+    if out_bias.shape[0] != output_bucket_count:
         raise ValueError(
-            f"out_bias length mismatch: expected 1, got {out_bias.shape[0]}"
+            f"out_bias length mismatch: expected {output_bucket_count}, got {out_bias.shape[0]}"
         )
 
     ft_bias_i16 = quantize_clipped(ft_bias, q0, -32768, 32767, np.int16)
     ft_weights_i16 = quantize_clipped(
         ft_weights.reshape(-1), q0, -32768, 32767, np.int16
     )
-
     out_bias_i32 = quantize_clipped(
-        out_bias,
+        out_bias.reshape(-1),
         q0 * q,
         -2147483648,
         2147483647,
@@ -146,26 +159,28 @@ def main() -> int:
     )
 
     out_path = Path(args.output_net)
-    write_syk_nnue_v4(
+    write_syk_nnue_v5(
         out_path,
         ft_hidden_size=ft_hidden_size,
         ft_biases_i16=ft_bias_i16.tolist(),
         ft_weights_i16=ft_weights_i16.tolist(),
-        out_bias_i32=int(out_bias_i32[0]),
+        out_biases_i32=out_bias_i32.tolist(),
         out_weights_i16=out_weights_i16.tolist(),
         activation_type=activation_type,
         feature_set=feature_set,
         bucket_layout_64=bucket_layout,
+        output_bucket_count=output_bucket_count,
         q0=q0,
         q=q,
         scale=scale,
     )
 
     print(f"Input: {in_path}")
-    print("Output format: SYKNNUE4")
-    print(f"Bucket count: {max(bucket_layout) + 1}")
+    print("Output format: SYKNNUE5")
+    print(f"Input bucket count: {max(bucket_layout) + 1}")
+    print(f"Output bucket count: {output_bucket_count}")
     print(f"FT hidden: {ft_hidden_size}")
-    print(f"Dense head: linear {2 * ft_hidden_size} -> 1")
+    print(f"Dense head: bucketed linear {2 * ft_hidden_size} -> 1")
     print(f"Wrote: {out_path}")
     return 0
 
diff --git a/utils/nnue/bullet/gate_checkpoints.py b/utils/nnue/bullet/gate_checkpoints.py
index bc46f99..57e7631 100755
--- a/utils/nnue/bullet/gate_checkpoints.py
+++ b/utils/nnue/bullet/gate_checkpoints.py
@@ -29,8 +29,8 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--npz-to-net",
-        default="utils/nnue/bullet/export_npz_to_syk4.py",
-        help="Path to NPZ -> SYKNNUE4 exporter",
+        default="utils/nnue/bullet/export_npz_to_syk5.py",
+        help="Path to NPZ -> SYKNNUE5 exporter",
     )
     parser.add_argument(
         "--engine", default="./zig-out/bin/sykora", help="Engine under test"
@@ -172,7 +172,7 @@ def main() -> int:
 
     for ckpt in ckpts:
         npz_out = nets_dir / f"{ckpt.name}.npz"
-        net_out = nets_dir / f"{ckpt.name}.sknnue4"
+        net_out = nets_dir / f"{ckpt.name}.sknnue"
         run_capture(
             [
                 sys.executable,
diff --git a/utils/nnue/bullet/train_cuda_longrun.py b/utils/nnue/bullet/train_cuda_longrun.py
index 802cd4a..6c4cc6a 100755
--- a/utils/nnue/bullet/train_cuda_longrun.py
+++ b/utils/nnue/bullet/train_cuda_longrun.py
@@ -65,8 +65,8 @@ def parse_args() -> argparse.Namespace:
     )
     parser.add_argument(
         "--network-format",
-        choices=["syk4"],
-        default="syk4",
+        choices=["syk5"],
+        default="syk5",
         help="Training network format",
     )
     parser.add_argument(
@@ -97,6 +97,12 @@ def parse_args() -> argparse.Namespace:
     parser.add_argument(
         "--threads", type=int, default=8, help="Bullet training/data threads"
     )
+    parser.add_argument(
+        "--output-buckets",
+        type=int,
+        default=8,
+        help="SYKNNUE5 material-count output buckets (currently fixed at 8)",
+    )
 
     # Data format
     parser.add_argument(
@@ -163,6 +169,9 @@ def main() -> int:
     if args.lr_start <= 0:
         print("--lr-start must be > 0", file=sys.stderr)
         return 2
+    if args.output_buckets != 8:
+        print("SYKNNUE5 currently supports exactly 8 output buckets", file=sys.stderr)
+        return 2
     if args.save_rate <= 0 or args.threads <= 0:
         print("--save-rate and --threads must be > 0", file=sys.stderr)
         return 2
@@ -191,6 +200,7 @@ def main() -> int:
             "SYK_WDL": str(args.wdl),
             "SYK_SAVE_RATE": str(args.save_rate),
             "SYK_THREADS": str(args.threads),
+            "SYK_OUTPUT_BUCKETS": str(args.output_buckets),
             "SYK_OUTPUT_DIR": str(ckpt_dir.resolve()),
             "SYK_NET_ID": run_id,
             "SYK_DATA_FORMAT": data_format,
@@ -219,7 +229,8 @@ def main() -> int:
             "bucket_layout_64": bucket_layout_64(args.bucket_layout),
             "ft_hidden": args.hidden,
             "hidden_activation": "screlu",
-            "head": "shared_linear",
+            "head": "material_count_output_buckets",
+            "output_bucket_count": args.output_buckets,
         },
         "env": {
             "SYK_DATASET": env["SYK_DATASET"],
@@ -233,6 +244,7 @@ def main() -> int:
             "SYK_WDL": env["SYK_WDL"],
             "SYK_SAVE_RATE": env["SYK_SAVE_RATE"],
             "SYK_THREADS": env["SYK_THREADS"],
+            "SYK_OUTPUT_BUCKETS": env["SYK_OUTPUT_BUCKETS"],
             "SYK_OUTPUT_DIR": env["SYK_OUTPUT_DIR"],
             "SYK_NET_ID": env["SYK_NET_ID"],
             "SYK_DATA_FORMAT": env["SYK_DATA_FORMAT"],
diff --git a/utils/nnue/bullet_runner/src/main.rs b/utils/nnue/bullet_runner/src/main.rs
index 6dab11d..a71b81b 100644
--- a/utils/nnue/bullet_runner/src/main.rs
+++ b/utils/nnue/bullet_runner/src/main.rs
@@ -1,18 +1,16 @@
 use bullet_lib::{
     game::{
         formats::sfbinpack::TrainingDataEntry,
-        inputs::{get_num_buckets, ChessBucketsMirrored},
-    },
-    nn::{
-        optimiser::{AdamW, AdamWParams},
-        InitSettings, Shape,
+        inputs::{ChessBucketsMirrored, get_num_buckets},
+        outputs::MaterialCount,
     },
+    nn::optimiser::{AdamW, AdamWParams},
     trainer::{
         save::SavedFormat,
-        schedule::{lr, wdl, TrainingSchedule, TrainingSteps},
+        schedule::{TrainingSchedule, TrainingSteps, lr, wdl},
         settings::LocalSettings,
     },
-    value::{loader::DirectSequentialDataLoader, ValueTrainerBuilder},
+    value::{ValueTrainerBuilder, loader::DirectSequentialDataLoader},
 };
 use std::env;
 
@@ -27,7 +25,7 @@ const BUCKET_LAYOUT_SYKORA16: [usize; 32] = [
     12, 12, 13, 13,
     14, 14, 15, 15,
 ];
-
+const SYK5_OUTPUT_BUCKETS: usize = 8;
 
 fn env_usize(name: &str, default: usize) -> usize {
     env::var(name)
@@ -61,7 +59,7 @@ fn binpack_filter(entry: &TrainingDataEntry) -> bool {
         && entry.score.unsigned_abs() <= 10000
 }
 
-fn run_syk4(
+fn run_syk5(
     bucket_layout: [usize; 32],
     num_input_buckets: usize,
     dataset_paths: &[&str],
@@ -82,38 +80,25 @@ fn run_syk4(
         .dual_perspective()
         .optimiser(AdamW)
         .inputs(ChessBucketsMirrored::new(bucket_layout))
+        .output_buckets(MaterialCount::<SYK5_OUTPUT_BUCKETS>)
         .use_threads(threads)
         .save_format(&[
-            SavedFormat::id("l0w")
-                .transform(move |store, weights| {
-                    let factoriser = store.get("l0f").values.repeat(num_input_buckets);
-                    weights
-                        .into_iter()
-                        .zip(factoriser)
-                        .map(|(a, b)| a + b)
-                        .collect()
-                })
-                .round()
-                .quantise::<i16>(255),
+            SavedFormat::id("l0w").round().quantise::<i16>(255),
             SavedFormat::id("l0b").round().quantise::<i16>(255),
             SavedFormat::id("outw").round().quantise::<i16>(64),
             SavedFormat::id("outb").round().quantise::<i32>(255 * 64),
         ])
         .loss_fn(|output, target| output.sigmoid().squared_error(target))
-        .build(|builder, stm_inputs, ntm_inputs| {
-            let l0f = builder.new_weights("l0f", Shape::new(hl_size, 768), InitSettings::Zeroed);
-            let expanded_factoriser = l0f.repeat(num_input_buckets);
-
-            let mut l0 = builder.new_affine("l0", 768 * num_input_buckets, hl_size);
+        .build(|builder, stm_inputs, ntm_inputs, output_buckets| {
+            let l0 = builder.new_affine("l0", 768 * num_input_buckets, hl_size);
             l0.init_with_effective_input_size(32);
-            l0.weights = l0.weights + expanded_factoriser;
 
-            let out = builder.new_affine("out", 2 * hl_size, 1);
+            let out = builder.new_affine("out", 2 * hl_size, SYK5_OUTPUT_BUCKETS);
 
             let stm_hidden = l0.forward(stm_inputs).screlu();
             let ntm_hidden = l0.forward(ntm_inputs).screlu();
             let hidden = stm_hidden.concat(ntm_hidden);
-            out.forward(hidden)
+            out.forward(hidden).select(output_buckets)
         });
 
     let stricter_clipping = AdamWParams {
@@ -124,9 +109,6 @@ fn run_syk4(
     trainer
         .optimiser
         .set_params_for_weight("l0w", stricter_clipping);
-    trainer
-        .optimiser
-        .set_params_for_weight("l0f", stricter_clipping);
 
     let schedule = TrainingSchedule {
         net_id,
@@ -173,11 +155,11 @@ fn run_syk4(
                 binpack_buffer_mb, binpack_threads
             );
             println!(
-                "Input layout: mirrored king buckets ({} buckets), shared head",
-                num_input_buckets
+                "Input layout: mirrored king buckets ({} buckets), material output buckets ({})",
+                num_input_buckets, SYK5_OUTPUT_BUCKETS
             );
             println!("FT width: {} per perspective", hl_size);
-            println!("Dense head: linear {} -> 1", 2 * hl_size);
+            println!("Dense head: bucketed linear {} -> 1", 2 * hl_size);
             for p in dataset_paths {
                 println!("  Dataset: {}", p);
             }
@@ -195,11 +177,11 @@ fn run_syk4(
         _ => {
             println!("Using DirectSequentialDataLoader (bullet format)");
             println!(
-                "Input layout: mirrored king buckets ({} buckets), shared head",
-                num_input_buckets
+                "Input layout: mirrored king buckets ({} buckets), material output buckets ({})",
+                num_input_buckets, SYK5_OUTPUT_BUCKETS
             );
             println!("FT width: {} per perspective", hl_size);
-            println!("Dense head: linear {} -> 1", 2 * hl_size);
+            println!("Dense head: bucketed linear {} -> 1", 2 * hl_size);
             for p in dataset_paths {
                 println!("  Dataset: {}", p);
             }
@@ -223,8 +205,8 @@ fn main() {
     let net_id = env_string("SYK_NET_ID", "sykora_bucketed");
     let resume_from = env::var("SYK_RESUME").ok();
     let data_format = env_string("SYK_DATA_FORMAT", "bullet");
-    let network_format = env_string("SYK_NETWORK_FORMAT", "syk4");
-    let hl_size = env_usize("SYK_HIDDEN", 768);
+    let network_format = env_string("SYK_NETWORK_FORMAT", "syk5");
+    let hl_size = env_usize("SYK_HIDDEN", 512);
     let bucket_layout_name = env_string("SYK_BUCKET_LAYOUT", "sykora16");
 
     let bucket_layout = selected_bucket_layout(&bucket_layout_name);
@@ -235,11 +217,11 @@ fn main() {
     println!("Network format: {}", network_format);
     println!("Bucket layout: {}", bucket_layout_name);
 
-    if network_format != "syk4" {
+    if network_format != "syk5" {
         panic!("unsupported network format: {network_format}");
     }
 
-    run_syk4(
+    run_syk5(
         bucket_layout,
         num_input_buckets,
         &dataset_paths,
diff --git a/utils/nnue/common.py b/utils/nnue/common.py
index 83bb703..89eb6f6 100644
--- a/utils/nnue/common.py
+++ b/utils/nnue/common.py
@@ -11,15 +11,11 @@
 
 
 LEGACY_INPUT_SIZE = 768
-QA = 255
-QB = 64
-V4_Q0 = 255
-V4_Q = 64
+NNUE_Q0 = 255
+NNUE_Q = 64
 SCALE = 400
-MAGIC_V3 = b"SYKNNUE3"
-FORMAT_VERSION_V3 = 3
-MAGIC_V4 = b"SYKNNUE4"
-FORMAT_VERSION_V4 = 4
+MAGIC_V5 = b"SYKNNUE5"
+FORMAT_VERSION_V5 = 5
 
 FEATURE_SET_LEGACY = 0
 FEATURE_SET_KING_BUCKETS_MIRRORED = 1
@@ -27,17 +23,6 @@
 ACTIVATION_RELU = 0
 ACTIVATION_SCRELU = 1
 
-SYKORA_BUCKET_LAYOUT_32 = [
-    0, 1, 2, 3,
-    4, 4, 5, 5,
-    6, 6, 6, 6,
-    7, 7, 7, 7,
-    8, 8, 8, 8,
-    8, 8, 8, 8,
-    9, 9, 9, 9,
-    9, 9, 9, 9,
-]
-
 SYKORA16_BUCKET_LAYOUT_32 = [
     0, 0, 1, 1,
     2, 2, 3, 3,
@@ -170,91 +155,30 @@ def _pack_i32(values: Iterable[int]) -> bytes:
     return b"".join(struct.pack("<i", int(v)) for v in values)
 
 
-def _pack_i8(values: Iterable[int]) -> bytes:
-    return b"".join(struct.pack("<b", int(v)) for v in values)
-
-
-def write_syk_nnue(
-    path: Path,
-    *,
-    hidden_size: int,
-    input_biases_i16: List[int],
-    input_weights_i16: List[int],
-    output_weights_i16: List[int],
-    output_bias_i32: int,
-    activation_type: int = 1,
-    feature_set: int = FEATURE_SET_LEGACY,
-    bucket_layout_64: List[int] | None = None,
-) -> None:
-    if hidden_size <= 0:
-        raise ValueError("hidden_size must be > 0")
-    if len(input_biases_i16) != hidden_size:
-        raise ValueError("input_biases length mismatch")
-    input_size = input_size_for_feature_set(feature_set, bucket_layout_64)
-    if len(input_weights_i16) != input_size * hidden_size:
-        raise ValueError("input_weights length mismatch")
-    if len(output_weights_i16) != 2 * hidden_size:
-        raise ValueError("output_weights length mismatch")
-
-    if feature_set == FEATURE_SET_LEGACY:
-        bucket_layout_64 = [0] * 64
-    elif bucket_layout_64 is None or len(bucket_layout_64) != 64:
-        raise ValueError("bucket_layout_64 must contain exactly 64 entries")
-
-    bucket_count = num_buckets(bucket_layout_64)
-
-    path.parent.mkdir(parents=True, exist_ok=True)
-    with path.open("wb") as handle:
-        handle.write(MAGIC_V3)
-        handle.write(struct.pack("<H", FORMAT_VERSION_V3))
-        handle.write(struct.pack("<B", feature_set))
-        handle.write(struct.pack("<H", hidden_size))
-        handle.write(struct.pack("<B", activation_type))
-        handle.write(struct.pack("<H", bucket_count))
-        handle.write(bytes(int(v) for v in bucket_layout_64))
-        handle.write(struct.pack("<i", int(output_bias_i32)))
-        handle.write(_pack_i16(input_biases_i16))
-        handle.write(_pack_i16(input_weights_i16))
-        handle.write(_pack_i16(output_weights_i16))
-
-
-def syk_nnue_v4_payload_size(
-    *,
-    feature_set: int,
-    bucket_layout_64: List[int],
-    ft_hidden_size: int,
-) -> int:
-    input_size = input_size_for_feature_set(feature_set, bucket_layout_64)
-    h = ft_hidden_size
-    return (
-        2 * h
-        + 2 * input_size * h
-        + 4
-        + 4 * h
-    )
-
-
-def write_syk_nnue_v4(
+def write_syk_nnue_v5(
     path: Path,
     *,
     ft_hidden_size: int,
     ft_biases_i16: List[int],
     ft_weights_i16: List[int],
-    out_bias_i32: int,
+    out_biases_i32: List[int],
     out_weights_i16: List[int],
     activation_type: int = ACTIVATION_SCRELU,
     feature_set: int = FEATURE_SET_KING_BUCKETS_MIRRORED,
     bucket_layout_64: List[int] | None = None,
-    q0: int = V4_Q0,
-    q: int = V4_Q,
+    output_bucket_count: int = 8,
+    q0: int = NNUE_Q0,
+    q: int = NNUE_Q,
     scale: int = SCALE,
 ) -> None:
     if feature_set != FEATURE_SET_KING_BUCKETS_MIRRORED:
-        raise ValueError("SYKNNUE4 currently requires king_buckets_mirrored inputs")
+        raise ValueError("SYKNNUE5 currently requires king_buckets_mirrored inputs")
     if bucket_layout_64 is None or len(bucket_layout_64) != 64:
         raise ValueError("bucket_layout_64 must contain exactly 64 entries")
     if ft_hidden_size <= 0:
         raise ValueError("ft_hidden_size must be > 0")
+    if output_bucket_count <= 0 or output_bucket_count > 255:
+        raise ValueError("output_bucket_count must be in 1..255")
     if activation_type not in (ACTIVATION_RELU, ACTIVATION_SCRELU):
         raise ValueError("unsupported activation_type")
 
@@ -266,22 +190,25 @@ def write_syk_nnue_v4(
         raise ValueError("ft_biases length mismatch")
     if len(ft_weights_i16) != input_size * h:
         raise ValueError("ft_weights length mismatch")
-    if len(out_weights_i16) != 2 * h:
+    if len(out_biases_i32) != output_bucket_count:
+        raise ValueError("out_biases length mismatch")
+    if len(out_weights_i16) != output_bucket_count * 2 * h:
         raise ValueError("out_weights length mismatch")
 
     path.parent.mkdir(parents=True, exist_ok=True)
     with path.open("wb") as handle:
-        handle.write(MAGIC_V4)
-        handle.write(struct.pack("<H", FORMAT_VERSION_V4))
+        handle.write(MAGIC_V5)
+        handle.write(struct.pack("<H", FORMAT_VERSION_V5))
         handle.write(struct.pack("<B", feature_set))
         handle.write(struct.pack("<H", h))
         handle.write(struct.pack("<B", activation_type))
         handle.write(struct.pack("<B", input_bucket_count))
+        handle.write(struct.pack("<B", output_bucket_count))
         handle.write(struct.pack("<H", q0))
         handle.write(struct.pack("<H", q))
         handle.write(struct.pack("<H", scale))
         handle.write(bytes(int(v) for v in bucket_layout_64))
-        handle.write(struct.pack("<i", int(out_bias_i32)))
         handle.write(_pack_i16(ft_biases_i16))
         handle.write(_pack_i16(ft_weights_i16))
+        handle.write(_pack_i32(out_biases_i32))
         handle.write(_pack_i16(out_weights_i16))