diff --git a/README.md b/README.md
index a72529d..f8e238b 100644
--- a/README.md
+++ b/README.md
@@ -92,8 +92,8 @@ Sykora is tested by [CCRL](https://computerchess.org.uk/ccrl/404/). Current entr
Evaluation: NNUE (default) with classical fallback
- **NNUE evaluation** (default, embedded in binary):
- - `SYKNNUE3` and `SYKNNUE4` network loading
- - Legacy `768 -> Nx2 -> 1` and mirrored king-bucketed sparse-input nets
+ - `SYKNNUE5` material-output-bucket nets, with compatibility for the current embedded net
+ - Mirrored king-bucketed sparse-input nets
- SCReLU activation with incremental accumulators during search
- Trained on high-depth self-play data via the Bullet trainer
- King-bucket training path via `nnue/bullet_repo/examples/sykora_bucketed.rs`
@@ -260,7 +260,7 @@ See `history/README.md` for folder schema and the archived workflow.
## NNUE
-Sykora supports both legacy `768 -> Nx2 -> 1` nets and mirrored king-bucketed nets with dual-perspective accumulator updates and SCReLU activation. The engine can load both `SYKNNUE3` and `SYKNNUE4` files.
+Sykora's current training target is `SYKNNUE5`: mirrored king-bucketed sparse inputs with dual-perspective accumulator updates, SCReLU activation, and material-count output buckets. The engine still keeps loader compatibility for the current embedded net until `src/net.sknnue` is replaced.
### Runtime
@@ -270,7 +270,7 @@ Sykora supports both legacy `768 -> Nx2 -> 1` nets and mirrored king-bucketed ne
- To use a different net, set `EvalFile` to the path of an external `.sknnue` file.
- `NnueScale` scales the NNUE score before it is fed into the search.
-For exact file-format details, see [specs/syknnue4_spec.md](specs/syknnue4_spec.md) and `src/nnue.zig`.
+For exact file-format details, see [specs/syknnue5_spec.md](specs/syknnue5_spec.md) and `src/nnue.zig`.
### Training Pipeline
@@ -284,8 +284,9 @@ python utils/nnue/bullet/train_cuda_longrun.py \
--data-format binpack \
--bullet-repo nnue/bullet_repo \
--output-root nnue/models/bullet \
- --network-format syk3 \
- --hidden 256 --end-superbatch 320 --threads 8
+ --network-format syk5 \
+ --bucket-layout sykora16 \
+ --hidden 512 --end-superbatch 320 --threads 8
```
**Using BulletFormat .data files:**
@@ -295,8 +296,9 @@ python utils/nnue/bullet/train_cuda_longrun.py \
--dataset nnue/data/bullet/train/train_main.data \
--bullet-repo nnue/bullet_repo \
--output-root nnue/models/bullet \
- --network-format syk3 \
- --hidden 256 --end-superbatch 320 --threads 8
+ --network-format syk5 \
+ --bucket-layout sykora16 \
+ --hidden 512 --end-superbatch 320 --threads 8
```
**Multiple datasets** can be passed space-separated:
@@ -319,16 +321,15 @@ python utils/nnue/bullet/train_cuda_longrun.py \
...
```
-**Training a `SYKNNUE4` baseline:**
+**Training a `SYKNNUE5` material-output-bucket net:**
```bash
python utils/nnue/bullet/train_cuda_longrun.py \
--dataset data/training.binpack \
--data-format binpack \
- --network-format syk4 \
+ --network-format syk5 \
--bucket-layout sykora16 \
- --hidden 1536 \
- --dense-l1 16 --dense-l2 32 \
+ --hidden 512 \
--end-superbatch 320 --threads 8
```
@@ -343,15 +344,15 @@ Sykora can generate its own training data via the `gensfen` command:
### Exporting a Trained Net
-Export a `SYKNNUE4` checkpoint:
+Export a `SYKNNUE5` checkpoint:
```bash
python utils/nnue/bullet/checkpoint_raw_to_npz.py \
--input nnue/models/bullet//checkpoints/ \
- --output checkpoint_syk4.npz
+ --output checkpoint_syk5.npz
-python utils/nnue/bullet/export_npz_to_syk4.py \
- --input checkpoint_syk4.npz \
+python utils/nnue/bullet/export_npz_to_syk5.py \
+ --input checkpoint_syk5.npz \
--output-net output.sknnue
```
@@ -379,7 +380,7 @@ python utils/nnue/bullet/gate_checkpoints.py \
This gate now evaluates recent checkpoints by selfplay only. STS is intentionally not part of the checkpoint promotion path.
-SYKNNUE4 design spec: `specs/syknnue4_spec.md`.
+SYKNNUE5 design spec: `specs/syknnue5_spec.md`.
## Contributing
diff --git a/launch_training.ps1 b/launch_training.ps1
index 3cb48b2..40de935 100644
--- a/launch_training.ps1
+++ b/launch_training.ps1
@@ -1,4 +1,4 @@
-# Sykora NNUE V4 Training Launch Script
+# Sykora NNUE V5 Training Launch Script
# Run from project root: .\launch_training.ps1
#
# Dataset: T80-2023 (jun-dec) + T80-2024 (jan-jun) .min-v2.v6 binpacks
@@ -60,11 +60,12 @@ foreach ($bp in $binpacks) {
}
# --- Training Parameters ---
-# SYKNNUE4 baseline:
-# mirrored king buckets (sykora16) -> FT 768 -> shared linear output
-$networkFormat = "syk4"
+# SYKNNUE5:
+# mirrored king buckets (sykora16) -> FT 512 -> 8 material-count output heads
+$networkFormat = "syk5"
$bucketLayout = "sykora16"
-$hidden = 768
+$hidden = 512
+$outputBuckets = 8
$endSuperbatch = 600
$lrStart = 0.001
$wdl = 0.25
@@ -72,7 +73,7 @@ $saveRate = 10
$threads = 8
Write-Host "============================================"
-Write-Host " Sykora NNUE V4 Training (RTX 4070 Ti SUPER)"
+Write-Host " Sykora NNUE V5 Training (RTX 4070 Ti SUPER)"
Write-Host "============================================"
Write-Host "Data: T80-2023/2024 filtered set"
Write-Host "Filtering: .min-v2.v6 on T80 inputs"
@@ -81,7 +82,8 @@ Write-Host "Format: binpack (sfbinpack)"
Write-Host "Net format: $networkFormat"
Write-Host "Bucket layout: $bucketLayout"
Write-Host "FT hidden: $hidden"
-Write-Host "Dense head: linear $($hidden * 2) -> 1"
+Write-Host "Output heads: $outputBuckets material-count buckets"
+Write-Host "Dense head: bucketed linear $($hidden * 2) -> 1"
Write-Host "Superbatches: 1 -> $endSuperbatch"
Write-Host "Save rate: every $saveRate superbatches"
Write-Host "Threads: $threads"
@@ -104,6 +106,7 @@ python "$PSScriptRoot\utils\nnue\bullet\train_cuda_longrun.py" `
--network-format $networkFormat `
--bucket-layout $bucketLayout `
--hidden $hidden `
+ --output-buckets $outputBuckets `
--end-superbatch $endSuperbatch `
--save-rate $saveRate `
--threads $threads `
diff --git a/specs/syknnue4_spec.md b/specs/syknnue4_spec.md
deleted file mode 100644
index da5de16..0000000
--- a/specs/syknnue4_spec.md
+++ /dev/null
@@ -1,342 +0,0 @@
-# SYKNNUE4 Design Spec
-
-## Goal
-
-`SYKNNUE4` is the simple, stable baseline Sykora NNUE format.
-
-The design goal is:
-
-- keep the sparse incremental part large
-- keep the head shared
-- stay close to the already-working v3 math
-- make the file format self-describing for mirrored king-bucket inputs
-
-The baseline `SYKNNUE4` net is:
-
-```text
-king_buckets_mirrored(16 buckets)
--> shared sparse FT, width 768, two color-fixed accumulators
--> concat(screlu(A_us), screlu(A_them)) # 1536 inputs
--> shared linear output
-```
-
-Short form:
-
-```text
-shared FT: 12288 -> 768, color-fixed dual perspective
--> concat(us, them) -> 1
-```
-
-This is intentionally a monotonic upgrade from the v3 family:
-
-- same shared-head philosophy
-- same SCReLU inference contract
-- wider FT
-- explicit mirrored king-bucket layout stored in the file
-
-## Non-Goals
-
-The first `SYKNNUE4` implementation should not include:
-
-- multiple output heads
-- multi-layer dense heads
-- PSQT side channels
-- product pooling
-- mixed float/int inference
-- approximate rescale rules in the reference path
-
-## Architecture
-
-### Inputs
-
-- Feature set: `king_buckets_mirrored`
-- Per-bucket base feature size: `768`
-- Default input bucket count: `16`
-- Bucket layout: stored explicitly in the file
-- Horizontal mirroring: enabled
-- Training-only factorization is allowed, but exported nets must contain merged
- sparse weights only
-
-Per perspective:
-
-```text
-INPUT_SIZE = 768
-INPUT_BUCKET_COUNT = 16
-HORIZONTAL_MIRRORING = true
-```
-
-Feature indexing is defined for color-fixed perspectives `white` and `black`,
-not for side-to-move / side-not-to-move.
-
-For a perspective `p`:
-
-```text
-feature =
- king_bucket(p.king_sq) * 768
- + relative_color(piece, p) * (6 * 64)
- + piece_type * 64
- + mirrored_square(p.king_sq, sq)
-```
-
-### Sparse Transformer
-
-The sparse transformer is:
-
-```text
-SparseAffine(768, 768) per king bucket
-```
-
-Maintain two color-fixed accumulators:
-
-- `A_white[768]`
-- `A_black[768]`
-
-For the reference implementation, store these accumulators as `i32`.
-
-At evaluation time:
-
-```text
-if side_to_move == white:
- A_us = A_white
- A_them = A_black
-else:
- A_us = A_black
- A_them = A_white
-```
-
-### Hidden Activation
-
-For each hidden accumulator entry:
-
-```text
-u = clamp(A_us[i], 0, Q0)
-t = clamp(A_them[i], 0, Q0)
-```
-
-Apply the activation selected by `activation_type`:
-
-- `0 = ReLU`
-- `1 = SCReLU`
-
-Baseline `SYKNNUE4` uses `SCReLU`.
-
-For `SCReLU`:
-
-```text
-U[i] = u * u
-T[i] = t * t
-```
-
-Concatenate:
-
-```text
-X = [U, T]
-```
-
-So:
-
-- `X` has length `2 * H`
-- with the baseline `H = 768`, `X` has length `1536`
-- each entry is in the `Q0^2` domain for `SCReLU`
-
-### Output Head
-
-The output head is shared. There are no phase-specific output stacks.
-
-```text
-Out: Affine(2 * H, 1)
-```
-
-## Quantization Contract
-
-Use the following constants:
-
-```text
-Q0 = 255
-Q = 64
-SCALE = 400
-```
-
-Interpretation:
-
-- `Q0`: sparse hidden clamp / scale
-- `Q`: output-weight scale
-- `SCALE`: final centipawn conversion
-
-All float-to-int quantization in this spec uses:
-
-```text
-quantize_round(x, scale) =
- if x >= 0:
- floor(x * scale + 0.5)
- else:
- -floor((-x) * scale + 0.5)
-```
-
-This is round-to-nearest with ties away from zero.
-
-### Hidden FT Storage
-
-Export the sparse branch as:
-
-- hidden biases: `i16`
-- hidden weights: `i16`
-
-Quantization:
-
-```text
-hidden_bias_int = quantize_round(hidden_bias_float, Q0)
-hidden_weight_int = quantize_round(hidden_weight_float, Q0)
-```
-
-### Output Head Storage
-
-Export the shared output head as:
-
-- output weights: `i16`
-- output bias: `i32`
-
-Quantization:
-
-```text
-out_weight_int = quantize_round(out_weight_float, Q)
-out_bias_int = quantize_round(out_bias_float, Q0 * Q)
-```
-
-## Integer Inference Contract
-
-### Hidden Accumulators
-
-The reference accumulator update path sums stored sparse integers directly:
-
-```text
-A_white[i] = hidden_bias_int[i] + sum(active white-perspective feature weights)
-A_black[i] = hidden_bias_int[i] + sum(active black-perspective feature weights)
-```
-
-### Output Evaluation
-
-For `SCReLU`:
-
-```text
-sum_int =
- Σ_i (clamp(A_us[i], 0, Q0)^2 * out_weight_int[i])
- + Σ_i (clamp(A_them[i], 0, Q0)^2 * out_weight_int[H + i])
-```
-
-Rescale by one factor of `Q0` before adding bias:
-
-```text
-sum_rescaled = div_round_nearest_signed(sum_int, Q0)
-z_int = sum_rescaled + out_bias_int
-```
-
-Convert to centipawns:
-
-```text
-eval_cp = div_round_nearest_signed(z_int * SCALE, Q0 * Q)
-```
-
-For `ReLU`, omit the squaring and the intermediate `/Q0` rescale:
-
-```text
-sum_int =
- Σ_i (clamp(A_us[i], 0, Q0) * out_weight_int[i])
- + Σ_i (clamp(A_them[i], 0, Q0) * out_weight_int[H + i])
-
-z_int = sum_int + out_bias_int
-eval_cp = div_round_nearest_signed(z_int * SCALE, Q0 * Q)
-```
-
-### Signed Division
-
-The reference path uses signed round-to-nearest:
-
-```text
-div_round_nearest_signed(x, d) =
- if x >= 0:
- (x + d / 2) / d
- else:
- -(((-x) + d / 2) / d)
-```
-
-This is the reference contract to match across trainer, exporter, and runtime.
-
-## File Format
-
-All integers are little-endian.
-
-### Header
-
-```text
-u8[8] magic = "SYKNNUE4"
-u16 format_version = 4
-u8 feature_set = 1 # king_buckets_mirrored
-u16 ft_hidden_size # baseline 768
-u8 activation_type # baseline 1 = SCReLU
-u8 input_bucket_count # baseline 16
-u16 q0 # baseline 255
-u16 q # baseline 64
-u16 scale # baseline 400
-u8[64] bucket_layout_64
-```
-
-### Payload
-
-Let:
-
-- `I = 768 * input_bucket_count`
-- `H = ft_hidden_size`
-
-Payload order:
-
-```text
-i32 output_bias
-i16[H] ft_biases
-i16[I * H] ft_weights
-i16[2 * H] output_weights
-```
-
-Weight order:
-
-- `ft_weights[input_feature][hidden]`
-- `output_weights[0..H]` are `us`
-- `output_weights[H..2H]` are `them`
-
-## Loader Validation
-
-A loader should reject nets where:
-
-- `magic != "SYKNNUE4"`
-- `format_version != 4`
-- `feature_set != 1`
-- `ft_hidden_size == 0`
-- `input_bucket_count == 0`
-- any `bucket_layout_64` entry is `>= input_bucket_count`
-- `q0 == 0`
-- `q == 0`
-- `scale == 0`
-- payload size does not match the header
-
-## Baseline Defaults
-
-Baseline values:
-
-```text
-feature_set = king_buckets_mirrored
-input_bucket_count = 16
-ft_hidden_size = 768
-activation_type = SCReLU
-q0 = 255
-q = 64
-scale = 400
-```
-
-## Reference Implementation Priorities
-
-If implementing or training this architecture, the recommended order is:
-
-1. make the sparse update path correct
-2. make exporter and runtime agree bit-for-bit on fixed FENs
-3. validate the shared-head model against v3-like sanity positions
-4. only then consider widening the FT or adding extra head complexity
diff --git a/specs/syknnue5_spec.md b/specs/syknnue5_spec.md
new file mode 100644
index 0000000..45b9f48
--- /dev/null
+++ b/specs/syknnue5_spec.md
@@ -0,0 +1,83 @@
+# SYKNNUE5 Design Spec
+
+`SYKNNUE5` is Sykora's current king-bucketed SCReLU training target with
+material-count output buckets.
+
+## Architecture
+
+```text
+king_buckets_mirrored(16 buckets)
+-> shared sparse FT, H hidden units, color-fixed dual perspective
+-> concat(screlu(A_us), screlu(A_them))
+-> material-count bucketed linear output head
+```
+
+The first intended training target is:
+
+```text
+shared FT: 12288 -> 512
+-> concat(us, them): 1024
+-> 8 material-count output heads
+```
+
+`H = 768` is the larger follow-up target.
+
+## Output Buckets
+
+The output bucket selector matches Bullet's `MaterialCount<8>`:
+
+```text
+piece_count = popcount(occupied)
+non_king_count = piece_count - 2
+divisor = ceil(32 / output_bucket_count)
+output_bucket = min(non_king_count / divisor, output_bucket_count - 1)
+```
+
+With the default `output_bucket_count = 8`, the divisor is `4`.
+
+## File Format
+
+All integers are little-endian.
+
+```text
+u8[8] magic = "SYKNNUE5"
+u16 version = 5
+u8 feature_set = 1 # king_buckets_mirrored
+u16 ft_hidden_size = H
+u8 activation_type # 0 = ReLU, 1 = SCReLU
+u8 input_bucket_count
+u8 output_bucket_count
+u16 q0
+u16 q
+u16 scale
+u8[64] bucket_layout
+i16[H] ft_biases
+i16[input_bucket_count * 768 * H] ft_weights
+i32[output_bucket_count] output_biases
+i16[output_bucket_count * 2 * H] output_weights
+```
+
+`output_weights` are bucket-major. For bucket `b`, the slice is:
+
+```text
+output_weights[b * 2H .. (b + 1) * 2H]
+```
+
+The first `H` weights apply to `A_us`; the second `H` apply to `A_them`.
+
+## Quantization
+
+The baseline constants are:
+
+```text
+Q0 = 255
+Q = 64
+SCALE = 400
+```
+
+SCReLU output is divided by `Q0` before adding the selected output bias, then the
+final score is converted to centipawns with:
+
+```text
+score = round(sum * SCALE / (Q0 * Q))
+```
diff --git a/src/nnue.zig b/src/nnue.zig
index ecdb0cd..62fef27 100644
--- a/src/nnue.zig
+++ b/src/nnue.zig
@@ -13,9 +13,9 @@ pub const SCALE: i32 = 400;
const MAX_NETWORK_BYTES = 64 * 1024 * 1024;
const MAGIC_V3 = "SYKNNUE3";
-const MAGIC_V4 = "SYKNNUE4";
+const MAGIC_V5 = "SYKNNUE5";
const FORMAT_VERSION_V3: u16 = 3;
-const FORMAT_VERSION_V4: u16 = 4;
+const FORMAT_VERSION_V5: u16 = 5;
pub const FeatureSet = enum(u8) {
legacy_psqt = 0,
@@ -43,13 +43,14 @@ pub const Network = struct {
output_bias: i32,
};
- pub const V4Head = struct {
+ pub const V5Head = struct {
activation_type: u8, // 0 = ReLU, 1 = SCReLU
q0: u16,
q: u16,
scale: u16,
- output_weights: []i16, // [2 * H]
- output_bias: i32,
+ output_bucket_count: u8,
+ output_weights: []i16, // [output_bucket_count * 2 * H], bucket-major
+ output_biases: []i32, // [output_bucket_count]
};
allocator: std.mem.Allocator,
@@ -61,7 +62,7 @@ pub const Network = struct {
ft_weights: []i16,
head: union(enum) {
v3: V3Head,
- v4: V4Head,
+ v5: V5Head,
},
pub fn deinit(self: *Network) void {
@@ -71,8 +72,9 @@ pub const Network = struct {
.v3 => |v3| {
self.allocator.free(v3.output_weights);
},
- .v4 => |v4| {
- self.allocator.free(v4.output_weights);
+ .v5 => |v5| {
+ self.allocator.free(v5.output_weights);
+ self.allocator.free(v5.output_biases);
},
}
}
@@ -83,8 +85,8 @@ pub const Network = struct {
if (std.mem.eql(u8, data[0..8], MAGIC_V3)) {
return loadFromBytesV3(allocator, data);
}
- if (std.mem.eql(u8, data[0..8], MAGIC_V4)) {
- return loadFromBytesV4(allocator, data);
+ if (std.mem.eql(u8, data[0..8], MAGIC_V5)) {
+ return loadFromBytesV5(allocator, data);
}
return error.UnsupportedVersion;
}
@@ -144,21 +146,29 @@ fn checkedAddU64(a: u64, b: u64) ?u64 {
return std.math.add(u64, a, b) catch null;
}
-fn computeV4PayloadBytes(
+fn computeV5PayloadBytes(
input_size: usize,
ft_hidden_size: usize,
+ output_bucket_count: usize,
) ?u64 {
var total: u64 = 0;
- const ft_bias_bytes = checkedMulU64(@as(u64, @intCast(ft_hidden_size)), @sizeOf(i16)) orelse return null;
+ const hidden_size_u64: u64 = @intCast(ft_hidden_size);
+ const ft_bias_bytes = checkedMulU64(hidden_size_u64, @sizeOf(i16)) orelse return null;
total = checkedAddU64(total, ft_bias_bytes) orelse return null;
- const ft_weight_count = checkedMulU64(@as(u64, @intCast(input_size)), @as(u64, @intCast(ft_hidden_size))) orelse return null;
+ const ft_weight_count = checkedMulU64(@as(u64, @intCast(input_size)), hidden_size_u64) orelse return null;
total = checkedAddU64(total, checkedMulU64(ft_weight_count, @sizeOf(i16)) orelse return null) orelse return null;
- total = checkedAddU64(total, @sizeOf(i32)) orelse return null;
- const out_weight_count = checkedMulU64(2, @as(u64, @intCast(ft_hidden_size))) orelse return null;
- total = checkedAddU64(total, checkedMulU64(out_weight_count, @sizeOf(i16)) orelse return null) orelse return null;
+ const bias_bytes = checkedMulU64(@as(u64, @intCast(output_bucket_count)), @sizeOf(i32)) orelse return null;
+ total = checkedAddU64(total, bias_bytes) orelse return null;
+
+ const single_head_weight_count = checkedMulU64(2, hidden_size_u64) orelse return null;
+ const output_weight_count = checkedMulU64(
+ @as(u64, @intCast(output_bucket_count)),
+ single_head_weight_count,
+ ) orelse return null;
+ total = checkedAddU64(total, checkedMulU64(output_weight_count, @sizeOf(i16)) orelse return null) orelse return null;
return total;
}
@@ -229,11 +239,11 @@ fn loadFromBytesV3(allocator: std.mem.Allocator, data: []const u8) LoadError!Net
};
}
-fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Network {
+fn loadFromBytesV5(allocator: std.mem.Allocator, data: []const u8) LoadError!Network {
var pos: usize = 8;
const version = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork;
- if (version != FORMAT_VERSION_V4) return error.UnsupportedVersion;
+ if (version != FORMAT_VERSION_V5) return error.UnsupportedVersion;
if (pos >= data.len) return error.InvalidNetwork;
const feature_set = std.meta.intToEnum(FeatureSet, data[pos]) catch return error.InvalidNetwork;
@@ -255,6 +265,11 @@ fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Net
pos += 1;
if (bucket_count == 0) return error.InvalidNetwork;
+ if (pos >= data.len) return error.InvalidNetwork;
+ const output_bucket_count = data[pos];
+ pos += 1;
+ if (output_bucket_count == 0) return error.InvalidNetwork;
+
const q0 = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork;
const q = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork;
const scale = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork;
@@ -269,21 +284,24 @@ fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Net
}
const input_size = LEGACY_INPUT_SIZE * @as(usize, bucket_count);
- const payload_size = computeV4PayloadBytes(
+ const payload_size = computeV5PayloadBytes(
input_size,
ft_hidden_size,
+ output_bucket_count,
) orelse return error.InvalidNetwork;
const expected_size = checkedAddU64(@as(u64, @intCast(pos)), payload_size) orelse return error.InvalidNetwork;
if (expected_size != data.len) return error.InvalidNetwork;
- const output_bias = readBytesInt(i32, data, &pos) orelse return error.InvalidNetwork;
const ft_biases = try allocAndReadInts(i16, allocator, data, &pos, ft_hidden_size);
errdefer allocator.free(ft_biases);
const ft_weights = try allocAndReadInts(i16, allocator, data, &pos, input_size * ft_hidden_size);
errdefer allocator.free(ft_weights);
- const output_weights = try allocAndReadInts(i16, allocator, data, &pos, 2 * ft_hidden_size);
+ const output_biases = try allocAndReadInts(i32, allocator, data, &pos, output_bucket_count);
+ errdefer allocator.free(output_biases);
+
+ const output_weights = try allocAndReadInts(i16, allocator, data, &pos, output_bucket_count * 2 * ft_hidden_size);
errdefer allocator.free(output_weights);
if (pos != data.len) return error.InvalidNetwork;
@@ -297,13 +315,14 @@ fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Net
.ft_biases = ft_biases,
.ft_weights = ft_weights,
.head = .{
- .v4 = .{
+ .v5 = .{
.activation_type = activation_type,
.q0 = q0,
.q = q,
.scale = scale,
+ .output_bucket_count = output_bucket_count,
.output_weights = output_weights,
- .output_bias = output_bias,
+ .output_biases = output_biases,
},
},
};
@@ -1043,10 +1062,19 @@ pub fn updateAccumulators(
}
}
-fn evaluateV4FromAccumulators(
+inline fn materialCountOutputBucket(b: *Board, output_bucket_count: u8) usize {
+ const piece_count = @popCount(b.board.occupied());
+ const non_king_count = if (piece_count >= 2) piece_count - 2 else 0;
+ const divisor = (32 + @as(usize, output_bucket_count) - 1) / @as(usize, output_bucket_count);
+ const bucket = non_king_count / divisor;
+ return @min(bucket, @as(usize, output_bucket_count) - 1);
+}
+
+fn evaluateV5FromAccumulators(
net: *const Network,
- head: *const Network.V4Head,
+ head: *const Network.V5Head,
acc: *const AccumulatorPair,
+ b: *Board,
stm_is_white: bool,
) i32 {
const hidden_size: usize = @intCast(net.ft_hidden_size);
@@ -1055,17 +1083,20 @@ fn evaluateV4FromAccumulators(
const scale: i32 = head.scale;
const use_screlu = head.activation_type == 1;
const final_den: i64 = @as(i64, q0) * @as(i64, q);
+ const output_bucket = materialCountOutputBucket(b, head.output_bucket_count);
+ const weights_base = output_bucket * 2 * hidden_size;
+ const weights = head.output_weights[weights_base .. weights_base + 2 * hidden_size];
const us_acc = if (stm_is_white) acc.white[0..hidden_size] else acc.black[0..hidden_size];
const them_acc = if (stm_is_white) acc.black[0..hidden_size] else acc.white[0..hidden_size];
- var sum = activatedDot(us_acc, head.output_weights[0..hidden_size], hidden_size, head.activation_type, q0) +
- activatedDot(them_acc, head.output_weights[hidden_size .. 2 * hidden_size], hidden_size, head.activation_type, q0);
+ var sum = activatedDot(us_acc, weights[0..hidden_size], hidden_size, head.activation_type, q0) +
+ activatedDot(them_acc, weights[hidden_size .. 2 * hidden_size], hidden_size, head.activation_type, q0);
if (use_screlu) {
sum = divRoundNearestSigned(sum, q0);
}
- sum += head.output_bias;
+ sum += head.output_biases[output_bucket];
return @intCast(divRoundNearestSigned(sum * scale, final_den));
}
@@ -1100,7 +1131,7 @@ pub fn evaluateFromAccumulators(
const stm_is_white = b.board.move == .white;
return switch (net.head) {
.v3 => |*head| evaluateV3FromAccumulators(net, head, acc, stm_is_white),
- .v4 => |*head| evaluateV4FromAccumulators(net, head, acc, stm_is_white),
+ .v5 => |*head| evaluateV5FromAccumulators(net, head, acc, b, stm_is_white),
};
}
diff --git a/utils/nnue/bullet/checkpoint_raw_to_npz.py b/utils/nnue/bullet/checkpoint_raw_to_npz.py
index 8aaca7a..5009583 100644
--- a/utils/nnue/bullet/checkpoint_raw_to_npz.py
+++ b/utils/nnue/bullet/checkpoint_raw_to_npz.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
-"""Convert a SYKNNUE4 Bullet checkpoint raw.bin into explicit NPZ tensors."""
+"""Convert a Sykora Bullet checkpoint raw.bin into explicit NPZ tensors."""
from __future__ import annotations
@@ -14,8 +14,8 @@
from common import ( # noqa: E402
SCALE,
- V4_Q0,
- V4_Q,
+ NNUE_Q0,
+ NNUE_Q,
SYKORA16_BUCKET_LAYOUT_32,
expand_mirrored_bucket_layout,
)
@@ -23,7 +23,7 @@
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
- description="Convert a SYKNNUE4 Bullet raw checkpoint into NPZ tensors."
+ description="Convert a SYKNNUE5 Bullet raw checkpoint into NPZ tensors."
)
parser.add_argument(
"--input",
@@ -76,25 +76,29 @@ def take_f32(buf, offset: int, count: int):
def expected_raw_sizes(
- *, bucket_count: int, ft_hidden: int
+ *, bucket_count: int, ft_hidden: int, network_format: str, output_bucket_count: int
) -> dict[str, int]:
input_size = 768 * bucket_count
+ if network_format != "syk5":
+ raise ValueError(f"unsupported network format: {network_format}")
return {
- "spec_merged_ft": (
+ "syk5_output_buckets": (
input_size * ft_hidden
+ ft_hidden
- + (2 * ft_hidden)
- + 1
+ + (output_bucket_count * 2 * ft_hidden)
+ + output_bucket_count
),
}
def detect_layout(
- *, raw_len: int, bucket_count: int, ft_hidden: int
+ *, raw_len: int, bucket_count: int, ft_hidden: int, network_format: str, output_bucket_count: int
) -> str:
sizes = expected_raw_sizes(
bucket_count=bucket_count,
ft_hidden=ft_hidden,
+ network_format=network_format,
+ output_bucket_count=output_bucket_count,
)
for name, expected in sizes.items():
if raw_len == expected:
@@ -109,10 +113,10 @@ def parse_network_config(run_meta: dict) -> dict:
network = dict(run_meta.get("network", {}))
env = run_meta.get("env", {})
- network_format = network.get("format") or env.get("SYK_NETWORK_FORMAT") or "syk4"
- if network_format != "syk4":
+ network_format = network.get("format") or env.get("SYK_NETWORK_FORMAT") or "syk5"
+ if network_format != "syk5":
raise ValueError(
- f"run_meta.json does not describe a SYKNNUE4 run: {network_format!r}"
+ f"run_meta.json does not describe a SYKNNUE5 run: {network_format!r}"
)
if "bucket_layout_64" in network:
@@ -130,6 +134,11 @@ def parse_network_config(run_meta: dict) -> dict:
"format": network_format,
"bucket_layout_64": bucket_layout_64,
"ft_hidden": int(network.get("ft_hidden") or env["SYK_HIDDEN"]),
+ "output_bucket_count": int(
+ network.get("output_bucket_count")
+ or env.get("SYK_OUTPUT_BUCKETS")
+ or 8
+ ),
}
@@ -148,6 +157,8 @@ def main() -> int:
bucket_layout_64 = [int(v) for v in network["bucket_layout_64"]]
bucket_count = max(bucket_layout_64) + 1
ft_hidden = int(network["ft_hidden"])
+ network_format = str(network["format"])
+ output_bucket_count = int(network["output_bucket_count"])
input_size = 768 * bucket_count
raw = np.fromfile(raw_path, dtype=" int:
raw_len=raw.shape[0],
bucket_count=bucket_count,
ft_hidden=ft_hidden,
+ network_format=network_format,
+ output_bucket_count=output_bucket_count,
)
offset = 0
l0w, offset = take_f32(raw, offset, input_size * ft_hidden)
l0b, offset = take_f32(raw, offset, ft_hidden)
- outw, offset = take_f32(raw, offset, 2 * ft_hidden)
- outb, offset = take_f32(raw, offset, 1)
+ outw_len = output_bucket_count * 2 * ft_hidden
+ outb_len = output_bucket_count
+ outw, offset = take_f32(raw, offset, outw_len)
+ outb, offset = take_f32(raw, offset, outb_len)
if offset != raw.shape[0]:
raise ValueError(
@@ -170,8 +185,8 @@ def main() -> int:
ft_weights = l0w.reshape(input_size, ft_hidden)
ft_bias = l0b.reshape(ft_hidden)
- out_weights = outw.reshape(2 * ft_hidden)
- out_bias = outb.reshape(1)
+ out_weights = outw.reshape(output_bucket_count, 2 * ft_hidden)
+ out_bias = outb.reshape(output_bucket_count)
out_path = Path(args.output)
out_path.parent.mkdir(parents=True, exist_ok=True)
@@ -184,19 +199,21 @@ def main() -> int:
bucket_layout_64=np.asarray(bucket_layout_64, dtype=np.uint8),
feature_set=np.asarray([1], dtype=np.uint8),
input_bucket_count=np.asarray([bucket_count], dtype=np.uint8),
+ output_bucket_count=np.asarray([output_bucket_count], dtype=np.uint8),
activation_type=np.asarray([1], dtype=np.uint8),
- q0=np.asarray([V4_Q0], dtype=np.uint16),
- q=np.asarray([V4_Q], dtype=np.uint16),
+ q0=np.asarray([NNUE_Q0], dtype=np.uint16),
+ q=np.asarray([NNUE_Q], dtype=np.uint16),
scale=np.asarray([SCALE], dtype=np.uint16),
)
print(f"Input: {raw_path}")
print(f"Run metadata: {run_meta_path}")
- print("Network format: SYKNNUE4")
+ print(f"Network format: {network_format.upper()}")
print(f"Detected raw layout: {layout}")
print(f"Bucket count: {bucket_count}")
print(f"FT hidden: {ft_hidden}")
- print(f"Dense head: linear {2 * ft_hidden} -> 1")
+ print(f"Output buckets: {output_bucket_count}")
+ print(f"Dense head: bucketed linear {2 * ft_hidden} -> 1")
print(f"Wrote: {out_path}")
return 0
diff --git a/utils/nnue/bullet/export_npz_to_syk4.py b/utils/nnue/bullet/export_npz_to_syk5.py
similarity index 76%
rename from utils/nnue/bullet/export_npz_to_syk4.py
rename to utils/nnue/bullet/export_npz_to_syk5.py
index b5465d4..841fe32 100644
--- a/utils/nnue/bullet/export_npz_to_syk4.py
+++ b/utils/nnue/bullet/export_npz_to_syk5.py
@@ -1,5 +1,5 @@
#!/usr/bin/env python3
-"""Convert a float-domain NPZ checkpoint into Sykora SYKNNUE4 format."""
+"""Convert a float-domain NPZ checkpoint into Sykora SYKNNUE5 format."""
from __future__ import annotations
@@ -16,18 +16,18 @@
ACTIVATION_SCRELU,
FEATURE_SET_KING_BUCKETS_MIRRORED,
SCALE,
- V4_Q,
- V4_Q0,
+ NNUE_Q,
+ NNUE_Q0,
SYKORA16_BUCKET_LAYOUT_32,
expand_mirrored_bucket_layout,
input_size_for_feature_set,
- write_syk_nnue_v4,
+ write_syk_nnue_v5,
)
def parse_args() -> argparse.Namespace:
parser = argparse.ArgumentParser(
- description="Export NPZ checkpoint to SYKNNUE4 net."
+ description="Export NPZ checkpoint to SYKNNUE5 net."
)
parser.add_argument("--input", required=True, help="Input .npz checkpoint")
parser.add_argument("--output-net", required=True, help="Output .sknnue path")
@@ -73,7 +73,7 @@ def main() -> int:
with np.load(in_path) as ckpt:
ft_weights = np.asarray(expect_array(ckpt, "ft_weights"), dtype=np.float32)
ft_bias = np.asarray(expect_array(ckpt, "ft_bias"), dtype=np.float32).reshape(-1)
- out_weights = np.asarray(expect_array(ckpt, "out_weights"), dtype=np.float32).reshape(-1)
+ out_weights = np.asarray(expect_array(ckpt, "out_weights"), dtype=np.float32)
out_bias = np.asarray(expect_array(ckpt, "out_bias"), dtype=np.float32).reshape(-1)
if "bucket_layout_64" in ckpt:
@@ -92,8 +92,8 @@ def main() -> int:
else:
activation_type = ACTIVATION_SCRELU
- q0 = int(np.asarray(ckpt["q0"]).reshape(-1)[0]) if "q0" in ckpt else V4_Q0
- q = int(np.asarray(ckpt["q"]).reshape(-1)[0]) if "q" in ckpt else V4_Q
+ q0 = int(np.asarray(ckpt["q0"]).reshape(-1)[0]) if "q0" in ckpt else NNUE_Q0
+ q = int(np.asarray(ckpt["q"]).reshape(-1)[0]) if "q" in ckpt else NNUE_Q
scale = int(np.asarray(ckpt["scale"]).reshape(-1)[0]) if "scale" in ckpt else SCALE
if args.q0 is not None:
@@ -104,7 +104,7 @@ def main() -> int:
scale = args.scale
if feature_set != FEATURE_SET_KING_BUCKETS_MIRRORED:
- raise ValueError("SYKNNUE4 only supports king_buckets_mirrored inputs")
+ raise ValueError("SYKNNUE5 only supports king_buckets_mirrored inputs")
if len(bucket_layout) != 64:
raise ValueError(f"bucket_layout_64 must have 64 entries, got {len(bucket_layout)}")
@@ -120,22 +120,35 @@ def main() -> int:
raise ValueError(
f"ft_bias length mismatch: expected {ft_hidden_size}, got {ft_bias.shape[0]}"
)
- if out_weights.shape[0] != 2 * ft_hidden_size:
+
+ out_weights = np.asarray(out_weights, dtype=np.float32)
+ if out_weights.ndim == 1:
+ if out_bias.shape[0] <= 0:
+ raise ValueError("out_bias must contain at least one output bucket")
+ output_bucket_count = out_bias.shape[0]
+ out_weights = out_weights.reshape(output_bucket_count, 2 * ft_hidden_size)
+ elif out_weights.ndim == 2:
+ output_bucket_count = out_weights.shape[0]
+ else:
+ raise ValueError(f"out_weights must be rank-1 or rank-2, got shape {out_weights.shape}")
+
+ if output_bucket_count <= 1:
+ raise ValueError("SYKNNUE5 requires more than one output bucket")
+ if out_weights.shape != (output_bucket_count, 2 * ft_hidden_size):
raise ValueError(
- f"out_weights shape mismatch: expected {(2 * ft_hidden_size,)}, got {out_weights.shape}"
+ f"out_weights shape mismatch: expected {(output_bucket_count, 2 * ft_hidden_size)}, got {out_weights.shape}"
)
- if out_bias.shape[0] != 1:
+ if out_bias.shape[0] != output_bucket_count:
raise ValueError(
- f"out_bias length mismatch: expected 1, got {out_bias.shape[0]}"
+ f"out_bias length mismatch: expected {output_bucket_count}, got {out_bias.shape[0]}"
)
ft_bias_i16 = quantize_clipped(ft_bias, q0, -32768, 32767, np.int16)
ft_weights_i16 = quantize_clipped(
ft_weights.reshape(-1), q0, -32768, 32767, np.int16
)
-
out_bias_i32 = quantize_clipped(
- out_bias,
+ out_bias.reshape(-1),
q0 * q,
-2147483648,
2147483647,
@@ -146,26 +159,28 @@ def main() -> int:
)
out_path = Path(args.output_net)
- write_syk_nnue_v4(
+ write_syk_nnue_v5(
out_path,
ft_hidden_size=ft_hidden_size,
ft_biases_i16=ft_bias_i16.tolist(),
ft_weights_i16=ft_weights_i16.tolist(),
- out_bias_i32=int(out_bias_i32[0]),
+ out_biases_i32=out_bias_i32.tolist(),
out_weights_i16=out_weights_i16.tolist(),
activation_type=activation_type,
feature_set=feature_set,
bucket_layout_64=bucket_layout,
+ output_bucket_count=output_bucket_count,
q0=q0,
q=q,
scale=scale,
)
print(f"Input: {in_path}")
- print("Output format: SYKNNUE4")
- print(f"Bucket count: {max(bucket_layout) + 1}")
+ print("Output format: SYKNNUE5")
+ print(f"Input bucket count: {max(bucket_layout) + 1}")
+ print(f"Output bucket count: {output_bucket_count}")
print(f"FT hidden: {ft_hidden_size}")
- print(f"Dense head: linear {2 * ft_hidden_size} -> 1")
+ print(f"Dense head: bucketed linear {2 * ft_hidden_size} -> 1")
print(f"Wrote: {out_path}")
return 0
diff --git a/utils/nnue/bullet/gate_checkpoints.py b/utils/nnue/bullet/gate_checkpoints.py
index bc46f99..57e7631 100755
--- a/utils/nnue/bullet/gate_checkpoints.py
+++ b/utils/nnue/bullet/gate_checkpoints.py
@@ -29,8 +29,8 @@ def parse_args() -> argparse.Namespace:
)
parser.add_argument(
"--npz-to-net",
- default="utils/nnue/bullet/export_npz_to_syk4.py",
- help="Path to NPZ -> SYKNNUE4 exporter",
+ default="utils/nnue/bullet/export_npz_to_syk5.py",
+ help="Path to NPZ -> SYKNNUE5 exporter",
)
parser.add_argument(
"--engine", default="./zig-out/bin/sykora", help="Engine under test"
@@ -172,7 +172,7 @@ def main() -> int:
for ckpt in ckpts:
npz_out = nets_dir / f"{ckpt.name}.npz"
- net_out = nets_dir / f"{ckpt.name}.sknnue4"
+ net_out = nets_dir / f"{ckpt.name}.sknnue"
run_capture(
[
sys.executable,
diff --git a/utils/nnue/bullet/train_cuda_longrun.py b/utils/nnue/bullet/train_cuda_longrun.py
index 802cd4a..6c4cc6a 100755
--- a/utils/nnue/bullet/train_cuda_longrun.py
+++ b/utils/nnue/bullet/train_cuda_longrun.py
@@ -65,8 +65,8 @@ def parse_args() -> argparse.Namespace:
)
parser.add_argument(
"--network-format",
- choices=["syk4"],
- default="syk4",
+ choices=["syk5"],
+ default="syk5",
help="Training network format",
)
parser.add_argument(
@@ -97,6 +97,12 @@ def parse_args() -> argparse.Namespace:
parser.add_argument(
"--threads", type=int, default=8, help="Bullet training/data threads"
)
+ parser.add_argument(
+ "--output-buckets",
+ type=int,
+ default=8,
+ help="SYKNNUE5 material-count output buckets (currently fixed at 8)",
+ )
# Data format
parser.add_argument(
@@ -163,6 +169,9 @@ def main() -> int:
if args.lr_start <= 0:
print("--lr-start must be > 0", file=sys.stderr)
return 2
+ if args.output_buckets != 8:
+ print("SYKNNUE5 currently supports exactly 8 output buckets", file=sys.stderr)
+ return 2
if args.save_rate <= 0 or args.threads <= 0:
print("--save-rate and --threads must be > 0", file=sys.stderr)
return 2
@@ -191,6 +200,7 @@ def main() -> int:
"SYK_WDL": str(args.wdl),
"SYK_SAVE_RATE": str(args.save_rate),
"SYK_THREADS": str(args.threads),
+ "SYK_OUTPUT_BUCKETS": str(args.output_buckets),
"SYK_OUTPUT_DIR": str(ckpt_dir.resolve()),
"SYK_NET_ID": run_id,
"SYK_DATA_FORMAT": data_format,
@@ -219,7 +229,8 @@ def main() -> int:
"bucket_layout_64": bucket_layout_64(args.bucket_layout),
"ft_hidden": args.hidden,
"hidden_activation": "screlu",
- "head": "shared_linear",
+ "head": "material_count_output_buckets",
+ "output_bucket_count": args.output_buckets,
},
"env": {
"SYK_DATASET": env["SYK_DATASET"],
@@ -233,6 +244,7 @@ def main() -> int:
"SYK_WDL": env["SYK_WDL"],
"SYK_SAVE_RATE": env["SYK_SAVE_RATE"],
"SYK_THREADS": env["SYK_THREADS"],
+ "SYK_OUTPUT_BUCKETS": env["SYK_OUTPUT_BUCKETS"],
"SYK_OUTPUT_DIR": env["SYK_OUTPUT_DIR"],
"SYK_NET_ID": env["SYK_NET_ID"],
"SYK_DATA_FORMAT": env["SYK_DATA_FORMAT"],
diff --git a/utils/nnue/bullet_runner/src/main.rs b/utils/nnue/bullet_runner/src/main.rs
index 6dab11d..a71b81b 100644
--- a/utils/nnue/bullet_runner/src/main.rs
+++ b/utils/nnue/bullet_runner/src/main.rs
@@ -1,18 +1,16 @@
use bullet_lib::{
game::{
formats::sfbinpack::TrainingDataEntry,
- inputs::{get_num_buckets, ChessBucketsMirrored},
- },
- nn::{
- optimiser::{AdamW, AdamWParams},
- InitSettings, Shape,
+ inputs::{ChessBucketsMirrored, get_num_buckets},
+ outputs::MaterialCount,
},
+ nn::optimiser::{AdamW, AdamWParams},
trainer::{
save::SavedFormat,
- schedule::{lr, wdl, TrainingSchedule, TrainingSteps},
+ schedule::{TrainingSchedule, TrainingSteps, lr, wdl},
settings::LocalSettings,
},
- value::{loader::DirectSequentialDataLoader, ValueTrainerBuilder},
+ value::{ValueTrainerBuilder, loader::DirectSequentialDataLoader},
};
use std::env;
@@ -27,7 +25,7 @@ const BUCKET_LAYOUT_SYKORA16: [usize; 32] = [
12, 12, 13, 13,
14, 14, 15, 15,
];
-
+const SYK5_OUTPUT_BUCKETS: usize = 8;
fn env_usize(name: &str, default: usize) -> usize {
env::var(name)
@@ -61,7 +59,7 @@ fn binpack_filter(entry: &TrainingDataEntry) -> bool {
&& entry.score.unsigned_abs() <= 10000
}
-fn run_syk4(
+fn run_syk5(
bucket_layout: [usize; 32],
num_input_buckets: usize,
dataset_paths: &[&str],
@@ -82,38 +80,25 @@ fn run_syk4(
.dual_perspective()
.optimiser(AdamW)
.inputs(ChessBucketsMirrored::new(bucket_layout))
+ .output_buckets(MaterialCount::)
.use_threads(threads)
.save_format(&[
- SavedFormat::id("l0w")
- .transform(move |store, weights| {
- let factoriser = store.get("l0f").values.repeat(num_input_buckets);
- weights
- .into_iter()
- .zip(factoriser)
- .map(|(a, b)| a + b)
- .collect()
- })
- .round()
- .quantise::(255),
+ SavedFormat::id("l0w").round().quantise::(255),
SavedFormat::id("l0b").round().quantise::(255),
SavedFormat::id("outw").round().quantise::(64),
SavedFormat::id("outb").round().quantise::(255 * 64),
])
.loss_fn(|output, target| output.sigmoid().squared_error(target))
- .build(|builder, stm_inputs, ntm_inputs| {
- let l0f = builder.new_weights("l0f", Shape::new(hl_size, 768), InitSettings::Zeroed);
- let expanded_factoriser = l0f.repeat(num_input_buckets);
-
- let mut l0 = builder.new_affine("l0", 768 * num_input_buckets, hl_size);
+ .build(|builder, stm_inputs, ntm_inputs, output_buckets| {
+ let l0 = builder.new_affine("l0", 768 * num_input_buckets, hl_size);
l0.init_with_effective_input_size(32);
- l0.weights = l0.weights + expanded_factoriser;
- let out = builder.new_affine("out", 2 * hl_size, 1);
+ let out = builder.new_affine("out", 2 * hl_size, SYK5_OUTPUT_BUCKETS);
let stm_hidden = l0.forward(stm_inputs).screlu();
let ntm_hidden = l0.forward(ntm_inputs).screlu();
let hidden = stm_hidden.concat(ntm_hidden);
- out.forward(hidden)
+ out.forward(hidden).select(output_buckets)
});
let stricter_clipping = AdamWParams {
@@ -124,9 +109,6 @@ fn run_syk4(
trainer
.optimiser
.set_params_for_weight("l0w", stricter_clipping);
- trainer
- .optimiser
- .set_params_for_weight("l0f", stricter_clipping);
let schedule = TrainingSchedule {
net_id,
@@ -173,11 +155,11 @@ fn run_syk4(
binpack_buffer_mb, binpack_threads
);
println!(
- "Input layout: mirrored king buckets ({} buckets), shared head",
- num_input_buckets
+ "Input layout: mirrored king buckets ({} buckets), material output buckets ({})",
+ num_input_buckets, SYK5_OUTPUT_BUCKETS
);
println!("FT width: {} per perspective", hl_size);
- println!("Dense head: linear {} -> 1", 2 * hl_size);
+ println!("Dense head: bucketed linear {} -> 1", 2 * hl_size);
for p in dataset_paths {
println!(" Dataset: {}", p);
}
@@ -195,11 +177,11 @@ fn run_syk4(
_ => {
println!("Using DirectSequentialDataLoader (bullet format)");
println!(
- "Input layout: mirrored king buckets ({} buckets), shared head",
- num_input_buckets
+ "Input layout: mirrored king buckets ({} buckets), material output buckets ({})",
+ num_input_buckets, SYK5_OUTPUT_BUCKETS
);
println!("FT width: {} per perspective", hl_size);
- println!("Dense head: linear {} -> 1", 2 * hl_size);
+ println!("Dense head: bucketed linear {} -> 1", 2 * hl_size);
for p in dataset_paths {
println!(" Dataset: {}", p);
}
@@ -223,8 +205,8 @@ fn main() {
let net_id = env_string("SYK_NET_ID", "sykora_bucketed");
let resume_from = env::var("SYK_RESUME").ok();
let data_format = env_string("SYK_DATA_FORMAT", "bullet");
- let network_format = env_string("SYK_NETWORK_FORMAT", "syk4");
- let hl_size = env_usize("SYK_HIDDEN", 768);
+ let network_format = env_string("SYK_NETWORK_FORMAT", "syk5");
+ let hl_size = env_usize("SYK_HIDDEN", 512);
let bucket_layout_name = env_string("SYK_BUCKET_LAYOUT", "sykora16");
let bucket_layout = selected_bucket_layout(&bucket_layout_name);
@@ -235,11 +217,11 @@ fn main() {
println!("Network format: {}", network_format);
println!("Bucket layout: {}", bucket_layout_name);
- if network_format != "syk4" {
+ if network_format != "syk5" {
panic!("unsupported network format: {network_format}");
}
- run_syk4(
+ run_syk5(
bucket_layout,
num_input_buckets,
&dataset_paths,
diff --git a/utils/nnue/common.py b/utils/nnue/common.py
index 83bb703..89eb6f6 100644
--- a/utils/nnue/common.py
+++ b/utils/nnue/common.py
@@ -11,15 +11,11 @@
LEGACY_INPUT_SIZE = 768
-QA = 255
-QB = 64
-V4_Q0 = 255
-V4_Q = 64
+NNUE_Q0 = 255
+NNUE_Q = 64
SCALE = 400
-MAGIC_V3 = b"SYKNNUE3"
-FORMAT_VERSION_V3 = 3
-MAGIC_V4 = b"SYKNNUE4"
-FORMAT_VERSION_V4 = 4
+MAGIC_V5 = b"SYKNNUE5"
+FORMAT_VERSION_V5 = 5
FEATURE_SET_LEGACY = 0
FEATURE_SET_KING_BUCKETS_MIRRORED = 1
@@ -27,17 +23,6 @@
ACTIVATION_RELU = 0
ACTIVATION_SCRELU = 1
-SYKORA_BUCKET_LAYOUT_32 = [
- 0, 1, 2, 3,
- 4, 4, 5, 5,
- 6, 6, 6, 6,
- 7, 7, 7, 7,
- 8, 8, 8, 8,
- 8, 8, 8, 8,
- 9, 9, 9, 9,
- 9, 9, 9, 9,
-]
-
SYKORA16_BUCKET_LAYOUT_32 = [
0, 0, 1, 1,
2, 2, 3, 3,
@@ -170,91 +155,30 @@ def _pack_i32(values: Iterable[int]) -> bytes:
return b"".join(struct.pack(" bytes:
- return b"".join(struct.pack(" None:
- if hidden_size <= 0:
- raise ValueError("hidden_size must be > 0")
- if len(input_biases_i16) != hidden_size:
- raise ValueError("input_biases length mismatch")
- input_size = input_size_for_feature_set(feature_set, bucket_layout_64)
- if len(input_weights_i16) != input_size * hidden_size:
- raise ValueError("input_weights length mismatch")
- if len(output_weights_i16) != 2 * hidden_size:
- raise ValueError("output_weights length mismatch")
-
- if feature_set == FEATURE_SET_LEGACY:
- bucket_layout_64 = [0] * 64
- elif bucket_layout_64 is None or len(bucket_layout_64) != 64:
- raise ValueError("bucket_layout_64 must contain exactly 64 entries")
-
- bucket_count = num_buckets(bucket_layout_64)
-
- path.parent.mkdir(parents=True, exist_ok=True)
- with path.open("wb") as handle:
- handle.write(MAGIC_V3)
- handle.write(struct.pack(" int:
- input_size = input_size_for_feature_set(feature_set, bucket_layout_64)
- h = ft_hidden_size
- return (
- 2 * h
- + 2 * input_size * h
- + 4
- + 4 * h
- )
-
-
-def write_syk_nnue_v4(
+def write_syk_nnue_v5(
path: Path,
*,
ft_hidden_size: int,
ft_biases_i16: List[int],
ft_weights_i16: List[int],
- out_bias_i32: int,
+ out_biases_i32: List[int],
out_weights_i16: List[int],
activation_type: int = ACTIVATION_SCRELU,
feature_set: int = FEATURE_SET_KING_BUCKETS_MIRRORED,
bucket_layout_64: List[int] | None = None,
- q0: int = V4_Q0,
- q: int = V4_Q,
+ output_bucket_count: int = 8,
+ q0: int = NNUE_Q0,
+ q: int = NNUE_Q,
scale: int = SCALE,
) -> None:
if feature_set != FEATURE_SET_KING_BUCKETS_MIRRORED:
- raise ValueError("SYKNNUE4 currently requires king_buckets_mirrored inputs")
+ raise ValueError("SYKNNUE5 currently requires king_buckets_mirrored inputs")
if bucket_layout_64 is None or len(bucket_layout_64) != 64:
raise ValueError("bucket_layout_64 must contain exactly 64 entries")
if ft_hidden_size <= 0:
raise ValueError("ft_hidden_size must be > 0")
+ if output_bucket_count <= 0 or output_bucket_count > 255:
+ raise ValueError("output_bucket_count must be in 1..255")
if activation_type not in (ACTIVATION_RELU, ACTIVATION_SCRELU):
raise ValueError("unsupported activation_type")
@@ -266,22 +190,25 @@ def write_syk_nnue_v4(
raise ValueError("ft_biases length mismatch")
if len(ft_weights_i16) != input_size * h:
raise ValueError("ft_weights length mismatch")
- if len(out_weights_i16) != 2 * h:
+ if len(out_biases_i32) != output_bucket_count:
+ raise ValueError("out_biases length mismatch")
+ if len(out_weights_i16) != output_bucket_count * 2 * h:
raise ValueError("out_weights length mismatch")
path.parent.mkdir(parents=True, exist_ok=True)
with path.open("wb") as handle:
- handle.write(MAGIC_V4)
- handle.write(struct.pack("