diff --git a/README.md b/README.md index a72529d..f8e238b 100644 --- a/README.md +++ b/README.md @@ -92,8 +92,8 @@ Sykora is tested by [CCRL](https://computerchess.org.uk/ccrl/404/). Current entr Evaluation: NNUE (default) with classical fallback - **NNUE evaluation** (default, embedded in binary): - - `SYKNNUE3` and `SYKNNUE4` network loading - - Legacy `768 -> Nx2 -> 1` and mirrored king-bucketed sparse-input nets + - `SYKNNUE5` material-output-bucket nets, with compatibility for the current embedded net + - Mirrored king-bucketed sparse-input nets - SCReLU activation with incremental accumulators during search - Trained on high-depth self-play data via the Bullet trainer - King-bucket training path via `nnue/bullet_repo/examples/sykora_bucketed.rs` @@ -260,7 +260,7 @@ See `history/README.md` for folder schema and the archived workflow. ## NNUE -Sykora supports both legacy `768 -> Nx2 -> 1` nets and mirrored king-bucketed nets with dual-perspective accumulator updates and SCReLU activation. The engine can load both `SYKNNUE3` and `SYKNNUE4` files. +Sykora's current training target is `SYKNNUE5`: mirrored king-bucketed sparse inputs with dual-perspective accumulator updates, SCReLU activation, and material-count output buckets. The engine still keeps loader compatibility for the current embedded net until `src/net.sknnue` is replaced. ### Runtime @@ -270,7 +270,7 @@ Sykora supports both legacy `768 -> Nx2 -> 1` nets and mirrored king-bucketed ne - To use a different net, set `EvalFile` to the path of an external `.sknnue` file. - `NnueScale` scales the NNUE score before it is fed into the search. -For exact file-format details, see [specs/syknnue4_spec.md](specs/syknnue4_spec.md) and `src/nnue.zig`. +For exact file-format details, see [specs/syknnue5_spec.md](specs/syknnue5_spec.md) and `src/nnue.zig`. ### Training Pipeline @@ -284,8 +284,9 @@ python utils/nnue/bullet/train_cuda_longrun.py \ --data-format binpack \ --bullet-repo nnue/bullet_repo \ --output-root nnue/models/bullet \ - --network-format syk3 \ - --hidden 256 --end-superbatch 320 --threads 8 + --network-format syk5 \ + --bucket-layout sykora16 \ + --hidden 512 --end-superbatch 320 --threads 8 ``` **Using BulletFormat .data files:** @@ -295,8 +296,9 @@ python utils/nnue/bullet/train_cuda_longrun.py \ --dataset nnue/data/bullet/train/train_main.data \ --bullet-repo nnue/bullet_repo \ --output-root nnue/models/bullet \ - --network-format syk3 \ - --hidden 256 --end-superbatch 320 --threads 8 + --network-format syk5 \ + --bucket-layout sykora16 \ + --hidden 512 --end-superbatch 320 --threads 8 ``` **Multiple datasets** can be passed space-separated: @@ -319,16 +321,15 @@ python utils/nnue/bullet/train_cuda_longrun.py \ ... ``` -**Training a `SYKNNUE4` baseline:** +**Training a `SYKNNUE5` material-output-bucket net:** ```bash python utils/nnue/bullet/train_cuda_longrun.py \ --dataset data/training.binpack \ --data-format binpack \ - --network-format syk4 \ + --network-format syk5 \ --bucket-layout sykora16 \ - --hidden 1536 \ - --dense-l1 16 --dense-l2 32 \ + --hidden 512 \ --end-superbatch 320 --threads 8 ``` @@ -343,15 +344,15 @@ Sykora can generate its own training data via the `gensfen` command: ### Exporting a Trained Net -Export a `SYKNNUE4` checkpoint: +Export a `SYKNNUE5` checkpoint: ```bash python utils/nnue/bullet/checkpoint_raw_to_npz.py \ --input nnue/models/bullet//checkpoints/ \ - --output checkpoint_syk4.npz + --output checkpoint_syk5.npz -python utils/nnue/bullet/export_npz_to_syk4.py \ - --input checkpoint_syk4.npz \ +python utils/nnue/bullet/export_npz_to_syk5.py \ + --input checkpoint_syk5.npz \ --output-net output.sknnue ``` @@ -379,7 +380,7 @@ python utils/nnue/bullet/gate_checkpoints.py \ This gate now evaluates recent checkpoints by selfplay only. STS is intentionally not part of the checkpoint promotion path. -SYKNNUE4 design spec: `specs/syknnue4_spec.md`. +SYKNNUE5 design spec: `specs/syknnue5_spec.md`. ## Contributing diff --git a/launch_training.ps1 b/launch_training.ps1 index 3cb48b2..40de935 100644 --- a/launch_training.ps1 +++ b/launch_training.ps1 @@ -1,4 +1,4 @@ -# Sykora NNUE V4 Training Launch Script +# Sykora NNUE V5 Training Launch Script # Run from project root: .\launch_training.ps1 # # Dataset: T80-2023 (jun-dec) + T80-2024 (jan-jun) .min-v2.v6 binpacks @@ -60,11 +60,12 @@ foreach ($bp in $binpacks) { } # --- Training Parameters --- -# SYKNNUE4 baseline: -# mirrored king buckets (sykora16) -> FT 768 -> shared linear output -$networkFormat = "syk4" +# SYKNNUE5: +# mirrored king buckets (sykora16) -> FT 512 -> 8 material-count output heads +$networkFormat = "syk5" $bucketLayout = "sykora16" -$hidden = 768 +$hidden = 512 +$outputBuckets = 8 $endSuperbatch = 600 $lrStart = 0.001 $wdl = 0.25 @@ -72,7 +73,7 @@ $saveRate = 10 $threads = 8 Write-Host "============================================" -Write-Host " Sykora NNUE V4 Training (RTX 4070 Ti SUPER)" +Write-Host " Sykora NNUE V5 Training (RTX 4070 Ti SUPER)" Write-Host "============================================" Write-Host "Data: T80-2023/2024 filtered set" Write-Host "Filtering: .min-v2.v6 on T80 inputs" @@ -81,7 +82,8 @@ Write-Host "Format: binpack (sfbinpack)" Write-Host "Net format: $networkFormat" Write-Host "Bucket layout: $bucketLayout" Write-Host "FT hidden: $hidden" -Write-Host "Dense head: linear $($hidden * 2) -> 1" +Write-Host "Output heads: $outputBuckets material-count buckets" +Write-Host "Dense head: bucketed linear $($hidden * 2) -> 1" Write-Host "Superbatches: 1 -> $endSuperbatch" Write-Host "Save rate: every $saveRate superbatches" Write-Host "Threads: $threads" @@ -104,6 +106,7 @@ python "$PSScriptRoot\utils\nnue\bullet\train_cuda_longrun.py" ` --network-format $networkFormat ` --bucket-layout $bucketLayout ` --hidden $hidden ` + --output-buckets $outputBuckets ` --end-superbatch $endSuperbatch ` --save-rate $saveRate ` --threads $threads ` diff --git a/specs/syknnue4_spec.md b/specs/syknnue4_spec.md deleted file mode 100644 index da5de16..0000000 --- a/specs/syknnue4_spec.md +++ /dev/null @@ -1,342 +0,0 @@ -# SYKNNUE4 Design Spec - -## Goal - -`SYKNNUE4` is the simple, stable baseline Sykora NNUE format. - -The design goal is: - -- keep the sparse incremental part large -- keep the head shared -- stay close to the already-working v3 math -- make the file format self-describing for mirrored king-bucket inputs - -The baseline `SYKNNUE4` net is: - -```text -king_buckets_mirrored(16 buckets) --> shared sparse FT, width 768, two color-fixed accumulators --> concat(screlu(A_us), screlu(A_them)) # 1536 inputs --> shared linear output -``` - -Short form: - -```text -shared FT: 12288 -> 768, color-fixed dual perspective --> concat(us, them) -> 1 -``` - -This is intentionally a monotonic upgrade from the v3 family: - -- same shared-head philosophy -- same SCReLU inference contract -- wider FT -- explicit mirrored king-bucket layout stored in the file - -## Non-Goals - -The first `SYKNNUE4` implementation should not include: - -- multiple output heads -- multi-layer dense heads -- PSQT side channels -- product pooling -- mixed float/int inference -- approximate rescale rules in the reference path - -## Architecture - -### Inputs - -- Feature set: `king_buckets_mirrored` -- Per-bucket base feature size: `768` -- Default input bucket count: `16` -- Bucket layout: stored explicitly in the file -- Horizontal mirroring: enabled -- Training-only factorization is allowed, but exported nets must contain merged - sparse weights only - -Per perspective: - -```text -INPUT_SIZE = 768 -INPUT_BUCKET_COUNT = 16 -HORIZONTAL_MIRRORING = true -``` - -Feature indexing is defined for color-fixed perspectives `white` and `black`, -not for side-to-move / side-not-to-move. - -For a perspective `p`: - -```text -feature = - king_bucket(p.king_sq) * 768 - + relative_color(piece, p) * (6 * 64) - + piece_type * 64 - + mirrored_square(p.king_sq, sq) -``` - -### Sparse Transformer - -The sparse transformer is: - -```text -SparseAffine(768, 768) per king bucket -``` - -Maintain two color-fixed accumulators: - -- `A_white[768]` -- `A_black[768]` - -For the reference implementation, store these accumulators as `i32`. - -At evaluation time: - -```text -if side_to_move == white: - A_us = A_white - A_them = A_black -else: - A_us = A_black - A_them = A_white -``` - -### Hidden Activation - -For each hidden accumulator entry: - -```text -u = clamp(A_us[i], 0, Q0) -t = clamp(A_them[i], 0, Q0) -``` - -Apply the activation selected by `activation_type`: - -- `0 = ReLU` -- `1 = SCReLU` - -Baseline `SYKNNUE4` uses `SCReLU`. - -For `SCReLU`: - -```text -U[i] = u * u -T[i] = t * t -``` - -Concatenate: - -```text -X = [U, T] -``` - -So: - -- `X` has length `2 * H` -- with the baseline `H = 768`, `X` has length `1536` -- each entry is in the `Q0^2` domain for `SCReLU` - -### Output Head - -The output head is shared. There are no phase-specific output stacks. - -```text -Out: Affine(2 * H, 1) -``` - -## Quantization Contract - -Use the following constants: - -```text -Q0 = 255 -Q = 64 -SCALE = 400 -``` - -Interpretation: - -- `Q0`: sparse hidden clamp / scale -- `Q`: output-weight scale -- `SCALE`: final centipawn conversion - -All float-to-int quantization in this spec uses: - -```text -quantize_round(x, scale) = - if x >= 0: - floor(x * scale + 0.5) - else: - -floor((-x) * scale + 0.5) -``` - -This is round-to-nearest with ties away from zero. - -### Hidden FT Storage - -Export the sparse branch as: - -- hidden biases: `i16` -- hidden weights: `i16` - -Quantization: - -```text -hidden_bias_int = quantize_round(hidden_bias_float, Q0) -hidden_weight_int = quantize_round(hidden_weight_float, Q0) -``` - -### Output Head Storage - -Export the shared output head as: - -- output weights: `i16` -- output bias: `i32` - -Quantization: - -```text -out_weight_int = quantize_round(out_weight_float, Q) -out_bias_int = quantize_round(out_bias_float, Q0 * Q) -``` - -## Integer Inference Contract - -### Hidden Accumulators - -The reference accumulator update path sums stored sparse integers directly: - -```text -A_white[i] = hidden_bias_int[i] + sum(active white-perspective feature weights) -A_black[i] = hidden_bias_int[i] + sum(active black-perspective feature weights) -``` - -### Output Evaluation - -For `SCReLU`: - -```text -sum_int = - Σ_i (clamp(A_us[i], 0, Q0)^2 * out_weight_int[i]) - + Σ_i (clamp(A_them[i], 0, Q0)^2 * out_weight_int[H + i]) -``` - -Rescale by one factor of `Q0` before adding bias: - -```text -sum_rescaled = div_round_nearest_signed(sum_int, Q0) -z_int = sum_rescaled + out_bias_int -``` - -Convert to centipawns: - -```text -eval_cp = div_round_nearest_signed(z_int * SCALE, Q0 * Q) -``` - -For `ReLU`, omit the squaring and the intermediate `/Q0` rescale: - -```text -sum_int = - Σ_i (clamp(A_us[i], 0, Q0) * out_weight_int[i]) - + Σ_i (clamp(A_them[i], 0, Q0) * out_weight_int[H + i]) - -z_int = sum_int + out_bias_int -eval_cp = div_round_nearest_signed(z_int * SCALE, Q0 * Q) -``` - -### Signed Division - -The reference path uses signed round-to-nearest: - -```text -div_round_nearest_signed(x, d) = - if x >= 0: - (x + d / 2) / d - else: - -(((-x) + d / 2) / d) -``` - -This is the reference contract to match across trainer, exporter, and runtime. - -## File Format - -All integers are little-endian. - -### Header - -```text -u8[8] magic = "SYKNNUE4" -u16 format_version = 4 -u8 feature_set = 1 # king_buckets_mirrored -u16 ft_hidden_size # baseline 768 -u8 activation_type # baseline 1 = SCReLU -u8 input_bucket_count # baseline 16 -u16 q0 # baseline 255 -u16 q # baseline 64 -u16 scale # baseline 400 -u8[64] bucket_layout_64 -``` - -### Payload - -Let: - -- `I = 768 * input_bucket_count` -- `H = ft_hidden_size` - -Payload order: - -```text -i32 output_bias -i16[H] ft_biases -i16[I * H] ft_weights -i16[2 * H] output_weights -``` - -Weight order: - -- `ft_weights[input_feature][hidden]` -- `output_weights[0..H]` are `us` -- `output_weights[H..2H]` are `them` - -## Loader Validation - -A loader should reject nets where: - -- `magic != "SYKNNUE4"` -- `format_version != 4` -- `feature_set != 1` -- `ft_hidden_size == 0` -- `input_bucket_count == 0` -- any `bucket_layout_64` entry is `>= input_bucket_count` -- `q0 == 0` -- `q == 0` -- `scale == 0` -- payload size does not match the header - -## Baseline Defaults - -Baseline values: - -```text -feature_set = king_buckets_mirrored -input_bucket_count = 16 -ft_hidden_size = 768 -activation_type = SCReLU -q0 = 255 -q = 64 -scale = 400 -``` - -## Reference Implementation Priorities - -If implementing or training this architecture, the recommended order is: - -1. make the sparse update path correct -2. make exporter and runtime agree bit-for-bit on fixed FENs -3. validate the shared-head model against v3-like sanity positions -4. only then consider widening the FT or adding extra head complexity diff --git a/specs/syknnue5_spec.md b/specs/syknnue5_spec.md new file mode 100644 index 0000000..45b9f48 --- /dev/null +++ b/specs/syknnue5_spec.md @@ -0,0 +1,83 @@ +# SYKNNUE5 Design Spec + +`SYKNNUE5` is Sykora's current king-bucketed SCReLU training target with +material-count output buckets. + +## Architecture + +```text +king_buckets_mirrored(16 buckets) +-> shared sparse FT, H hidden units, color-fixed dual perspective +-> concat(screlu(A_us), screlu(A_them)) +-> material-count bucketed linear output head +``` + +The first intended training target is: + +```text +shared FT: 12288 -> 512 +-> concat(us, them): 1024 +-> 8 material-count output heads +``` + +`H = 768` is the larger follow-up target. + +## Output Buckets + +The output bucket selector matches Bullet's `MaterialCount<8>`: + +```text +piece_count = popcount(occupied) +non_king_count = piece_count - 2 +divisor = ceil(32 / output_bucket_count) +output_bucket = min(non_king_count / divisor, output_bucket_count - 1) +``` + +With the default `output_bucket_count = 8`, the divisor is `4`. + +## File Format + +All integers are little-endian. + +```text +u8[8] magic = "SYKNNUE5" +u16 version = 5 +u8 feature_set = 1 # king_buckets_mirrored +u16 ft_hidden_size = H +u8 activation_type # 0 = ReLU, 1 = SCReLU +u8 input_bucket_count +u8 output_bucket_count +u16 q0 +u16 q +u16 scale +u8[64] bucket_layout +i16[H] ft_biases +i16[input_bucket_count * 768 * H] ft_weights +i32[output_bucket_count] output_biases +i16[output_bucket_count * 2 * H] output_weights +``` + +`output_weights` are bucket-major. For bucket `b`, the slice is: + +```text +output_weights[b * 2H .. (b + 1) * 2H] +``` + +The first `H` weights apply to `A_us`; the second `H` apply to `A_them`. + +## Quantization + +The baseline constants are: + +```text +Q0 = 255 +Q = 64 +SCALE = 400 +``` + +SCReLU output is divided by `Q0` before adding the selected output bias, then the +final score is converted to centipawns with: + +```text +score = round(sum * SCALE / (Q0 * Q)) +``` diff --git a/src/nnue.zig b/src/nnue.zig index ecdb0cd..62fef27 100644 --- a/src/nnue.zig +++ b/src/nnue.zig @@ -13,9 +13,9 @@ pub const SCALE: i32 = 400; const MAX_NETWORK_BYTES = 64 * 1024 * 1024; const MAGIC_V3 = "SYKNNUE3"; -const MAGIC_V4 = "SYKNNUE4"; +const MAGIC_V5 = "SYKNNUE5"; const FORMAT_VERSION_V3: u16 = 3; -const FORMAT_VERSION_V4: u16 = 4; +const FORMAT_VERSION_V5: u16 = 5; pub const FeatureSet = enum(u8) { legacy_psqt = 0, @@ -43,13 +43,14 @@ pub const Network = struct { output_bias: i32, }; - pub const V4Head = struct { + pub const V5Head = struct { activation_type: u8, // 0 = ReLU, 1 = SCReLU q0: u16, q: u16, scale: u16, - output_weights: []i16, // [2 * H] - output_bias: i32, + output_bucket_count: u8, + output_weights: []i16, // [output_bucket_count * 2 * H], bucket-major + output_biases: []i32, // [output_bucket_count] }; allocator: std.mem.Allocator, @@ -61,7 +62,7 @@ pub const Network = struct { ft_weights: []i16, head: union(enum) { v3: V3Head, - v4: V4Head, + v5: V5Head, }, pub fn deinit(self: *Network) void { @@ -71,8 +72,9 @@ pub const Network = struct { .v3 => |v3| { self.allocator.free(v3.output_weights); }, - .v4 => |v4| { - self.allocator.free(v4.output_weights); + .v5 => |v5| { + self.allocator.free(v5.output_weights); + self.allocator.free(v5.output_biases); }, } } @@ -83,8 +85,8 @@ pub const Network = struct { if (std.mem.eql(u8, data[0..8], MAGIC_V3)) { return loadFromBytesV3(allocator, data); } - if (std.mem.eql(u8, data[0..8], MAGIC_V4)) { - return loadFromBytesV4(allocator, data); + if (std.mem.eql(u8, data[0..8], MAGIC_V5)) { + return loadFromBytesV5(allocator, data); } return error.UnsupportedVersion; } @@ -144,21 +146,29 @@ fn checkedAddU64(a: u64, b: u64) ?u64 { return std.math.add(u64, a, b) catch null; } -fn computeV4PayloadBytes( +fn computeV5PayloadBytes( input_size: usize, ft_hidden_size: usize, + output_bucket_count: usize, ) ?u64 { var total: u64 = 0; - const ft_bias_bytes = checkedMulU64(@as(u64, @intCast(ft_hidden_size)), @sizeOf(i16)) orelse return null; + const hidden_size_u64: u64 = @intCast(ft_hidden_size); + const ft_bias_bytes = checkedMulU64(hidden_size_u64, @sizeOf(i16)) orelse return null; total = checkedAddU64(total, ft_bias_bytes) orelse return null; - const ft_weight_count = checkedMulU64(@as(u64, @intCast(input_size)), @as(u64, @intCast(ft_hidden_size))) orelse return null; + const ft_weight_count = checkedMulU64(@as(u64, @intCast(input_size)), hidden_size_u64) orelse return null; total = checkedAddU64(total, checkedMulU64(ft_weight_count, @sizeOf(i16)) orelse return null) orelse return null; - total = checkedAddU64(total, @sizeOf(i32)) orelse return null; - const out_weight_count = checkedMulU64(2, @as(u64, @intCast(ft_hidden_size))) orelse return null; - total = checkedAddU64(total, checkedMulU64(out_weight_count, @sizeOf(i16)) orelse return null) orelse return null; + const bias_bytes = checkedMulU64(@as(u64, @intCast(output_bucket_count)), @sizeOf(i32)) orelse return null; + total = checkedAddU64(total, bias_bytes) orelse return null; + + const single_head_weight_count = checkedMulU64(2, hidden_size_u64) orelse return null; + const output_weight_count = checkedMulU64( + @as(u64, @intCast(output_bucket_count)), + single_head_weight_count, + ) orelse return null; + total = checkedAddU64(total, checkedMulU64(output_weight_count, @sizeOf(i16)) orelse return null) orelse return null; return total; } @@ -229,11 +239,11 @@ fn loadFromBytesV3(allocator: std.mem.Allocator, data: []const u8) LoadError!Net }; } -fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Network { +fn loadFromBytesV5(allocator: std.mem.Allocator, data: []const u8) LoadError!Network { var pos: usize = 8; const version = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork; - if (version != FORMAT_VERSION_V4) return error.UnsupportedVersion; + if (version != FORMAT_VERSION_V5) return error.UnsupportedVersion; if (pos >= data.len) return error.InvalidNetwork; const feature_set = std.meta.intToEnum(FeatureSet, data[pos]) catch return error.InvalidNetwork; @@ -255,6 +265,11 @@ fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Net pos += 1; if (bucket_count == 0) return error.InvalidNetwork; + if (pos >= data.len) return error.InvalidNetwork; + const output_bucket_count = data[pos]; + pos += 1; + if (output_bucket_count == 0) return error.InvalidNetwork; + const q0 = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork; const q = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork; const scale = readBytesInt(u16, data, &pos) orelse return error.InvalidNetwork; @@ -269,21 +284,24 @@ fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Net } const input_size = LEGACY_INPUT_SIZE * @as(usize, bucket_count); - const payload_size = computeV4PayloadBytes( + const payload_size = computeV5PayloadBytes( input_size, ft_hidden_size, + output_bucket_count, ) orelse return error.InvalidNetwork; const expected_size = checkedAddU64(@as(u64, @intCast(pos)), payload_size) orelse return error.InvalidNetwork; if (expected_size != data.len) return error.InvalidNetwork; - const output_bias = readBytesInt(i32, data, &pos) orelse return error.InvalidNetwork; const ft_biases = try allocAndReadInts(i16, allocator, data, &pos, ft_hidden_size); errdefer allocator.free(ft_biases); const ft_weights = try allocAndReadInts(i16, allocator, data, &pos, input_size * ft_hidden_size); errdefer allocator.free(ft_weights); - const output_weights = try allocAndReadInts(i16, allocator, data, &pos, 2 * ft_hidden_size); + const output_biases = try allocAndReadInts(i32, allocator, data, &pos, output_bucket_count); + errdefer allocator.free(output_biases); + + const output_weights = try allocAndReadInts(i16, allocator, data, &pos, output_bucket_count * 2 * ft_hidden_size); errdefer allocator.free(output_weights); if (pos != data.len) return error.InvalidNetwork; @@ -297,13 +315,14 @@ fn loadFromBytesV4(allocator: std.mem.Allocator, data: []const u8) LoadError!Net .ft_biases = ft_biases, .ft_weights = ft_weights, .head = .{ - .v4 = .{ + .v5 = .{ .activation_type = activation_type, .q0 = q0, .q = q, .scale = scale, + .output_bucket_count = output_bucket_count, .output_weights = output_weights, - .output_bias = output_bias, + .output_biases = output_biases, }, }, }; @@ -1043,10 +1062,19 @@ pub fn updateAccumulators( } } -fn evaluateV4FromAccumulators( +inline fn materialCountOutputBucket(b: *Board, output_bucket_count: u8) usize { + const piece_count = @popCount(b.board.occupied()); + const non_king_count = if (piece_count >= 2) piece_count - 2 else 0; + const divisor = (32 + @as(usize, output_bucket_count) - 1) / @as(usize, output_bucket_count); + const bucket = non_king_count / divisor; + return @min(bucket, @as(usize, output_bucket_count) - 1); +} + +fn evaluateV5FromAccumulators( net: *const Network, - head: *const Network.V4Head, + head: *const Network.V5Head, acc: *const AccumulatorPair, + b: *Board, stm_is_white: bool, ) i32 { const hidden_size: usize = @intCast(net.ft_hidden_size); @@ -1055,17 +1083,20 @@ fn evaluateV4FromAccumulators( const scale: i32 = head.scale; const use_screlu = head.activation_type == 1; const final_den: i64 = @as(i64, q0) * @as(i64, q); + const output_bucket = materialCountOutputBucket(b, head.output_bucket_count); + const weights_base = output_bucket * 2 * hidden_size; + const weights = head.output_weights[weights_base .. weights_base + 2 * hidden_size]; const us_acc = if (stm_is_white) acc.white[0..hidden_size] else acc.black[0..hidden_size]; const them_acc = if (stm_is_white) acc.black[0..hidden_size] else acc.white[0..hidden_size]; - var sum = activatedDot(us_acc, head.output_weights[0..hidden_size], hidden_size, head.activation_type, q0) + - activatedDot(them_acc, head.output_weights[hidden_size .. 2 * hidden_size], hidden_size, head.activation_type, q0); + var sum = activatedDot(us_acc, weights[0..hidden_size], hidden_size, head.activation_type, q0) + + activatedDot(them_acc, weights[hidden_size .. 2 * hidden_size], hidden_size, head.activation_type, q0); if (use_screlu) { sum = divRoundNearestSigned(sum, q0); } - sum += head.output_bias; + sum += head.output_biases[output_bucket]; return @intCast(divRoundNearestSigned(sum * scale, final_den)); } @@ -1100,7 +1131,7 @@ pub fn evaluateFromAccumulators( const stm_is_white = b.board.move == .white; return switch (net.head) { .v3 => |*head| evaluateV3FromAccumulators(net, head, acc, stm_is_white), - .v4 => |*head| evaluateV4FromAccumulators(net, head, acc, stm_is_white), + .v5 => |*head| evaluateV5FromAccumulators(net, head, acc, b, stm_is_white), }; } diff --git a/utils/nnue/bullet/checkpoint_raw_to_npz.py b/utils/nnue/bullet/checkpoint_raw_to_npz.py index 8aaca7a..5009583 100644 --- a/utils/nnue/bullet/checkpoint_raw_to_npz.py +++ b/utils/nnue/bullet/checkpoint_raw_to_npz.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Convert a SYKNNUE4 Bullet checkpoint raw.bin into explicit NPZ tensors.""" +"""Convert a Sykora Bullet checkpoint raw.bin into explicit NPZ tensors.""" from __future__ import annotations @@ -14,8 +14,8 @@ from common import ( # noqa: E402 SCALE, - V4_Q0, - V4_Q, + NNUE_Q0, + NNUE_Q, SYKORA16_BUCKET_LAYOUT_32, expand_mirrored_bucket_layout, ) @@ -23,7 +23,7 @@ def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Convert a SYKNNUE4 Bullet raw checkpoint into NPZ tensors." + description="Convert a SYKNNUE5 Bullet raw checkpoint into NPZ tensors." ) parser.add_argument( "--input", @@ -76,25 +76,29 @@ def take_f32(buf, offset: int, count: int): def expected_raw_sizes( - *, bucket_count: int, ft_hidden: int + *, bucket_count: int, ft_hidden: int, network_format: str, output_bucket_count: int ) -> dict[str, int]: input_size = 768 * bucket_count + if network_format != "syk5": + raise ValueError(f"unsupported network format: {network_format}") return { - "spec_merged_ft": ( + "syk5_output_buckets": ( input_size * ft_hidden + ft_hidden - + (2 * ft_hidden) - + 1 + + (output_bucket_count * 2 * ft_hidden) + + output_bucket_count ), } def detect_layout( - *, raw_len: int, bucket_count: int, ft_hidden: int + *, raw_len: int, bucket_count: int, ft_hidden: int, network_format: str, output_bucket_count: int ) -> str: sizes = expected_raw_sizes( bucket_count=bucket_count, ft_hidden=ft_hidden, + network_format=network_format, + output_bucket_count=output_bucket_count, ) for name, expected in sizes.items(): if raw_len == expected: @@ -109,10 +113,10 @@ def parse_network_config(run_meta: dict) -> dict: network = dict(run_meta.get("network", {})) env = run_meta.get("env", {}) - network_format = network.get("format") or env.get("SYK_NETWORK_FORMAT") or "syk4" - if network_format != "syk4": + network_format = network.get("format") or env.get("SYK_NETWORK_FORMAT") or "syk5" + if network_format != "syk5": raise ValueError( - f"run_meta.json does not describe a SYKNNUE4 run: {network_format!r}" + f"run_meta.json does not describe a SYKNNUE5 run: {network_format!r}" ) if "bucket_layout_64" in network: @@ -130,6 +134,11 @@ def parse_network_config(run_meta: dict) -> dict: "format": network_format, "bucket_layout_64": bucket_layout_64, "ft_hidden": int(network.get("ft_hidden") or env["SYK_HIDDEN"]), + "output_bucket_count": int( + network.get("output_bucket_count") + or env.get("SYK_OUTPUT_BUCKETS") + or 8 + ), } @@ -148,6 +157,8 @@ def main() -> int: bucket_layout_64 = [int(v) for v in network["bucket_layout_64"]] bucket_count = max(bucket_layout_64) + 1 ft_hidden = int(network["ft_hidden"]) + network_format = str(network["format"]) + output_bucket_count = int(network["output_bucket_count"]) input_size = 768 * bucket_count raw = np.fromfile(raw_path, dtype=" int: raw_len=raw.shape[0], bucket_count=bucket_count, ft_hidden=ft_hidden, + network_format=network_format, + output_bucket_count=output_bucket_count, ) offset = 0 l0w, offset = take_f32(raw, offset, input_size * ft_hidden) l0b, offset = take_f32(raw, offset, ft_hidden) - outw, offset = take_f32(raw, offset, 2 * ft_hidden) - outb, offset = take_f32(raw, offset, 1) + outw_len = output_bucket_count * 2 * ft_hidden + outb_len = output_bucket_count + outw, offset = take_f32(raw, offset, outw_len) + outb, offset = take_f32(raw, offset, outb_len) if offset != raw.shape[0]: raise ValueError( @@ -170,8 +185,8 @@ def main() -> int: ft_weights = l0w.reshape(input_size, ft_hidden) ft_bias = l0b.reshape(ft_hidden) - out_weights = outw.reshape(2 * ft_hidden) - out_bias = outb.reshape(1) + out_weights = outw.reshape(output_bucket_count, 2 * ft_hidden) + out_bias = outb.reshape(output_bucket_count) out_path = Path(args.output) out_path.parent.mkdir(parents=True, exist_ok=True) @@ -184,19 +199,21 @@ def main() -> int: bucket_layout_64=np.asarray(bucket_layout_64, dtype=np.uint8), feature_set=np.asarray([1], dtype=np.uint8), input_bucket_count=np.asarray([bucket_count], dtype=np.uint8), + output_bucket_count=np.asarray([output_bucket_count], dtype=np.uint8), activation_type=np.asarray([1], dtype=np.uint8), - q0=np.asarray([V4_Q0], dtype=np.uint16), - q=np.asarray([V4_Q], dtype=np.uint16), + q0=np.asarray([NNUE_Q0], dtype=np.uint16), + q=np.asarray([NNUE_Q], dtype=np.uint16), scale=np.asarray([SCALE], dtype=np.uint16), ) print(f"Input: {raw_path}") print(f"Run metadata: {run_meta_path}") - print("Network format: SYKNNUE4") + print(f"Network format: {network_format.upper()}") print(f"Detected raw layout: {layout}") print(f"Bucket count: {bucket_count}") print(f"FT hidden: {ft_hidden}") - print(f"Dense head: linear {2 * ft_hidden} -> 1") + print(f"Output buckets: {output_bucket_count}") + print(f"Dense head: bucketed linear {2 * ft_hidden} -> 1") print(f"Wrote: {out_path}") return 0 diff --git a/utils/nnue/bullet/export_npz_to_syk4.py b/utils/nnue/bullet/export_npz_to_syk5.py similarity index 76% rename from utils/nnue/bullet/export_npz_to_syk4.py rename to utils/nnue/bullet/export_npz_to_syk5.py index b5465d4..841fe32 100644 --- a/utils/nnue/bullet/export_npz_to_syk4.py +++ b/utils/nnue/bullet/export_npz_to_syk5.py @@ -1,5 +1,5 @@ #!/usr/bin/env python3 -"""Convert a float-domain NPZ checkpoint into Sykora SYKNNUE4 format.""" +"""Convert a float-domain NPZ checkpoint into Sykora SYKNNUE5 format.""" from __future__ import annotations @@ -16,18 +16,18 @@ ACTIVATION_SCRELU, FEATURE_SET_KING_BUCKETS_MIRRORED, SCALE, - V4_Q, - V4_Q0, + NNUE_Q, + NNUE_Q0, SYKORA16_BUCKET_LAYOUT_32, expand_mirrored_bucket_layout, input_size_for_feature_set, - write_syk_nnue_v4, + write_syk_nnue_v5, ) def parse_args() -> argparse.Namespace: parser = argparse.ArgumentParser( - description="Export NPZ checkpoint to SYKNNUE4 net." + description="Export NPZ checkpoint to SYKNNUE5 net." ) parser.add_argument("--input", required=True, help="Input .npz checkpoint") parser.add_argument("--output-net", required=True, help="Output .sknnue path") @@ -73,7 +73,7 @@ def main() -> int: with np.load(in_path) as ckpt: ft_weights = np.asarray(expect_array(ckpt, "ft_weights"), dtype=np.float32) ft_bias = np.asarray(expect_array(ckpt, "ft_bias"), dtype=np.float32).reshape(-1) - out_weights = np.asarray(expect_array(ckpt, "out_weights"), dtype=np.float32).reshape(-1) + out_weights = np.asarray(expect_array(ckpt, "out_weights"), dtype=np.float32) out_bias = np.asarray(expect_array(ckpt, "out_bias"), dtype=np.float32).reshape(-1) if "bucket_layout_64" in ckpt: @@ -92,8 +92,8 @@ def main() -> int: else: activation_type = ACTIVATION_SCRELU - q0 = int(np.asarray(ckpt["q0"]).reshape(-1)[0]) if "q0" in ckpt else V4_Q0 - q = int(np.asarray(ckpt["q"]).reshape(-1)[0]) if "q" in ckpt else V4_Q + q0 = int(np.asarray(ckpt["q0"]).reshape(-1)[0]) if "q0" in ckpt else NNUE_Q0 + q = int(np.asarray(ckpt["q"]).reshape(-1)[0]) if "q" in ckpt else NNUE_Q scale = int(np.asarray(ckpt["scale"]).reshape(-1)[0]) if "scale" in ckpt else SCALE if args.q0 is not None: @@ -104,7 +104,7 @@ def main() -> int: scale = args.scale if feature_set != FEATURE_SET_KING_BUCKETS_MIRRORED: - raise ValueError("SYKNNUE4 only supports king_buckets_mirrored inputs") + raise ValueError("SYKNNUE5 only supports king_buckets_mirrored inputs") if len(bucket_layout) != 64: raise ValueError(f"bucket_layout_64 must have 64 entries, got {len(bucket_layout)}") @@ -120,22 +120,35 @@ def main() -> int: raise ValueError( f"ft_bias length mismatch: expected {ft_hidden_size}, got {ft_bias.shape[0]}" ) - if out_weights.shape[0] != 2 * ft_hidden_size: + + out_weights = np.asarray(out_weights, dtype=np.float32) + if out_weights.ndim == 1: + if out_bias.shape[0] <= 0: + raise ValueError("out_bias must contain at least one output bucket") + output_bucket_count = out_bias.shape[0] + out_weights = out_weights.reshape(output_bucket_count, 2 * ft_hidden_size) + elif out_weights.ndim == 2: + output_bucket_count = out_weights.shape[0] + else: + raise ValueError(f"out_weights must be rank-1 or rank-2, got shape {out_weights.shape}") + + if output_bucket_count <= 1: + raise ValueError("SYKNNUE5 requires more than one output bucket") + if out_weights.shape != (output_bucket_count, 2 * ft_hidden_size): raise ValueError( - f"out_weights shape mismatch: expected {(2 * ft_hidden_size,)}, got {out_weights.shape}" + f"out_weights shape mismatch: expected {(output_bucket_count, 2 * ft_hidden_size)}, got {out_weights.shape}" ) - if out_bias.shape[0] != 1: + if out_bias.shape[0] != output_bucket_count: raise ValueError( - f"out_bias length mismatch: expected 1, got {out_bias.shape[0]}" + f"out_bias length mismatch: expected {output_bucket_count}, got {out_bias.shape[0]}" ) ft_bias_i16 = quantize_clipped(ft_bias, q0, -32768, 32767, np.int16) ft_weights_i16 = quantize_clipped( ft_weights.reshape(-1), q0, -32768, 32767, np.int16 ) - out_bias_i32 = quantize_clipped( - out_bias, + out_bias.reshape(-1), q0 * q, -2147483648, 2147483647, @@ -146,26 +159,28 @@ def main() -> int: ) out_path = Path(args.output_net) - write_syk_nnue_v4( + write_syk_nnue_v5( out_path, ft_hidden_size=ft_hidden_size, ft_biases_i16=ft_bias_i16.tolist(), ft_weights_i16=ft_weights_i16.tolist(), - out_bias_i32=int(out_bias_i32[0]), + out_biases_i32=out_bias_i32.tolist(), out_weights_i16=out_weights_i16.tolist(), activation_type=activation_type, feature_set=feature_set, bucket_layout_64=bucket_layout, + output_bucket_count=output_bucket_count, q0=q0, q=q, scale=scale, ) print(f"Input: {in_path}") - print("Output format: SYKNNUE4") - print(f"Bucket count: {max(bucket_layout) + 1}") + print("Output format: SYKNNUE5") + print(f"Input bucket count: {max(bucket_layout) + 1}") + print(f"Output bucket count: {output_bucket_count}") print(f"FT hidden: {ft_hidden_size}") - print(f"Dense head: linear {2 * ft_hidden_size} -> 1") + print(f"Dense head: bucketed linear {2 * ft_hidden_size} -> 1") print(f"Wrote: {out_path}") return 0 diff --git a/utils/nnue/bullet/gate_checkpoints.py b/utils/nnue/bullet/gate_checkpoints.py index bc46f99..57e7631 100755 --- a/utils/nnue/bullet/gate_checkpoints.py +++ b/utils/nnue/bullet/gate_checkpoints.py @@ -29,8 +29,8 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--npz-to-net", - default="utils/nnue/bullet/export_npz_to_syk4.py", - help="Path to NPZ -> SYKNNUE4 exporter", + default="utils/nnue/bullet/export_npz_to_syk5.py", + help="Path to NPZ -> SYKNNUE5 exporter", ) parser.add_argument( "--engine", default="./zig-out/bin/sykora", help="Engine under test" @@ -172,7 +172,7 @@ def main() -> int: for ckpt in ckpts: npz_out = nets_dir / f"{ckpt.name}.npz" - net_out = nets_dir / f"{ckpt.name}.sknnue4" + net_out = nets_dir / f"{ckpt.name}.sknnue" run_capture( [ sys.executable, diff --git a/utils/nnue/bullet/train_cuda_longrun.py b/utils/nnue/bullet/train_cuda_longrun.py index 802cd4a..6c4cc6a 100755 --- a/utils/nnue/bullet/train_cuda_longrun.py +++ b/utils/nnue/bullet/train_cuda_longrun.py @@ -65,8 +65,8 @@ def parse_args() -> argparse.Namespace: ) parser.add_argument( "--network-format", - choices=["syk4"], - default="syk4", + choices=["syk5"], + default="syk5", help="Training network format", ) parser.add_argument( @@ -97,6 +97,12 @@ def parse_args() -> argparse.Namespace: parser.add_argument( "--threads", type=int, default=8, help="Bullet training/data threads" ) + parser.add_argument( + "--output-buckets", + type=int, + default=8, + help="SYKNNUE5 material-count output buckets (currently fixed at 8)", + ) # Data format parser.add_argument( @@ -163,6 +169,9 @@ def main() -> int: if args.lr_start <= 0: print("--lr-start must be > 0", file=sys.stderr) return 2 + if args.output_buckets != 8: + print("SYKNNUE5 currently supports exactly 8 output buckets", file=sys.stderr) + return 2 if args.save_rate <= 0 or args.threads <= 0: print("--save-rate and --threads must be > 0", file=sys.stderr) return 2 @@ -191,6 +200,7 @@ def main() -> int: "SYK_WDL": str(args.wdl), "SYK_SAVE_RATE": str(args.save_rate), "SYK_THREADS": str(args.threads), + "SYK_OUTPUT_BUCKETS": str(args.output_buckets), "SYK_OUTPUT_DIR": str(ckpt_dir.resolve()), "SYK_NET_ID": run_id, "SYK_DATA_FORMAT": data_format, @@ -219,7 +229,8 @@ def main() -> int: "bucket_layout_64": bucket_layout_64(args.bucket_layout), "ft_hidden": args.hidden, "hidden_activation": "screlu", - "head": "shared_linear", + "head": "material_count_output_buckets", + "output_bucket_count": args.output_buckets, }, "env": { "SYK_DATASET": env["SYK_DATASET"], @@ -233,6 +244,7 @@ def main() -> int: "SYK_WDL": env["SYK_WDL"], "SYK_SAVE_RATE": env["SYK_SAVE_RATE"], "SYK_THREADS": env["SYK_THREADS"], + "SYK_OUTPUT_BUCKETS": env["SYK_OUTPUT_BUCKETS"], "SYK_OUTPUT_DIR": env["SYK_OUTPUT_DIR"], "SYK_NET_ID": env["SYK_NET_ID"], "SYK_DATA_FORMAT": env["SYK_DATA_FORMAT"], diff --git a/utils/nnue/bullet_runner/src/main.rs b/utils/nnue/bullet_runner/src/main.rs index 6dab11d..a71b81b 100644 --- a/utils/nnue/bullet_runner/src/main.rs +++ b/utils/nnue/bullet_runner/src/main.rs @@ -1,18 +1,16 @@ use bullet_lib::{ game::{ formats::sfbinpack::TrainingDataEntry, - inputs::{get_num_buckets, ChessBucketsMirrored}, - }, - nn::{ - optimiser::{AdamW, AdamWParams}, - InitSettings, Shape, + inputs::{ChessBucketsMirrored, get_num_buckets}, + outputs::MaterialCount, }, + nn::optimiser::{AdamW, AdamWParams}, trainer::{ save::SavedFormat, - schedule::{lr, wdl, TrainingSchedule, TrainingSteps}, + schedule::{TrainingSchedule, TrainingSteps, lr, wdl}, settings::LocalSettings, }, - value::{loader::DirectSequentialDataLoader, ValueTrainerBuilder}, + value::{ValueTrainerBuilder, loader::DirectSequentialDataLoader}, }; use std::env; @@ -27,7 +25,7 @@ const BUCKET_LAYOUT_SYKORA16: [usize; 32] = [ 12, 12, 13, 13, 14, 14, 15, 15, ]; - +const SYK5_OUTPUT_BUCKETS: usize = 8; fn env_usize(name: &str, default: usize) -> usize { env::var(name) @@ -61,7 +59,7 @@ fn binpack_filter(entry: &TrainingDataEntry) -> bool { && entry.score.unsigned_abs() <= 10000 } -fn run_syk4( +fn run_syk5( bucket_layout: [usize; 32], num_input_buckets: usize, dataset_paths: &[&str], @@ -82,38 +80,25 @@ fn run_syk4( .dual_perspective() .optimiser(AdamW) .inputs(ChessBucketsMirrored::new(bucket_layout)) + .output_buckets(MaterialCount::) .use_threads(threads) .save_format(&[ - SavedFormat::id("l0w") - .transform(move |store, weights| { - let factoriser = store.get("l0f").values.repeat(num_input_buckets); - weights - .into_iter() - .zip(factoriser) - .map(|(a, b)| a + b) - .collect() - }) - .round() - .quantise::(255), + SavedFormat::id("l0w").round().quantise::(255), SavedFormat::id("l0b").round().quantise::(255), SavedFormat::id("outw").round().quantise::(64), SavedFormat::id("outb").round().quantise::(255 * 64), ]) .loss_fn(|output, target| output.sigmoid().squared_error(target)) - .build(|builder, stm_inputs, ntm_inputs| { - let l0f = builder.new_weights("l0f", Shape::new(hl_size, 768), InitSettings::Zeroed); - let expanded_factoriser = l0f.repeat(num_input_buckets); - - let mut l0 = builder.new_affine("l0", 768 * num_input_buckets, hl_size); + .build(|builder, stm_inputs, ntm_inputs, output_buckets| { + let l0 = builder.new_affine("l0", 768 * num_input_buckets, hl_size); l0.init_with_effective_input_size(32); - l0.weights = l0.weights + expanded_factoriser; - let out = builder.new_affine("out", 2 * hl_size, 1); + let out = builder.new_affine("out", 2 * hl_size, SYK5_OUTPUT_BUCKETS); let stm_hidden = l0.forward(stm_inputs).screlu(); let ntm_hidden = l0.forward(ntm_inputs).screlu(); let hidden = stm_hidden.concat(ntm_hidden); - out.forward(hidden) + out.forward(hidden).select(output_buckets) }); let stricter_clipping = AdamWParams { @@ -124,9 +109,6 @@ fn run_syk4( trainer .optimiser .set_params_for_weight("l0w", stricter_clipping); - trainer - .optimiser - .set_params_for_weight("l0f", stricter_clipping); let schedule = TrainingSchedule { net_id, @@ -173,11 +155,11 @@ fn run_syk4( binpack_buffer_mb, binpack_threads ); println!( - "Input layout: mirrored king buckets ({} buckets), shared head", - num_input_buckets + "Input layout: mirrored king buckets ({} buckets), material output buckets ({})", + num_input_buckets, SYK5_OUTPUT_BUCKETS ); println!("FT width: {} per perspective", hl_size); - println!("Dense head: linear {} -> 1", 2 * hl_size); + println!("Dense head: bucketed linear {} -> 1", 2 * hl_size); for p in dataset_paths { println!(" Dataset: {}", p); } @@ -195,11 +177,11 @@ fn run_syk4( _ => { println!("Using DirectSequentialDataLoader (bullet format)"); println!( - "Input layout: mirrored king buckets ({} buckets), shared head", - num_input_buckets + "Input layout: mirrored king buckets ({} buckets), material output buckets ({})", + num_input_buckets, SYK5_OUTPUT_BUCKETS ); println!("FT width: {} per perspective", hl_size); - println!("Dense head: linear {} -> 1", 2 * hl_size); + println!("Dense head: bucketed linear {} -> 1", 2 * hl_size); for p in dataset_paths { println!(" Dataset: {}", p); } @@ -223,8 +205,8 @@ fn main() { let net_id = env_string("SYK_NET_ID", "sykora_bucketed"); let resume_from = env::var("SYK_RESUME").ok(); let data_format = env_string("SYK_DATA_FORMAT", "bullet"); - let network_format = env_string("SYK_NETWORK_FORMAT", "syk4"); - let hl_size = env_usize("SYK_HIDDEN", 768); + let network_format = env_string("SYK_NETWORK_FORMAT", "syk5"); + let hl_size = env_usize("SYK_HIDDEN", 512); let bucket_layout_name = env_string("SYK_BUCKET_LAYOUT", "sykora16"); let bucket_layout = selected_bucket_layout(&bucket_layout_name); @@ -235,11 +217,11 @@ fn main() { println!("Network format: {}", network_format); println!("Bucket layout: {}", bucket_layout_name); - if network_format != "syk4" { + if network_format != "syk5" { panic!("unsupported network format: {network_format}"); } - run_syk4( + run_syk5( bucket_layout, num_input_buckets, &dataset_paths, diff --git a/utils/nnue/common.py b/utils/nnue/common.py index 83bb703..89eb6f6 100644 --- a/utils/nnue/common.py +++ b/utils/nnue/common.py @@ -11,15 +11,11 @@ LEGACY_INPUT_SIZE = 768 -QA = 255 -QB = 64 -V4_Q0 = 255 -V4_Q = 64 +NNUE_Q0 = 255 +NNUE_Q = 64 SCALE = 400 -MAGIC_V3 = b"SYKNNUE3" -FORMAT_VERSION_V3 = 3 -MAGIC_V4 = b"SYKNNUE4" -FORMAT_VERSION_V4 = 4 +MAGIC_V5 = b"SYKNNUE5" +FORMAT_VERSION_V5 = 5 FEATURE_SET_LEGACY = 0 FEATURE_SET_KING_BUCKETS_MIRRORED = 1 @@ -27,17 +23,6 @@ ACTIVATION_RELU = 0 ACTIVATION_SCRELU = 1 -SYKORA_BUCKET_LAYOUT_32 = [ - 0, 1, 2, 3, - 4, 4, 5, 5, - 6, 6, 6, 6, - 7, 7, 7, 7, - 8, 8, 8, 8, - 8, 8, 8, 8, - 9, 9, 9, 9, - 9, 9, 9, 9, -] - SYKORA16_BUCKET_LAYOUT_32 = [ 0, 0, 1, 1, 2, 2, 3, 3, @@ -170,91 +155,30 @@ def _pack_i32(values: Iterable[int]) -> bytes: return b"".join(struct.pack(" bytes: - return b"".join(struct.pack(" None: - if hidden_size <= 0: - raise ValueError("hidden_size must be > 0") - if len(input_biases_i16) != hidden_size: - raise ValueError("input_biases length mismatch") - input_size = input_size_for_feature_set(feature_set, bucket_layout_64) - if len(input_weights_i16) != input_size * hidden_size: - raise ValueError("input_weights length mismatch") - if len(output_weights_i16) != 2 * hidden_size: - raise ValueError("output_weights length mismatch") - - if feature_set == FEATURE_SET_LEGACY: - bucket_layout_64 = [0] * 64 - elif bucket_layout_64 is None or len(bucket_layout_64) != 64: - raise ValueError("bucket_layout_64 must contain exactly 64 entries") - - bucket_count = num_buckets(bucket_layout_64) - - path.parent.mkdir(parents=True, exist_ok=True) - with path.open("wb") as handle: - handle.write(MAGIC_V3) - handle.write(struct.pack(" int: - input_size = input_size_for_feature_set(feature_set, bucket_layout_64) - h = ft_hidden_size - return ( - 2 * h - + 2 * input_size * h - + 4 - + 4 * h - ) - - -def write_syk_nnue_v4( +def write_syk_nnue_v5( path: Path, *, ft_hidden_size: int, ft_biases_i16: List[int], ft_weights_i16: List[int], - out_bias_i32: int, + out_biases_i32: List[int], out_weights_i16: List[int], activation_type: int = ACTIVATION_SCRELU, feature_set: int = FEATURE_SET_KING_BUCKETS_MIRRORED, bucket_layout_64: List[int] | None = None, - q0: int = V4_Q0, - q: int = V4_Q, + output_bucket_count: int = 8, + q0: int = NNUE_Q0, + q: int = NNUE_Q, scale: int = SCALE, ) -> None: if feature_set != FEATURE_SET_KING_BUCKETS_MIRRORED: - raise ValueError("SYKNNUE4 currently requires king_buckets_mirrored inputs") + raise ValueError("SYKNNUE5 currently requires king_buckets_mirrored inputs") if bucket_layout_64 is None or len(bucket_layout_64) != 64: raise ValueError("bucket_layout_64 must contain exactly 64 entries") if ft_hidden_size <= 0: raise ValueError("ft_hidden_size must be > 0") + if output_bucket_count <= 0 or output_bucket_count > 255: + raise ValueError("output_bucket_count must be in 1..255") if activation_type not in (ACTIVATION_RELU, ACTIVATION_SCRELU): raise ValueError("unsupported activation_type") @@ -266,22 +190,25 @@ def write_syk_nnue_v4( raise ValueError("ft_biases length mismatch") if len(ft_weights_i16) != input_size * h: raise ValueError("ft_weights length mismatch") - if len(out_weights_i16) != 2 * h: + if len(out_biases_i32) != output_bucket_count: + raise ValueError("out_biases length mismatch") + if len(out_weights_i16) != output_bucket_count * 2 * h: raise ValueError("out_weights length mismatch") path.parent.mkdir(parents=True, exist_ok=True) with path.open("wb") as handle: - handle.write(MAGIC_V4) - handle.write(struct.pack("