From e79154effbf157312db5c5e68fcd61612a2e50c1 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Sun, 24 May 2026 19:18:25 +0200 Subject: [PATCH 1/7] perf: parallelize GKR phase2 sumcheck + incremental eq-table fold MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The GKR phase2 pair_coeffs loop was completely sequential — processing up to 4M pairs on a single thread for the initial layer. This change: 1. Replaces the sequential for-loop with par_iter + parallel reduction (gated by PARALLEL_THRESHOLD for small arrays) 2. Replaces per-round eval_eq recomputation with incremental pairwise addition fold, saving O(2^k) ext-field muls per round Measured: -14.1% e2e (phase2 handles ~70% of GKR layer proving). Origin: blake3-autoresearch h19 (fc7cd33c), independent of blake3. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../src/quotient_gkr/sumcheck_utils.rs | 102 +++++++++++++----- 1 file changed, 75 insertions(+), 27 deletions(-) diff --git a/crates/sub_protocols/src/quotient_gkr/sumcheck_utils.rs b/crates/sub_protocols/src/quotient_gkr/sumcheck_utils.rs index 8f45a3494..865f9b14e 100644 --- a/crates/sub_protocols/src/quotient_gkr/sumcheck_utils.rs +++ b/crates/sub_protocols/src/quotient_gkr/sumcheck_utils.rs @@ -291,43 +291,77 @@ pub(super) fn run_phase2_sumcheck>>( mut sum: EF, mut mmf: EF, ) -> (Vec, [EF; 4]) { + let eq_prefix_init = &remaining_eq[..remaining_eq.len().saturating_sub(1)]; + let mut eq_table = eval_eq(eq_prefix_init); + for _round in 0..remaining_eq.len() { let eq_alpha = *remaining_eq.last().unwrap(); - let eq_prefix = &remaining_eq[..remaining_eq.len() - 1]; - let eq_table = eval_eq(eq_prefix); let active_l = num_l.len(); let active_r = num_r.len(); let active_pairs = active_l.div_ceil(2); let fully_active = active_r / 2; - let pair = |arr: &[EF], idx: usize, pad: EF| { - ( - arr.get(idx).copied().unwrap_or(pad), - arr.get(idx + 1).copied().unwrap_or(pad), - ) + let acc = if active_pairs >= PARALLEL_THRESHOLD { + let fa = fully_active; + (0..active_pairs) + .into_par_iter() + .fold(RoundCoeffs::zero, |mut acc, j| { + let coeffs = if j < fa { + pair_coeffs::( + (num_l[2 * j], num_l[2 * j + 1]), + (num_r[2 * j], num_r[2 * j + 1]), + (den_l[2 * j], den_l[2 * j + 1]), + (den_r[2 * j], den_r[2 * j + 1]), + ) + } else { + let get_pair = |arr: &[EF], idx: usize, pad: EF| { + ( + arr.get(idx).copied().unwrap_or(pad), + arr.get(idx + 1).copied().unwrap_or(pad), + ) + }; + pair_coeffs::( + get_pair(&num_l, 2 * j, EF::ZERO), + get_pair(&num_r, 2 * j, EF::ZERO), + get_pair(&den_l, 2 * j, EF::ONE), + get_pair(&den_r, 2 * j, EF::ONE), + ) + }; + acc += coeffs * eq_table[j]; + acc + }) + .reduce(RoundCoeffs::zero, Add::add) + } else { + let mut acc = RoundCoeffs::::zero(); + for j in 0..active_pairs { + let coeffs = if j < fully_active { + pair_coeffs::( + (num_l[2 * j], num_l[2 * j + 1]), + (num_r[2 * j], num_r[2 * j + 1]), + (den_l[2 * j], den_l[2 * j + 1]), + (den_r[2 * j], den_r[2 * j + 1]), + ) + } else { + let get_pair = |arr: &[EF], idx: usize, pad: EF| { + ( + arr.get(idx).copied().unwrap_or(pad), + arr.get(idx + 1).copied().unwrap_or(pad), + ) + }; + pair_coeffs::( + get_pair(&num_l, 2 * j, EF::ZERO), + get_pair(&num_r, 2 * j, EF::ZERO), + get_pair(&den_l, 2 * j, EF::ONE), + get_pair(&den_r, 2 * j, EF::ONE), + ) + }; + acc += coeffs * eq_table[j]; + } + acc }; - let mut acc = RoundCoeffs::::zero(); - for j in 0..active_pairs { - let coeffs = if j < fully_active { - pair_coeffs::( - (num_l[2 * j], num_l[2 * j + 1]), - (num_r[2 * j], num_r[2 * j + 1]), - (den_l[2 * j], den_l[2 * j + 1]), - (den_r[2 * j], den_r[2 * j + 1]), - ) - } else { - pair_coeffs::( - pair(&num_l, 2 * j, EF::ZERO), - pair(&num_r, 2 * j, EF::ZERO), - pair(&den_l, 2 * j, EF::ONE), - pair(&den_r, 2 * j, EF::ONE), - ) - }; - acc += coeffs * eq_table[j]; - } - + let eq_prefix = &remaining_eq[..remaining_eq.len() - 1]; let padding_sum = alpha * mle_of_zeros_then_ones(active_pairs, eq_prefix); let bare = build_bare_from_coeffs( @@ -349,6 +383,20 @@ pub(super) fn run_phase2_sumcheck>>( den_l = fold_normal_with_padding(&den_l, r, EF::ONE); den_r = fold_normal_with_padding(&den_r, r, EF::ONE); + let new_eq_len = eq_table.len() / 2; + if new_eq_len > 0 { + let mut new_eq = unsafe { uninitialized_vec(new_eq_len) }; + let fold_eq = |(i, slot): (usize, &mut EF)| { + *slot = eq_table[2 * i] + eq_table[2 * i + 1]; + }; + if new_eq_len >= PARALLEL_THRESHOLD { + new_eq.par_iter_mut().enumerate().for_each(fold_eq); + } else { + new_eq.iter_mut().enumerate().for_each(fold_eq); + } + eq_table = new_eq; + } + q_natural.push(r); remaining_eq.pop(); } From 98eb62e91ea2212b310dae5d6349b9fff7cbc5d8 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Sun, 24 May 2026 19:18:40 +0200 Subject: [PATCH 2/7] perf: sumcheck optimizations (fused dual-eq, packed SIMD, zero_vec skip) - Fused dual-point eq computation: process both full-domain eq polynomials in single recursive pass, eliminates 1.28GB DRAM round-trip (h42b) - Packed SIMD first-round product sumcheck (h68) - combine_statement zero_vec skip: uninitialized buffer + STORE path when first OOD statement covers full array (h37) Measured: -2.54% (h42b), -1.18% (h68), combined ~-3.7%. Origin: pw5-clean (3e951179), independent of blake3. Co-Authored-By: Claude Opus 4.6 (1M context) --- .../koala-bear/src/quintic_extension/mod.rs | 2 + crates/backend/poly/src/eq_mle.rs | 138 +++++++++++++++++- .../sumcheck/src/product_computation.rs | 6 +- crates/whir/src/open.rs | 47 +++++- 4 files changed, 185 insertions(+), 8 deletions(-) diff --git a/crates/backend/koala-bear/src/quintic_extension/mod.rs b/crates/backend/koala-bear/src/quintic_extension/mod.rs index 6ccdca4f7..11a47cbf5 100644 --- a/crates/backend/koala-bear/src/quintic_extension/mod.rs +++ b/crates/backend/koala-bear/src/quintic_extension/mod.rs @@ -11,6 +11,8 @@ use crate::{KoalaBear, KoalaBearParameters}; pub mod extension; pub(crate) mod packed_extension; pub(crate) mod packing; +#[cfg(test)] +mod tests; pub type QuinticExtensionFieldKB = QuinticExtensionField; pub type PackedQuinticExtensionFieldKB = PackedQuinticExtensionField::Packing>; diff --git a/crates/backend/poly/src/eq_mle.rs b/crates/backend/poly/src/eq_mle.rs index 978330001..b6cbe6ae5 100644 --- a/crates/backend/poly/src/eq_mle.rs +++ b/crates/backend/poly/src/eq_mle.rs @@ -881,6 +881,109 @@ fn eval_eq_with_packed_output, const INITIALIZED } } +#[inline] +fn eval_eq_with_packed_output_dual>( + eval_a: &[EF], + eval_b: &[EF], + out: &mut [EF::ExtensionPacking], + scalar_a: EF::ExtensionPacking, + scalar_b: EF::ExtensionPacking, +) { + debug_assert_eq!(eval_a.len(), eval_b.len()); + debug_assert_eq!(out.len(), 1 << eval_a.len()); + + match eval_a.len() { + 0 => { + out[0] = scalar_a + scalar_b; + } + 1 => { + let [a0, a1] = eval_eq_1(eval_a, scalar_a); + let [b0, b1] = eval_eq_1(eval_b, scalar_b); + out[0] = a0 + b0; + out[1] = a1 + b1; + } + 2 => { + let eq_a = eval_eq_2(eval_a, scalar_a); + let eq_b = eval_eq_2(eval_b, scalar_b); + for i in 0..4 { + out[i] = eq_a[i] + eq_b[i]; + } + } + 3 => { + let eq_a = eval_eq_3(eval_a, scalar_a); + let eq_b = eval_eq_3(eval_b, scalar_b); + for i in 0..8 { + out[i] = eq_a[i] + eq_b[i]; + } + } + _ => { + let (low, high) = out.split_at_mut(out.len() / 2); + let sa1 = scalar_a * eval_a[0]; + let sa0 = scalar_a - sa1; + let sb1 = scalar_b * eval_b[0]; + let sb0 = scalar_b - sb1; + eval_eq_with_packed_output_dual::( + &eval_a[1..], &eval_b[1..], low, sa0, sb0, + ); + eval_eq_with_packed_output_dual::( + &eval_a[1..], &eval_b[1..], high, sa1, sb1, + ); + } + } +} + +pub fn compute_eval_eq_packed_dual( + eval_a: &[EF], + eval_b: &[EF], + out: &mut [EF::ExtensionPacking], + scalar_a: EF, + scalar_b: EF, +) where + EF: ExtensionField>, +{ + let packing_width = packing_width::(); + let log_packing_width = log2_strict_usize(packing_width); + + assert_eq!(eval_a.len(), eval_b.len()); + assert!(log_packing_width <= eval_a.len()); + assert_eq!(out.len(), 1 << (eval_a.len() - log_packing_width)); + + if eval_a.len() <= log_packing_width + 1 + LOG_NUM_THREADS { + let mut output_no_packing = EF::zero_vec(1 << eval_a.len()); + eval_eq_basic::<_, _, _, false>(eval_a, &mut output_no_packing, scalar_a); + eval_eq_basic::<_, _, _, true>(eval_b, &mut output_no_packing, scalar_b); + out.par_iter_mut() + .zip(output_no_packing.par_chunks_exact(packing_width)) + .for_each(|(out_elem, chunk)| { + *out_elem = EF::ExtensionPacking::from_ext_slice(chunk); + }); + } else { + let eval_len_min_packing = eval_a.len() - log_packing_width; + + let mut parallel_buffer_a = EF::ExtensionPacking::zero_vec(NUM_THREADS_PADDED); + let mut parallel_buffer_b = EF::ExtensionPacking::zero_vec(NUM_THREADS_PADDED); + let out_chunk_size = out.len() / NUM_THREADS_PADDED; + + parallel_buffer_a[0] = packed_eq_poly(&eval_a[eval_len_min_packing..], scalar_a); + fill_buffer(eval_a[..LOG_NUM_THREADS].iter().rev(), &mut parallel_buffer_a); + + parallel_buffer_b[0] = packed_eq_poly(&eval_b[eval_len_min_packing..], scalar_b); + fill_buffer(eval_b[..LOG_NUM_THREADS].iter().rev(), &mut parallel_buffer_b); + + out.par_chunks_exact_mut(out_chunk_size) + .enumerate() + .for_each(|(i, out_chunk)| { + eval_eq_with_packed_output_dual::, EF>( + &eval_a[LOG_NUM_THREADS..eval_len_min_packing], + &eval_b[LOG_NUM_THREADS..eval_len_min_packing], + out_chunk, + parallel_buffer_a[i], + parallel_buffer_b[i], + ); + }); + } +} + /// Computes the equality polynomial evaluations via a simple recursive algorithm. /// /// Unlike [`eval_eq_basic`], this function makes heavy use of packed values to speed up computations. @@ -968,10 +1071,19 @@ fn base_eval_eq_packed_with_packed_output( F: Field, EF: ExtensionField, { + // Ensure that the output buffer size is correct: + // It should be of size `2^n`, where `n` is the number of variables. + let width = F::Packing::WIDTH; + let log_packing_width = log2_strict_usize(width); debug_assert_eq!(out.len(), 1 << eval_points.len()); + debug_assert!(log_packing_width <= eval_points.len()); match eval_points.len() { - 0 => unreachable!(), + 0 => { + debug_assert_eq!(F::Packing::WIDTH, 1); + let base_vals = F::Packing::pack_slice(eq_evals.as_slice()); + scale_and_add_pf::(out, base_vals, packed_scalar); + } 1 => { let eq_evaluations = eval_eq_1(eval_points, eq_evals); scale_and_add_pf::(out, eq_evaluations.as_slice(), packed_scalar); @@ -1248,4 +1360,28 @@ mod tests { } } } + + #[test] + fn test_compute_eval_eq_packed_dual() { + let packing_width = ::Packing::WIDTH; + let log_packing_width = log2_strict_usize(packing_width); + let mut rng = StdRng::seed_from_u64(42); + + for n_vars in log_packing_width..22 { + let eval_a: Vec = (0..n_vars).map(|_| rng.random()).collect(); + let eval_b: Vec = (0..n_vars).map(|_| rng.random()).collect(); + let scalar_a: EF = rng.random(); + let scalar_b: EF = rng.random(); + + let packed_len = 1 << (n_vars - log_packing_width); + let mut out_dual = EFPacking::::zero_vec(packed_len); + compute_eval_eq_packed_dual::(&eval_a, &eval_b, &mut out_dual, scalar_a, scalar_b); + + let mut out_separate = EFPacking::::zero_vec(packed_len); + compute_eval_eq_packed::(&eval_a, &mut out_separate, scalar_a); + compute_eval_eq_packed::(&eval_b, &mut out_separate, scalar_b); + + assert_eq!(out_dual, out_separate, "Mismatch at n_vars={}", n_vars); + } + } } diff --git a/crates/backend/sumcheck/src/product_computation.rs b/crates/backend/sumcheck/src/product_computation.rs index ecce379fb..2828af039 100644 --- a/crates/backend/sumcheck/src/product_computation.rs +++ b/crates/backend/sumcheck/src/product_computation.rs @@ -45,11 +45,7 @@ pub fn run_product_sumcheck>>( assert!(n_rounds >= 1); let first_sumcheck_poly = match (pol_a, pol_b) { (MleRef::BasePacked(evals), MleRef::ExtensionPacked(weights)) => { - if EF::DIMENSION == 5 { - compute_product_sumcheck_polynomial_base_ext_packed::<5, _, _, _, EF>(evals, weights, sum) - } else { - unimplemented!() - } + compute_product_sumcheck_polynomial(evals, weights, sum, |e| EFPacking::::to_ext_iter([e]).collect()) } (MleRef::ExtensionPacked(evals), MleRef::ExtensionPacked(weights)) => { compute_product_sumcheck_polynomial(evals, weights, sum, |e| EFPacking::::to_ext_iter([e]).collect()) diff --git a/crates/whir/src/open.rs b/crates/whir/src/open.rs index f9634c918..0919b2cff 100644 --- a/crates/whir/src/open.rs +++ b/crates/whir/src/open.rs @@ -522,12 +522,55 @@ where let num_variables = statements[0].total_num_variables; assert!(statements.iter().all(|e| e.total_num_variables == num_variables)); - let mut combined_weights = EFPacking::::zero_vec(1 << (num_variables - packing_log_width::())); + let out_len = 1 << (num_variables - packing_log_width::()); + let first = &statements[0]; + let first_is_full_initializer = !first.is_next + && first.values.len() == 1 + && first.values[0].selector == 0 + && first.inner_num_variables() == num_variables; + + let mut combined_weights: Vec>; let mut combined_sum = EF::ZERO; let mut gamma_pow = EF::ONE; + let start_idx; + + if first_is_full_initializer { + combined_weights = unsafe { uninitialized_vec(out_len) }; + let first_scalar = gamma_pow; + combined_sum += first.values[0].value * gamma_pow; + gamma_pow *= gamma; + + let second = statements.get(1); + let second_is_full_domain = second.is_some_and(|s| { + !s.is_next + && s.values.len() == 1 + && s.values[0].selector == 0 + && s.inner_num_variables() == num_variables + }); + + if second_is_full_domain { + let second = &statements[1]; + compute_eval_eq_packed_dual::( + &first.point.0, + &second.point.0, + &mut combined_weights, + first_scalar, + gamma_pow, + ); + combined_sum += second.values[0].value * gamma_pow; + gamma_pow *= gamma; + start_idx = 2; + } else { + compute_eval_eq_packed::(&first.point.0, &mut combined_weights, first_scalar); + start_idx = 1; + } + } else { + combined_weights = EFPacking::::zero_vec(out_len); + start_idx = 0; + } - for smt in statements { + for smt in &statements[start_idx..] { if !smt.is_next && (smt.values.len() == 1 || smt.inner_num_variables() < packing_log_width::()) { for evaluation in &smt.values { compute_sparse_eval_eq_packed::(evaluation.selector, &smt.point, &mut combined_weights, gamma_pow); From 64b77b349a4620d64c6968e3f51b80fda2ce3476 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Sun, 24 May 2026 19:18:57 +0200 Subject: [PATCH 3/7] =?UTF-8?q?perf:=20WHIR=20initial=20folding=20factor?= =?UTF-8?q?=207=E2=86=928?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Doubles the initial WHIR folding width (2^7=128 → 2^8=256 evaluation points per fold), which halves FFT rows (2^20 → 2^19), halves Merkle tree leaves, and eliminates one subsequent WHIR round (3 → 2). Adds num_chunks=32 support to decompose_and_verify_merkle_batch in the recursion circuit verifier (256/8=32 chunks per Merkle leaf). Measured: -2.73% e2e. Origin: blake3-autoresearch h14v2 (3aab3dbc), independent of blake3. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/lean_prover/src/lib.rs | 2 +- crates/rec_aggregation/zkdsl_implem/whir.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/crates/lean_prover/src/lib.rs b/crates/lean_prover/src/lib.rs index 143474bb8..58605bc47 100644 --- a/crates/lean_prover/src/lib.rs +++ b/crates/lean_prover/src/lib.rs @@ -23,7 +23,7 @@ pub const SECURITY_BITS: usize = 124; // TODO 128 bits security pub const GRINDING_BITS: usize = 16; pub const MAX_NUM_VARIABLES_TO_SEND_COEFFS: usize = 8; -pub const WHIR_INITIAL_FOLDING_FACTOR: usize = 7; +pub const WHIR_INITIAL_FOLDING_FACTOR: usize = 8; pub const WHIR_SUBSEQUENT_FOLDING_FACTOR: usize = 5; pub const RS_DOMAIN_INITIAL_REDUCTION_FACTOR: usize = 5; diff --git a/crates/rec_aggregation/zkdsl_implem/whir.py b/crates/rec_aggregation/zkdsl_implem/whir.py index 3124f2534..c2a00c1ad 100644 --- a/crates/rec_aggregation/zkdsl_implem/whir.py +++ b/crates/rec_aggregation/zkdsl_implem/whir.py @@ -269,6 +269,9 @@ def decompose_and_verify_merkle_batch_with_height( if num_chunks == 5: decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 5, circle_values, answers) return + if num_chunks == 32: + decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 32, circle_values, answers) + return print(num_chunks) assert False, "decompose_and_verify_merkle_batch called with unsupported num_chunks" From c756d82c992f4e3531210c91b14bb33ae24ffe31 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Sun, 24 May 2026 19:19:39 +0200 Subject: [PATCH 4/7] perf: fix GKR pivot computation in logup Fix min_section_log calculation to use .max() before .min(), ensuring the bytecode section doesn't unnecessarily pull down the GKR pivot. Also updates the surface assertion for WHIR folding factor 8. The MIN_LOG_N_ROWS_PER_TABLE stays at 8 (no blake3 small table to pad). Measured: -8.1% e2e on blake3 branch (primarily from the pivot fix enabling the ENDIANNESS_PIVOT_GKR=12 fast path). Origin: blake3-autoresearch h24 (bc3bd7e3), logup fix independent of blake3. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/lean_vm/src/core/constants.rs | 2 +- crates/sub_protocols/src/logup.rs | 6 ++---- 2 files changed, 3 insertions(+), 5 deletions(-) diff --git a/crates/lean_vm/src/core/constants.rs b/crates/lean_vm/src/core/constants.rs index 50b9371e1..800be717a 100644 --- a/crates/lean_vm/src/core/constants.rs +++ b/crates/lean_vm/src/core/constants.rs @@ -77,6 +77,6 @@ mod tests { for (table, max_log_n_rows) in MAX_LOG_N_ROWS_PER_TABLE { max_surface += (table.n_columns() as u64) << (max_log_n_rows as u64); } - assert!(max_surface <= 1 << 30); // Maximum data we can commit via WHIR using an initial folding factor of 7, and rate = 1/2 + assert!(max_surface <= 1 << 31); // Maximum data we can commit via WHIR using an initial folding factor of 8, and rate = 1/2 } } diff --git a/crates/sub_protocols/src/logup.rs b/crates/sub_protocols/src/logup.rs index 55af0a320..8bf38368f 100644 --- a/crates/sub_protocols/src/logup.rs +++ b/crates/sub_protocols/src/logup.rs @@ -56,10 +56,8 @@ pub fn prove_generic_logup( let memory_domainsep_packed = PFPacking::::from(F::from_usize(LOGUP_MEMORY_DOMAINSEP)); let bytecode_domainsep_packed = PFPacking::::from(F::from_usize(LOGUP_BYTECODE_DOMAINSEP)); - let min_section_log = log_bytecode.min(tables_log_heights_sorted.last().unwrap().1); - if min_section_log < ENDIANNESS_PIVOT_GKR { - tracing::info!("TODO: suboptimal GKR pivot (could be improved)."); - } + let log_bytecode_section = log_bytecode.max(tables_log_heights_sorted[0].1); + let min_section_log = log_bytecode_section.min(tables_log_heights_sorted.last().unwrap().1); let pivot = ENDIANNESS_PIVOT_GKR.min(min_section_log); let chunk_size = 1usize << pivot; let chunk_shift = usize::BITS as usize - pivot; From e16ce0a61320f9287678ddaa27b33ea17b313e99 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Sun, 24 May 2026 19:21:23 +0200 Subject: [PATCH 5/7] =?UTF-8?q?fix:=20println=E2=86=92eprintln=20in=20self?= =?UTF-8?q?-referential=20compilation=20retry?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Compilation retry messages should go to stderr, not stdout, to avoid polluting JSON benchmark output. Origin: blake3-autoresearch (89a8b612), independent of blake3. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/rec_aggregation/src/compilation.rs | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/crates/rec_aggregation/src/compilation.rs b/crates/rec_aggregation/src/compilation.rs index 5db63d36a..7a1b99e5f 100644 --- a/crates/rec_aggregation/src/compilation.rs +++ b/crates/rec_aggregation/src/compilation.rs @@ -89,7 +89,7 @@ fn compile_main_program_self_referential() -> Bytecode { if actual_log_size == log_size_guess { return bytecode; } - println!( + eprintln!( "Wrong guess at `compile_main_program_self_referential` (log_size {log_size_guess}->{actual_log_size})" ); log_size_guess = actual_log_size; From 74f9e0592ed844cb84b42b07924d3cea142c7de1 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Mon, 25 May 2026 19:40:58 +0200 Subject: [PATCH 6/7] perf: remove 8 dead Poseidon outputs_right columns (h31) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove outputs_right from Poseidon1Cols16 struct, reducing committed columns by 8. The right half of the permutation output is no longer committed or looked up. Changes: - Poseidon1Cols16: removed outputs_right field (-8 columns) - bus_interactions: result lookup reduced from DIGEST_LEN*2 to DIGEST_LEN - eval: removed 8 flag_permute*(state[i+8]-outputs_right[i]) constraints - n_constraints: 99 → 91 - trace_gen: removed outputs_right generation - trace override: simplified to only handle half_output for outputs_left The lookup now writes only outputs_left (8 values) to memory. For permute rows: outputs_left = state (matches permuted output in memory). For compression rows: outputs_left = state + input (matches output in memory). For half_output rows: outputs_left[4..7] overridden with memory values. ALL 5 TESTS PASS. Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/lean_prover/src/trace_gen.rs | 22 +++++++------------ crates/lean_vm/src/tables/poseidon/mod.rs | 12 +++------- .../lean_vm/src/tables/poseidon/trace_gen.rs | 4 +--- 3 files changed, 12 insertions(+), 26 deletions(-) diff --git a/crates/lean_prover/src/trace_gen.rs b/crates/lean_prover/src/trace_gen.rs index c6d6429a5..caabf81ea 100644 --- a/crates/lean_prover/src/trace_gen.rs +++ b/crates/lean_prover/src/trace_gen.rs @@ -112,31 +112,25 @@ pub fn get_execution_trace( let poseidon_trace = traces.get_mut(&Table::poseidon16()).unwrap(); fill_trace_poseidon_16(&mut poseidon_trace.columns); - // For permute=0 rows, override unconstrained output columns with memory values - // so the lookup matches. Same when half_output=1. + // For half_output=1 rows: override outputs_left[4..7] with memory values + // so the lookup passes (the lookup reads memory[res+4..7] which matches). + // outputs_right removed — no override needed for those. { let split = POSEIDON_16_COL_OUTPUT_LEFT + HALF_DIGEST_LEN; let (left, right) = poseidon_trace.columns.split_at_mut(split); let half_output_col = &left[POSEIDON_16_COL_FLAG_HALF_OUTPUT]; - let permute_col = &left[POSEIDON_16_COL_FLAG_PERMUTE]; let res_col = &left[POSEIDON_16_COL_INDEX_INPUT_RES]; - const N: usize = HALF_DIGEST_LEN + DIGEST_LEN; + const N: usize = HALF_DIGEST_LEN; let cols: &mut [Vec; N] = (&mut right[..N]).try_into().unwrap(); transposed_par_iter_mut(cols) .zip(half_output_col) - .zip(permute_col) .zip(res_col) - .for_each(|(((row, &half), &permute), &res)| { - if permute == F::ZERO { + .for_each(|((row, &half), &res)| { + if half == F::ONE { let base = res.to_usize(); - if half == F::ONE { - for j in 0..HALF_DIGEST_LEN { - *row[j] = memory_padded[base + HALF_DIGEST_LEN + j]; - } - } - for j in 0..DIGEST_LEN { - *row[HALF_DIGEST_LEN + j] = memory_padded[base + DIGEST_LEN + j]; + for j in 0..HALF_DIGEST_LEN { + *row[j] = memory_padded[base + HALF_DIGEST_LEN + j]; } } }); diff --git a/crates/lean_vm/src/tables/poseidon/mod.rs b/crates/lean_vm/src/tables/poseidon/mod.rs index 25f3279c5..28aeabd33 100644 --- a/crates/lean_vm/src/tables/poseidon/mod.rs +++ b/crates/lean_vm/src/tables/poseidon/mod.rs @@ -106,8 +106,7 @@ pub const POSEIDON_16_COL_EFFECTIVE_INDEX_LEFT_FIRST: ColIndex = 6; pub const POSEIDON_16_COL_EFFECTIVE_INDEX_LEFT_SECOND: ColIndex = 7; pub const POSEIDON_16_COL_FLAG_PERMUTE: ColIndex = 8; pub const POSEIDON_16_COL_INPUT_START: ColIndex = 9; -pub const POSEIDON_16_COL_OUTPUT_LEFT: ColIndex = num_cols_poseidon_16() - 16; -pub const POSEIDON_16_COL_OUTPUT_RIGHT: ColIndex = num_cols_poseidon_16() - 8; +pub const POSEIDON_16_COL_OUTPUT_LEFT: ColIndex = num_cols_poseidon_16() - 8; /// Non-committed columns ("virtual"): pub const POSEIDON_16_COL_INDEX_INPUT_LEFT: ColIndex = num_cols_poseidon_16(); pub const POSEIDON_16_COL_DOMAINSEP: ColIndex = num_cols_poseidon_16() + 1; @@ -171,7 +170,7 @@ impl TableT for Poseidon16Precompile { buses.extend(memory_lookups_consecutive( POSEIDON_16_COL_INDEX_INPUT_RES, POSEIDON_16_COL_OUTPUT_LEFT, - DIGEST_LEN * 2, + DIGEST_LEN, // was DIGEST_LEN * 2 (included outputs_right) )); buses } @@ -193,7 +192,6 @@ impl TableT for Poseidon16Precompile { *perm.effective_index_left_first = F::from_usize(zero_vec_ptr); *perm.effective_index_left_second = F::from_usize(zero_vec_ptr + HALF_DIGEST_LEN); *perm.flag_permute = F::ZERO; - perm.outputs_right.iter_mut().for_each(|x| **x = F::ZERO); row[POSEIDON_16_COL_INDEX_INPUT_LEFT] = F::from_usize(zero_vec_ptr); row[POSEIDON_16_COL_DOMAINSEP] = F::from_usize(POSEIDON_DOMAINSEP_BASE); @@ -308,7 +306,7 @@ impl Air for Poseidon16Precompile { 0 } fn n_constraints(&self) -> usize { - 2 * BUS as usize + 99 + 2 * BUS as usize + 91 // was 99, removed 8 flag_permute * (state[i+8] - outputs_right[i]) constraints } fn eval(&self, builder: &mut AB, extra_data: &Self::ExtraData) { let cols: Poseidon1Cols16 = { @@ -378,7 +376,6 @@ pub(super) struct Poseidon1Cols16 { pub partial_rounds: [T; PARTIAL_ROUNDS], pub ending_full_rounds: [[T; WIDTH]; HALF_FINAL_FULL_ROUNDS - 1], pub outputs_left: [T; WIDTH / 2], - pub outputs_right: [T; WIDTH / 2], } fn eval_poseidon1_16(builder: &mut AB, local: &Poseidon1Cols16) { @@ -438,7 +435,6 @@ fn eval_poseidon1_16(builder: &mut AB, local: &Poseidon1Cols16( initial_state: &[AB::IF; WIDTH], state: &mut [AB::IF; WIDTH], outputs_left: &[AB::IF; WIDTH / 2], - outputs_right: &[AB::IF; WIDTH / 2], round_constants_1: &[F; WIDTH], round_constants_2: &[F; WIDTH], flag_half_output: AB::IF, @@ -513,7 +508,6 @@ fn eval_last_2_full_rounds_16( }; builder.assert_zero(compression_gate * (state[i] + initial_state[i] - outputs_left[i])); builder.assert_zero(flag_permute * (state[i] - outputs_left[i])); - builder.assert_zero(flag_permute * (state[i + WIDTH / 2] - outputs_right[i])); } } diff --git a/crates/lean_vm/src/tables/poseidon/trace_gen.rs b/crates/lean_vm/src/tables/poseidon/trace_gen.rs index c7b93cf56..faec41400 100644 --- a/crates/lean_vm/src/tables/poseidon/trace_gen.rs +++ b/crates/lean_vm/src/tables/poseidon/trace_gen.rs @@ -104,7 +104,6 @@ pub(super) fn generate_trace_rows_for_perm + Copy>(perm: & &mut state, &inputs, &mut perm.outputs_left, - &mut perm.outputs_right, flag_permute, &poseidon1_final_constants()[2 * n_ending_full_rounds], &poseidon1_final_constants()[2 * n_ending_full_rounds + 1], @@ -140,7 +139,6 @@ fn generate_last_2_full_rounds + Copy>( state: &mut [F; WIDTH], inputs: &[F; WIDTH], outputs_left: &mut [&mut F; WIDTH / 2], - outputs_right: &mut [&mut F; WIDTH / 2], flag_permute: F, round_constants_1: &[KoalaBear; WIDTH], round_constants_2: &[KoalaBear; WIDTH], @@ -160,6 +158,6 @@ fn generate_last_2_full_rounds + Copy>( for i in 0..(WIDTH / 2) { let compression_value = state[i] + inputs[i]; *outputs_left[i] = (F::ONE - flag_permute) * compression_value + flag_permute * state[i]; - *outputs_right[i] = flag_permute * state[i + WIDTH / 2]; + // outputs_right removed — only outputs_left is committed } } From ec0d7a713e289c15a83535ca50f096d6cf7df7bd Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Mon, 25 May 2026 21:04:33 +0200 Subject: [PATCH 7/7] style: fix rustfmt and remove dead test module reference Co-Authored-By: Claude Opus 4.6 (1M context) --- crates/backend/koala-bear/src/quintic_extension/mod.rs | 2 -- crates/backend/poly/src/eq_mle.rs | 8 ++------ crates/whir/src/open.rs | 5 +---- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/crates/backend/koala-bear/src/quintic_extension/mod.rs b/crates/backend/koala-bear/src/quintic_extension/mod.rs index 11a47cbf5..6ccdca4f7 100644 --- a/crates/backend/koala-bear/src/quintic_extension/mod.rs +++ b/crates/backend/koala-bear/src/quintic_extension/mod.rs @@ -11,8 +11,6 @@ use crate::{KoalaBear, KoalaBearParameters}; pub mod extension; pub(crate) mod packed_extension; pub(crate) mod packing; -#[cfg(test)] -mod tests; pub type QuinticExtensionFieldKB = QuinticExtensionField; pub type PackedQuinticExtensionFieldKB = PackedQuinticExtensionField::Packing>; diff --git a/crates/backend/poly/src/eq_mle.rs b/crates/backend/poly/src/eq_mle.rs index b6cbe6ae5..64d3733f5 100644 --- a/crates/backend/poly/src/eq_mle.rs +++ b/crates/backend/poly/src/eq_mle.rs @@ -922,12 +922,8 @@ fn eval_eq_with_packed_output_dual>( let sa0 = scalar_a - sa1; let sb1 = scalar_b * eval_b[0]; let sb0 = scalar_b - sb1; - eval_eq_with_packed_output_dual::( - &eval_a[1..], &eval_b[1..], low, sa0, sb0, - ); - eval_eq_with_packed_output_dual::( - &eval_a[1..], &eval_b[1..], high, sa1, sb1, - ); + eval_eq_with_packed_output_dual::(&eval_a[1..], &eval_b[1..], low, sa0, sb0); + eval_eq_with_packed_output_dual::(&eval_a[1..], &eval_b[1..], high, sa1, sb1); } } } diff --git a/crates/whir/src/open.rs b/crates/whir/src/open.rs index 0919b2cff..dec608061 100644 --- a/crates/whir/src/open.rs +++ b/crates/whir/src/open.rs @@ -543,10 +543,7 @@ where let second = statements.get(1); let second_is_full_domain = second.is_some_and(|s| { - !s.is_next - && s.values.len() == 1 - && s.values[0].selector == 0 - && s.inner_num_variables() == num_variables + !s.is_next && s.values.len() == 1 && s.values[0].selector == 0 && s.inner_num_variables() == num_variables }); if second_is_full_domain {