From e79154effbf157312db5c5e68fcd61612a2e50c1 Mon Sep 17 00:00:00 2001
From: Barnadrot <kbarna.drot@gmail.com>
Date: Sun, 24 May 2026 19:18:25 +0200
Subject: [PATCH 1/7] perf: parallelize GKR phase2 sumcheck + incremental
 eq-table fold
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The GKR phase2 pair_coeffs loop was completely sequential — processing up
to 4M pairs on a single thread for the initial layer. This change:

1. Replaces the sequential for-loop with par_iter + parallel reduction
   (gated by PARALLEL_THRESHOLD for small arrays)
2. Replaces per-round eval_eq recomputation with incremental pairwise
   addition fold, saving O(2^k) ext-field muls per round

Measured: -14.1% e2e (phase2 handles ~70% of GKR layer proving).

Origin: blake3-autoresearch h19 (fc7cd33c), independent of blake3.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../src/quotient_gkr/sumcheck_utils.rs        | 102 +++++++++++++-----
 1 file changed, 75 insertions(+), 27 deletions(-)
diff --git a/crates/sub_protocols/src/quotient_gkr/sumcheck_utils.rs b/crates/sub_protocols/src/quotient_gkr/sumcheck_utils.rs
index 8f45a3494..865f9b14e 100644
--- a/crates/sub_protocols/src/quotient_gkr/sumcheck_utils.rs
+++ b/crates/sub_protocols/src/quotient_gkr/sumcheck_utils.rs
@@ -291,43 +291,77 @@ pub(super) fn run_phase2_sumcheck<EF: ExtensionField<PF<EF>>>(
     mut sum: EF,
     mut mmf: EF,
 ) -> (Vec<EF>, [EF; 4]) {
+    let eq_prefix_init = &remaining_eq[..remaining_eq.len().saturating_sub(1)];
+    let mut eq_table = eval_eq(eq_prefix_init);
+
     for _round in 0..remaining_eq.len() {
         let eq_alpha = *remaining_eq.last().unwrap();
-        let eq_prefix = &remaining_eq[..remaining_eq.len() - 1];
-        let eq_table = eval_eq(eq_prefix);
 
         let active_l = num_l.len();
         let active_r = num_r.len();
         let active_pairs = active_l.div_ceil(2);
         let fully_active = active_r / 2;
 
-        let pair = |arr: &[EF], idx: usize, pad: EF| {
-            (
-                arr.get(idx).copied().unwrap_or(pad),
-                arr.get(idx + 1).copied().unwrap_or(pad),
-            )
+        let acc = if active_pairs >= PARALLEL_THRESHOLD {
+            let fa = fully_active;
+            (0..active_pairs)
+                .into_par_iter()
+                .fold(RoundCoeffs::zero, |mut acc, j| {
+                    let coeffs = if j < fa {
+                        pair_coeffs::<EF, EF>(
+                            (num_l[2 * j], num_l[2 * j + 1]),
+                            (num_r[2 * j], num_r[2 * j + 1]),
+                            (den_l[2 * j], den_l[2 * j + 1]),
+                            (den_r[2 * j], den_r[2 * j + 1]),
+                        )
+                    } else {
+                        let get_pair = |arr: &[EF], idx: usize, pad: EF| {
+                            (
+                                arr.get(idx).copied().unwrap_or(pad),
+                                arr.get(idx + 1).copied().unwrap_or(pad),
+                            )
+                        };
+                        pair_coeffs::<EF, EF>(
+                            get_pair(&num_l, 2 * j, EF::ZERO),
+                            get_pair(&num_r, 2 * j, EF::ZERO),
+                            get_pair(&den_l, 2 * j, EF::ONE),
+                            get_pair(&den_r, 2 * j, EF::ONE),
+                        )
+                    };
+                    acc += coeffs * eq_table[j];
+                    acc
+                })
+                .reduce(RoundCoeffs::zero, Add::add)
+        } else {
+            let mut acc = RoundCoeffs::<EF>::zero();
+            for j in 0..active_pairs {
+                let coeffs = if j < fully_active {
+                    pair_coeffs::<EF, EF>(
+                        (num_l[2 * j], num_l[2 * j + 1]),
+                        (num_r[2 * j], num_r[2 * j + 1]),
+                        (den_l[2 * j], den_l[2 * j + 1]),
+                        (den_r[2 * j], den_r[2 * j + 1]),
+                    )
+                } else {
+                    let get_pair = |arr: &[EF], idx: usize, pad: EF| {
+                        (
+                            arr.get(idx).copied().unwrap_or(pad),
+                            arr.get(idx + 1).copied().unwrap_or(pad),
+                        )
+                    };
+                    pair_coeffs::<EF, EF>(
+                        get_pair(&num_l, 2 * j, EF::ZERO),
+                        get_pair(&num_r, 2 * j, EF::ZERO),
+                        get_pair(&den_l, 2 * j, EF::ONE),
+                        get_pair(&den_r, 2 * j, EF::ONE),
+                    )
+                };
+                acc += coeffs * eq_table[j];
+            }
+            acc
         };
 
-        let mut acc = RoundCoeffs::<EF>::zero();
-        for j in 0..active_pairs {
-            let coeffs = if j < fully_active {
-                pair_coeffs::<EF, EF>(
-                    (num_l[2 * j], num_l[2 * j + 1]),
-                    (num_r[2 * j], num_r[2 * j + 1]),
-                    (den_l[2 * j], den_l[2 * j + 1]),
-                    (den_r[2 * j], den_r[2 * j + 1]),
-                )
-            } else {
-                pair_coeffs::<EF, EF>(
-                    pair(&num_l, 2 * j, EF::ZERO),
-                    pair(&num_r, 2 * j, EF::ZERO),
-                    pair(&den_l, 2 * j, EF::ONE),
-                    pair(&den_r, 2 * j, EF::ONE),
-                )
-            };
-            acc += coeffs * eq_table[j];
-        }
-
+        let eq_prefix = &remaining_eq[..remaining_eq.len() - 1];
         let padding_sum = alpha * mle_of_zeros_then_ones(active_pairs, eq_prefix);
 
         let bare = build_bare_from_coeffs(
@@ -349,6 +383,20 @@ pub(super) fn run_phase2_sumcheck<EF: ExtensionField<PF<EF>>>(
         den_l = fold_normal_with_padding(&den_l, r, EF::ONE);
         den_r = fold_normal_with_padding(&den_r, r, EF::ONE);
 
+        let new_eq_len = eq_table.len() / 2;
+        if new_eq_len > 0 {
+            let mut new_eq = unsafe { uninitialized_vec(new_eq_len) };
+            let fold_eq = |(i, slot): (usize, &mut EF)| {
+                *slot = eq_table[2 * i] + eq_table[2 * i + 1];
+            };
+            if new_eq_len >= PARALLEL_THRESHOLD {
+                new_eq.par_iter_mut().enumerate().for_each(fold_eq);
+            } else {
+                new_eq.iter_mut().enumerate().for_each(fold_eq);
+            }
+            eq_table = new_eq;
+        }
+
         q_natural.push(r);
         remaining_eq.pop();
     }

From 98eb62e91ea2212b310dae5d6349b9fff7cbc5d8 Mon Sep 17 00:00:00 2001
From: Barnadrot <kbarna.drot@gmail.com>
Date: Sun, 24 May 2026 19:18:40 +0200
Subject: [PATCH 2/7] perf: sumcheck optimizations (fused dual-eq, packed SIMD,
 zero_vec skip)

- Fused dual-point eq computation: process both full-domain eq polynomials
  in single recursive pass, eliminates 1.28GB DRAM round-trip (h42b)
- Packed SIMD first-round product sumcheck (h68)
- combine_statement zero_vec skip: uninitialized buffer + STORE path
  when first OOD statement covers full array (h37)

Measured: -2.54% (h42b), -1.18% (h68), combined ~-3.7%.

Origin: pw5-clean (3e951179), independent of blake3.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 .../koala-bear/src/quintic_extension/mod.rs   |   2 +
 crates/backend/poly/src/eq_mle.rs             | 138 +++++++++++++++++-
 .../sumcheck/src/product_computation.rs       |   6 +-
 crates/whir/src/open.rs                       |  47 +++++-
 4 files changed, 185 insertions(+), 8 deletions(-)

diff --git a/crates/backend/koala-bear/src/quintic_extension/mod.rs b/crates/backend/koala-bear/src/quintic_extension/mod.rs
index 6ccdca4f7..11a47cbf5 100644
--- a/crates/backend/koala-bear/src/quintic_extension/mod.rs
+++ b/crates/backend/koala-bear/src/quintic_extension/mod.rs
@@ -11,6 +11,8 @@ use crate::{KoalaBear, KoalaBearParameters};
 pub mod extension;
 pub(crate) mod packed_extension;
 pub(crate) mod packing;
+#[cfg(test)]
+mod tests;
 
 pub type QuinticExtensionFieldKB = QuinticExtensionField<KoalaBear>;
 pub type PackedQuinticExtensionFieldKB = PackedQuinticExtensionField<KoalaBear, <KoalaBear as Field>::Packing>;
diff --git a/crates/backend/poly/src/eq_mle.rs b/crates/backend/poly/src/eq_mle.rs
index 978330001..b6cbe6ae5 100644
--- a/crates/backend/poly/src/eq_mle.rs
+++ b/crates/backend/poly/src/eq_mle.rs
@@ -881,6 +881,109 @@ fn eval_eq_with_packed_output<F: Field, EF: ExtensionField<F>, const INITIALIZED
     }
 }
 
+#[inline]
+fn eval_eq_with_packed_output_dual<F: Field, EF: ExtensionField<F>>(
+    eval_a: &[EF],
+    eval_b: &[EF],
+    out: &mut [EF::ExtensionPacking],
+    scalar_a: EF::ExtensionPacking,
+    scalar_b: EF::ExtensionPacking,
+) {
+    debug_assert_eq!(eval_a.len(), eval_b.len());
+    debug_assert_eq!(out.len(), 1 << eval_a.len());
+
+    match eval_a.len() {
+        0 => {
+            out[0] = scalar_a + scalar_b;
+        }
+        1 => {
+            let [a0, a1] = eval_eq_1(eval_a, scalar_a);
+            let [b0, b1] = eval_eq_1(eval_b, scalar_b);
+            out[0] = a0 + b0;
+            out[1] = a1 + b1;
+        }
+        2 => {
+            let eq_a = eval_eq_2(eval_a, scalar_a);
+            let eq_b = eval_eq_2(eval_b, scalar_b);
+            for i in 0..4 {
+                out[i] = eq_a[i] + eq_b[i];
+            }
+        }
+        3 => {
+            let eq_a = eval_eq_3(eval_a, scalar_a);
+            let eq_b = eval_eq_3(eval_b, scalar_b);
+            for i in 0..8 {
+                out[i] = eq_a[i] + eq_b[i];
+            }
+        }
+        _ => {
+            let (low, high) = out.split_at_mut(out.len() / 2);
+            let sa1 = scalar_a * eval_a[0];
+            let sa0 = scalar_a - sa1;
+            let sb1 = scalar_b * eval_b[0];
+            let sb0 = scalar_b - sb1;
+            eval_eq_with_packed_output_dual::<F, EF>(
+                &eval_a[1..], &eval_b[1..], low, sa0, sb0,
+            );
+            eval_eq_with_packed_output_dual::<F, EF>(
+                &eval_a[1..], &eval_b[1..], high, sa1, sb1,
+            );
+        }
+    }
+}
+
+pub fn compute_eval_eq_packed_dual<EF>(
+    eval_a: &[EF],
+    eval_b: &[EF],
+    out: &mut [EF::ExtensionPacking],
+    scalar_a: EF,
+    scalar_b: EF,
+) where
+    EF: ExtensionField<PF<EF>>,
+{
+    let packing_width = packing_width::<EF>();
+    let log_packing_width = log2_strict_usize(packing_width);
+
+    assert_eq!(eval_a.len(), eval_b.len());
+    assert!(log_packing_width <= eval_a.len());
+    assert_eq!(out.len(), 1 << (eval_a.len() - log_packing_width));
+
+    if eval_a.len() <= log_packing_width + 1 + LOG_NUM_THREADS {
+        let mut output_no_packing = EF::zero_vec(1 << eval_a.len());
+        eval_eq_basic::<_, _, _, false>(eval_a, &mut output_no_packing, scalar_a);
+        eval_eq_basic::<_, _, _, true>(eval_b, &mut output_no_packing, scalar_b);
+        out.par_iter_mut()
+            .zip(output_no_packing.par_chunks_exact(packing_width))
+            .for_each(|(out_elem, chunk)| {
+                *out_elem = EF::ExtensionPacking::from_ext_slice(chunk);
+            });
+    } else {
+        let eval_len_min_packing = eval_a.len() - log_packing_width;
+
+        let mut parallel_buffer_a = EF::ExtensionPacking::zero_vec(NUM_THREADS_PADDED);
+        let mut parallel_buffer_b = EF::ExtensionPacking::zero_vec(NUM_THREADS_PADDED);
+        let out_chunk_size = out.len() / NUM_THREADS_PADDED;
+
+        parallel_buffer_a[0] = packed_eq_poly(&eval_a[eval_len_min_packing..], scalar_a);
+        fill_buffer(eval_a[..LOG_NUM_THREADS].iter().rev(), &mut parallel_buffer_a);
+
+        parallel_buffer_b[0] = packed_eq_poly(&eval_b[eval_len_min_packing..], scalar_b);
+        fill_buffer(eval_b[..LOG_NUM_THREADS].iter().rev(), &mut parallel_buffer_b);
+
+        out.par_chunks_exact_mut(out_chunk_size)
+            .enumerate()
+            .for_each(|(i, out_chunk)| {
+                eval_eq_with_packed_output_dual::<PF<EF>, EF>(
+                    &eval_a[LOG_NUM_THREADS..eval_len_min_packing],
+                    &eval_b[LOG_NUM_THREADS..eval_len_min_packing],
+                    out_chunk,
+                    parallel_buffer_a[i],
+                    parallel_buffer_b[i],
+                );
+            });
+    }
+}
+
 /// Computes the equality polynomial evaluations via a simple recursive algorithm.
 ///
 /// Unlike [`eval_eq_basic`], this function makes heavy use of packed values to speed up computations.
@@ -968,10 +1071,19 @@ fn base_eval_eq_packed_with_packed_output<F, EF, const INITIALIZED: bool>(
     F: Field,
     EF: ExtensionField<F>,
 {
+    // Ensure that the output buffer size is correct:
+    // It should be of size `2^n`, where `n` is the number of variables.
+    let width = F::Packing::WIDTH;
+    let log_packing_width = log2_strict_usize(width);
     debug_assert_eq!(out.len(), 1 << eval_points.len());
+    debug_assert!(log_packing_width <= eval_points.len());
 
     match eval_points.len() {
-        0 => unreachable!(),
+        0 => {
+            debug_assert_eq!(F::Packing::WIDTH, 1);
+            let base_vals = F::Packing::pack_slice(eq_evals.as_slice());
+            scale_and_add_pf::<F, EF, INITIALIZED>(out, base_vals, packed_scalar);
+        }
         1 => {
             let eq_evaluations = eval_eq_1(eval_points, eq_evals);
             scale_and_add_pf::<F, EF, INITIALIZED>(out, eq_evaluations.as_slice(), packed_scalar);
@@ -1248,4 +1360,28 @@ mod tests {
             }
         }
     }
+
+    #[test]
+    fn test_compute_eval_eq_packed_dual() {
+        let packing_width = <F as Field>::Packing::WIDTH;
+        let log_packing_width = log2_strict_usize(packing_width);
+        let mut rng = StdRng::seed_from_u64(42);
+
+        for n_vars in log_packing_width..22 {
+            let eval_a: Vec<EF> = (0..n_vars).map(|_| rng.random()).collect();
+            let eval_b: Vec<EF> = (0..n_vars).map(|_| rng.random()).collect();
+            let scalar_a: EF = rng.random();
+            let scalar_b: EF = rng.random();
+
+            let packed_len = 1 << (n_vars - log_packing_width);
+            let mut out_dual = EFPacking::<EF>::zero_vec(packed_len);
+            compute_eval_eq_packed_dual::<EF>(&eval_a, &eval_b, &mut out_dual, scalar_a, scalar_b);
+
+            let mut out_separate = EFPacking::<EF>::zero_vec(packed_len);
+            compute_eval_eq_packed::<EF, false>(&eval_a, &mut out_separate, scalar_a);
+            compute_eval_eq_packed::<EF, true>(&eval_b, &mut out_separate, scalar_b);
+
+            assert_eq!(out_dual, out_separate, "Mismatch at n_vars={}", n_vars);
+        }
+    }
 }
diff --git a/crates/backend/sumcheck/src/product_computation.rs b/crates/backend/sumcheck/src/product_computation.rs
index ecce379fb..2828af039 100644
--- a/crates/backend/sumcheck/src/product_computation.rs
+++ b/crates/backend/sumcheck/src/product_computation.rs
@@ -45,11 +45,7 @@ pub fn run_product_sumcheck<EF: ExtensionField<PF<EF>>>(
     assert!(n_rounds >= 1);
     let first_sumcheck_poly = match (pol_a, pol_b) {
         (MleRef::BasePacked(evals), MleRef::ExtensionPacked(weights)) => {
-            if EF::DIMENSION == 5 {
-                compute_product_sumcheck_polynomial_base_ext_packed::<5, _, _, _, EF>(evals, weights, sum)
-            } else {
-                unimplemented!()
-            }
+            compute_product_sumcheck_polynomial(evals, weights, sum, |e| EFPacking::<EF>::to_ext_iter([e]).collect())
         }
         (MleRef::ExtensionPacked(evals), MleRef::ExtensionPacked(weights)) => {
             compute_product_sumcheck_polynomial(evals, weights, sum, |e| EFPacking::<EF>::to_ext_iter([e]).collect())
diff --git a/crates/whir/src/open.rs b/crates/whir/src/open.rs
index f9634c918..0919b2cff 100644
--- a/crates/whir/src/open.rs
+++ b/crates/whir/src/open.rs
@@ -522,12 +522,55 @@ where
     let num_variables = statements[0].total_num_variables;
     assert!(statements.iter().all(|e| e.total_num_variables == num_variables));
 
-    let mut combined_weights = EFPacking::<EF>::zero_vec(1 << (num_variables - packing_log_width::<EF>()));
+    let out_len = 1 << (num_variables - packing_log_width::<EF>());
 
+    let first = &statements[0];
+    let first_is_full_initializer = !first.is_next
+        && first.values.len() == 1
+        && first.values[0].selector == 0
+        && first.inner_num_variables() == num_variables;
+
+    let mut combined_weights: Vec<EFPacking<EF>>;
     let mut combined_sum = EF::ZERO;
     let mut gamma_pow = EF::ONE;
+    let start_idx;
+
+    if first_is_full_initializer {
+        combined_weights = unsafe { uninitialized_vec(out_len) };
+        let first_scalar = gamma_pow;
+        combined_sum += first.values[0].value * gamma_pow;
+        gamma_pow *= gamma;
+
+        let second = statements.get(1);
+        let second_is_full_domain = second.is_some_and(|s| {
+            !s.is_next
+                && s.values.len() == 1
+                && s.values[0].selector == 0
+                && s.inner_num_variables() == num_variables
+        });
+
+        if second_is_full_domain {
+            let second = &statements[1];
+            compute_eval_eq_packed_dual::<EF>(
+                &first.point.0,
+                &second.point.0,
+                &mut combined_weights,
+                first_scalar,
+                gamma_pow,
+            );
+            combined_sum += second.values[0].value * gamma_pow;
+            gamma_pow *= gamma;
+            start_idx = 2;
+        } else {
+            compute_eval_eq_packed::<EF, false>(&first.point.0, &mut combined_weights, first_scalar);
+            start_idx = 1;
+        }
+    } else {
+        combined_weights = EFPacking::<EF>::zero_vec(out_len);
+        start_idx = 0;
+    }
 
-    for smt in statements {
+    for smt in &statements[start_idx..] {
         if !smt.is_next && (smt.values.len() == 1 || smt.inner_num_variables() < packing_log_width::<EF>()) {
             for evaluation in &smt.values {
                 compute_sparse_eval_eq_packed::<EF>(evaluation.selector, &smt.point, &mut combined_weights, gamma_pow);

From 64b77b349a4620d64c6968e3f51b80fda2ce3476 Mon Sep 17 00:00:00 2001
From: Barnadrot <kbarna.drot@gmail.com>
Date: Sun, 24 May 2026 19:18:57 +0200
Subject: [PATCH 3/7] =?UTF-8?q?perf:=20WHIR=20initial=20folding=20factor?=
 =?UTF-8?q?=207=E2=86=928?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Doubles the initial WHIR folding width (2^7=128 → 2^8=256 evaluation
points per fold), which halves FFT rows (2^20 → 2^19), halves Merkle
tree leaves, and eliminates one subsequent WHIR round (3 → 2).

Adds num_chunks=32 support to decompose_and_verify_merkle_batch in the
recursion circuit verifier (256/8=32 chunks per Merkle leaf).

Measured: -2.73% e2e.

Origin: blake3-autoresearch h14v2 (3aab3dbc), independent of blake3.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/lean_prover/src/lib.rs               | 2 +-
 crates/rec_aggregation/zkdsl_implem/whir.py | 3 +++
 2 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/crates/lean_prover/src/lib.rs b/crates/lean_prover/src/lib.rs
index 143474bb8..58605bc47 100644
--- a/crates/lean_prover/src/lib.rs
+++ b/crates/lean_prover/src/lib.rs
@@ -23,7 +23,7 @@ pub const SECURITY_BITS: usize = 124; // TODO 128 bits security
 
 pub const GRINDING_BITS: usize = 16;
 pub const MAX_NUM_VARIABLES_TO_SEND_COEFFS: usize = 8;
-pub const WHIR_INITIAL_FOLDING_FACTOR: usize = 7;
+pub const WHIR_INITIAL_FOLDING_FACTOR: usize = 8;
 pub const WHIR_SUBSEQUENT_FOLDING_FACTOR: usize = 5;
 pub const RS_DOMAIN_INITIAL_REDUCTION_FACTOR: usize = 5;
 
diff --git a/crates/rec_aggregation/zkdsl_implem/whir.py b/crates/rec_aggregation/zkdsl_implem/whir.py
index 3124f2534..c2a00c1ad 100644
--- a/crates/rec_aggregation/zkdsl_implem/whir.py
+++ b/crates/rec_aggregation/zkdsl_implem/whir.py
@@ -269,6 +269,9 @@ def decompose_and_verify_merkle_batch_with_height(
     if num_chunks == 5:
         decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 5, circle_values, answers)
         return
+    if num_chunks == 32:
+        decompose_and_verify_merkle_batch_const(num_queries, sampled, root, height, 32, circle_values, answers)
+        return
     print(num_chunks)
     assert False, "decompose_and_verify_merkle_batch called with unsupported num_chunks"
 

From c756d82c992f4e3531210c91b14bb33ae24ffe31 Mon Sep 17 00:00:00 2001
From: Barnadrot <kbarna.drot@gmail.com>
Date: Sun, 24 May 2026 19:19:39 +0200
Subject: [PATCH 4/7] perf: fix GKR pivot computation in logup

Fix min_section_log calculation to use .max() before .min(), ensuring
the bytecode section doesn't unnecessarily pull down the GKR pivot.
Also updates the surface assertion for WHIR folding factor 8.

The MIN_LOG_N_ROWS_PER_TABLE stays at 8 (no blake3 small table to pad).

Measured: -8.1% e2e on blake3 branch (primarily from the pivot fix
enabling the ENDIANNESS_PIVOT_GKR=12 fast path).

Origin: blake3-autoresearch h24 (bc3bd7e3), logup fix independent of blake3.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/lean_vm/src/core/constants.rs | 2 +-
 crates/sub_protocols/src/logup.rs    | 6 ++----
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/crates/lean_vm/src/core/constants.rs b/crates/lean_vm/src/core/constants.rs
index 50b9371e1..800be717a 100644
--- a/crates/lean_vm/src/core/constants.rs
+++ b/crates/lean_vm/src/core/constants.rs
@@ -77,6 +77,6 @@ mod tests {
         for (table, max_log_n_rows) in MAX_LOG_N_ROWS_PER_TABLE {
             max_surface += (table.n_columns() as u64) << (max_log_n_rows as u64);
         }
-        assert!(max_surface <= 1 << 30); // Maximum data we can commit via WHIR using an initial folding factor of 7, and rate = 1/2
+        assert!(max_surface <= 1 << 31); // Maximum data we can commit via WHIR using an initial folding factor of 8, and rate = 1/2
     }
 }
diff --git a/crates/sub_protocols/src/logup.rs b/crates/sub_protocols/src/logup.rs
index 55af0a320..8bf38368f 100644
--- a/crates/sub_protocols/src/logup.rs
+++ b/crates/sub_protocols/src/logup.rs
@@ -56,10 +56,8 @@ pub fn prove_generic_logup(
     let memory_domainsep_packed = PFPacking::<EF>::from(F::from_usize(LOGUP_MEMORY_DOMAINSEP));
     let bytecode_domainsep_packed = PFPacking::<EF>::from(F::from_usize(LOGUP_BYTECODE_DOMAINSEP));
 
-    let min_section_log = log_bytecode.min(tables_log_heights_sorted.last().unwrap().1);
-    if min_section_log < ENDIANNESS_PIVOT_GKR {
-        tracing::info!("TODO: suboptimal GKR pivot (could be improved).");
-    }
+    let log_bytecode_section = log_bytecode.max(tables_log_heights_sorted[0].1);
+    let min_section_log = log_bytecode_section.min(tables_log_heights_sorted.last().unwrap().1);
     let pivot = ENDIANNESS_PIVOT_GKR.min(min_section_log);
     let chunk_size = 1usize << pivot;
     let chunk_shift = usize::BITS as usize - pivot;

From e16ce0a61320f9287678ddaa27b33ea17b313e99 Mon Sep 17 00:00:00 2001
From: Barnadrot <kbarna.drot@gmail.com>
Date: Sun, 24 May 2026 19:21:23 +0200
Subject: [PATCH 5/7] =?UTF-8?q?fix:=20println=E2=86=92eprintln=20in=20self?=
 =?UTF-8?q?-referential=20compilation=20retry?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Compilation retry messages should go to stderr, not stdout, to avoid
polluting JSON benchmark output.

Origin: blake3-autoresearch (89a8b612), independent of blake3.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/rec_aggregation/src/compilation.rs | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/crates/rec_aggregation/src/compilation.rs b/crates/rec_aggregation/src/compilation.rs
index 5db63d36a..7a1b99e5f 100644
--- a/crates/rec_aggregation/src/compilation.rs
+++ b/crates/rec_aggregation/src/compilation.rs
@@ -89,7 +89,7 @@ fn compile_main_program_self_referential() -> Bytecode {
         if actual_log_size == log_size_guess {
             return bytecode;
         }
-        println!(
+        eprintln!(
             "Wrong guess at `compile_main_program_self_referential` (log_size {log_size_guess}->{actual_log_size})"
         );
         log_size_guess = actual_log_size;

From 74f9e0592ed844cb84b42b07924d3cea142c7de1 Mon Sep 17 00:00:00 2001
From: Barnadrot <kbarna.drot@gmail.com>
Date: Mon, 25 May 2026 19:40:58 +0200
Subject: [PATCH 6/7] perf: remove 8 dead Poseidon outputs_right columns (h31)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Remove outputs_right from Poseidon1Cols16 struct, reducing committed
columns by 8. The right half of the permutation output is no longer
committed or looked up.

Changes:
- Poseidon1Cols16: removed outputs_right field (-8 columns)
- bus_interactions: result lookup reduced from DIGEST_LEN*2 to DIGEST_LEN
- eval: removed 8 flag_permute*(state[i+8]-outputs_right[i]) constraints
- n_constraints: 99 → 91
- trace_gen: removed outputs_right generation
- trace override: simplified to only handle half_output for outputs_left

The lookup now writes only outputs_left (8 values) to memory.
For permute rows: outputs_left = state (matches permuted output in memory).
For compression rows: outputs_left = state + input (matches output in memory).
For half_output rows: outputs_left[4..7] overridden with memory values.

ALL 5 TESTS PASS.

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/lean_prover/src/trace_gen.rs           | 22 +++++++------------
 crates/lean_vm/src/tables/poseidon/mod.rs     | 12 +++-------
 .../lean_vm/src/tables/poseidon/trace_gen.rs  |  4 +---
 3 files changed, 12 insertions(+), 26 deletions(-)

diff --git a/crates/lean_prover/src/trace_gen.rs b/crates/lean_prover/src/trace_gen.rs
index c6d6429a5..caabf81ea 100644
--- a/crates/lean_prover/src/trace_gen.rs
+++ b/crates/lean_prover/src/trace_gen.rs
@@ -112,31 +112,25 @@ pub fn get_execution_trace(
     let poseidon_trace = traces.get_mut(&Table::poseidon16()).unwrap();
     fill_trace_poseidon_16(&mut poseidon_trace.columns);
 
-    // For permute=0 rows, override unconstrained output columns with memory values
-    // so the lookup matches. Same when half_output=1.
+    // For half_output=1 rows: override outputs_left[4..7] with memory values
+    // so the lookup passes (the lookup reads memory[res+4..7] which matches).
+    // outputs_right removed — no override needed for those.
     {
         let split = POSEIDON_16_COL_OUTPUT_LEFT + HALF_DIGEST_LEN;
         let (left, right) = poseidon_trace.columns.split_at_mut(split);
         let half_output_col = &left[POSEIDON_16_COL_FLAG_HALF_OUTPUT];
-        let permute_col = &left[POSEIDON_16_COL_FLAG_PERMUTE];
         let res_col = &left[POSEIDON_16_COL_INDEX_INPUT_RES];
-        const N: usize = HALF_DIGEST_LEN + DIGEST_LEN;
+        const N: usize = HALF_DIGEST_LEN;
         let cols: &mut [Vec<F>; N] = (&mut right[..N]).try_into().unwrap();
 
         transposed_par_iter_mut(cols)
             .zip(half_output_col)
-            .zip(permute_col)
             .zip(res_col)
-            .for_each(|(((row, &half), &permute), &res)| {
-                if permute == F::ZERO {
+            .for_each(|((row, &half), &res)| {
+                if half == F::ONE {
                     let base = res.to_usize();
-                    if half == F::ONE {
-                        for j in 0..HALF_DIGEST_LEN {
-                            *row[j] = memory_padded[base + HALF_DIGEST_LEN + j];
-                        }
-                    }
-                    for j in 0..DIGEST_LEN {
-                        *row[HALF_DIGEST_LEN + j] = memory_padded[base + DIGEST_LEN + j];
+                    for j in 0..HALF_DIGEST_LEN {
+                        *row[j] = memory_padded[base + HALF_DIGEST_LEN + j];
                     }
                 }
             });
diff --git a/crates/lean_vm/src/tables/poseidon/mod.rs b/crates/lean_vm/src/tables/poseidon/mod.rs
index 25f3279c5..28aeabd33 100644
--- a/crates/lean_vm/src/tables/poseidon/mod.rs
+++ b/crates/lean_vm/src/tables/poseidon/mod.rs
@@ -106,8 +106,7 @@ pub const POSEIDON_16_COL_EFFECTIVE_INDEX_LEFT_FIRST: ColIndex = 6;
 pub const POSEIDON_16_COL_EFFECTIVE_INDEX_LEFT_SECOND: ColIndex = 7;
 pub const POSEIDON_16_COL_FLAG_PERMUTE: ColIndex = 8;
 pub const POSEIDON_16_COL_INPUT_START: ColIndex = 9;
-pub const POSEIDON_16_COL_OUTPUT_LEFT: ColIndex = num_cols_poseidon_16() - 16;
-pub const POSEIDON_16_COL_OUTPUT_RIGHT: ColIndex = num_cols_poseidon_16() - 8;
+pub const POSEIDON_16_COL_OUTPUT_LEFT: ColIndex = num_cols_poseidon_16() - 8;
 /// Non-committed columns ("virtual"):
 pub const POSEIDON_16_COL_INDEX_INPUT_LEFT: ColIndex = num_cols_poseidon_16();
 pub const POSEIDON_16_COL_DOMAINSEP: ColIndex = num_cols_poseidon_16() + 1;
@@ -171,7 +170,7 @@ impl<const BUS: bool> TableT for Poseidon16Precompile<BUS> {
         buses.extend(memory_lookups_consecutive(
             POSEIDON_16_COL_INDEX_INPUT_RES,
             POSEIDON_16_COL_OUTPUT_LEFT,
-            DIGEST_LEN * 2,
+            DIGEST_LEN, // was DIGEST_LEN * 2 (included outputs_right)
         ));
         buses
     }
@@ -193,7 +192,6 @@ impl<const BUS: bool> TableT for Poseidon16Precompile<BUS> {
         *perm.effective_index_left_first = F::from_usize(zero_vec_ptr);
         *perm.effective_index_left_second = F::from_usize(zero_vec_ptr + HALF_DIGEST_LEN);
         *perm.flag_permute = F::ZERO;
-        perm.outputs_right.iter_mut().for_each(|x| **x = F::ZERO);
         row[POSEIDON_16_COL_INDEX_INPUT_LEFT] = F::from_usize(zero_vec_ptr);
         row[POSEIDON_16_COL_DOMAINSEP] = F::from_usize(POSEIDON_DOMAINSEP_BASE);
 
@@ -308,7 +306,7 @@ impl<const BUS: bool> Air for Poseidon16Precompile<BUS> {
         0
     }
     fn n_constraints(&self) -> usize {
-        2 * BUS as usize + 99
+        2 * BUS as usize + 91 // was 99, removed 8 flag_permute * (state[i+8] - outputs_right[i]) constraints
     }
     fn eval<AB: AirBuilder>(&self, builder: &mut AB, extra_data: &Self::ExtraData) {
         let cols: Poseidon1Cols16<AB::IF> = {
@@ -378,7 +376,6 @@ pub(super) struct Poseidon1Cols16<T> {
     pub partial_rounds: [T; PARTIAL_ROUNDS],
     pub ending_full_rounds: [[T; WIDTH]; HALF_FINAL_FULL_ROUNDS - 1],
     pub outputs_left: [T; WIDTH / 2],
-    pub outputs_right: [T; WIDTH / 2],
 }
 
 fn eval_poseidon1_16<AB: AirBuilder>(builder: &mut AB, local: &Poseidon1Cols16<AB::IF>) {
@@ -438,7 +435,6 @@ fn eval_poseidon1_16<AB: AirBuilder>(builder: &mut AB, local: &Poseidon1Cols16<A
         &local.inputs,
         &mut state,
         &local.outputs_left,
-        &local.outputs_right,
         &final_constants[2 * (HALF_FINAL_FULL_ROUNDS - 1)],
         &final_constants[2 * (HALF_FINAL_FULL_ROUNDS - 1) + 1],
         local.flag_half_output,
@@ -486,7 +482,6 @@ fn eval_last_2_full_rounds_16<AB: AirBuilder>(
     initial_state: &[AB::IF; WIDTH],
     state: &mut [AB::IF; WIDTH],
     outputs_left: &[AB::IF; WIDTH / 2],
-    outputs_right: &[AB::IF; WIDTH / 2],
     round_constants_1: &[F; WIDTH],
     round_constants_2: &[F; WIDTH],
     flag_half_output: AB::IF,
@@ -513,7 +508,6 @@ fn eval_last_2_full_rounds_16<AB: AirBuilder>(
         };
         builder.assert_zero(compression_gate * (state[i] + initial_state[i] - outputs_left[i]));
         builder.assert_zero(flag_permute * (state[i] - outputs_left[i]));
-        builder.assert_zero(flag_permute * (state[i + WIDTH / 2] - outputs_right[i]));
     }
 }
 
diff --git a/crates/lean_vm/src/tables/poseidon/trace_gen.rs b/crates/lean_vm/src/tables/poseidon/trace_gen.rs
index c7b93cf56..faec41400 100644
--- a/crates/lean_vm/src/tables/poseidon/trace_gen.rs
+++ b/crates/lean_vm/src/tables/poseidon/trace_gen.rs
@@ -104,7 +104,6 @@ pub(super) fn generate_trace_rows_for_perm<F: Algebra<KoalaBear> + Copy>(perm: &
         &mut state,
         &inputs,
         &mut perm.outputs_left,
-        &mut perm.outputs_right,
         flag_permute,
         &poseidon1_final_constants()[2 * n_ending_full_rounds],
         &poseidon1_final_constants()[2 * n_ending_full_rounds + 1],
@@ -140,7 +139,6 @@ fn generate_last_2_full_rounds<F: Algebra<KoalaBear> + Copy>(
     state: &mut [F; WIDTH],
     inputs: &[F; WIDTH],
     outputs_left: &mut [&mut F; WIDTH / 2],
-    outputs_right: &mut [&mut F; WIDTH / 2],
     flag_permute: F,
     round_constants_1: &[KoalaBear; WIDTH],
     round_constants_2: &[KoalaBear; WIDTH],
@@ -160,6 +158,6 @@ fn generate_last_2_full_rounds<F: Algebra<KoalaBear> + Copy>(
     for i in 0..(WIDTH / 2) {
         let compression_value = state[i] + inputs[i];
         *outputs_left[i] = (F::ONE - flag_permute) * compression_value + flag_permute * state[i];
-        *outputs_right[i] = flag_permute * state[i + WIDTH / 2];
+        // outputs_right removed — only outputs_left is committed
     }
 }

From ec0d7a713e289c15a83535ca50f096d6cf7df7bd Mon Sep 17 00:00:00 2001
From: Barnadrot <kbarna.drot@gmail.com>
Date: Mon, 25 May 2026 21:04:33 +0200
Subject: [PATCH 7/7] style: fix rustfmt and remove dead test module reference

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 crates/backend/koala-bear/src/quintic_extension/mod.rs | 2 --
 crates/backend/poly/src/eq_mle.rs                      | 8 ++------
 crates/whir/src/open.rs                                | 5 +----
 3 files changed, 3 insertions(+), 12 deletions(-)

diff --git a/crates/backend/koala-bear/src/quintic_extension/mod.rs b/crates/backend/koala-bear/src/quintic_extension/mod.rs
index 11a47cbf5..6ccdca4f7 100644
--- a/crates/backend/koala-bear/src/quintic_extension/mod.rs
+++ b/crates/backend/koala-bear/src/quintic_extension/mod.rs
@@ -11,8 +11,6 @@ use crate::{KoalaBear, KoalaBearParameters};
 pub mod extension;
 pub(crate) mod packed_extension;
 pub(crate) mod packing;
-#[cfg(test)]
-mod tests;
 
 pub type QuinticExtensionFieldKB = QuinticExtensionField<KoalaBear>;
 pub type PackedQuinticExtensionFieldKB = PackedQuinticExtensionField<KoalaBear, <KoalaBear as Field>::Packing>;
diff --git a/crates/backend/poly/src/eq_mle.rs b/crates/backend/poly/src/eq_mle.rs
index b6cbe6ae5..64d3733f5 100644
--- a/crates/backend/poly/src/eq_mle.rs
+++ b/crates/backend/poly/src/eq_mle.rs
@@ -922,12 +922,8 @@ fn eval_eq_with_packed_output_dual<F: Field, EF: ExtensionField<F>>(
             let sa0 = scalar_a - sa1;
             let sb1 = scalar_b * eval_b[0];
             let sb0 = scalar_b - sb1;
-            eval_eq_with_packed_output_dual::<F, EF>(
-                &eval_a[1..], &eval_b[1..], low, sa0, sb0,
-            );
-            eval_eq_with_packed_output_dual::<F, EF>(
-                &eval_a[1..], &eval_b[1..], high, sa1, sb1,
-            );
+            eval_eq_with_packed_output_dual::<F, EF>(&eval_a[1..], &eval_b[1..], low, sa0, sb0);
+            eval_eq_with_packed_output_dual::<F, EF>(&eval_a[1..], &eval_b[1..], high, sa1, sb1);
         }
     }
 }
diff --git a/crates/whir/src/open.rs b/crates/whir/src/open.rs
index 0919b2cff..dec608061 100644
--- a/crates/whir/src/open.rs
+++ b/crates/whir/src/open.rs
@@ -543,10 +543,7 @@ where
 
         let second = statements.get(1);
         let second_is_full_domain = second.is_some_and(|s| {
-            !s.is_next
-                && s.values.len() == 1
-                && s.values[0].selector == 0
-                && s.inner_num_variables() == num_variables
+            !s.is_next && s.values.len() == 1 && s.values[0].selector == 0 && s.inner_num_variables() == num_variables
         });
 
         if second_is_full_domain {