From 26b6c92c8e8d720914fbb00736807a5f225a2e48 Mon Sep 17 00:00:00 2001 From: Barnadrot Date: Mon, 11 May 2026 11:48:13 +0200 Subject: [PATCH] fix: enforce flat-phase contract via assert (no depth counter) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the PHASE_DEPTH counter with an unconditional assert in begin_phase() that panics when a phase is already active. The counter masked correctness bugs: a panic between paired calls orphaned the depth and silently shifted subsequent phases into nested mode, where the slab kept growing instead of recycling. Mirrors the pattern Emile cherry-picked into leanMultisig as 'zk-alloc: assert begin_phase has flat lifetime' (684a526c). Bringing it back to the standalone repo so downstream consumers see the same contract. - src/lib.rs: drop PHASE_DEPTH, swap-based assert in begin_phase, unconditional store in end_phase. Doc-comment now states the contract explicitly. - tests/test_nested_phase.rs: rewritten as a #[should_panic] contract test pinning the assert message. - tests/test_panic_phase.rs: pinned to 'next begin_phase trips the assert loudly' after an orphaned phase, instead of the old 'depth-counter preserves data' assertion. - tests/test_phase_guard.rs: nested_phase_guards_compose → nested_phase_guards_panic (#[should_panic]). Lock acquires ignore poisoning so the should_panic test does not cascade-fail the binary. - tests/correctness.rs, tests/test_crossbeam_epoch.rs: add a per-binary PHASE_LOCK around phase-touching tests so they don't race on ARENA_ACTIVE under parallel test execution. Add the missing end_phase() in phase_boundary_does_not_crash. - tests/test_realloc_overlap.rs: reflow a doc-comment ASCII diagram the new clippy 1.94 doc_overindented_list_items lint rejected. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/lib.rs | 52 +++++++++++----------------- tests/correctness.rs | 10 +++++- tests/test_crossbeam_epoch.rs | 7 ++++ tests/test_nested_phase.rs | 49 +++++++------------------- tests/test_panic_phase.rs | 65 ++++++++--------------------------- tests/test_phase_guard.rs | 17 +++++---- tests/test_realloc_overlap.rs | 8 ++--- 7 files changed, 76 insertions(+), 132 deletions(-) diff --git a/src/lib.rs b/src/lib.rs index 0327247..2804e8a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -112,13 +112,6 @@ static MAX_THREADS: AtomicUsize = AtomicUsize::new(0); static OVERFLOW_COUNT: AtomicUsize = AtomicUsize::new(0); static OVERFLOW_BYTES: AtomicUsize = AtomicUsize::new(0); -/// Nested-phase counter. `begin_phase()` increments it; `end_phase()` -/// decrements. Only the outermost begin (0 → 1) bumps GENERATION and -/// flips ARENA_ACTIVE; only the outermost end (1 → 0) deactivates. -/// This makes nested `begin_phase()` calls compose without the inner -/// begin recycling the outer phase's slab data. -static PHASE_DEPTH: AtomicUsize = AtomicUsize::new(0); - /// Allocations smaller than this go to System even during active phases. /// Routes registry / hashmap / injector-block-sized allocations away from /// the arena, so library state that outlives a phase doesn't land in @@ -183,6 +176,17 @@ fn ensure_region() -> usize { /// Activates the arena and resets every thread's slab. All allocations until the next /// `end_phase()` go to the arena; the previous phase's data is overwritten in place. /// +/// ## Phases must not nest +/// +/// Calling `begin_phase()` while another phase is already active panics. The +/// arena is a flat lifetime — nested phases were previously tolerated via a +/// depth counter, but the depth counter masked correctness bugs (panics +/// orphaning the count, accidental double-begin recycling the outer phase's +/// slab on the next allocation). The contract is now: every `begin_phase()` +/// is paired with one `end_phase()` (or use [`PhaseGuard`] / [`phase`] for +/// panic-safe pairing), and no second `begin_phase()` is reachable from +/// within an active phase. +/// /// ## Retention is unsafe /// /// Allocations made during phase N that are still held when phase N+1 begins @@ -203,14 +207,12 @@ fn ensure_region() -> usize { /// or copy into a `Vec` allocated outside any phase). pub fn begin_phase() { ensure_region(); - // Only the outermost begin bumps GENERATION and activates the arena. - // Nested begin_phase() calls just bump the depth counter; without this - // guard, an inner begin would recycle the slab and silently overwrite - // the outer phase's bump-allocated data. - if PHASE_DEPTH.fetch_add(1, Ordering::AcqRel) == 0 { - GENERATION.fetch_add(1, Ordering::Release); - ARENA_ACTIVE.store(true, Ordering::Release); - } + let prev_active = ARENA_ACTIVE.swap(true, Ordering::Release); + assert!( + !prev_active, + "begin_phase() called while another phase is already active — phases must not nest" + ); + GENERATION.fetch_add(1, Ordering::Release); } /// Deactivates the arena. New allocations go to the system allocator; existing arena @@ -219,23 +221,11 @@ pub fn begin_phase() { /// With the `rayon-flush` feature (default), this also drains rayon's internal /// queues to release any crossbeam-deque blocks allocated during the phase. /// -/// Calls beyond the matching `begin_phase()` (i.e., when no phase is -/// active) are tolerated and have no effect. +/// Idempotent: calling `end_phase()` while no phase is active is a no-op. pub fn end_phase() { - // Only the outermost end (depth 1 → 0) tears down. Saturate at zero so - // unbalanced end_phase() calls are no-ops rather than wrapping around. - let prev = PHASE_DEPTH.fetch_update(Ordering::AcqRel, Ordering::Acquire, |d| { - if d == 0 { - None - } else { - Some(d - 1) - } - }); - if let Ok(1) = prev { - ARENA_ACTIVE.store(false, Ordering::Release); - #[cfg(feature = "rayon-flush")] - flush_rayon(); - } + ARENA_ACTIVE.store(false, Ordering::Release); + #[cfg(feature = "rayon-flush")] + flush_rayon(); } /// Drains rayon's crossbeam-deque injector to release blocks allocated during diff --git a/tests/correctness.rs b/tests/correctness.rs index 48d43f8..f64d120 100644 --- a/tests/correctness.rs +++ b/tests/correctness.rs @@ -1,8 +1,14 @@ use std::alloc::{GlobalAlloc, Layout}; +use std::sync::Mutex; use zk_alloc::ZkAllocator; static ZK: ZkAllocator = ZkAllocator; +// Serializes tests that touch begin_phase()/end_phase(). The flat-phase +// contract makes ARENA_ACTIVE shared state, so parallel test threads would +// race and trip the nested-phase assert. +static PHASE_LOCK: Mutex<()> = Mutex::new(()); + #[test] fn small_alloc_returns_aligned_nonnull() { for size in [1, 8, 32, 64, 128, 256, 512] { @@ -85,6 +91,7 @@ fn realloc_preserves_data() { #[test] fn phase_boundary_does_not_crash() { + let _lock = PHASE_LOCK.lock().unwrap(); let layout = Layout::from_size_align(128, 8).unwrap(); for _ in 0..10 { let ptr = unsafe { ZK.alloc(layout) }; @@ -97,6 +104,7 @@ fn phase_boundary_does_not_crash() { assert!(!ptr.is_null()); unsafe { ZK.dealloc(ptr, layout) }; } + zk_alloc::end_phase(); } #[test] @@ -126,7 +134,7 @@ fn cross_thread_dealloc_does_not_crash() { #[test] fn arena_active_allocation() { - zk_alloc::begin_phase(); + let _lock = PHASE_LOCK.lock().unwrap(); zk_alloc::begin_phase(); let layout = Layout::from_size_align(4096, 8).unwrap(); diff --git a/tests/test_crossbeam_epoch.rs b/tests/test_crossbeam_epoch.rs index eb3f450..3234c59 100644 --- a/tests/test_crossbeam_epoch.rs +++ b/tests/test_crossbeam_epoch.rs @@ -13,10 +13,15 @@ //! drive more retires, and assert program integrity over many cycles. use rayon::prelude::*; +use std::sync::Mutex; #[global_allocator] static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator; +// Serialize phase-touching tests within this binary so they don't race on +// ARENA_ACTIVE and trip the flat-phase assert. +static PHASE_LOCK: Mutex<()> = Mutex::new(()); + /// Force per-worker crossbeam-deque buffer growth via deep recursion. Each /// growth retires the prior buffer to crossbeam-epoch. fn nested_join(depth: usize) { @@ -28,6 +33,7 @@ fn nested_join(depth: usize) { #[test] fn crossbeam_epoch_garbage_survives_phase_cycles() { + let _lock = PHASE_LOCK.lock().unwrap(); let _: u64 = (0..1_000_000_u64).into_par_iter().sum(); const CYCLES: usize = 50; @@ -58,6 +64,7 @@ fn crossbeam_epoch_garbage_survives_phase_cycles() { /// rayon-heavy workloads survive 100 cycles. #[test] fn crossbeam_in_par_iter_collect_survives_cycles() { + let _lock = PHASE_LOCK.lock().unwrap(); let _: u64 = (0..1_000_000_u64).into_par_iter().sum(); for _ in 0..100 { diff --git a/tests/test_nested_phase.rs b/tests/test_nested_phase.rs index 1b123a3..3b022d7 100644 --- a/tests/test_nested_phase.rs +++ b/tests/test_nested_phase.rs @@ -1,48 +1,25 @@ -//! hunt-1: Nested phases silently corrupt outer phase data. +//! Contract test: nested `begin_phase()` calls panic. //! -//! Hypothesis: nested `begin_phase()` calls bump GENERATION, which marks -//! every thread's slab as needing reset. On the next allocation in the -//! inner phase, ARENA_PTR is reset to the slab base — overwriting any -//! data the outer phase had bump-allocated there. +//! Phases are flat. A nested `begin_phase()` previously corrupted the outer +//! phase's slab (later masked by a depth counter, which itself had failure +//! modes — a panic between matched calls left the depth orphaned). The +//! library now enforces the flat-phase contract with an assertion in +//! `begin_phase()`: any second begin while a phase is active is a panic. //! -//! There is no nesting counter. The inner `end_phase()` flips -//! ARENA_ACTIVE to false, after which any allocations in the remainder -//! of the outer phase silently land in System (a different correctness -//! issue covered separately). -//! -//! Existing `tests/test_phase_guard.rs::nested_phase_guards_compose` -//! only verifies that nested guards don't panic — it never allocates -//! anything in the outer phase, so this corruption is invisible there. +//! This test pins that behavior so a future depth-counter regression is +//! caught at `cargo test`. #[global_allocator] static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator; #[test] -fn nested_phase_does_not_corrupt_outer() { - // Allocation big enough to land in arena (>= min_arena_bytes = 4096). +#[should_panic(expected = "phases must not nest")] +fn nested_begin_phase_panics() { zk_alloc::begin_phase(); - let outer: Vec = vec![0xA1_u8; 8192]; - let outer_ptr = outer.as_ptr() as usize; - - // Nested begin_phase. GENERATION bumps; the next arena alloc on - // this thread will reset ARENA_PTR back to slab base. + // Second begin_phase while the first is still active must panic. zk_alloc::begin_phase(); - let inner: Vec = vec![0xB2_u8; 8192]; - let inner_ptr = inner.as_ptr() as usize; - let inner_first_byte = inner[0]; + // Unreachable. Tearing down so a hypothetical non-panic doesn't leave + // ARENA_ACTIVE set across other tests. zk_alloc::end_phase(); - zk_alloc::end_phase(); - - eprintln!("outer_ptr=0x{outer_ptr:x} inner_ptr=0x{inner_ptr:x}"); - - // If nested phases are sound, outer's bytes are still 0xA1. - let pos = outer.iter().position(|&b| b != 0xA1); - let _ = inner_first_byte; - assert!( - pos.is_none(), - "outer phase data corrupted at offset {}: nested begin_phase bumped \ - GENERATION and the inner allocation recycled the outer's slab region", - pos.unwrap() - ); } diff --git a/tests/test_panic_phase.rs b/tests/test_panic_phase.rs index 4cef194..77c938b 100644 --- a/tests/test_panic_phase.rs +++ b/tests/test_panic_phase.rs @@ -1,73 +1,38 @@ //! Scenario 3: panic unwinding through a phase boundary. //! -//! There is no RAII guard around bare begin_phase()/end_phase(). If a -//! panic propagates out of phase code without reaching end_phase(), -//! PHASE_DEPTH stays > 0 and ARENA_ACTIVE stays true. +//! There is no RAII guard around bare begin_phase()/end_phase(). If a panic +//! propagates out of phase code without reaching end_phase(), ARENA_ACTIVE +//! stays true. //! -//! After the depth-counter fix, this means: subsequent begin_phase() calls -//! in the recovery path nest under the orphaned phase rather than recycling -//! the slab. The slab keeps growing across "iterations" until it overflows -//! to System — a memory leak, not silent corruption. Either way the -//! recovery path is broken; PhaseGuard is the supported way to make panic -//! recovery safe (see test_phase_guard.rs). +//! Under the flat-phase contract, the next `begin_phase()` in the recovery +//! path then panics with the "phases must not nest" message — i.e. the +//! failure is loud, not silent. `PhaseGuard` / `phase()` is the supported +//! way to make panic recovery safe; see test_phase_guard.rs. #[global_allocator] static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator; #[test] -fn panic_without_phase_guard_orphans_phase_depth() { +fn panic_without_phase_guard_leaves_arena_active_and_trips_next_begin() { use std::panic; - // Suppress default panic print to minimize incidental allocations between - // the panic and our observation point. panic::set_hook(Box::new(|_| {})); let _ = vec![0u8; 1024]; // warm up zk_alloc::begin_phase(); let r = panic::catch_unwind(panic::AssertUnwindSafe(|| panic!("simulated"))); assert!(r.is_err()); - // No end_phase reached. PHASE_DEPTH still 1, ARENA_ACTIVE still true. - - // Lands in arena (active=true, 8192 >= MIN_ARENA_BYTES default 4096). - let post_panic: Vec = vec![0xCC; 8192]; - let post_panic_ptr = post_panic.as_ptr() as usize; - - // Begin the next "iteration". With the depth-counter fix this nests - // under the orphaned phase (depth 1 → 2) instead of bumping - // GENERATION, so the slab is NOT reset and post_panic survives. - zk_alloc::begin_phase(); - let big: Vec = vec![0x33; 1 << 20]; - let big_ptr = big.as_ptr() as usize; - let big_end = big_ptr + big.len(); - zk_alloc::end_phase(); + // No end_phase reached. ARENA_ACTIVE still true. + // The next begin_phase must panic (flat-phase assert), surfacing the + // missing end_phase loudly rather than silently corrupting state. + let next = panic::catch_unwind(panic::AssertUnwindSafe(zk_alloc::begin_phase)); let _ = panic::take_hook(); - - let post_in_big = post_panic_ptr >= big_ptr && post_panic_ptr < big_end; - let big_overlaps_post = - big_ptr < post_panic_ptr + post_panic.len() && big_ptr + big.len() > post_panic_ptr; - let observed = post_panic[0]; - - eprintln!( - "post_panic_ptr=0x{post_panic_ptr:x} big=[0x{big_ptr:x}, 0x{big_end:x}); \ - post_in_big={post_in_big} big_overlaps_post={big_overlaps_post} \ - observed=0x{observed:02x}" - ); - - // With the depth-counter fix, the second begin_phase nests rather than - // recycling. post_panic is preserved and big is allocated *after* it - // in the same slab, so they do not overlap. assert!( - !big_overlaps_post, - "depth-counter fix failed: nested begin_phase recycled the slab and \ - big overlapped post_panic" - ); - assert_eq!( - observed, 0xCC, - "post-panic Vec was corrupted; depth-counter fix should prevent the \ - next begin_phase from recycling the slab. got 0x{observed:02x}" + next.is_err(), + "begin_phase after an orphaned phase must panic, but it returned normally" ); - // Drain the orphaned depth so subsequent tests see a clean state. + // Restore a clean state for subsequent tests. zk_alloc::end_phase(); } diff --git a/tests/test_phase_guard.rs b/tests/test_phase_guard.rs index a914815..85351fb 100644 --- a/tests/test_phase_guard.rs +++ b/tests/test_phase_guard.rs @@ -14,7 +14,7 @@ static A: zk_alloc::ZkAllocator = zk_alloc::ZkAllocator; #[test] fn phase_guard_runs_end_phase_on_panic() { - let _lock = PHASE_LOCK.lock().unwrap(); + let _lock = PHASE_LOCK.lock().unwrap_or_else(|e| e.into_inner()); use std::panic; panic::set_hook(Box::new(|_| {})); @@ -65,7 +65,7 @@ fn phase_guard_runs_end_phase_on_panic() { #[test] fn phase_guard_runs_end_phase_on_normal_return() { - let _lock = PHASE_LOCK.lock().unwrap(); + let _lock = PHASE_LOCK.lock().unwrap_or_else(|e| e.into_inner()); let v = zk_alloc::phase(|| vec![0xAB_u8; 8192]); // After phase, arena is inactive. Subsequent allocations go to System. let after: Vec = vec![0xCD_u8; 8192]; @@ -86,11 +86,10 @@ fn phase_guard_runs_end_phase_on_normal_return() { } #[test] -fn nested_phase_guards_compose() { - let _lock = PHASE_LOCK.lock().unwrap(); - // Outer phase + inner phase. Inner phase end_phases (sets active=false), - // then outer phase end_phases again. Sequence: begin, begin, end, end. - // Final state: active=false. No panic. - let result = zk_alloc::phase(|| zk_alloc::phase(|| 42_u64)); - assert_eq!(result, 42); +#[should_panic(expected = "phases must not nest")] +fn nested_phase_guards_panic() { + let _lock = PHASE_LOCK.lock().unwrap_or_else(|e| e.into_inner()); + // Phases are flat. `phase(|| phase(...))` calls begin_phase while + // the outer phase is still active, which trips the flat-phase assert. + let _ = zk_alloc::phase(|| zk_alloc::phase(|| 42_u64)); } diff --git a/tests/test_realloc_overlap.rs b/tests/test_realloc_overlap.rs index e680654..c366e45 100644 --- a/tests/test_realloc_overlap.rs +++ b/tests/test_realloc_overlap.rs @@ -12,11 +12,9 @@ //! slab_base; we make it 4 KiB and fill with 0xD0. ARENA_PTR now sits //! at slab_base + 4 KiB. //! 5. Realloc p1 to grow to 16 KiB. The arena fast path returns -//! ARENA_PTR = slab_base + 4 KiB, so: -//! src = [base, base+8K) — bytes: 0xD0 (first 4K, py overwrote) -//! + 0xC1 (last 4K, untouched) -//! dst = [base+4K, base+12K) -//! overlap region = [base+4K, base+8K) — 4 KiB +//! ARENA_PTR = slab_base + 4 KiB, so src = [base, base+8K) with bytes +//! 0xD0 (first 4K, py overwrote) + 0xC1 (last 4K, untouched); +//! dst = [base+4K, base+12K); overlap = [base+4K, base+8K) (4 KiB). //! 6. Realloc executes `copy_nonoverlapping(src, dst, 8192)`. With glibc's //! forward-direction memcpy this writes src's first half (0xD0) into //! the overlap region, clobbering the upper half of src BEFORE it's