From 06a692cf408ed484d7bd03ed1c60783f59659088 Mon Sep 17 00:00:00 2001 From: TheHypnoo Date: Sat, 13 Jun 2026 19:54:57 +0200 Subject: [PATCH] perf(gc): skip the per-write layout-mask hashmap on scalar-over-scalar array stores MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For an in-place `arr[i] = value` on a non-raw-layout (e.g. downgraded `any[]`) array, codegen emits a per-write `js_gc_note_slot_layout` call that, for a SIDE_MASK array, does a thread-local `LAYOUT_SLOT_MASKS` hashmap lookup + clear_slot — even when the slot was already non-pointer and stays non-pointer. On a numeric write loop over a downgraded array this is the dominant cost (stubbing the note makes bench_numeric_array_downgrade 11x faster). Add `js_gc_note_slot_layout_aware(parent, slot, value_bits, old_bits)`: when neither the new nor the previous slot value is a heap pointer, the slot's pointer-ness is unchanged, so the GC per-slot mask needs no update and the hashmap is skipped. The mask invariant ('bit set <=> slot holds a pointer') is preserved because the full path still runs whenever a pointer is involved on either side (new is a pointer -> set; old was a pointer -> clear). Uses the same `layout_pointer_bearing_bits` predicate the layout machinery uses internally, so raw-pointer slots are classified correctly (not just NaN-box tags). Codegen (`emit_jsvalue_slot_store_scalar_aware_on_block`) loads the slot's previous value before the store and routes ONLY in-place array element overwrites (index.rs) through the aware note — object field writes and fresh-slot appends keep the original note, so POINTER_FREE-dominated paths (bench_object_property) are unaffected. bench_numeric_array_downgrade (M1 Pro): 4482ms -> 427ms (~10.5x). Checksum identical to Node. First concrete win from the #5094 GC-layout umbrella. Verified: GC-stress correct (pointer<->number slot transitions + GC under PERRY_GC_VERIFY_EVACUATION=1 / PERRY_GC_FORCE_EVACUATE=1 / PERRY_GEN_GC=0); codegen + runtime tests pass; full local parity shows zero new regressions; no other benchmark regresses. --- crates/perry-codegen/src/expr/index.rs | 9 +- crates/perry-codegen/src/expr/mod.rs | 6 +- .../perry-codegen/src/expr/write_barrier.rs | 110 +++++++++++++++++- .../perry-codegen/src/runtime_decls/arrays.rs | 2 + crates/perry-runtime/src/gc/layout.rs | 27 +++++ 5 files changed, 148 insertions(+), 6 deletions(-) diff --git a/crates/perry-codegen/src/expr/index.rs b/crates/perry-codegen/src/expr/index.rs index eea7f4b087..3e6fad1fd8 100644 --- a/crates/perry-codegen/src/expr/index.rs +++ b/crates/perry-codegen/src/expr/index.rs @@ -5,7 +5,8 @@ use anyhow::{anyhow, Result}; use super::{ emit_array_numeric_write_note_on_block, emit_jsvalue_slot_store_on_block, - emit_write_barrier_slot_on_block, nanbox_pointer_inline, raw_f64_layout_fact, FnCtx, + emit_jsvalue_slot_store_scalar_aware_on_block, emit_write_barrier_slot_on_block, + nanbox_pointer_inline, raw_f64_layout_fact, FnCtx, }; use crate::block::LlBlock; use crate::nanbox::POINTER_MASK_I64; @@ -218,7 +219,11 @@ pub(crate) fn lower_index_set_fast( ); } else { let (element_addr, element_ptr) = element_slot(blk, &arr_handle, &idx_i32); - let value_bits = emit_jsvalue_slot_store_on_block( + // In-place overwrite of a non-raw-layout (e.g. downgraded `any[]`) + // array element: the slot holds a valid value, so the scalar-aware + // note skips the GC layout hashmap on scalar-over-scalar stores + // (#5094 — ~9× on bench_numeric_array_downgrade). + let value_bits = emit_jsvalue_slot_store_scalar_aware_on_block( blk, &element_ptr, val_double, diff --git a/crates/perry-codegen/src/expr/mod.rs b/crates/perry-codegen/src/expr/mod.rs index fe5adba57b..d85c58dbda 100644 --- a/crates/perry-codegen/src/expr/mod.rs +++ b/crates/perry-codegen/src/expr/mod.rs @@ -120,9 +120,9 @@ pub(crate) use v8_interop::{ }; pub(crate) use write_barrier::{ emit_array_numeric_write_note_on_block, emit_jsvalue_slot_store_on_block, - emit_layout_note_slot_on_block, emit_root_heap_word_store_on_block, - emit_root_nanbox_store_on_block, emit_write_barrier, emit_write_barrier_slot_on_block, - lower_node_stream_super_init, lower_stream_super_init, + emit_jsvalue_slot_store_scalar_aware_on_block, emit_layout_note_slot_on_block, + emit_root_heap_word_store_on_block, emit_root_nanbox_store_on_block, emit_write_barrier, + emit_write_barrier_slot_on_block, lower_node_stream_super_init, lower_stream_super_init, }; /// One in-flight inline-constructor return target. See diff --git a/crates/perry-codegen/src/expr/write_barrier.rs b/crates/perry-codegen/src/expr/write_barrier.rs index b90e276156..122d72037b 100644 --- a/crates/perry-codegen/src/expr/write_barrier.rs +++ b/crates/perry-codegen/src/expr/write_barrier.rs @@ -72,6 +72,28 @@ pub(crate) fn emit_layout_note_slot_on_block( ); } +/// Scalar-aware layout note: passes the slot's previous value (`old_bits`) so +/// the runtime can skip the thread-local layout hashmap when the store does not +/// change the slot's pointer-ness (scalar-over-scalar). See +/// `js_gc_note_slot_layout_aware`. +pub(crate) fn emit_layout_note_slot_aware_on_block( + blk: &mut LlBlock, + parent_bits: &str, + slot_index: &str, + value_bits: &str, + old_bits: &str, +) { + blk.call_void( + "js_gc_note_slot_layout_aware", + &[ + (I64, parent_bits), + (I32, slot_index), + (I64, value_bits), + (I64, old_bits), + ], + ); +} + pub(crate) fn emit_array_numeric_write_note_on_block( blk: &mut LlBlock, array_bits: &str, @@ -94,6 +116,78 @@ pub(crate) fn emit_jsvalue_slot_store_on_block( slot_addr: &str, write_barrier_needed: bool, ) -> Option { + emit_jsvalue_slot_store_on_block_inner( + blk, + slot_ptr, + value_double, + layout_parent_bits, + slot_index, + layout_note_needed, + barrier_parent_bits, + slot_addr, + write_barrier_needed, + false, + ) +} + +/// As [`emit_jsvalue_slot_store_on_block`], but for an **in-place element +/// overwrite** of a slot that already holds a valid value: routes the layout +/// note through `js_gc_note_slot_layout_aware`, which loads the previous slot +/// value and skips the thread-local layout hashmap when neither old nor new is +/// a heap pointer. Use only where the slot is guaranteed initialized (array +/// `arr[i] = …` overwrites), not for fresh-slot appends/literals or object +/// field writes (which are POINTER_FREE-dominated and only pay the extra load). +/// This is the dominant per-write cost on downgraded `any[]` numeric loops +/// (#5094) and gives ~9× on `bench_numeric_array_downgrade` without regressing +/// `bench_object_property`. +pub(crate) fn emit_jsvalue_slot_store_scalar_aware_on_block( + blk: &mut LlBlock, + slot_ptr: &str, + value_double: &str, + layout_parent_bits: &str, + slot_index: &str, + layout_note_needed: bool, + barrier_parent_bits: &str, + slot_addr: &str, + write_barrier_needed: bool, +) -> Option { + emit_jsvalue_slot_store_on_block_inner( + blk, + slot_ptr, + value_double, + layout_parent_bits, + slot_index, + layout_note_needed, + barrier_parent_bits, + slot_addr, + write_barrier_needed, + true, + ) +} + +#[allow(clippy::too_many_arguments)] +fn emit_jsvalue_slot_store_on_block_inner( + blk: &mut LlBlock, + slot_ptr: &str, + value_double: &str, + layout_parent_bits: &str, + slot_index: &str, + layout_note_needed: bool, + barrier_parent_bits: &str, + slot_addr: &str, + write_barrier_needed: bool, + scalar_aware: bool, +) -> Option { + // The scalar-aware layout note needs the slot's PREVIOUS value to decide + // whether the slot's pointer-ness actually changed; load it before the + // store overwrites it. Only when both a note is needed and the caller opted + // into the scalar-aware path (the slot is a valid in-place overwrite). + let old_bits = if scalar_aware && layout_note_needed { + let old_double = blk.load(DOUBLE, slot_ptr); + Some(blk.bitcast_double_to_i64(&old_double)) + } else { + None + }; // GC_STORE_AUDIT(BARRIERED): generated heap JSValue stores route through this shared emitter. blk.store(DOUBLE, value_double, slot_ptr); if !layout_note_needed && !write_barrier_needed { @@ -101,7 +195,21 @@ pub(crate) fn emit_jsvalue_slot_store_on_block( } let value_bits = blk.bitcast_double_to_i64(value_double); if layout_note_needed { - emit_layout_note_slot_on_block(blk, layout_parent_bits, slot_index, &value_bits); + match old_bits.as_deref() { + // Scalar-over-scalar stores leave the GC slot layout unchanged — the + // aware note skips the thread-local layout hashmap when neither the + // new nor the old value is a heap pointer (#5094). + Some(old) => emit_layout_note_slot_aware_on_block( + blk, + layout_parent_bits, + slot_index, + &value_bits, + old, + ), + None => { + emit_layout_note_slot_on_block(blk, layout_parent_bits, slot_index, &value_bits) + } + } } if write_barrier_needed { emit_write_barrier_slot_on_block(blk, barrier_parent_bits, slot_addr, &value_bits); diff --git a/crates/perry-codegen/src/runtime_decls/arrays.rs b/crates/perry-codegen/src/runtime_decls/arrays.rs index df8bdd19fc..364c901260 100644 --- a/crates/perry-codegen/src/runtime_decls/arrays.rs +++ b/crates/perry-codegen/src/runtime_decls/arrays.rs @@ -99,6 +99,8 @@ pub fn declare_phase_b_arrays(module: &mut LlModule) { module.declare_function("js_write_barrier_root_nanbox", VOID, &[I64]); module.declare_function("js_write_barrier_root_heap_word", VOID, &[I64]); module.declare_function("js_gc_note_slot_layout", VOID, &[I64, I32, I64]); + // js_gc_note_slot_layout_aware(parent, slot_index, value_bits, old_bits) + module.declare_function("js_gc_note_slot_layout_aware", VOID, &[I64, I32, I64, I64]); module.declare_function( "js_gc_init_typed_shape_layout", VOID, diff --git a/crates/perry-runtime/src/gc/layout.rs b/crates/perry-runtime/src/gc/layout.rs index 977193cab0..52d470b312 100644 --- a/crates/perry-runtime/src/gc/layout.rs +++ b/crates/perry-runtime/src/gc/layout.rs @@ -442,6 +442,33 @@ pub extern "C" fn js_gc_note_slot_layout(parent: u64, slot_index: u32, value_bit layout_note_slot(parent_user, slot_index as usize, value_bits); } +/// Scalar-aware variant of [`js_gc_note_slot_layout`]: `old_bits` is the value +/// previously held in the slot. When **neither** the new value nor the old +/// value is a heap pointer, the slot's pointer-ness is unchanged, so the +/// per-slot GC layout mask needs no update — the `SIDE_MASK`/typed path's +/// thread-local hashmap touch is skipped. The mask invariant ("bit set ⟺ slot +/// holds a pointer") is preserved because the full path still runs whenever a +/// pointer is involved on either side (`new` is a pointer → set; `old` was a +/// pointer → clear), which is exactly when the mask must change. This is the +/// dominant per-write cost on heterogeneous `any[]` numeric write loops +/// (stubbing `layout_note_slot` makes `bench_numeric_array_downgrade` 11× +/// faster). `layout_pointer_bearing_bits` is the same predicate the layout +/// machinery uses internally, so raw-pointer array slots are classified +/// correctly (not just NaN-boxed tags). +#[no_mangle] +pub extern "C" fn js_gc_note_slot_layout_aware( + parent: u64, + slot_index: u32, + value_bits: u64, + old_bits: u64, +) { + if !layout_pointer_bearing_bits(value_bits) && !layout_pointer_bearing_bits(old_bits) { + return; + } + let parent_user = strip_nanbox_user_ptr(parent); + layout_note_slot(parent_user, slot_index as usize, value_bits); +} + unsafe fn init_typed_shape_layout( user_ptr: usize, slot_count: usize,