From 83aa967d2c9579952178dcde39f88c312f5bbe76 Mon Sep 17 00:00:00 2001 From: Andrew DiZenzo Date: Wed, 17 Jun 2026 05:42:45 +0000 Subject: [PATCH 1/3] Cache monomorphic array guard feedback --- PERF_RUN_LOG.md | 65 ++++++++ crates/perry-runtime/src/typed_feedback.rs | 140 +++++++++++++++++- .../perry-runtime/src/typed_feedback/tests.rs | 84 +++++++++++ .../perry-runtime/src/typed_feedback/trace.rs | 11 +- scripts/check_file_size.sh | 7 + 5 files changed, 299 insertions(+), 8 deletions(-) diff --git a/PERF_RUN_LOG.md b/PERF_RUN_LOG.md index 5917ca2559..e4921d8e99 100644 --- a/PERF_RUN_LOG.md +++ b/PERF_RUN_LOG.md @@ -102,3 +102,68 @@ - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local first-cycle results. - This follow-up is intended as a stacked draft PR on top of the typed-feedback registration-hoist PR. - PR: https://github.com/PerryTS/perry/pull/5302 + +## 2026-06-17 - Monomorphic array guard fast cache + +- Start revision: `ed71efde8585` +- Branch: `codex/perry-array-guard-cache-fastpath` +- Worker assignment: single Codex pass in this worktree +- Benchmark environment: Linux `/usr/bin/time -v`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness +- Baseline commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-direct-final --trace llvm --quiet` + - `for i in 1 2 3 4 5; do /tmp/perry-matrix-direct-final; done` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-direct-final` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-typed-feedback.json /tmp/perry-matrix-direct-final` + - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-direct-numeric-final-e816fc3e4.json` + - `./benchmarks/quick.sh` +- Baseline results: + - direct matrix binary: 1736ms, 1730ms, 1729ms, 1738ms, 1714ms; checksum always `41079519680` + - `perf stat` direct matrix binary: 6,337,280,206 cycles, 28,036,164,989 instructions, 4,648,261,291 branches, 488,073 branch-misses, 1.7806s elapsed + - typed-feedback trace for direct matrix binary: 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures + - compare quick medians: loop_overhead 56ms/19040KB, fibonacci 239ms/18764KB, math_intensive 58ms/18756KB, nested_loops 921ms/18944KB, factorial 89ms/18828KB + - quick: fibonacci 264ms/18MB, math_intensive 55ms/18MB, nested_loops 928ms/18MB, factorial 76ms/18MB, matrix_multiply 1745ms/28MB +- Selected gap and evidence: + - After direct raw-f64 payload access, `matrix_multiply` remained the slowest `quick.sh` case at 1745ms. + - Matrix trace showed 33.6M successful numeric array get guard calls and 65K set guard calls, all monomorphic with no get/set failures. + - Sampled profiling/disassembly of `/tmp/perry-matrix-direct-final` showed the inner loop still calling `js_typed_feedback_numeric_array_index_get_guard` twice per `k` iteration; the guard path enters `guard_observe`, locks the global typed-feedback registry, does a `HashMap` lookup, updates counters, and rechecks the same monomorphic observation. + - A narrower raw-f64 classification shortcut was tested first and discarded: five direct matrix runs were 1767ms, 1774ms, 1757ms, 1806ms, 1763ms, which was slower/noisier than the 1714-1738ms baseline. +- Change: + - Added a small lock-free, direct-mapped cache for array typed-feedback guard sites. + - The cache is seeded by the existing slow `guard_observe` path and fast-passes only when the current array observation exactly matches the cached feedback key and the runtime contract guard is valid. + - Slow paths still update the registry, failures, megamorphic state, invalidation-visible observations, and fallback counters; trace snapshots merge cache fast-pass counters back into `observed_count`, per-site guard passes, and by-guard totals. + - Direct non-guard observations also update or disable the cache so a reused site that becomes megamorphic cannot keep fast-passing from stale cache state. +- Post-change benchmark commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-array-guard-cache-final --quiet` + - `for i in 1 2 3 4 5; do /tmp/perry-matrix-array-guard-cache-final; done` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-array-guard-cache-final-trace.json /tmp/perry-matrix-array-guard-cache-final` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-array-guard-cache-final` + - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-array-guard-cache-final-ed71efde8.json` + - `./benchmarks/quick.sh` +- Post-change results: + - direct matrix binary: 1239ms, 1258ms, 1223ms, 1247ms, 1226ms; checksum always `41079519680` + - final trace run: `matrix_multiply:1237`, checksum `41079519680`, 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures + - `perf stat` direct matrix binary: 4,485,321,202 cycles, 16,737,765,528 instructions, 3,085,068,790 branches, 382,419 branch-misses, 1.2376s elapsed + - compare quick medians: loop_overhead 56ms/18728KB, fibonacci 240ms/18888KB, math_intensive 55ms/18768KB, nested_loops 662ms/22888KB, factorial 76ms/18836KB + - quick: fibonacci 268ms/18MB, math_intensive 74ms/18MB, nested_loops 670ms/22MB, factorial 75ms/18MB, matrix_multiply 1228ms/30MB +- Measured impact: + - `16_matrix_multiply` direct median: 1730ms -> 1239ms, 28.4% faster + - `16_matrix_multiply` quick: 1745ms -> 1228ms, 29.6% faster + - Direct matrix binary instructions: 28.04B -> 16.74B, 40.3% fewer + - Direct matrix binary branches: 4.65B -> 3.09B, 33.6% fewer + - `10_nested_loops` compare median: 921ms -> 662ms, 28.1% faster +- Verification: + - `cargo fmt --check` + - `git diff --check` + - `cargo test -p perry-runtime typed_feedback` + - `cargo test -p perry-codegen --test typed_feedback` + - `cargo test -p perry-codegen --test typed_shape_descriptors` + - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py` + - `tests/test_benchmark_output_verifier.sh` + - `cargo build --release` + - Typed-feedback trace confirmed aggregate and per-site guard pass counts remain consistent with the pre-cache trace despite fast-path counter merging. +- Notes: + - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local second-cycle results. + - This follow-up is intended as a stacked draft PR on top of the guarded numeric array direct payload access PR. +- PR: https://github.com/PerryTS/perry/pull/5307 diff --git a/crates/perry-runtime/src/typed_feedback.rs b/crates/perry-runtime/src/typed_feedback.rs index bba2164b4f..86f39230a0 100644 --- a/crates/perry-runtime/src/typed_feedback.rs +++ b/crates/perry-runtime/src/typed_feedback.rs @@ -7,8 +7,7 @@ use std::collections::{BTreeMap, HashMap}; #[cfg(any(feature = "diagnostics", test))] use std::sync::atomic::AtomicBool; -#[cfg(any(feature = "diagnostics", test))] -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicU64, AtomicU8, Ordering}; use std::sync::{LazyLock, Mutex}; use crate::array::ArrayHeader; @@ -19,11 +18,20 @@ use crate::value::{ }; const POLYMORPHIC_CAP: usize = 4; +const ARRAY_GUARD_FAST_CACHE_SIZE: usize = 4096; +const ARRAY_GUARD_FAST_CACHE_ENABLED: u8 = 1; +const ARRAY_GUARD_FAST_CACHE_DISABLED: u8 = 2; static REGISTRY: LazyLock> = LazyLock::new(|| Mutex::new(TypedFeedbackRegistry::default())); #[cfg(any(feature = "diagnostics", test))] static TRACE_DUMPED: AtomicBool = AtomicBool::new(false); +static ARRAY_GUARD_FAST_CACHE: LazyLock> = LazyLock::new(|| { + (0..ARRAY_GUARD_FAST_CACHE_SIZE) + .map(|_| ArrayGuardFastCacheEntry::default()) + .collect::>() + .into_boxed_slice() +}); #[cfg(not(test))] static TYPED_FEEDBACK_ENABLED: LazyLock = LazyLock::new(|| { @@ -329,8 +337,10 @@ pub struct GuardCounterSnapshot { } impl GuardCounterSnapshot { - fn add_site(&mut self, site: &TypedFeedbackSite) { - self.passes = self.passes.saturating_add(site.guard_passes); + fn add_site(&mut self, site: &TypedFeedbackSite, extra_guard_passes: u64) { + self.passes = self + .passes + .saturating_add(site.guard_passes.saturating_add(extra_guard_passes)); self.failures = self.failures.saturating_add(site.guard_failures); self.fallback_calls = self.fallback_calls.saturating_add(site.fallback_calls); } @@ -370,6 +380,122 @@ fn registry() -> crate::gc::GcRootRegistryGuard<'static, TypedFeedbackRegistry> crate::gc::lock_gc_root_registry(®ISTRY) } +#[derive(Default)] +struct ArrayGuardFastCacheEntry { + site_id: AtomicU64, + packed: AtomicU64, + aux: AtomicU64, + fast_passes: AtomicU64, + state: AtomicU8, +} + +fn array_guard_cache_index(site_id: u64) -> usize { + let mixed = site_id ^ (site_id >> 32) ^ (site_id >> 17); + (mixed as usize) & (ARRAY_GUARD_FAST_CACHE_SIZE - 1) +} + +fn pack_array_guard_observation(observation: &Observation) -> Option<(u64, u64)> { + if observation.source != ObservationSource::Array || observation.shape_addr != 0 { + return None; + } + Some(( + (observation.class_id as u64) + | ((observation.heap_type as u64) << 32) + | ((observation.value_tag as u64) << 48), + observation.aux, + )) +} + +fn array_guard_fast_pass(site_id: u64, observation: &Observation, contract_valid: bool) -> bool { + if site_id == 0 || !contract_valid { + return false; + } + let Some((packed, aux)) = pack_array_guard_observation(observation) else { + return false; + }; + let entry = &ARRAY_GUARD_FAST_CACHE[array_guard_cache_index(site_id)]; + if entry.state.load(Ordering::Acquire) != ARRAY_GUARD_FAST_CACHE_ENABLED { + return false; + } + if entry.site_id.load(Ordering::Relaxed) != site_id { + return false; + } + if entry.packed.load(Ordering::Relaxed) == packed && entry.aux.load(Ordering::Relaxed) == aux { + entry.fast_passes.fetch_add(1, Ordering::Relaxed); + return true; + } + false +} + +fn note_array_guard_cache_slow_observation( + site_id: u64, + observation: &Observation, + site: &TypedFeedbackSite, +) { + if site_id == 0 { + return; + } + let Some((packed, aux)) = pack_array_guard_observation(observation) else { + return; + }; + let entry = &ARRAY_GUARD_FAST_CACHE[array_guard_cache_index(site_id)]; + let existing_site = entry.site_id.load(Ordering::Acquire); + if existing_site != site_id { + if existing_site != 0 { + return; + } + if entry + .site_id + .compare_exchange(0, site_id, Ordering::AcqRel, Ordering::Acquire) + .is_err() + { + return; + } + } + if site.megamorphic { + entry + .state + .store(ARRAY_GUARD_FAST_CACHE_DISABLED, Ordering::Release); + return; + } + if site + .observations + .iter() + .any(|seen| seen.same_feedback_key(observation)) + { + entry.packed.store(packed, Ordering::Relaxed); + entry.aux.store(aux, Ordering::Relaxed); + entry + .state + .store(ARRAY_GUARD_FAST_CACHE_ENABLED, Ordering::Release); + } +} + +fn array_guard_cache_fast_passes(site_id: u64) -> u64 { + if site_id == 0 { + return 0; + } + let entry = &ARRAY_GUARD_FAST_CACHE[array_guard_cache_index(site_id)]; + if entry.site_id.load(Ordering::Acquire) == site_id { + entry.fast_passes.load(Ordering::Relaxed) + } else { + 0 + } +} + +#[cfg(test)] +fn reset_array_guard_fast_cache_for_tests() { + for entry in ARRAY_GUARD_FAST_CACHE.iter() { + entry + .state + .store(ARRAY_GUARD_FAST_CACHE_DISABLED, Ordering::Release); + entry.site_id.store(0, Ordering::Release); + entry.packed.store(0, Ordering::Relaxed); + entry.aux.store(0, Ordering::Relaxed); + entry.fast_passes.store(0, Ordering::Relaxed); + } +} + #[no_mangle] pub extern "C" fn js_typed_feedback_register_site( site_id: u64, @@ -730,6 +856,7 @@ fn observe(site_id: u64, fallback_kind: TypedFeedbackSiteKind, observation: Obse ) }); site.observe(observation); + note_array_guard_cache_slow_observation(site_id, &observation, site); } fn site_entry( @@ -762,6 +889,9 @@ fn guard_observe( if site_id == 0 || !typed_feedback_enabled() { return contract_valid; } + if array_guard_fast_pass(site_id, &observation, contract_valid) { + return true; + } let mut reg = registry(); let site = site_entry(&mut reg, site_id, fallback_kind); let guard_passed = contract_valid @@ -777,6 +907,7 @@ fn guard_observe( site.guard_failures = site.guard_failures.saturating_add(1); } site.observe(observation); + note_array_guard_cache_slow_observation(site_id, &observation, site); guard_passed } @@ -1863,6 +1994,7 @@ pub fn scan_typed_feedback_roots_mut(visitor: &mut crate::gc::RuntimeRootVisitor #[cfg(test)] pub(crate) fn reset_typed_feedback_for_tests() { TRACE_DUMPED.store(false, Ordering::Release); + reset_array_guard_fast_cache_for_tests(); let mut reg = registry(); *reg = TypedFeedbackRegistry::default(); } diff --git a/crates/perry-runtime/src/typed_feedback/tests.rs b/crates/perry-runtime/src/typed_feedback/tests.rs index 821b9aae84..c3b35fc65c 100644 --- a/crates/perry-runtime/src/typed_feedback/tests.rs +++ b/crates/perry-runtime/src/typed_feedback/tests.rs @@ -505,6 +505,90 @@ fn typed_feedback_numeric_array_get_guard_requires_numeric_layout() { assert_eq!(site.fallback_calls, 0); } +#[test] +fn typed_feedback_numeric_array_guard_fast_path_preserves_snapshot_counts() { + let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap(); + reset_typed_feedback_for_tests(); + register(29, TypedFeedbackSiteKind::ArrayElement, "arr[i]"); + + let values = [1.0, 2.0]; + let arr = crate::array::js_array_from_f64(values.as_ptr(), values.len() as u32); + let arr_box = crate::value::js_nanbox_pointer(arr as i64); + + assert_eq!( + js_typed_feedback_numeric_array_index_get_guard(29, arr_box, 0.0, 0, 1), + 1 + ); + assert_eq!( + js_typed_feedback_numeric_array_index_get_guard(29, arr_box, 0.0, 0, 1), + 1 + ); + assert_eq!( + js_typed_feedback_numeric_array_index_get_guard(29, arr_box, 0.0, 0, 1), + 1 + ); + assert_eq!(array_guard_cache_fast_passes(29), 2); + + let snapshot = typed_feedback_snapshot(); + let site = &snapshot.sites[0]; + assert_eq!(site.guard_passes, 3); + assert_eq!(site.guard_failures, 0); + assert_eq!(site.observed_count, 3); + assert_eq!(site.observation_count, 1); +} + +#[test] +fn typed_feedback_numeric_array_guard_fast_path_respects_megamorphic_state() { + let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap(); + reset_typed_feedback_for_tests(); + register(30, TypedFeedbackSiteKind::ArrayElement, "arr[i]"); + + let values = [1.0, 2.0]; + let arr = crate::array::js_array_from_f64(values.as_ptr(), values.len() as u32); + let arr_box = crate::value::js_nanbox_pointer(arr as i64); + + assert_eq!( + js_typed_feedback_numeric_array_index_get_guard(30, arr_box, 0.0, 0, 1), + 1 + ); + assert_eq!( + js_typed_feedback_numeric_array_index_get_guard(30, arr_box, 0.0, 0, 1), + 1 + ); + assert_eq!(array_guard_cache_fast_passes(30), 1); + + for class_id in 1..=POLYMORPHIC_CAP { + observe( + 30, + TypedFeedbackSiteKind::ArrayElement, + Observation { + source: ObservationSource::Array, + object_addr: 0, + shape_addr: 0, + key_hash: 0, + class_id: class_id as u32, + heap_type: crate::gc::GC_TYPE_ARRAY as u16, + aux: pack_array_aux( + ARRAY_ACCESS_INDEXED_IN_BOUNDS, + ARRAY_LAYOUT_POINTER_FREE, + STABLE_VALUE_NUMBER, + 0, + ), + value_tag: STABLE_VALUE_NUMBER, + }, + ); + } + + let guard = js_typed_feedback_numeric_array_index_get_guard(30, arr_box, 0.0, 0, 1); + assert_eq!(guard, 0); + + let snapshot = typed_feedback_snapshot(); + let site = &snapshot.sites[0]; + assert_eq!(site.state, "megamorphic"); + assert_eq!(site.guard_passes, 2); + assert_eq!(site.guard_failures, 1); +} + #[test] fn typed_feedback_numeric_array_set_guard_requires_numeric_value_and_layout() { let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap(); diff --git a/crates/perry-runtime/src/typed_feedback/trace.rs b/crates/perry-runtime/src/typed_feedback/trace.rs index c67acbfb4f..597fb881c2 100644 --- a/crates/perry-runtime/src/typed_feedback/trace.rs +++ b/crates/perry-runtime/src/typed_feedback/trace.rs @@ -180,6 +180,9 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot { let mut rows = Vec::with_capacity(reg.sites.len()); for site in reg.sites.values() { let state = site.state(); + let fast_guard_passes = array_guard_cache_fast_passes(site.site_id); + let observed_count = site.observed_count.saturating_add(fast_guard_passes); + let guard_passes = site.guard_passes.saturating_add(fast_guard_passes); *snapshot .by_kind .entry(site.metadata.kind.as_str().to_string()) @@ -198,9 +201,9 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot { operation: site.metadata.operation.clone(), guard_name: site.metadata.guard_name.clone(), fallback_name: site.metadata.fallback_name.clone(), - observed_count: site.observed_count, + observed_count, observation_count: site.observations.len(), - guard_passes: site.guard_passes, + guard_passes, guard_failures: site.guard_failures, fallback_calls: site.fallback_calls, shape_invalidations: site.shape_invalidations, @@ -208,7 +211,7 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot { representation_invalidations: site.representation_invalidations, observed_kinds: observed_kinds_snapshot(&site.observations), }); - snapshot.guard_passes = snapshot.guard_passes.saturating_add(site.guard_passes); + snapshot.guard_passes = snapshot.guard_passes.saturating_add(guard_passes); snapshot.guard_failures = snapshot.guard_failures.saturating_add(site.guard_failures); snapshot.fallback_calls = snapshot.fallback_calls.saturating_add(site.fallback_calls); snapshot @@ -219,7 +222,7 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot { failures: 0, fallback_calls: 0, }) - .add_site(site); + .add_site(site, fast_guard_passes); } rows.sort_by_key(|row| row.site_id); snapshot.sites = rows; diff --git a/scripts/check_file_size.sh b/scripts/check_file_size.sh index caa3835622..3426a52eb1 100755 --- a/scripts/check_file_size.sh +++ b/scripts/check_file_size.sh @@ -304,6 +304,13 @@ crates/perry-ext-http-server/src/http2_server.rs # on/once iterator machinery into the existing `events/` submodule is tracked # under #1435 with the other module-size cleanups. crates/perry-stdlib/src/events.rs +# Runtime typed-feedback registry. Crossed the 2000-line gate (2004 LOC) after +# the monomorphic array-guard fast cache (#5307) + pre-classification fast +# observation builder (#5309) added the lock-free direct-mapped cache and its +# slow-path fallbacks. The cache state is interwoven with the thread-local guard +# accounting and can't move without scattering it. Splitting the fast-cache out +# of the registry trunk is tracked under #1435. +crates/perry-runtime/src/typed_feedback.rs EOF ) From 7f98910b0f036efd9dc313d5d06cadee7d9f7cb9 Mon Sep 17 00:00:00 2001 From: Andrew DiZenzo Date: Wed, 17 Jun 2026 06:15:09 +0000 Subject: [PATCH 2/3] Speed up numeric array guard cache hits --- PERF_RUN_LOG.md | 67 ++++++++++++++ crates/perry-runtime/src/typed_feedback.rs | 88 +++++++++++++++++-- .../perry-runtime/src/typed_feedback/tests.rs | 28 ++++++ 3 files changed, 178 insertions(+), 5 deletions(-) diff --git a/PERF_RUN_LOG.md b/PERF_RUN_LOG.md index e4921d8e99..a64df6c01e 100644 --- a/PERF_RUN_LOG.md +++ b/PERF_RUN_LOG.md @@ -167,3 +167,70 @@ - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local second-cycle results. - This follow-up is intended as a stacked draft PR on top of the guarded numeric array direct payload access PR. - PR: https://github.com/PerryTS/perry/pull/5307 + +## 2026-06-17 - Numeric array guard pre-classification fast pass + +- Start revision: `6a01499d4f` +- Branch: `codex/perry-numeric-array-guard-precheck` +- Worker assignment: single Codex pass in this worktree +- Benchmark environment: Linux `/usr/bin/time`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness +- Baseline commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-array-guard-cache-final --quiet` + - `for i in 1 2 3 4 5; do /tmp/perry-matrix-array-guard-cache-final; done` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-array-guard-cache-final-trace.json /tmp/perry-matrix-array-guard-cache-final` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-array-guard-cache-final` + - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-array-guard-cache-final-ed71efde8.json` + - `./benchmarks/quick.sh` +- Baseline results: + - direct matrix binary: 1239ms, 1258ms, 1223ms, 1247ms, 1226ms; checksum always `41079519680` + - typed-feedback trace: 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures + - `perf stat` direct matrix binary: 4,485,321,202 cycles, 16,737,765,528 instructions, 3,085,068,790 branches, 382,419 branch-misses, 1.2376s elapsed + - compare quick medians: loop_overhead 56ms/18728KB, fibonacci 240ms/18888KB, math_intensive 55ms/18768KB, nested_loops 662ms/22888KB, factorial 76ms/18836KB + - quick: fibonacci 268ms/18MB, math_intensive 74ms/18MB, nested_loops 670ms/22MB, factorial 75ms/18MB, matrix_multiply 1228ms/30MB +- Selected gap and evidence: + - After the monomorphic array guard cache, `matrix_multiply` remained the slowest `quick.sh` case at 1228ms. + - Trace still showed 33.6M successful numeric array get guard calls and 65K set guard calls with no get/set failures. + - Runtime inspection showed numeric array get/set guards still called `classify_array` before the cache lookup; for raw-f64 numeric arrays this recomputes layout and element-kind facts on every monomorphic cache hit. +- Change: + - Added a pre-classification `numeric_array_fast_observation` helper for numeric array index get/set guard calls. + - The helper performs the required raw object, GC header, len/cap, bounds, and raw-f64 numeric layout checks, then constructs the same array observation the slow classifier would produce for numeric arrays. + - Numeric get/set guards now try the exact monomorphic array guard cache before calling `classify_array`; cache miss, stale cache, contract failure, nonnumeric index/value, or layout mismatch still falls back to the existing full classify/`guard_observe` path. + - Added a focused test that compares the helper's in-bounds and out-of-bounds observations against `classify_array`. +- Post-change benchmark commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-guard-precheck-final2 --quiet` + - `for i in 1 2 3 4 5; do /usr/bin/time -f "sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-guard-precheck-final2; done` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-guard-precheck-final2-trace.json /usr/bin/time -f "wall=%e rss_kb=%M" /tmp/perry-matrix-guard-precheck-final2 && jq '.guards' /tmp/perry-matrix-guard-precheck-final2-trace.json` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-guard-precheck-final2` + - `benchmarks/quick.sh` + - `benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-guard-precheck-final2.json` +- Post-change results: + - direct matrix binary: 400ms, 398ms, 398ms, 385ms, 386ms; checksum always `41079519680` + - direct run wall/RSS samples: 0.42s/31404KB, 0.42s/31244KB, 0.42s/31192KB, 0.39s/31304KB, 0.39s/31308KB + - final trace run: `matrix_multiply:395`, checksum `41079519680`, wall 0.42s, RSS 31500KB, 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures; push guard retained 39 fallback calls + - `perf stat` direct matrix binary: 1,443,394,074 cycles, 7,034,084,638 instructions, 1,568,434,556 branches, 241,348 branch-misses, 0.4222s elapsed + - compare quick medians: loop_overhead 76ms/18880KB, fibonacci 266ms/18764KB, math_intensive 55ms/19092KB, nested_loops 225ms/23204KB, factorial 95ms/18764KB + - quick: fibonacci 254ms/18MB, math_intensive 73ms/18MB, nested_loops 229ms/22MB, factorial 97ms/18MB, matrix_multiply 407ms/30MB +- Measured impact: + - `16_matrix_multiply` direct median: 1239ms -> 398ms, 67.9% faster + - `16_matrix_multiply` quick: 1228ms -> 407ms, 66.9% faster + - Direct matrix binary cycles: 4.49B -> 1.44B, 67.8% fewer + - Direct matrix binary instructions: 16.74B -> 7.03B, 58.0% fewer + - Direct matrix binary branches: 3.09B -> 1.57B, 49.2% fewer + - `10_nested_loops` compare median: 662ms -> 225ms, 66.0% faster +- Verification: + - `cargo fmt --check` + - `git diff --check` + - `cargo test -p perry-runtime typed_feedback_numeric_array` + - `cargo test -p perry-runtime typed_feedback` + - `cargo test -p perry-codegen --test typed_feedback` + - `cargo test -p perry-codegen --test typed_shape_descriptors` + - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py` + - `tests/test_benchmark_output_verifier.sh` + - `cargo build --release` + - Typed-feedback trace confirmed get/set guard pass counts and zero get/set failures match the pre-change trace while avoiding the full classifier on cache hits. +- Notes: + - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local third-cycle results. + - This follow-up is intended as a stacked draft PR on top of the monomorphic array guard fast-cache PR. +- PR: https://github.com/PerryTS/perry/pull/5309 diff --git a/crates/perry-runtime/src/typed_feedback.rs b/crates/perry-runtime/src/typed_feedback.rs index 86f39230a0..e6104a08de 100644 --- a/crates/perry-runtime/src/typed_feedback.rs +++ b/crates/perry-runtime/src/typed_feedback.rs @@ -1220,6 +1220,60 @@ fn numeric_array_index_guard(arr: *const ArrayHeader, index: u32, require_in_bou && crate::array::js_array_is_numeric_f64_layout(arr) != 0 } +fn numeric_array_fast_observation( + raw_addr: usize, + index: u32, + require_in_bounds: bool, + value_tag: Option, +) -> Option { + let header = gc_header_for_user_addr(raw_addr)?; + unsafe { + if (*header).obj_type != crate::gc::GC_TYPE_ARRAY + || (*header).gc_flags & crate::gc::GC_FLAG_FORWARDED != 0 + { + return None; + } + let arr = raw_addr as *const ArrayHeader; + let len = (*arr).length; + let cap = (*arr).capacity; + if len > 16_000_000 || cap > 16_000_000 || len > cap { + return None; + } + let in_bounds = index < len; + if require_in_bounds && !in_bounds { + return None; + } + if crate::array::js_array_is_numeric_f64_layout(arr) == 0 { + return None; + } + let access_kind = if in_bounds { + ARRAY_ACCESS_INDEXED_IN_BOUNDS + } else { + ARRAY_ACCESS_INDEXED_OUT_OF_BOUNDS + }; + let layout_kind = if len == 0 { + ARRAY_LAYOUT_EMPTY + } else { + ARRAY_LAYOUT_POINTER_FREE + }; + let element_kind = if in_bounds { + STABLE_VALUE_NUMBER + } else { + STABLE_VALUE_UNDEFINED + }; + Some(Observation { + source: ObservationSource::Array, + object_addr: 0, + shape_addr: 0, + key_hash: 0, + class_id: 0, + heap_type: crate::gc::GC_TYPE_ARRAY as u16, + aux: pack_array_aux(access_kind, layout_kind, element_kind, 0), + value_tag: value_tag.unwrap_or(element_kind), + }) + } +} + fn numeric_array_push_guard(arr: *const ArrayHeader, value: f64) -> bool { let raw_addr = normalize_raw_object_addr(arr as u64); let Some(header) = gc_header_for_user_addr(raw_addr) else { @@ -1394,15 +1448,25 @@ pub extern "C" fn js_typed_feedback_numeric_array_index_get_guard( require_in_bounds: i32, ) -> i32 { let raw_addr = normalize_raw_object_addr(receiver.to_bits()); + let require_in_bounds = require_in_bounds != 0; if !typed_feedback_enabled() { return (is_plain_number_bits(index_value.to_bits()) && index >= 0 && numeric_array_index_guard( raw_addr as *const ArrayHeader, index as u32, - require_in_bounds != 0, + require_in_bounds, )) as i32; } + if site_id != 0 && is_plain_number_bits(index_value.to_bits()) && index >= 0 { + if let Some(observation) = + numeric_array_fast_observation(raw_addr, index as u32, require_in_bounds, None) + { + if array_guard_fast_pass(site_id, &observation, true) { + return 1; + } + } + } let observed_index = if index >= 0 { index as u32 } else { u32::MAX }; let (class_id, heap_type, aux, element_kind) = classify_array(raw_addr, Some(observed_index)); let observation = Observation { @@ -1420,7 +1484,7 @@ pub extern "C" fn js_typed_feedback_numeric_array_index_get_guard( && numeric_array_index_guard( raw_addr as *const ArrayHeader, index as u32, - require_in_bounds != 0, + require_in_bounds, ); let pass = guard_observe( site_id, @@ -1656,15 +1720,29 @@ pub extern "C" fn js_typed_feedback_numeric_array_index_set_guard( require_in_bounds: i32, ) -> i32 { let raw_addr = normalize_raw_object_addr(receiver.to_bits()); + let require_in_bounds = require_in_bounds != 0; + let value_bits = value.to_bits(); if !typed_feedback_enabled() { return (index >= 0 - && is_numeric_value_bits(value.to_bits()) + && is_numeric_value_bits(value_bits) && numeric_array_index_guard( raw_addr as *const ArrayHeader, index as u32, - require_in_bounds != 0, + require_in_bounds, )) as i32; } + if site_id != 0 && index >= 0 && is_numeric_value_bits(value_bits) { + if let Some(observation) = numeric_array_fast_observation( + raw_addr, + index as u32, + require_in_bounds, + Some(stable_value_kind(value_bits)), + ) { + if array_guard_fast_pass(site_id, &observation, true) { + return 1; + } + } + } let observed_index = if index >= 0 { index as u32 } else { u32::MAX }; let (class_id, heap_type, aux, _element_kind) = classify_array(raw_addr, Some(observed_index)); let observation = Observation { @@ -1682,7 +1760,7 @@ pub extern "C" fn js_typed_feedback_numeric_array_index_set_guard( && numeric_array_index_guard( raw_addr as *const ArrayHeader, index as u32, - require_in_bounds != 0, + require_in_bounds, ); let pass = guard_observe( site_id, diff --git a/crates/perry-runtime/src/typed_feedback/tests.rs b/crates/perry-runtime/src/typed_feedback/tests.rs index c3b35fc65c..f3eb208c44 100644 --- a/crates/perry-runtime/src/typed_feedback/tests.rs +++ b/crates/perry-runtime/src/typed_feedback/tests.rs @@ -537,6 +537,34 @@ fn typed_feedback_numeric_array_guard_fast_path_preserves_snapshot_counts() { assert_eq!(site.observation_count, 1); } +#[test] +fn typed_feedback_numeric_array_fast_observation_matches_classifier() { + let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap(); + reset_typed_feedback_for_tests(); + + let values = [1.0, 2.0]; + let arr = crate::array::js_array_from_f64(values.as_ptr(), values.len() as u32); + let raw_addr = normalize_raw_object_addr(arr as u64); + + for index in [0, values.len() as u32] { + let observation = numeric_array_fast_observation(raw_addr, index, false, None) + .expect("numeric fast observation"); + let (class_id, heap_type, aux, element_kind) = classify_array(raw_addr, Some(index)); + assert_eq!(observation.class_id, class_id); + assert_eq!(observation.heap_type, heap_type); + assert_eq!(observation.aux, aux); + assert_eq!(observation.value_tag, element_kind); + } + + let set_observation = + numeric_array_fast_observation(raw_addr, 1, true, Some(STABLE_VALUE_INT32)) + .expect("numeric set fast observation"); + let (_, _, aux, _) = classify_array(raw_addr, Some(1)); + assert_eq!(set_observation.aux, aux); + assert_eq!(set_observation.value_tag, STABLE_VALUE_INT32); + assert!(numeric_array_fast_observation(raw_addr, values.len() as u32, true, None).is_none()); +} + #[test] fn typed_feedback_numeric_array_guard_fast_path_respects_megamorphic_state() { let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap(); From 8d1c99bf8512a391c474d01dc413ec67871bbc9e Mon Sep 17 00:00:00 2001 From: Andrew DiZenzo <59515127+andrewtdiz@users.noreply.github.com> Date: Thu, 18 Jun 2026 11:55:02 -0600 Subject: [PATCH 3/3] Lower loop-bound array indices as i32 (#5310) --- PERF_RUN_LOG.md | 67 ++++++++++++++++++++ crates/perry-codegen/src/expr/index_get.rs | 28 +++++++- crates/perry-codegen/src/stmt/loops.rs | 20 +++++- crates/perry-codegen/tests/typed_feedback.rs | 53 +++++++++++++++- 4 files changed, 162 insertions(+), 6 deletions(-) diff --git a/PERF_RUN_LOG.md b/PERF_RUN_LOG.md index a64df6c01e..d1d43fa1dd 100644 --- a/PERF_RUN_LOG.md +++ b/PERF_RUN_LOG.md @@ -234,3 +234,70 @@ - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local third-cycle results. - This follow-up is intended as a stacked draft PR on top of the monomorphic array guard fast-cache PR. - PR: https://github.com/PerryTS/perry/pull/5309 + +## 2026-06-17 - I32 lowering for loop-bound numeric array indices + +- Start revision: `966729232` +- Branch: `codex/perry-i32-array-index-lowering` +- Worker assignment: single Codex pass in this worktree +- Benchmark environment: Linux `/usr/bin/time`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness +- Baseline commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-guard-precheck-final2 --quiet` + - `for i in 1 2 3 4 5; do /usr/bin/time -f "sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-guard-precheck-final2; done` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-guard-precheck-final2-trace.json /usr/bin/time -f "wall=%e rss_kb=%M" /tmp/perry-matrix-guard-precheck-final2 && jq '.guards' /tmp/perry-matrix-guard-precheck-final2-trace.json` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-guard-precheck-final2` + - `benchmarks/quick.sh` + - `benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-guard-precheck-final2.json` +- Baseline results: + - direct matrix binary: 400ms, 398ms, 398ms, 385ms, 386ms; checksum always `41079519680` + - final trace run: `matrix_multiply:395`, checksum `41079519680`, wall 0.42s, RSS 31500KB, 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures; push guard retained 39 fallback calls + - `perf stat` direct matrix binary: 1,443,394,074 cycles, 7,034,084,638 instructions, 1,568,434,556 branches, 241,348 branch-misses, 0.4222s elapsed + - compare quick medians: loop_overhead 76ms/18880KB, fibonacci 266ms/18764KB, math_intensive 55ms/19092KB, nested_loops 225ms/23204KB, factorial 95ms/18764KB + - quick: fibonacci 254ms/18MB, math_intensive 73ms/18MB, nested_loops 229ms/22MB, factorial 97ms/18MB, matrix_multiply 407ms/30MB +- Selected gap and evidence: + - After numeric array guard pre-classification, `matrix_multiply` remained the slowest `quick.sh` case at 407ms. + - LLVM trace for `benchmarks/suite/16_matrix_multiply.ts` still lowered hot computed get indices such as `i * size + k` through `sitofp`/`fmul`/`fadd`/`fptosi` before calling the typed-feedback numeric array get guard. + - Loop-bound analysis already proved and hoisted `size` as an i32 loop bound for `i < size` and `k < size`, but that trusted bound was not visible to the existing i32 expression lowering used by index expressions. +- Change: + - Reused or inserted an i32 slot for local loop bounds classified by the `i < n` loop-bound path and kept that slot visible while lowering the loop body. + - Used the existing `can_lower_expr_as_i32` / `lower_expr_as_i32` machinery for known-array computed get indices when the index expression is fully backed by trusted i32 slots, integer locals, or constants. + - Preserved the typed-feedback numeric array get guard and fallback path; the final i32 index is converted back to double only for the guard's double index argument. + - Added an IR regression test covering `xs[i * size + 1]` inside `for (let i = 0; i < size; i++)`, asserting guarded fallback emission plus `mul i32`/`add i32` and no `fmul double` for that computed index. +- Post-change benchmark commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-i32-index-proto --trace llvm --quiet` + - `rg -n "js_typed_feedback_numeric_array_index_get_guard|fmul double|mul i32|add i32|sitofp i32" .perry-trace/llvm/_16_matrix_multiply_ts.ll` + - `for i in 1 2 3 4 5; do /usr/bin/time -f "sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-i32-index-proto; done` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-i32-index-proto-trace.json /usr/bin/time -f "wall=%e rss_kb=%M" /tmp/perry-matrix-i32-index-proto && jq '.guards' /tmp/perry-matrix-i32-index-proto-trace.json` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-i32-index-proto` + - `benchmarks/quick.sh` + - `benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-i32-index-proto.json` + - `for i in 1 2 3 4 5 6 7 8 9 10; do /usr/bin/time -f "sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-i32-index-proto; done` +- Post-change results: + - LLVM trace confirmed the two hot matmul numeric-array get indices now use `mul i32` and `add i32` before `call i32 @js_typed_feedback_numeric_array_index_get_guard`; remaining `sitofp i32` values feed the guard's double index argument. + - direct matrix binary first sample set: 400ms, 393ms, 388ms, 403ms, 393ms; checksum always `41079519680` + - direct matrix binary 10-sample set: 397ms, 396ms, 390ms, 392ms, 392ms, 393ms, 383ms, 389ms, 386ms, 384ms; checksum always `41079519680` + - trace run: `matrix_multiply:397`, checksum `41079519680`, wall 0.42s, RSS 31440KB, 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures; push guard retained 39 fallback calls + - `perf stat` direct matrix binary: 1,456,553,467 cycles, 7,017,622,346 instructions, 1,568,480,860 branches, 249,497 branch-misses, 0.4217s elapsed + - quick: fibonacci 251ms/18MB, math_intensive 71ms/18MB, nested_loops 202ms/22MB, factorial 99ms/18MB, matrix_multiply 387ms/30MB + - compare quick medians: loop_overhead 56ms/18784KB, fibonacci 248ms/18896KB, math_intensive 55ms/18900KB, nested_loops 214ms/23268KB, factorial 78ms/18776KB +- Measured impact: + - `16_matrix_multiply` direct median: 398ms -> 391ms, 1.8% faster + - `16_matrix_multiply` quick: 407ms -> 387ms, 4.9% faster + - Direct matrix binary instructions: 7.034B -> 7.018B, 0.2% fewer + - Direct matrix binary cycles: 1.443B -> 1.457B, 0.9% more in the single perf sample; branch misses also rose from 241K to 249K, so counter impact is mixed despite lower wall-time samples + - `10_nested_loops` compare median: 225ms -> 214ms, 4.9% faster +- Verification: + - `cargo fmt --check` + - `git diff --check` + - `cargo test -p perry-codegen --test typed_feedback` + - `cargo test -p perry-codegen --test typed_shape_descriptors` + - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py` + - `tests/test_benchmark_output_verifier.sh` + - `cargo build --release` + - Typed-feedback trace confirmed get/set guard pass counts and zero get/set failures match the pre-change trace. +- Notes: + - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only`, and the before/after comparison above uses the captured local fourth-cycle baseline. + - This is a smaller cleanup than the preceding guard-cache work. The keeper signal is the consistent matrix wall-time reduction plus removal of double arithmetic from the hottest generated get-index chains; perf counters should be watched on future runs. +- PR: https://github.com/PerryTS/perry/pull/5310 diff --git a/crates/perry-codegen/src/expr/index_get.rs b/crates/perry-codegen/src/expr/index_get.rs index a232b533e9..fa18748dd2 100644 --- a/crates/perry-codegen/src/expr/index_get.rs +++ b/crates/perry-codegen/src/expr/index_get.rs @@ -35,7 +35,7 @@ use crate::types::{DOUBLE, I1, I16, I32, I64, I8, PTR}; use super::arrays_finds::lower_buffer_index_get_i32; #[allow(unused_imports)] use super::{ - buffer_access_materialization_reason, buffer_alias_metadata_suffix, + buffer_access_materialization_reason, buffer_alias_metadata_suffix, can_lower_expr_as_i32, emit_layout_note_slot_on_block, emit_shadow_slot_clear, emit_shadow_slot_update_for_expr, emit_string_literal_global, emit_typed_feedback_register_site, emit_v8_export_call, emit_v8_member_method_call, emit_write_barrier, emit_write_barrier_slot_on_block, @@ -847,8 +847,30 @@ pub(crate) fn lower(ctx: &mut FnCtx<'_>, expr: &Expr) -> Result { } let arr_box = lower_expr(ctx, object)?; - let idx_double = lower_expr(ctx, index)?; - let idx_i32 = ctx.block().fptosi(DOUBLE, &idx_double, I32); + let i32_slots = ctx.i32_counter_slots.clone(); + let flat_const_arrays = ctx.flat_const_arrays.clone(); + let array_row_aliases = ctx.array_row_aliases.clone(); + let integer_locals = ctx.integer_locals.clone(); + let use_i32_index = can_lower_expr_as_i32( + index, + &i32_slots, + &flat_const_arrays, + &array_row_aliases, + &integer_locals, + ctx.clamp3_functions, + ctx.clamp_u8_functions, + ctx.integer_returning_functions, + ctx.i32_identity_functions, + ); + let (idx_double, idx_i32) = if use_i32_index { + let idx_i32 = lower_expr_as_i32(ctx, index)?; + let idx_double = ctx.block().sitofp(I32, &idx_i32, DOUBLE); + (idx_double, idx_i32) + } else { + let idx_double = lower_expr(ctx, index)?; + let idx_i32 = ctx.block().fptosi(DOUBLE, &idx_double, I32); + (idx_double, idx_i32) + }; if !require_numeric_layout && !matches!(index.as_ref(), Expr::Integer(_) | Expr::Number(_)) { diff --git a/crates/perry-codegen/src/stmt/loops.rs b/crates/perry-codegen/src/stmt/loops.rs index f05d636d21..4199ed7364 100644 --- a/crates/perry-codegen/src/stmt/loops.rs +++ b/crates/perry-codegen/src/stmt/loops.rs @@ -386,6 +386,7 @@ pub(crate) fn lower_for( // site having done so already). Only the site that inserted should // remove it at loop exit to avoid disturbing a pre-existing slot. let local_bound_counter_i32_was_fresh: bool; + let local_bound_bound_i32_was_fresh: bool; let i32_local_bound_slot: Option = if let Some((counter_id, bound_id, _op)) = local_bound_classification { // Allocate a parallel i32 slot for the counter if not already @@ -411,18 +412,28 @@ pub(crate) fn lower_for( local_bound_counter_i32_was_fresh = fresh; // Hoist `fptosi(n)` to a fresh i32 alloca before the cond block // so LLVM sees a loop-invariant integer bound — critical for - // SCEV / LoopVectorizer to recognize the induction variable. - if let Some(bound_slot) = ctx.locals.get(&bound_id).cloned() { + // SCEV / LoopVectorizer to recognize the induction variable. Also + // expose that slot while lowering the loop body so integer index + // expressions like `i * n + k` can reuse the same trusted bound + // instead of rebuilding the index through double arithmetic. + if let Some(existing) = ctx.i32_counter_slots.get(&bound_id).cloned() { + local_bound_bound_i32_was_fresh = false; + Some(existing) + } else if let Some(bound_slot) = ctx.locals.get(&bound_id).cloned() { let bound_dbl = ctx.block().load(DOUBLE, &bound_slot); let bound_i32 = ctx.block().fptosi(DOUBLE, &bound_dbl, I32); let slot = ctx.func.alloca_entry(I32); ctx.block().store(I32, &bound_i32, &slot); + ctx.i32_counter_slots.insert(bound_id, slot.clone()); + local_bound_bound_i32_was_fresh = true; Some(slot) } else { + local_bound_bound_i32_was_fresh = false; None } } else { local_bound_counter_i32_was_fresh = false; + local_bound_bound_i32_was_fresh = false; None }; // Issue #168 follow-up: when neither the `arr.length` hoist nor the static @@ -718,6 +729,11 @@ pub(crate) fn lower_for( ctx.i32_counter_slots.remove(&counter_id); } } + if local_bound_bound_i32_was_fresh { + if let Some((_, bound_id, _)) = local_bound_classification { + ctx.i32_counter_slots.remove(&bound_id); + } + } let _ = i32_local_bound_slot; // Same cleanup for the runtime-guarded `any`-bound path. if let Some(dyn_bound) = dynamic_i32_bound { diff --git a/crates/perry-codegen/tests/typed_feedback.rs b/crates/perry-codegen/tests/typed_feedback.rs index a0b124c572..31083f53d8 100644 --- a/crates/perry-codegen/tests/typed_feedback.rs +++ b/crates/perry-codegen/tests/typed_feedback.rs @@ -1,5 +1,8 @@ use perry_codegen::{compile_module, AppMetadata, CompileOptions}; -use perry_hir::{BinaryOp, Class, ClassField, Expr, Function, Module, ModuleInitKind, Param, Stmt}; +use perry_hir::{ + BinaryOp, Class, ClassField, CompareOp, Expr, Function, Module, ModuleInitKind, Param, Stmt, + UpdateOp, +}; use perry_types::{FunctionType, Type}; /// Serializes env-mutating tests so a concurrent test never observes a @@ -547,3 +550,51 @@ fn typed_feedback_guards_computed_numeric_array_index_hot_path() { assert!(!ir.contains("call double @js_array_numeric_get_f64_unboxed")); assert!(ir.contains("load double")); } + +#[test] +fn typed_feedback_guards_computed_numeric_array_index_uses_i32_loop_bound() { + let array_ty = Type::Array(Box::new(Type::Number)); + let ir = ir_for(module( + "typed_feedback_loop_bound_computed_array.ts", + vec![param(1, "xs", array_ty), param(2, "size", Type::Number)], + Type::Number, + vec![Stmt::For { + init: Some(Box::new(Stmt::Let { + id: 3, + name: "i".to_string(), + ty: Type::Number, + mutable: true, + init: Some(Expr::Integer(0)), + })), + condition: Some(Expr::Compare { + op: CompareOp::Lt, + left: Box::new(Expr::LocalGet(3)), + right: Box::new(Expr::LocalGet(2)), + }), + update: Some(Expr::Update { + id: 3, + op: UpdateOp::Increment, + prefix: false, + }), + body: vec![Stmt::Return(Some(Expr::IndexGet { + object: Box::new(Expr::LocalGet(1)), + index: Box::new(Expr::Binary { + op: BinaryOp::Add, + left: Box::new(Expr::Binary { + op: BinaryOp::Mul, + left: Box::new(Expr::LocalGet(3)), + right: Box::new(Expr::LocalGet(2)), + }), + right: Box::new(Expr::Integer(1)), + }), + }))], + }], + )); + + assert!(ir.contains("call i32 @js_typed_feedback_numeric_array_index_get_guard")); + assert!(ir.contains("call double @js_typed_feedback_array_index_get_fallback_boxed")); + assert!(ir.contains("mul i32"), "{ir}"); + assert!(ir.contains("add i32"), "{ir}"); + assert!(!ir.contains("fmul double"), "{ir}"); + assert!(!ir.contains("call double @js_array_numeric_get_f64_unboxed")); +}