From 83aa967d2c9579952178dcde39f88c312f5bbe76 Mon Sep 17 00:00:00 2001
From: Andrew DiZenzo <andrewdizenzojhu@gmail.com>
Date: Wed, 17 Jun 2026 05:42:45 +0000
Subject: [PATCH 1/3] Cache monomorphic array guard feedback

---
 PERF_RUN_LOG.md                               |  65 ++++++++
 crates/perry-runtime/src/typed_feedback.rs    | 140 +++++++++++++++++-
 .../perry-runtime/src/typed_feedback/tests.rs |  84 +++++++++++
 .../perry-runtime/src/typed_feedback/trace.rs |  11 +-
 scripts/check_file_size.sh                    |   7 +
 5 files changed, 299 insertions(+), 8 deletions(-)

diff --git a/PERF_RUN_LOG.md b/PERF_RUN_LOG.md
index 5917ca2559..e4921d8e99 100644
--- a/PERF_RUN_LOG.md
+++ b/PERF_RUN_LOG.md
@@ -102,3 +102,68 @@
   - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local first-cycle results.
   - This follow-up is intended as a stacked draft PR on top of the typed-feedback registration-hoist PR.
 - PR: https://github.com/PerryTS/perry/pull/5302
+
+## 2026-06-17 - Monomorphic array guard fast cache
+
+- Start revision: `ed71efde8585`
+- Branch: `codex/perry-array-guard-cache-fastpath`
+- Worker assignment: single Codex pass in this worktree
+- Benchmark environment: Linux `/usr/bin/time -v`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness
+- Baseline commands:
+  - `cargo build --release`
+  - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-direct-final --trace llvm --quiet`
+  - `for i in 1 2 3 4 5; do /tmp/perry-matrix-direct-final; done`
+  - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-direct-final`
+  - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-typed-feedback.json /tmp/perry-matrix-direct-final`
+  - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-direct-numeric-final-e816fc3e4.json`
+  - `./benchmarks/quick.sh`
+- Baseline results:
+  - direct matrix binary: 1736ms, 1730ms, 1729ms, 1738ms, 1714ms; checksum always `41079519680`
+  - `perf stat` direct matrix binary: 6,337,280,206 cycles, 28,036,164,989 instructions, 4,648,261,291 branches, 488,073 branch-misses, 1.7806s elapsed
+  - typed-feedback trace for direct matrix binary: 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures
+  - compare quick medians: loop_overhead 56ms/19040KB, fibonacci 239ms/18764KB, math_intensive 58ms/18756KB, nested_loops 921ms/18944KB, factorial 89ms/18828KB
+  - quick: fibonacci 264ms/18MB, math_intensive 55ms/18MB, nested_loops 928ms/18MB, factorial 76ms/18MB, matrix_multiply 1745ms/28MB
+- Selected gap and evidence:
+  - After direct raw-f64 payload access, `matrix_multiply` remained the slowest `quick.sh` case at 1745ms.
+  - Matrix trace showed 33.6M successful numeric array get guard calls and 65K set guard calls, all monomorphic with no get/set failures.
+  - Sampled profiling/disassembly of `/tmp/perry-matrix-direct-final` showed the inner loop still calling `js_typed_feedback_numeric_array_index_get_guard` twice per `k` iteration; the guard path enters `guard_observe`, locks the global typed-feedback registry, does a `HashMap` lookup, updates counters, and rechecks the same monomorphic observation.
+  - A narrower raw-f64 classification shortcut was tested first and discarded: five direct matrix runs were 1767ms, 1774ms, 1757ms, 1806ms, 1763ms, which was slower/noisier than the 1714-1738ms baseline.
+- Change:
+  - Added a small lock-free, direct-mapped cache for array typed-feedback guard sites.
+  - The cache is seeded by the existing slow `guard_observe` path and fast-passes only when the current array observation exactly matches the cached feedback key and the runtime contract guard is valid.
+  - Slow paths still update the registry, failures, megamorphic state, invalidation-visible observations, and fallback counters; trace snapshots merge cache fast-pass counters back into `observed_count`, per-site guard passes, and by-guard totals.
+  - Direct non-guard observations also update or disable the cache so a reused site that becomes megamorphic cannot keep fast-passing from stale cache state.
+- Post-change benchmark commands:
+  - `cargo build --release`
+  - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-array-guard-cache-final --quiet`
+  - `for i in 1 2 3 4 5; do /tmp/perry-matrix-array-guard-cache-final; done`
+  - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-array-guard-cache-final-trace.json /tmp/perry-matrix-array-guard-cache-final`
+  - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-array-guard-cache-final`
+  - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-array-guard-cache-final-ed71efde8.json`
+  - `./benchmarks/quick.sh`
+- Post-change results:
+  - direct matrix binary: 1239ms, 1258ms, 1223ms, 1247ms, 1226ms; checksum always `41079519680`
+  - final trace run: `matrix_multiply:1237`, checksum `41079519680`, 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures
+  - `perf stat` direct matrix binary: 4,485,321,202 cycles, 16,737,765,528 instructions, 3,085,068,790 branches, 382,419 branch-misses, 1.2376s elapsed
+  - compare quick medians: loop_overhead 56ms/18728KB, fibonacci 240ms/18888KB, math_intensive 55ms/18768KB, nested_loops 662ms/22888KB, factorial 76ms/18836KB
+  - quick: fibonacci 268ms/18MB, math_intensive 74ms/18MB, nested_loops 670ms/22MB, factorial 75ms/18MB, matrix_multiply 1228ms/30MB
+- Measured impact:
+  - `16_matrix_multiply` direct median: 1730ms -> 1239ms, 28.4% faster
+  - `16_matrix_multiply` quick: 1745ms -> 1228ms, 29.6% faster
+  - Direct matrix binary instructions: 28.04B -> 16.74B, 40.3% fewer
+  - Direct matrix binary branches: 4.65B -> 3.09B, 33.6% fewer
+  - `10_nested_loops` compare median: 921ms -> 662ms, 28.1% faster
+- Verification:
+  - `cargo fmt --check`
+  - `git diff --check`
+  - `cargo test -p perry-runtime typed_feedback`
+  - `cargo test -p perry-codegen --test typed_feedback`
+  - `cargo test -p perry-codegen --test typed_shape_descriptors`
+  - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py`
+  - `tests/test_benchmark_output_verifier.sh`
+  - `cargo build --release`
+  - Typed-feedback trace confirmed aggregate and per-site guard pass counts remain consistent with the pre-cache trace despite fast-path counter merging.
+- Notes:
+  - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local second-cycle results.
+  - This follow-up is intended as a stacked draft PR on top of the guarded numeric array direct payload access PR.
+- PR: https://github.com/PerryTS/perry/pull/5307
diff --git a/crates/perry-runtime/src/typed_feedback.rs b/crates/perry-runtime/src/typed_feedback.rs
index bba2164b4f..86f39230a0 100644
--- a/crates/perry-runtime/src/typed_feedback.rs
+++ b/crates/perry-runtime/src/typed_feedback.rs
@@ -7,8 +7,7 @@
 use std::collections::{BTreeMap, HashMap};
 #[cfg(any(feature = "diagnostics", test))]
 use std::sync::atomic::AtomicBool;
-#[cfg(any(feature = "diagnostics", test))]
-use std::sync::atomic::Ordering;
+use std::sync::atomic::{AtomicU64, AtomicU8, Ordering};
 use std::sync::{LazyLock, Mutex};
 
 use crate::array::ArrayHeader;
@@ -19,11 +18,20 @@ use crate::value::{
 };
 
 const POLYMORPHIC_CAP: usize = 4;
+const ARRAY_GUARD_FAST_CACHE_SIZE: usize = 4096;
+const ARRAY_GUARD_FAST_CACHE_ENABLED: u8 = 1;
+const ARRAY_GUARD_FAST_CACHE_DISABLED: u8 = 2;
 
 static REGISTRY: LazyLock<Mutex<TypedFeedbackRegistry>> =
     LazyLock::new(|| Mutex::new(TypedFeedbackRegistry::default()));
 #[cfg(any(feature = "diagnostics", test))]
 static TRACE_DUMPED: AtomicBool = AtomicBool::new(false);
+static ARRAY_GUARD_FAST_CACHE: LazyLock<Box<[ArrayGuardFastCacheEntry]>> = LazyLock::new(|| {
+    (0..ARRAY_GUARD_FAST_CACHE_SIZE)
+        .map(|_| ArrayGuardFastCacheEntry::default())
+        .collect::<Vec<_>>()
+        .into_boxed_slice()
+});
 
 #[cfg(not(test))]
 static TYPED_FEEDBACK_ENABLED: LazyLock<bool> = LazyLock::new(|| {
@@ -329,8 +337,10 @@ pub struct GuardCounterSnapshot {
 }
 
 impl GuardCounterSnapshot {
-    fn add_site(&mut self, site: &TypedFeedbackSite) {
-        self.passes = self.passes.saturating_add(site.guard_passes);
+    fn add_site(&mut self, site: &TypedFeedbackSite, extra_guard_passes: u64) {
+        self.passes = self
+            .passes
+            .saturating_add(site.guard_passes.saturating_add(extra_guard_passes));
         self.failures = self.failures.saturating_add(site.guard_failures);
         self.fallback_calls = self.fallback_calls.saturating_add(site.fallback_calls);
     }
@@ -370,6 +380,122 @@ fn registry() -> crate::gc::GcRootRegistryGuard<'static, TypedFeedbackRegistry>
     crate::gc::lock_gc_root_registry(&REGISTRY)
 }
 
+#[derive(Default)]
+struct ArrayGuardFastCacheEntry {
+    site_id: AtomicU64,
+    packed: AtomicU64,
+    aux: AtomicU64,
+    fast_passes: AtomicU64,
+    state: AtomicU8,
+}
+
+fn array_guard_cache_index(site_id: u64) -> usize {
+    let mixed = site_id ^ (site_id >> 32) ^ (site_id >> 17);
+    (mixed as usize) & (ARRAY_GUARD_FAST_CACHE_SIZE - 1)
+}
+
+fn pack_array_guard_observation(observation: &Observation) -> Option<(u64, u64)> {
+    if observation.source != ObservationSource::Array || observation.shape_addr != 0 {
+        return None;
+    }
+    Some((
+        (observation.class_id as u64)
+            | ((observation.heap_type as u64) << 32)
+            | ((observation.value_tag as u64) << 48),
+        observation.aux,
+    ))
+}
+
+fn array_guard_fast_pass(site_id: u64, observation: &Observation, contract_valid: bool) -> bool {
+    if site_id == 0 || !contract_valid {
+        return false;
+    }
+    let Some((packed, aux)) = pack_array_guard_observation(observation) else {
+        return false;
+    };
+    let entry = &ARRAY_GUARD_FAST_CACHE[array_guard_cache_index(site_id)];
+    if entry.state.load(Ordering::Acquire) != ARRAY_GUARD_FAST_CACHE_ENABLED {
+        return false;
+    }
+    if entry.site_id.load(Ordering::Relaxed) != site_id {
+        return false;
+    }
+    if entry.packed.load(Ordering::Relaxed) == packed && entry.aux.load(Ordering::Relaxed) == aux {
+        entry.fast_passes.fetch_add(1, Ordering::Relaxed);
+        return true;
+    }
+    false
+}
+
+fn note_array_guard_cache_slow_observation(
+    site_id: u64,
+    observation: &Observation,
+    site: &TypedFeedbackSite,
+) {
+    if site_id == 0 {
+        return;
+    }
+    let Some((packed, aux)) = pack_array_guard_observation(observation) else {
+        return;
+    };
+    let entry = &ARRAY_GUARD_FAST_CACHE[array_guard_cache_index(site_id)];
+    let existing_site = entry.site_id.load(Ordering::Acquire);
+    if existing_site != site_id {
+        if existing_site != 0 {
+            return;
+        }
+        if entry
+            .site_id
+            .compare_exchange(0, site_id, Ordering::AcqRel, Ordering::Acquire)
+            .is_err()
+        {
+            return;
+        }
+    }
+    if site.megamorphic {
+        entry
+            .state
+            .store(ARRAY_GUARD_FAST_CACHE_DISABLED, Ordering::Release);
+        return;
+    }
+    if site
+        .observations
+        .iter()
+        .any(|seen| seen.same_feedback_key(observation))
+    {
+        entry.packed.store(packed, Ordering::Relaxed);
+        entry.aux.store(aux, Ordering::Relaxed);
+        entry
+            .state
+            .store(ARRAY_GUARD_FAST_CACHE_ENABLED, Ordering::Release);
+    }
+}
+
+fn array_guard_cache_fast_passes(site_id: u64) -> u64 {
+    if site_id == 0 {
+        return 0;
+    }
+    let entry = &ARRAY_GUARD_FAST_CACHE[array_guard_cache_index(site_id)];
+    if entry.site_id.load(Ordering::Acquire) == site_id {
+        entry.fast_passes.load(Ordering::Relaxed)
+    } else {
+        0
+    }
+}
+
+#[cfg(test)]
+fn reset_array_guard_fast_cache_for_tests() {
+    for entry in ARRAY_GUARD_FAST_CACHE.iter() {
+        entry
+            .state
+            .store(ARRAY_GUARD_FAST_CACHE_DISABLED, Ordering::Release);
+        entry.site_id.store(0, Ordering::Release);
+        entry.packed.store(0, Ordering::Relaxed);
+        entry.aux.store(0, Ordering::Relaxed);
+        entry.fast_passes.store(0, Ordering::Relaxed);
+    }
+}
+
 #[no_mangle]
 pub extern "C" fn js_typed_feedback_register_site(
     site_id: u64,
@@ -730,6 +856,7 @@ fn observe(site_id: u64, fallback_kind: TypedFeedbackSiteKind, observation: Obse
         )
     });
     site.observe(observation);
+    note_array_guard_cache_slow_observation(site_id, &observation, site);
 }
 
 fn site_entry(
@@ -762,6 +889,9 @@ fn guard_observe(
     if site_id == 0 || !typed_feedback_enabled() {
         return contract_valid;
     }
+    if array_guard_fast_pass(site_id, &observation, contract_valid) {
+        return true;
+    }
     let mut reg = registry();
     let site = site_entry(&mut reg, site_id, fallback_kind);
     let guard_passed = contract_valid
@@ -777,6 +907,7 @@ fn guard_observe(
         site.guard_failures = site.guard_failures.saturating_add(1);
     }
     site.observe(observation);
+    note_array_guard_cache_slow_observation(site_id, &observation, site);
     guard_passed
 }
 
@@ -1863,6 +1994,7 @@ pub fn scan_typed_feedback_roots_mut(visitor: &mut crate::gc::RuntimeRootVisitor
 #[cfg(test)]
 pub(crate) fn reset_typed_feedback_for_tests() {
     TRACE_DUMPED.store(false, Ordering::Release);
+    reset_array_guard_fast_cache_for_tests();
     let mut reg = registry();
     *reg = TypedFeedbackRegistry::default();
 }
diff --git a/crates/perry-runtime/src/typed_feedback/tests.rs b/crates/perry-runtime/src/typed_feedback/tests.rs
index 821b9aae84..c3b35fc65c 100644
--- a/crates/perry-runtime/src/typed_feedback/tests.rs
+++ b/crates/perry-runtime/src/typed_feedback/tests.rs
@@ -505,6 +505,90 @@ fn typed_feedback_numeric_array_get_guard_requires_numeric_layout() {
     assert_eq!(site.fallback_calls, 0);
 }
 
+#[test]
+fn typed_feedback_numeric_array_guard_fast_path_preserves_snapshot_counts() {
+    let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap();
+    reset_typed_feedback_for_tests();
+    register(29, TypedFeedbackSiteKind::ArrayElement, "arr[i]");
+
+    let values = [1.0, 2.0];
+    let arr = crate::array::js_array_from_f64(values.as_ptr(), values.len() as u32);
+    let arr_box = crate::value::js_nanbox_pointer(arr as i64);
+
+    assert_eq!(
+        js_typed_feedback_numeric_array_index_get_guard(29, arr_box, 0.0, 0, 1),
+        1
+    );
+    assert_eq!(
+        js_typed_feedback_numeric_array_index_get_guard(29, arr_box, 0.0, 0, 1),
+        1
+    );
+    assert_eq!(
+        js_typed_feedback_numeric_array_index_get_guard(29, arr_box, 0.0, 0, 1),
+        1
+    );
+    assert_eq!(array_guard_cache_fast_passes(29), 2);
+
+    let snapshot = typed_feedback_snapshot();
+    let site = &snapshot.sites[0];
+    assert_eq!(site.guard_passes, 3);
+    assert_eq!(site.guard_failures, 0);
+    assert_eq!(site.observed_count, 3);
+    assert_eq!(site.observation_count, 1);
+}
+
+#[test]
+fn typed_feedback_numeric_array_guard_fast_path_respects_megamorphic_state() {
+    let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap();
+    reset_typed_feedback_for_tests();
+    register(30, TypedFeedbackSiteKind::ArrayElement, "arr[i]");
+
+    let values = [1.0, 2.0];
+    let arr = crate::array::js_array_from_f64(values.as_ptr(), values.len() as u32);
+    let arr_box = crate::value::js_nanbox_pointer(arr as i64);
+
+    assert_eq!(
+        js_typed_feedback_numeric_array_index_get_guard(30, arr_box, 0.0, 0, 1),
+        1
+    );
+    assert_eq!(
+        js_typed_feedback_numeric_array_index_get_guard(30, arr_box, 0.0, 0, 1),
+        1
+    );
+    assert_eq!(array_guard_cache_fast_passes(30), 1);
+
+    for class_id in 1..=POLYMORPHIC_CAP {
+        observe(
+            30,
+            TypedFeedbackSiteKind::ArrayElement,
+            Observation {
+                source: ObservationSource::Array,
+                object_addr: 0,
+                shape_addr: 0,
+                key_hash: 0,
+                class_id: class_id as u32,
+                heap_type: crate::gc::GC_TYPE_ARRAY as u16,
+                aux: pack_array_aux(
+                    ARRAY_ACCESS_INDEXED_IN_BOUNDS,
+                    ARRAY_LAYOUT_POINTER_FREE,
+                    STABLE_VALUE_NUMBER,
+                    0,
+                ),
+                value_tag: STABLE_VALUE_NUMBER,
+            },
+        );
+    }
+
+    let guard = js_typed_feedback_numeric_array_index_get_guard(30, arr_box, 0.0, 0, 1);
+    assert_eq!(guard, 0);
+
+    let snapshot = typed_feedback_snapshot();
+    let site = &snapshot.sites[0];
+    assert_eq!(site.state, "megamorphic");
+    assert_eq!(site.guard_passes, 2);
+    assert_eq!(site.guard_failures, 1);
+}
+
 #[test]
 fn typed_feedback_numeric_array_set_guard_requires_numeric_value_and_layout() {
     let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap();
diff --git a/crates/perry-runtime/src/typed_feedback/trace.rs b/crates/perry-runtime/src/typed_feedback/trace.rs
index c67acbfb4f..597fb881c2 100644
--- a/crates/perry-runtime/src/typed_feedback/trace.rs
+++ b/crates/perry-runtime/src/typed_feedback/trace.rs
@@ -180,6 +180,9 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot {
     let mut rows = Vec::with_capacity(reg.sites.len());
     for site in reg.sites.values() {
         let state = site.state();
+        let fast_guard_passes = array_guard_cache_fast_passes(site.site_id);
+        let observed_count = site.observed_count.saturating_add(fast_guard_passes);
+        let guard_passes = site.guard_passes.saturating_add(fast_guard_passes);
         *snapshot
             .by_kind
             .entry(site.metadata.kind.as_str().to_string())
@@ -198,9 +201,9 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot {
             operation: site.metadata.operation.clone(),
             guard_name: site.metadata.guard_name.clone(),
             fallback_name: site.metadata.fallback_name.clone(),
-            observed_count: site.observed_count,
+            observed_count,
             observation_count: site.observations.len(),
-            guard_passes: site.guard_passes,
+            guard_passes,
             guard_failures: site.guard_failures,
             fallback_calls: site.fallback_calls,
             shape_invalidations: site.shape_invalidations,
@@ -208,7 +211,7 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot {
             representation_invalidations: site.representation_invalidations,
             observed_kinds: observed_kinds_snapshot(&site.observations),
         });
-        snapshot.guard_passes = snapshot.guard_passes.saturating_add(site.guard_passes);
+        snapshot.guard_passes = snapshot.guard_passes.saturating_add(guard_passes);
         snapshot.guard_failures = snapshot.guard_failures.saturating_add(site.guard_failures);
         snapshot.fallback_calls = snapshot.fallback_calls.saturating_add(site.fallback_calls);
         snapshot
@@ -219,7 +222,7 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot {
                 failures: 0,
                 fallback_calls: 0,
             })
-            .add_site(site);
+            .add_site(site, fast_guard_passes);
     }
     rows.sort_by_key(|row| row.site_id);
     snapshot.sites = rows;
diff --git a/scripts/check_file_size.sh b/scripts/check_file_size.sh
index caa3835622..3426a52eb1 100755
--- a/scripts/check_file_size.sh
+++ b/scripts/check_file_size.sh
@@ -304,6 +304,13 @@ crates/perry-ext-http-server/src/http2_server.rs
 # on/once iterator machinery into the existing `events/` submodule is tracked
 # under #1435 with the other module-size cleanups.
 crates/perry-stdlib/src/events.rs
+# Runtime typed-feedback registry. Crossed the 2000-line gate (2004 LOC) after
+# the monomorphic array-guard fast cache (#5307) + pre-classification fast
+# observation builder (#5309) added the lock-free direct-mapped cache and its
+# slow-path fallbacks. The cache state is interwoven with the thread-local guard
+# accounting and can't move without scattering it. Splitting the fast-cache out
+# of the registry trunk is tracked under #1435.
+crates/perry-runtime/src/typed_feedback.rs
 EOF
 )
 

From 7f98910b0f036efd9dc313d5d06cadee7d9f7cb9 Mon Sep 17 00:00:00 2001
From: Andrew DiZenzo <andrewdizenzojhu@gmail.com>
Date: Wed, 17 Jun 2026 06:15:09 +0000
Subject: [PATCH 2/3] Speed up numeric array guard cache hits

---
 PERF_RUN_LOG.md                               | 67 ++++++++++++++
 crates/perry-runtime/src/typed_feedback.rs    | 88 +++++++++++++++++--
 .../perry-runtime/src/typed_feedback/tests.rs | 28 ++++++
 3 files changed, 178 insertions(+), 5 deletions(-)

diff --git a/PERF_RUN_LOG.md b/PERF_RUN_LOG.md
index e4921d8e99..a64df6c01e 100644
--- a/PERF_RUN_LOG.md
+++ b/PERF_RUN_LOG.md
@@ -167,3 +167,70 @@
   - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local second-cycle results.
   - This follow-up is intended as a stacked draft PR on top of the guarded numeric array direct payload access PR.
 - PR: https://github.com/PerryTS/perry/pull/5307
+
+## 2026-06-17 - Numeric array guard pre-classification fast pass
+
+- Start revision: `6a01499d4f`
+- Branch: `codex/perry-numeric-array-guard-precheck`
+- Worker assignment: single Codex pass in this worktree
+- Benchmark environment: Linux `/usr/bin/time`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness
+- Baseline commands:
+  - `cargo build --release`
+  - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-array-guard-cache-final --quiet`
+  - `for i in 1 2 3 4 5; do /tmp/perry-matrix-array-guard-cache-final; done`
+  - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-array-guard-cache-final-trace.json /tmp/perry-matrix-array-guard-cache-final`
+  - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-array-guard-cache-final`
+  - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-array-guard-cache-final-ed71efde8.json`
+  - `./benchmarks/quick.sh`
+- Baseline results:
+  - direct matrix binary: 1239ms, 1258ms, 1223ms, 1247ms, 1226ms; checksum always `41079519680`
+  - typed-feedback trace: 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures
+  - `perf stat` direct matrix binary: 4,485,321,202 cycles, 16,737,765,528 instructions, 3,085,068,790 branches, 382,419 branch-misses, 1.2376s elapsed
+  - compare quick medians: loop_overhead 56ms/18728KB, fibonacci 240ms/18888KB, math_intensive 55ms/18768KB, nested_loops 662ms/22888KB, factorial 76ms/18836KB
+  - quick: fibonacci 268ms/18MB, math_intensive 74ms/18MB, nested_loops 670ms/22MB, factorial 75ms/18MB, matrix_multiply 1228ms/30MB
+- Selected gap and evidence:
+  - After the monomorphic array guard cache, `matrix_multiply` remained the slowest `quick.sh` case at 1228ms.
+  - Trace still showed 33.6M successful numeric array get guard calls and 65K set guard calls with no get/set failures.
+  - Runtime inspection showed numeric array get/set guards still called `classify_array` before the cache lookup; for raw-f64 numeric arrays this recomputes layout and element-kind facts on every monomorphic cache hit.
+- Change:
+  - Added a pre-classification `numeric_array_fast_observation` helper for numeric array index get/set guard calls.
+  - The helper performs the required raw object, GC header, len/cap, bounds, and raw-f64 numeric layout checks, then constructs the same array observation the slow classifier would produce for numeric arrays.
+  - Numeric get/set guards now try the exact monomorphic array guard cache before calling `classify_array`; cache miss, stale cache, contract failure, nonnumeric index/value, or layout mismatch still falls back to the existing full classify/`guard_observe` path.
+  - Added a focused test that compares the helper's in-bounds and out-of-bounds observations against `classify_array`.
+- Post-change benchmark commands:
+  - `cargo build --release`
+  - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-guard-precheck-final2 --quiet`
+  - `for i in 1 2 3 4 5; do /usr/bin/time -f "sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-guard-precheck-final2; done`
+  - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-guard-precheck-final2-trace.json /usr/bin/time -f "wall=%e rss_kb=%M" /tmp/perry-matrix-guard-precheck-final2 && jq '.guards' /tmp/perry-matrix-guard-precheck-final2-trace.json`
+  - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-guard-precheck-final2`
+  - `benchmarks/quick.sh`
+  - `benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-guard-precheck-final2.json`
+- Post-change results:
+  - direct matrix binary: 400ms, 398ms, 398ms, 385ms, 386ms; checksum always `41079519680`
+  - direct run wall/RSS samples: 0.42s/31404KB, 0.42s/31244KB, 0.42s/31192KB, 0.39s/31304KB, 0.39s/31308KB
+  - final trace run: `matrix_multiply:395`, checksum `41079519680`, wall 0.42s, RSS 31500KB, 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures; push guard retained 39 fallback calls
+  - `perf stat` direct matrix binary: 1,443,394,074 cycles, 7,034,084,638 instructions, 1,568,434,556 branches, 241,348 branch-misses, 0.4222s elapsed
+  - compare quick medians: loop_overhead 76ms/18880KB, fibonacci 266ms/18764KB, math_intensive 55ms/19092KB, nested_loops 225ms/23204KB, factorial 95ms/18764KB
+  - quick: fibonacci 254ms/18MB, math_intensive 73ms/18MB, nested_loops 229ms/22MB, factorial 97ms/18MB, matrix_multiply 407ms/30MB
+- Measured impact:
+  - `16_matrix_multiply` direct median: 1239ms -> 398ms, 67.9% faster
+  - `16_matrix_multiply` quick: 1228ms -> 407ms, 66.9% faster
+  - Direct matrix binary cycles: 4.49B -> 1.44B, 67.8% fewer
+  - Direct matrix binary instructions: 16.74B -> 7.03B, 58.0% fewer
+  - Direct matrix binary branches: 3.09B -> 1.57B, 49.2% fewer
+  - `10_nested_loops` compare median: 662ms -> 225ms, 66.0% faster
+- Verification:
+  - `cargo fmt --check`
+  - `git diff --check`
+  - `cargo test -p perry-runtime typed_feedback_numeric_array`
+  - `cargo test -p perry-runtime typed_feedback`
+  - `cargo test -p perry-codegen --test typed_feedback`
+  - `cargo test -p perry-codegen --test typed_shape_descriptors`
+  - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py`
+  - `tests/test_benchmark_output_verifier.sh`
+  - `cargo build --release`
+  - Typed-feedback trace confirmed get/set guard pass counts and zero get/set failures match the pre-change trace while avoiding the full classifier on cache hits.
+- Notes:
+  - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local third-cycle results.
+  - This follow-up is intended as a stacked draft PR on top of the monomorphic array guard fast-cache PR.
+- PR: https://github.com/PerryTS/perry/pull/5309
diff --git a/crates/perry-runtime/src/typed_feedback.rs b/crates/perry-runtime/src/typed_feedback.rs
index 86f39230a0..e6104a08de 100644
--- a/crates/perry-runtime/src/typed_feedback.rs
+++ b/crates/perry-runtime/src/typed_feedback.rs
@@ -1220,6 +1220,60 @@ fn numeric_array_index_guard(arr: *const ArrayHeader, index: u32, require_in_bou
         && crate::array::js_array_is_numeric_f64_layout(arr) != 0
 }
 
+fn numeric_array_fast_observation(
+    raw_addr: usize,
+    index: u32,
+    require_in_bounds: bool,
+    value_tag: Option<u16>,
+) -> Option<Observation> {
+    let header = gc_header_for_user_addr(raw_addr)?;
+    unsafe {
+        if (*header).obj_type != crate::gc::GC_TYPE_ARRAY
+            || (*header).gc_flags & crate::gc::GC_FLAG_FORWARDED != 0
+        {
+            return None;
+        }
+        let arr = raw_addr as *const ArrayHeader;
+        let len = (*arr).length;
+        let cap = (*arr).capacity;
+        if len > 16_000_000 || cap > 16_000_000 || len > cap {
+            return None;
+        }
+        let in_bounds = index < len;
+        if require_in_bounds && !in_bounds {
+            return None;
+        }
+        if crate::array::js_array_is_numeric_f64_layout(arr) == 0 {
+            return None;
+        }
+        let access_kind = if in_bounds {
+            ARRAY_ACCESS_INDEXED_IN_BOUNDS
+        } else {
+            ARRAY_ACCESS_INDEXED_OUT_OF_BOUNDS
+        };
+        let layout_kind = if len == 0 {
+            ARRAY_LAYOUT_EMPTY
+        } else {
+            ARRAY_LAYOUT_POINTER_FREE
+        };
+        let element_kind = if in_bounds {
+            STABLE_VALUE_NUMBER
+        } else {
+            STABLE_VALUE_UNDEFINED
+        };
+        Some(Observation {
+            source: ObservationSource::Array,
+            object_addr: 0,
+            shape_addr: 0,
+            key_hash: 0,
+            class_id: 0,
+            heap_type: crate::gc::GC_TYPE_ARRAY as u16,
+            aux: pack_array_aux(access_kind, layout_kind, element_kind, 0),
+            value_tag: value_tag.unwrap_or(element_kind),
+        })
+    }
+}
+
 fn numeric_array_push_guard(arr: *const ArrayHeader, value: f64) -> bool {
     let raw_addr = normalize_raw_object_addr(arr as u64);
     let Some(header) = gc_header_for_user_addr(raw_addr) else {
@@ -1394,15 +1448,25 @@ pub extern "C" fn js_typed_feedback_numeric_array_index_get_guard(
     require_in_bounds: i32,
 ) -> i32 {
     let raw_addr = normalize_raw_object_addr(receiver.to_bits());
+    let require_in_bounds = require_in_bounds != 0;
     if !typed_feedback_enabled() {
         return (is_plain_number_bits(index_value.to_bits())
             && index >= 0
             && numeric_array_index_guard(
                 raw_addr as *const ArrayHeader,
                 index as u32,
-                require_in_bounds != 0,
+                require_in_bounds,
             )) as i32;
     }
+    if site_id != 0 && is_plain_number_bits(index_value.to_bits()) && index >= 0 {
+        if let Some(observation) =
+            numeric_array_fast_observation(raw_addr, index as u32, require_in_bounds, None)
+        {
+            if array_guard_fast_pass(site_id, &observation, true) {
+                return 1;
+            }
+        }
+    }
     let observed_index = if index >= 0 { index as u32 } else { u32::MAX };
     let (class_id, heap_type, aux, element_kind) = classify_array(raw_addr, Some(observed_index));
     let observation = Observation {
@@ -1420,7 +1484,7 @@ pub extern "C" fn js_typed_feedback_numeric_array_index_get_guard(
         && numeric_array_index_guard(
             raw_addr as *const ArrayHeader,
             index as u32,
-            require_in_bounds != 0,
+            require_in_bounds,
         );
     let pass = guard_observe(
         site_id,
@@ -1656,15 +1720,29 @@ pub extern "C" fn js_typed_feedback_numeric_array_index_set_guard(
     require_in_bounds: i32,
 ) -> i32 {
     let raw_addr = normalize_raw_object_addr(receiver.to_bits());
+    let require_in_bounds = require_in_bounds != 0;
+    let value_bits = value.to_bits();
     if !typed_feedback_enabled() {
         return (index >= 0
-            && is_numeric_value_bits(value.to_bits())
+            && is_numeric_value_bits(value_bits)
             && numeric_array_index_guard(
                 raw_addr as *const ArrayHeader,
                 index as u32,
-                require_in_bounds != 0,
+                require_in_bounds,
             )) as i32;
     }
+    if site_id != 0 && index >= 0 && is_numeric_value_bits(value_bits) {
+        if let Some(observation) = numeric_array_fast_observation(
+            raw_addr,
+            index as u32,
+            require_in_bounds,
+            Some(stable_value_kind(value_bits)),
+        ) {
+            if array_guard_fast_pass(site_id, &observation, true) {
+                return 1;
+            }
+        }
+    }
     let observed_index = if index >= 0 { index as u32 } else { u32::MAX };
     let (class_id, heap_type, aux, _element_kind) = classify_array(raw_addr, Some(observed_index));
     let observation = Observation {
@@ -1682,7 +1760,7 @@ pub extern "C" fn js_typed_feedback_numeric_array_index_set_guard(
         && numeric_array_index_guard(
             raw_addr as *const ArrayHeader,
             index as u32,
-            require_in_bounds != 0,
+            require_in_bounds,
         );
     let pass = guard_observe(
         site_id,
diff --git a/crates/perry-runtime/src/typed_feedback/tests.rs b/crates/perry-runtime/src/typed_feedback/tests.rs
index c3b35fc65c..f3eb208c44 100644
--- a/crates/perry-runtime/src/typed_feedback/tests.rs
+++ b/crates/perry-runtime/src/typed_feedback/tests.rs
@@ -537,6 +537,34 @@ fn typed_feedback_numeric_array_guard_fast_path_preserves_snapshot_counts() {
     assert_eq!(site.observation_count, 1);
 }
 
+#[test]
+fn typed_feedback_numeric_array_fast_observation_matches_classifier() {
+    let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap();
+    reset_typed_feedback_for_tests();
+
+    let values = [1.0, 2.0];
+    let arr = crate::array::js_array_from_f64(values.as_ptr(), values.len() as u32);
+    let raw_addr = normalize_raw_object_addr(arr as u64);
+
+    for index in [0, values.len() as u32] {
+        let observation = numeric_array_fast_observation(raw_addr, index, false, None)
+            .expect("numeric fast observation");
+        let (class_id, heap_type, aux, element_kind) = classify_array(raw_addr, Some(index));
+        assert_eq!(observation.class_id, class_id);
+        assert_eq!(observation.heap_type, heap_type);
+        assert_eq!(observation.aux, aux);
+        assert_eq!(observation.value_tag, element_kind);
+    }
+
+    let set_observation =
+        numeric_array_fast_observation(raw_addr, 1, true, Some(STABLE_VALUE_INT32))
+            .expect("numeric set fast observation");
+    let (_, _, aux, _) = classify_array(raw_addr, Some(1));
+    assert_eq!(set_observation.aux, aux);
+    assert_eq!(set_observation.value_tag, STABLE_VALUE_INT32);
+    assert!(numeric_array_fast_observation(raw_addr, values.len() as u32, true, None).is_none());
+}
+
 #[test]
 fn typed_feedback_numeric_array_guard_fast_path_respects_megamorphic_state() {
     let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap();

From 8d1c99bf8512a391c474d01dc413ec67871bbc9e Mon Sep 17 00:00:00 2001
From: Andrew DiZenzo <59515127+andrewtdiz@users.noreply.github.com>
Date: Thu, 18 Jun 2026 11:55:02 -0600
Subject: [PATCH 3/3] Lower loop-bound array indices as i32 (#5310)

---
 PERF_RUN_LOG.md                              | 67 ++++++++++++++++++++
 crates/perry-codegen/src/expr/index_get.rs   | 28 +++++++-
 crates/perry-codegen/src/stmt/loops.rs       | 20 +++++-
 crates/perry-codegen/tests/typed_feedback.rs | 53 +++++++++++++++-
 4 files changed, 162 insertions(+), 6 deletions(-)

diff --git a/PERF_RUN_LOG.md b/PERF_RUN_LOG.md
index a64df6c01e..d1d43fa1dd 100644
--- a/PERF_RUN_LOG.md
+++ b/PERF_RUN_LOG.md
@@ -234,3 +234,70 @@
   - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local third-cycle results.
   - This follow-up is intended as a stacked draft PR on top of the monomorphic array guard fast-cache PR.
 - PR: https://github.com/PerryTS/perry/pull/5309
+
+## 2026-06-17 - I32 lowering for loop-bound numeric array indices
+
+- Start revision: `966729232`
+- Branch: `codex/perry-i32-array-index-lowering`
+- Worker assignment: single Codex pass in this worktree
+- Benchmark environment: Linux `/usr/bin/time`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness
+- Baseline commands:
+  - `cargo build --release`
+  - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-guard-precheck-final2 --quiet`
+  - `for i in 1 2 3 4 5; do /usr/bin/time -f "sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-guard-precheck-final2; done`
+  - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-guard-precheck-final2-trace.json /usr/bin/time -f "wall=%e rss_kb=%M" /tmp/perry-matrix-guard-precheck-final2 && jq '.guards' /tmp/perry-matrix-guard-precheck-final2-trace.json`
+  - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-guard-precheck-final2`
+  - `benchmarks/quick.sh`
+  - `benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-guard-precheck-final2.json`
+- Baseline results:
+  - direct matrix binary: 400ms, 398ms, 398ms, 385ms, 386ms; checksum always `41079519680`
+  - final trace run: `matrix_multiply:395`, checksum `41079519680`, wall 0.42s, RSS 31500KB, 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures; push guard retained 39 fallback calls
+  - `perf stat` direct matrix binary: 1,443,394,074 cycles, 7,034,084,638 instructions, 1,568,434,556 branches, 241,348 branch-misses, 0.4222s elapsed
+  - compare quick medians: loop_overhead 76ms/18880KB, fibonacci 266ms/18764KB, math_intensive 55ms/19092KB, nested_loops 225ms/23204KB, factorial 95ms/18764KB
+  - quick: fibonacci 254ms/18MB, math_intensive 73ms/18MB, nested_loops 229ms/22MB, factorial 97ms/18MB, matrix_multiply 407ms/30MB
+- Selected gap and evidence:
+  - After numeric array guard pre-classification, `matrix_multiply` remained the slowest `quick.sh` case at 407ms.
+  - LLVM trace for `benchmarks/suite/16_matrix_multiply.ts` still lowered hot computed get indices such as `i * size + k` through `sitofp`/`fmul`/`fadd`/`fptosi` before calling the typed-feedback numeric array get guard.
+  - Loop-bound analysis already proved and hoisted `size` as an i32 loop bound for `i < size` and `k < size`, but that trusted bound was not visible to the existing i32 expression lowering used by index expressions.
+- Change:
+  - Reused or inserted an i32 slot for local loop bounds classified by the `i < n` loop-bound path and kept that slot visible while lowering the loop body.
+  - Used the existing `can_lower_expr_as_i32` / `lower_expr_as_i32` machinery for known-array computed get indices when the index expression is fully backed by trusted i32 slots, integer locals, or constants.
+  - Preserved the typed-feedback numeric array get guard and fallback path; the final i32 index is converted back to double only for the guard's double index argument.
+  - Added an IR regression test covering `xs[i * size + 1]` inside `for (let i = 0; i < size; i++)`, asserting guarded fallback emission plus `mul i32`/`add i32` and no `fmul double` for that computed index.
+- Post-change benchmark commands:
+  - `cargo build --release`
+  - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-i32-index-proto --trace llvm --quiet`
+  - `rg -n "js_typed_feedback_numeric_array_index_get_guard|fmul double|mul i32|add i32|sitofp i32" .perry-trace/llvm/_16_matrix_multiply_ts.ll`
+  - `for i in 1 2 3 4 5; do /usr/bin/time -f "sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-i32-index-proto; done`
+  - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-i32-index-proto-trace.json /usr/bin/time -f "wall=%e rss_kb=%M" /tmp/perry-matrix-i32-index-proto && jq '.guards' /tmp/perry-matrix-i32-index-proto-trace.json`
+  - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-i32-index-proto`
+  - `benchmarks/quick.sh`
+  - `benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-i32-index-proto.json`
+  - `for i in 1 2 3 4 5 6 7 8 9 10; do /usr/bin/time -f "sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-i32-index-proto; done`
+- Post-change results:
+  - LLVM trace confirmed the two hot matmul numeric-array get indices now use `mul i32` and `add i32` before `call i32 @js_typed_feedback_numeric_array_index_get_guard`; remaining `sitofp i32` values feed the guard's double index argument.
+  - direct matrix binary first sample set: 400ms, 393ms, 388ms, 403ms, 393ms; checksum always `41079519680`
+  - direct matrix binary 10-sample set: 397ms, 396ms, 390ms, 392ms, 392ms, 393ms, 383ms, 389ms, 386ms, 384ms; checksum always `41079519680`
+  - trace run: `matrix_multiply:397`, checksum `41079519680`, wall 0.42s, RSS 31440KB, 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures; push guard retained 39 fallback calls
+  - `perf stat` direct matrix binary: 1,456,553,467 cycles, 7,017,622,346 instructions, 1,568,480,860 branches, 249,497 branch-misses, 0.4217s elapsed
+  - quick: fibonacci 251ms/18MB, math_intensive 71ms/18MB, nested_loops 202ms/22MB, factorial 99ms/18MB, matrix_multiply 387ms/30MB
+  - compare quick medians: loop_overhead 56ms/18784KB, fibonacci 248ms/18896KB, math_intensive 55ms/18900KB, nested_loops 214ms/23268KB, factorial 78ms/18776KB
+- Measured impact:
+  - `16_matrix_multiply` direct median: 398ms -> 391ms, 1.8% faster
+  - `16_matrix_multiply` quick: 407ms -> 387ms, 4.9% faster
+  - Direct matrix binary instructions: 7.034B -> 7.018B, 0.2% fewer
+  - Direct matrix binary cycles: 1.443B -> 1.457B, 0.9% more in the single perf sample; branch misses also rose from 241K to 249K, so counter impact is mixed despite lower wall-time samples
+  - `10_nested_loops` compare median: 225ms -> 214ms, 4.9% faster
+- Verification:
+  - `cargo fmt --check`
+  - `git diff --check`
+  - `cargo test -p perry-codegen --test typed_feedback`
+  - `cargo test -p perry-codegen --test typed_shape_descriptors`
+  - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py`
+  - `tests/test_benchmark_output_verifier.sh`
+  - `cargo build --release`
+  - Typed-feedback trace confirmed get/set guard pass counts and zero get/set failures match the pre-change trace.
+- Notes:
+  - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only`, and the before/after comparison above uses the captured local fourth-cycle baseline.
+  - This is a smaller cleanup than the preceding guard-cache work. The keeper signal is the consistent matrix wall-time reduction plus removal of double arithmetic from the hottest generated get-index chains; perf counters should be watched on future runs.
+- PR: https://github.com/PerryTS/perry/pull/5310
diff --git a/crates/perry-codegen/src/expr/index_get.rs b/crates/perry-codegen/src/expr/index_get.rs
index a232b533e9..fa18748dd2 100644
--- a/crates/perry-codegen/src/expr/index_get.rs
+++ b/crates/perry-codegen/src/expr/index_get.rs
@@ -35,7 +35,7 @@ use crate::types::{DOUBLE, I1, I16, I32, I64, I8, PTR};
 use super::arrays_finds::lower_buffer_index_get_i32;
 #[allow(unused_imports)]
 use super::{
-    buffer_access_materialization_reason, buffer_alias_metadata_suffix,
+    buffer_access_materialization_reason, buffer_alias_metadata_suffix, can_lower_expr_as_i32,
     emit_layout_note_slot_on_block, emit_shadow_slot_clear, emit_shadow_slot_update_for_expr,
     emit_string_literal_global, emit_typed_feedback_register_site, emit_v8_export_call,
     emit_v8_member_method_call, emit_write_barrier, emit_write_barrier_slot_on_block,
@@ -847,8 +847,30 @@ pub(crate) fn lower(ctx: &mut FnCtx<'_>, expr: &Expr) -> Result<String> {
                 }
 
                 let arr_box = lower_expr(ctx, object)?;
-                let idx_double = lower_expr(ctx, index)?;
-                let idx_i32 = ctx.block().fptosi(DOUBLE, &idx_double, I32);
+                let i32_slots = ctx.i32_counter_slots.clone();
+                let flat_const_arrays = ctx.flat_const_arrays.clone();
+                let array_row_aliases = ctx.array_row_aliases.clone();
+                let integer_locals = ctx.integer_locals.clone();
+                let use_i32_index = can_lower_expr_as_i32(
+                    index,
+                    &i32_slots,
+                    &flat_const_arrays,
+                    &array_row_aliases,
+                    &integer_locals,
+                    ctx.clamp3_functions,
+                    ctx.clamp_u8_functions,
+                    ctx.integer_returning_functions,
+                    ctx.i32_identity_functions,
+                );
+                let (idx_double, idx_i32) = if use_i32_index {
+                    let idx_i32 = lower_expr_as_i32(ctx, index)?;
+                    let idx_double = ctx.block().sitofp(I32, &idx_i32, DOUBLE);
+                    (idx_double, idx_i32)
+                } else {
+                    let idx_double = lower_expr(ctx, index)?;
+                    let idx_i32 = ctx.block().fptosi(DOUBLE, &idx_double, I32);
+                    (idx_double, idx_i32)
+                };
                 if !require_numeric_layout
                     && !matches!(index.as_ref(), Expr::Integer(_) | Expr::Number(_))
                 {
diff --git a/crates/perry-codegen/src/stmt/loops.rs b/crates/perry-codegen/src/stmt/loops.rs
index f05d636d21..4199ed7364 100644
--- a/crates/perry-codegen/src/stmt/loops.rs
+++ b/crates/perry-codegen/src/stmt/loops.rs
@@ -386,6 +386,7 @@ pub(crate) fn lower_for(
     // site having done so already).  Only the site that inserted should
     // remove it at loop exit to avoid disturbing a pre-existing slot.
     let local_bound_counter_i32_was_fresh: bool;
+    let local_bound_bound_i32_was_fresh: bool;
     let i32_local_bound_slot: Option<String> =
         if let Some((counter_id, bound_id, _op)) = local_bound_classification {
             // Allocate a parallel i32 slot for the counter if not already
@@ -411,18 +412,28 @@ pub(crate) fn lower_for(
             local_bound_counter_i32_was_fresh = fresh;
             // Hoist `fptosi(n)` to a fresh i32 alloca before the cond block
             // so LLVM sees a loop-invariant integer bound — critical for
-            // SCEV / LoopVectorizer to recognize the induction variable.
-            if let Some(bound_slot) = ctx.locals.get(&bound_id).cloned() {
+            // SCEV / LoopVectorizer to recognize the induction variable. Also
+            // expose that slot while lowering the loop body so integer index
+            // expressions like `i * n + k` can reuse the same trusted bound
+            // instead of rebuilding the index through double arithmetic.
+            if let Some(existing) = ctx.i32_counter_slots.get(&bound_id).cloned() {
+                local_bound_bound_i32_was_fresh = false;
+                Some(existing)
+            } else if let Some(bound_slot) = ctx.locals.get(&bound_id).cloned() {
                 let bound_dbl = ctx.block().load(DOUBLE, &bound_slot);
                 let bound_i32 = ctx.block().fptosi(DOUBLE, &bound_dbl, I32);
                 let slot = ctx.func.alloca_entry(I32);
                 ctx.block().store(I32, &bound_i32, &slot);
+                ctx.i32_counter_slots.insert(bound_id, slot.clone());
+                local_bound_bound_i32_was_fresh = true;
                 Some(slot)
             } else {
+                local_bound_bound_i32_was_fresh = false;
                 None
             }
         } else {
             local_bound_counter_i32_was_fresh = false;
+            local_bound_bound_i32_was_fresh = false;
             None
         };
     // Issue #168 follow-up: when neither the `arr.length` hoist nor the static
@@ -718,6 +729,11 @@ pub(crate) fn lower_for(
             ctx.i32_counter_slots.remove(&counter_id);
         }
     }
+    if local_bound_bound_i32_was_fresh {
+        if let Some((_, bound_id, _)) = local_bound_classification {
+            ctx.i32_counter_slots.remove(&bound_id);
+        }
+    }
     let _ = i32_local_bound_slot;
     // Same cleanup for the runtime-guarded `any`-bound path.
     if let Some(dyn_bound) = dynamic_i32_bound {
diff --git a/crates/perry-codegen/tests/typed_feedback.rs b/crates/perry-codegen/tests/typed_feedback.rs
index a0b124c572..31083f53d8 100644
--- a/crates/perry-codegen/tests/typed_feedback.rs
+++ b/crates/perry-codegen/tests/typed_feedback.rs
@@ -1,5 +1,8 @@
 use perry_codegen::{compile_module, AppMetadata, CompileOptions};
-use perry_hir::{BinaryOp, Class, ClassField, Expr, Function, Module, ModuleInitKind, Param, Stmt};
+use perry_hir::{
+    BinaryOp, Class, ClassField, CompareOp, Expr, Function, Module, ModuleInitKind, Param, Stmt,
+    UpdateOp,
+};
 use perry_types::{FunctionType, Type};
 
 /// Serializes env-mutating tests so a concurrent test never observes a
@@ -547,3 +550,51 @@ fn typed_feedback_guards_computed_numeric_array_index_hot_path() {
     assert!(!ir.contains("call double @js_array_numeric_get_f64_unboxed"));
     assert!(ir.contains("load double"));
 }
+
+#[test]
+fn typed_feedback_guards_computed_numeric_array_index_uses_i32_loop_bound() {
+    let array_ty = Type::Array(Box::new(Type::Number));
+    let ir = ir_for(module(
+        "typed_feedback_loop_bound_computed_array.ts",
+        vec![param(1, "xs", array_ty), param(2, "size", Type::Number)],
+        Type::Number,
+        vec![Stmt::For {
+            init: Some(Box::new(Stmt::Let {
+                id: 3,
+                name: "i".to_string(),
+                ty: Type::Number,
+                mutable: true,
+                init: Some(Expr::Integer(0)),
+            })),
+            condition: Some(Expr::Compare {
+                op: CompareOp::Lt,
+                left: Box::new(Expr::LocalGet(3)),
+                right: Box::new(Expr::LocalGet(2)),
+            }),
+            update: Some(Expr::Update {
+                id: 3,
+                op: UpdateOp::Increment,
+                prefix: false,
+            }),
+            body: vec![Stmt::Return(Some(Expr::IndexGet {
+                object: Box::new(Expr::LocalGet(1)),
+                index: Box::new(Expr::Binary {
+                    op: BinaryOp::Add,
+                    left: Box::new(Expr::Binary {
+                        op: BinaryOp::Mul,
+                        left: Box::new(Expr::LocalGet(3)),
+                        right: Box::new(Expr::LocalGet(2)),
+                    }),
+                    right: Box::new(Expr::Integer(1)),
+                }),
+            }))],
+        }],
+    ));
+
+    assert!(ir.contains("call i32 @js_typed_feedback_numeric_array_index_get_guard"));
+    assert!(ir.contains("call double @js_typed_feedback_array_index_get_fallback_boxed"));
+    assert!(ir.contains("mul i32"), "{ir}");
+    assert!(ir.contains("add i32"), "{ir}");
+    assert!(!ir.contains("fmul double"), "{ir}");
+    assert!(!ir.contains("call double @js_array_numeric_get_f64_unboxed"));
+}