diff --git a/PERF_RUN_LOG.md b/PERF_RUN_LOG.md index e395ec7481..e4921d8e99 100644 --- a/PERF_RUN_LOG.md +++ b/PERF_RUN_LOG.md @@ -44,3 +44,126 @@ - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local baseline JSON. - Follow-up candidates remain in typed array and numeric array hot paths, but this cycle stopped at the isolated registration-hoist optimization. - PR: https://github.com/PerryTS/perry/pull/5295 + +## 2026-06-17 - Guarded numeric array direct payload access + +- Start revision: `8d953ca7ad6f` +- Branch: `codex/perry-performance-20260617` +- Worker assignment: single Codex pass in this worktree +- Benchmark environment: Linux `/usr/bin/time -v`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness +- Baseline commands: + - `cargo build --release` + - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-final-e816fc3e4.json` + - `./benchmarks/quick.sh` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-multiply-final --quiet` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-multiply-final` +- Baseline results: + - compare quick medians: loop_overhead 74ms/18768KB, fibonacci 261ms/18920KB, math_intensive 69ms/18944KB, nested_loops 956ms/19152KB, factorial 94ms/18896KB + - quick: fibonacci 262ms/18MB, math_intensive 55ms/18MB, nested_loops 965ms/18MB, factorial 75ms/18MB, matrix_multiply 1842ms/28MB + - direct matrix binary: `matrix_multiply:1778`, `checksum:41079519680` + - `perf stat` direct matrix binary: 6,569,183,197 cycles, 30,876,077,204 instructions, 5,501,828,073 branches, 2,178,745 branch-misses, 1.8236s elapsed +- Selected gap and evidence: + - After the registration hoist, `matrix_multiply` was still the slowest `quick.sh` case at 1842ms. + - LLVM trace for `benchmarks/suite/16_matrix_multiply.ts` showed hot-path calls to `js_array_numeric_get_f64_unboxed` and `js_array_numeric_set_f64_unboxed` after the existing typed-feedback numeric array guards. + - The guards prove a live, non-forwarded array, in-bounds index where required, raw-f64 numeric layout, and numeric set values; the runtime helpers then only repeat checks before loading or storing the raw-f64 payload. +- Change: + - Inlined raw-f64 array element loads/stores in guarded numeric array index get/set lowering after the typed-feedback guard and codegen length checks. + - Recorded direct-load/direct-store native proof consumers and taught the verifier to accept them only with the existing consumed raw-f64 layout fact. + - Updated typed-feedback, typed-shape, and native-proof tests to expect direct payload access instead of helper calls on the guarded fast paths. +- Post-change benchmark commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-direct-final --trace llvm --quiet` + - `for i in 1 2 3 4 5; do /tmp/perry-matrix-direct-final; done` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-direct-final` + - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-direct-numeric-final-e816fc3e4.json` + - `./benchmarks/quick.sh` +- Post-change results: + - traced matrix binary: 1736ms, 1730ms, 1729ms, 1738ms, 1714ms; checksum always `41079519680` + - `perf stat` direct matrix binary: 6,337,280,206 cycles, 28,036,164,989 instructions, 4,648,261,291 branches, 488,073 branch-misses, 1.7806s elapsed + - compare quick medians: loop_overhead 56ms/19040KB, fibonacci 239ms/18764KB, math_intensive 58ms/18756KB, nested_loops 921ms/18944KB, factorial 89ms/18828KB + - quick: fibonacci 264ms/18MB, math_intensive 55ms/18MB, nested_loops 928ms/18MB, factorial 76ms/18MB, matrix_multiply 1745ms/28MB +- Measured impact: + - `16_matrix_multiply` quick: 1842ms -> 1745ms, 5.3% faster + - Direct matrix binary instructions: 30.88B -> 28.04B, 9.2% fewer + - Direct matrix binary branches: 5.50B -> 4.65B, 15.5% fewer + - `10_nested_loops` compare median: 956ms -> 921ms, 3.7% faster +- Verification: + - `cargo fmt --check` + - `git diff --check` + - `cargo test -p perry-codegen --test typed_feedback` + - `cargo test -p perry-codegen --test typed_shape_descriptors` + - `cargo test -p perry-codegen --test native_proof_regressions artifact_records_numeric_array_f64_fast_paths_and_fallback_reasons` + - `cargo test -p perry-codegen native_value::verify::tests` + - `cargo build --release` + - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py` + - `tests/test_benchmark_output_verifier.sh` + - Trace check confirmed `js_array_numeric_get_f64_unboxed` and `js_array_numeric_set_f64_unboxed` are declared but have no `call` sites in the generated matrix module; raw-f64 `load double` and `store double` operations remain in the guarded paths. +- Notes: + - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local first-cycle results. + - This follow-up is intended as a stacked draft PR on top of the typed-feedback registration-hoist PR. +- PR: https://github.com/PerryTS/perry/pull/5302 + +## 2026-06-17 - Monomorphic array guard fast cache + +- Start revision: `ed71efde8585` +- Branch: `codex/perry-array-guard-cache-fastpath` +- Worker assignment: single Codex pass in this worktree +- Benchmark environment: Linux `/usr/bin/time -v`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness +- Baseline commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-direct-final --trace llvm --quiet` + - `for i in 1 2 3 4 5; do /tmp/perry-matrix-direct-final; done` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-direct-final` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-typed-feedback.json /tmp/perry-matrix-direct-final` + - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-direct-numeric-final-e816fc3e4.json` + - `./benchmarks/quick.sh` +- Baseline results: + - direct matrix binary: 1736ms, 1730ms, 1729ms, 1738ms, 1714ms; checksum always `41079519680` + - `perf stat` direct matrix binary: 6,337,280,206 cycles, 28,036,164,989 instructions, 4,648,261,291 branches, 488,073 branch-misses, 1.7806s elapsed + - typed-feedback trace for direct matrix binary: 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures + - compare quick medians: loop_overhead 56ms/19040KB, fibonacci 239ms/18764KB, math_intensive 58ms/18756KB, nested_loops 921ms/18944KB, factorial 89ms/18828KB + - quick: fibonacci 264ms/18MB, math_intensive 55ms/18MB, nested_loops 928ms/18MB, factorial 76ms/18MB, matrix_multiply 1745ms/28MB +- Selected gap and evidence: + - After direct raw-f64 payload access, `matrix_multiply` remained the slowest `quick.sh` case at 1745ms. + - Matrix trace showed 33.6M successful numeric array get guard calls and 65K set guard calls, all monomorphic with no get/set failures. + - Sampled profiling/disassembly of `/tmp/perry-matrix-direct-final` showed the inner loop still calling `js_typed_feedback_numeric_array_index_get_guard` twice per `k` iteration; the guard path enters `guard_observe`, locks the global typed-feedback registry, does a `HashMap` lookup, updates counters, and rechecks the same monomorphic observation. + - A narrower raw-f64 classification shortcut was tested first and discarded: five direct matrix runs were 1767ms, 1774ms, 1757ms, 1806ms, 1763ms, which was slower/noisier than the 1714-1738ms baseline. +- Change: + - Added a small lock-free, direct-mapped cache for array typed-feedback guard sites. + - The cache is seeded by the existing slow `guard_observe` path and fast-passes only when the current array observation exactly matches the cached feedback key and the runtime contract guard is valid. + - Slow paths still update the registry, failures, megamorphic state, invalidation-visible observations, and fallback counters; trace snapshots merge cache fast-pass counters back into `observed_count`, per-site guard passes, and by-guard totals. + - Direct non-guard observations also update or disable the cache so a reused site that becomes megamorphic cannot keep fast-passing from stale cache state. +- Post-change benchmark commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-array-guard-cache-final --quiet` + - `for i in 1 2 3 4 5; do /tmp/perry-matrix-array-guard-cache-final; done` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-array-guard-cache-final-trace.json /tmp/perry-matrix-array-guard-cache-final` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-array-guard-cache-final` + - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-array-guard-cache-final-ed71efde8.json` + - `./benchmarks/quick.sh` +- Post-change results: + - direct matrix binary: 1239ms, 1258ms, 1223ms, 1247ms, 1226ms; checksum always `41079519680` + - final trace run: `matrix_multiply:1237`, checksum `41079519680`, 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures + - `perf stat` direct matrix binary: 4,485,321,202 cycles, 16,737,765,528 instructions, 3,085,068,790 branches, 382,419 branch-misses, 1.2376s elapsed + - compare quick medians: loop_overhead 56ms/18728KB, fibonacci 240ms/18888KB, math_intensive 55ms/18768KB, nested_loops 662ms/22888KB, factorial 76ms/18836KB + - quick: fibonacci 268ms/18MB, math_intensive 74ms/18MB, nested_loops 670ms/22MB, factorial 75ms/18MB, matrix_multiply 1228ms/30MB +- Measured impact: + - `16_matrix_multiply` direct median: 1730ms -> 1239ms, 28.4% faster + - `16_matrix_multiply` quick: 1745ms -> 1228ms, 29.6% faster + - Direct matrix binary instructions: 28.04B -> 16.74B, 40.3% fewer + - Direct matrix binary branches: 4.65B -> 3.09B, 33.6% fewer + - `10_nested_loops` compare median: 921ms -> 662ms, 28.1% faster +- Verification: + - `cargo fmt --check` + - `git diff --check` + - `cargo test -p perry-runtime typed_feedback` + - `cargo test -p perry-codegen --test typed_feedback` + - `cargo test -p perry-codegen --test typed_shape_descriptors` + - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py` + - `tests/test_benchmark_output_verifier.sh` + - `cargo build --release` + - Typed-feedback trace confirmed aggregate and per-site guard pass counts remain consistent with the pre-cache trace despite fast-path counter merging. +- Notes: + - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local second-cycle results. + - This follow-up is intended as a stacked draft PR on top of the guarded numeric array direct payload access PR. +- PR: https://github.com/PerryTS/perry/pull/5307 diff --git a/benchmarks/compiler_output/workloads.toml b/benchmarks/compiler_output/workloads.toml index 38dac5600f..74a3d5c07b 100644 --- a/benchmarks/compiler_output/workloads.toml +++ b/benchmarks/compiler_output/workloads.toml @@ -632,8 +632,10 @@ detail = "numeric indexed read takes the guarded raw-f64 fast path and loads the [[workloads.numeric_arrays.ir_checks]] name = "numeric_array_uses_unboxed_set" -contains = "js_array_numeric_set_f64_unboxed" -detail = "numeric indexed write uses the guarded raw-f64 helper" +contains = "js_typed_feedback_numeric_array_index_set_guard" +regex = '''idxset\.inbounds\.\d+:[\s\S]*?inttoptr i64 %\w+ to ptr\s*\n\s*store double %\w+, ptr %\w+[^\n]*\n\s*br label %idxset\.merge''' +regex_none = ["call i32 @js_array_numeric_set_f64_unboxed"] +detail = "numeric indexed write takes the guarded raw-f64 fast path and stores the slot inline (inttoptr + store double in idxset.inbounds; helper call elided)" [[workloads.numeric_arrays.stdout_checks]] name = "numeric_arrays_checksum" @@ -674,7 +676,7 @@ rejected_fact_state = "invalidated" [[workloads.numeric_arrays.native_rep_checks.require_records]] name = "numeric_array_get_fast_f64" expr_kind = "NumericArrayIndexGet" -consumer = "js_array_numeric_get_f64_unboxed" +consumer = "numeric_array_index_get.raw_f64_load" native_rep_name = "f64" access_mode = "checked_native" bounds_state = "proven_or_guarded" @@ -702,7 +704,7 @@ rejected_fact_state = "invalidated" [[workloads.numeric_arrays.native_rep_checks.require_records]] name = "numeric_array_set_fast_f64" expr_kind = "NumericArrayIndexSet" -consumer = "js_array_numeric_set_f64_unboxed" +consumer = "numeric_array_index_set.raw_f64_store" native_rep_name = "f64" access_mode = "checked_native" bounds_state = "proven_or_guarded" diff --git a/crates/perry-codegen/src/expr/index.rs b/crates/perry-codegen/src/expr/index.rs index 3e6fad1fd8..cd158f08cc 100644 --- a/crates/perry-codegen/src/expr/index.rs +++ b/crates/perry-codegen/src/expr/index.rs @@ -212,11 +212,15 @@ pub(crate) fn lower_index_set_fast( { let blk = ctx.block(); if require_numeric_layout { - blk.call( - I32, - "js_array_numeric_set_f64_unboxed", - &[(I64, &arr_handle), (I32, &idx_i32), (DOUBLE, val_double)], - ); + let (_element_addr, element_ptr) = element_slot(blk, &arr_handle, &idx_i32); + // The numeric-array guard proves the receiver has raw-f64 numeric + // layout and the value is numeric; the preceding length check + // proves this specific store is in-bounds. Store the numeric + // payload directly instead of calling the runtime helper. + // GC_STORE_AUDIT(POINTER_FREE): the stored value is a guard-proven + // numeric f64 written into a raw-f64 array payload slot — no GC + // pointer is stored, so no write barrier is required. + blk.store(DOUBLE, val_double, &element_ptr); } else { let (element_addr, element_ptr) = element_slot(blk, &arr_handle, &idx_i32); // In-place overwrite of a non-raw-layout (e.g. downgraded `any[]`) @@ -251,7 +255,7 @@ pub(crate) fn lower_index_set_fast( ctx.record_lowered_value_with_access_mode_and_facts( "NumericArrayIndexSet", Some(local_id), - "js_array_numeric_set_f64_unboxed", + "numeric_array_index_set.raw_f64_store", &stored, Some(BoundsState::Guarded { guard_id: "numeric_array_index_set_guard".to_string(), diff --git a/crates/perry-codegen/src/expr/index_get.rs b/crates/perry-codegen/src/expr/index_get.rs index 6ba83f993e..a232b533e9 100644 --- a/crates/perry-codegen/src/expr/index_get.rs +++ b/crates/perry-codegen/src/expr/index_get.rs @@ -289,6 +289,11 @@ fn lower_guarded_array_index_get( let fast_blk = ctx.block(); let arr_bits = fast_blk.bitcast_double_to_i64(arr_box); let arr_handle = fast_blk.and(I64, &arr_bits, POINTER_MASK_I64); + let idx_i64 = fast_blk.zext(I32, idx_i32, I64); + let byte_offset = fast_blk.shl(I64, &idx_i64, "3"); + let with_header = fast_blk.add(I64, &byte_offset, "8"); + let element_addr = fast_blk.add(I64, &arr_handle, &with_header); + let element_ptr = fast_blk.inttoptr(I64, &element_addr); let fast_val = if require_numeric_layout { // The `numeric_array_index_get_guard` on the way into this block already // proved: a plain, non-forwarded `Array`, in raw-f64 numeric layout, @@ -297,19 +302,10 @@ fn lower_guarded_array_index_get( // of calling `js_array_numeric_get_f64_unboxed`, whose hot path // re-validates exactly those same conditions and then does this load. // Raw-f64 arrays are dense (no HOLE slots) and the slot holds a raw f64, - // matching the runtime helper's `return *elements_ptr.add(index)`. - let idx_i64 = fast_blk.zext(I32, idx_i32, I64); - let byte_offset = fast_blk.shl(I64, &idx_i64, "3"); - let with_header = fast_blk.add(I64, &byte_offset, "8"); - let element_addr = fast_blk.add(I64, &arr_handle, &with_header); - let element_ptr = fast_blk.inttoptr(I64, &element_addr); + // matching the runtime helper's `return *elements_ptr.add(index)`. The + // `element_ptr` is hoisted above the branch since both arms reuse it. fast_blk.load(DOUBLE, &element_ptr) } else { - let idx_i64 = fast_blk.zext(I32, idx_i32, I64); - let byte_offset = fast_blk.shl(I64, &idx_i64, "3"); - let with_header = fast_blk.add(I64, &byte_offset, "8"); - let element_addr = fast_blk.add(I64, &arr_handle, &with_header); - let element_ptr = fast_blk.inttoptr(I64, &element_addr); let fast_raw = fast_blk.load(DOUBLE, &element_ptr); // `new Array(n)` slots are TAG_HOLE internally; JavaScript reads expose // `undefined`. @@ -330,7 +326,7 @@ fn lower_guarded_array_index_get( ctx.record_lowered_value_with_access_mode_and_facts( "NumericArrayIndexGet", None, - "js_array_numeric_get_f64_unboxed", + "numeric_array_index_get.raw_f64_load", &fast, Some(BoundsState::Guarded { guard_id: "numeric_array_index_get_guard".to_string(), diff --git a/crates/perry-codegen/src/expr/index_set.rs b/crates/perry-codegen/src/expr/index_set.rs index 0ede0171f6..0a0aa02db3 100644 --- a/crates/perry-codegen/src/expr/index_set.rs +++ b/crates/perry-codegen/src/expr/index_set.rs @@ -439,11 +439,15 @@ pub(crate) fn lower(ctx: &mut FnCtx<'_>, expr: &Expr) -> Result { let blk = ctx.block(); let arr_bits = blk.bitcast_double_to_i64(&arr_box); let arr_handle = blk.and(I64, &arr_bits, POINTER_MASK_I64); - blk.call( - I32, - "js_array_numeric_set_f64_unboxed", - &[(I64, &arr_handle), (I32, &idx_i32), (DOUBLE, &val_double)], - ); + let idx_i64 = blk.zext(I32, &idx_i32, I64); + let byte_offset = blk.shl(I64, &idx_i64, "3"); + let with_header = blk.add(I64, &byte_offset, "8"); + let element_addr = blk.add(I64, &arr_handle, &with_header); + let element_ptr = blk.inttoptr(I64, &element_addr); + // GC_STORE_AUDIT(POINTER_FREE): guard-proven + // numeric f64 stored into a raw-f64 array + // payload slot — no GC pointer, no barrier. + blk.store(DOUBLE, &val_double, &element_ptr); blk.br(&merge_label); } let stored = LoweredValue { @@ -455,7 +459,7 @@ pub(crate) fn lower(ctx: &mut FnCtx<'_>, expr: &Expr) -> Result { ctx.record_lowered_value_with_access_mode_and_facts( "NumericArrayIndexSet", Some(*arr_id), - "js_array_numeric_set_f64_unboxed", + "numeric_array_index_set.raw_f64_store", &stored, Some(BoundsState::Guarded { guard_id: "numeric_array_index_set_guard".to_string(), diff --git a/crates/perry-codegen/src/native_value/verify.rs b/crates/perry-codegen/src/native_value/verify.rs index 4a14024d4a..436427e23c 100644 --- a/crates/perry-codegen/src/native_value/verify.rs +++ b/crates/perry-codegen/src/native_value/verify.rs @@ -255,6 +255,8 @@ fn raw_f64_checked_native_consumer(record: &NativeRepRecord) -> bool { record.consumer.as_str(), "js_array_numeric_get_f64_unboxed" | "js_array_numeric_set_f64_unboxed" + | "numeric_array_index_get.raw_f64_load" + | "numeric_array_index_set.raw_f64_store" | "js_array_numeric_push_f64_unboxed" | "class_field_get.raw_f64_load" | "class_field_set.raw_f64_store" @@ -1257,6 +1259,14 @@ mod tests { for (expr_kind, consumer) in [ ("NumericArrayIndexGet", "js_array_numeric_get_f64_unboxed"), ("NumericArrayIndexSet", "js_array_numeric_set_f64_unboxed"), + ( + "NumericArrayIndexGet", + "numeric_array_index_get.raw_f64_load", + ), + ( + "NumericArrayIndexSet", + "numeric_array_index_set.raw_f64_store", + ), ("NumericArrayPush", "js_array_numeric_push_f64_unboxed"), ("ClassFieldGet", "class_field_get.raw_f64_load"), ("ClassFieldSet", "class_field_set.raw_f64_store"), diff --git a/crates/perry-codegen/tests/native_proof_regressions.rs b/crates/perry-codegen/tests/native_proof_regressions.rs index 53b692ab1a..55d0ff47ee 100644 --- a/crates/perry-codegen/tests/native_proof_regressions.rs +++ b/crates/perry-codegen/tests/native_proof_regressions.rs @@ -1852,7 +1852,7 @@ fn artifact_records_numeric_array_f64_fast_paths_and_fallback_reasons() { assert!( records.iter().any(|record| { record["expr_kind"] == "NumericArrayIndexSet" - && record["consumer"] == "js_array_numeric_set_f64_unboxed" + && record["consumer"] == "numeric_array_index_set.raw_f64_store" && record["native_rep_name"] == "f64" && record["access_mode"] == "checked_native" && record_has_raw_f64_layout_fact(record, "consumed_facts", "consumed") @@ -1862,7 +1862,7 @@ fn artifact_records_numeric_array_f64_fast_paths_and_fallback_reasons() { assert!( records.iter().any(|record| { record["expr_kind"] == "NumericArrayIndexGet" - && record["consumer"] == "js_array_numeric_get_f64_unboxed" + && record["consumer"] == "numeric_array_index_get.raw_f64_load" && record["native_rep_name"] == "f64" && record["access_mode"] == "checked_native" && record_has_raw_f64_layout_fact(record, "consumed_facts", "consumed") diff --git a/crates/perry-codegen/tests/typed_feedback.rs b/crates/perry-codegen/tests/typed_feedback.rs index 8543cadf22..a0b124c572 100644 --- a/crates/perry-codegen/tests/typed_feedback.rs +++ b/crates/perry-codegen/tests/typed_feedback.rs @@ -422,8 +422,8 @@ fn typed_feedback_guards_array_index_specialization() { assert!(ir.contains("js_typed_feedback_array_index_set_fallback_boxed")); assert!(ir.contains("js_typed_feedback_numeric_array_index_get_guard")); assert!(ir.contains("js_typed_feedback_array_index_get_fallback_boxed")); - assert!(ir.contains("js_array_numeric_set_f64_unboxed")); - assert!(ir.contains("js_array_numeric_get_f64_unboxed")); + assert!(!ir.contains("call i32 @js_array_numeric_set_f64_unboxed")); + assert!(!ir.contains("call double @js_array_numeric_get_f64_unboxed")); } #[test] diff --git a/crates/perry-codegen/tests/typed_shape_descriptors.rs b/crates/perry-codegen/tests/typed_shape_descriptors.rs index 1b86972f18..62992262fd 100644 --- a/crates/perry-codegen/tests/typed_shape_descriptors.rs +++ b/crates/perry-codegen/tests/typed_shape_descriptors.rs @@ -443,9 +443,14 @@ fn bounded_integer_array_store_omits_layout_note_and_barrier() { let ir = ir_for(module); + // The bounded numeric store is inlined after the guard (no per-element + // `js_array_numeric_set_f64_unboxed` helper call): this commit inlines the + // guarded raw-f64 payload store directly, and a later commit's loop + // set-preguard does the same. The raw-f64 store still happens inline; the + // layout-note / barrier invariants below are what this test pins down. assert!( - ir.contains("call i32 @js_array_numeric_set_f64_unboxed"), - "bounded numeric array store should route through the raw-f64 payload helper" + !ir.contains("call i32 @js_array_numeric_set_f64_unboxed"), + "bounded numeric store should inline the raw-f64 payload store (no per-element helper call)" ); assert!( ir.contains("call i32 @js_typed_feedback_numeric_array_index_set_guard"), diff --git a/crates/perry-runtime/src/typed_feedback.rs b/crates/perry-runtime/src/typed_feedback.rs index bba2164b4f..86f39230a0 100644 --- a/crates/perry-runtime/src/typed_feedback.rs +++ b/crates/perry-runtime/src/typed_feedback.rs @@ -7,8 +7,7 @@ use std::collections::{BTreeMap, HashMap}; #[cfg(any(feature = "diagnostics", test))] use std::sync::atomic::AtomicBool; -#[cfg(any(feature = "diagnostics", test))] -use std::sync::atomic::Ordering; +use std::sync::atomic::{AtomicU64, AtomicU8, Ordering}; use std::sync::{LazyLock, Mutex}; use crate::array::ArrayHeader; @@ -19,11 +18,20 @@ use crate::value::{ }; const POLYMORPHIC_CAP: usize = 4; +const ARRAY_GUARD_FAST_CACHE_SIZE: usize = 4096; +const ARRAY_GUARD_FAST_CACHE_ENABLED: u8 = 1; +const ARRAY_GUARD_FAST_CACHE_DISABLED: u8 = 2; static REGISTRY: LazyLock> = LazyLock::new(|| Mutex::new(TypedFeedbackRegistry::default())); #[cfg(any(feature = "diagnostics", test))] static TRACE_DUMPED: AtomicBool = AtomicBool::new(false); +static ARRAY_GUARD_FAST_CACHE: LazyLock> = LazyLock::new(|| { + (0..ARRAY_GUARD_FAST_CACHE_SIZE) + .map(|_| ArrayGuardFastCacheEntry::default()) + .collect::>() + .into_boxed_slice() +}); #[cfg(not(test))] static TYPED_FEEDBACK_ENABLED: LazyLock = LazyLock::new(|| { @@ -329,8 +337,10 @@ pub struct GuardCounterSnapshot { } impl GuardCounterSnapshot { - fn add_site(&mut self, site: &TypedFeedbackSite) { - self.passes = self.passes.saturating_add(site.guard_passes); + fn add_site(&mut self, site: &TypedFeedbackSite, extra_guard_passes: u64) { + self.passes = self + .passes + .saturating_add(site.guard_passes.saturating_add(extra_guard_passes)); self.failures = self.failures.saturating_add(site.guard_failures); self.fallback_calls = self.fallback_calls.saturating_add(site.fallback_calls); } @@ -370,6 +380,122 @@ fn registry() -> crate::gc::GcRootRegistryGuard<'static, TypedFeedbackRegistry> crate::gc::lock_gc_root_registry(®ISTRY) } +#[derive(Default)] +struct ArrayGuardFastCacheEntry { + site_id: AtomicU64, + packed: AtomicU64, + aux: AtomicU64, + fast_passes: AtomicU64, + state: AtomicU8, +} + +fn array_guard_cache_index(site_id: u64) -> usize { + let mixed = site_id ^ (site_id >> 32) ^ (site_id >> 17); + (mixed as usize) & (ARRAY_GUARD_FAST_CACHE_SIZE - 1) +} + +fn pack_array_guard_observation(observation: &Observation) -> Option<(u64, u64)> { + if observation.source != ObservationSource::Array || observation.shape_addr != 0 { + return None; + } + Some(( + (observation.class_id as u64) + | ((observation.heap_type as u64) << 32) + | ((observation.value_tag as u64) << 48), + observation.aux, + )) +} + +fn array_guard_fast_pass(site_id: u64, observation: &Observation, contract_valid: bool) -> bool { + if site_id == 0 || !contract_valid { + return false; + } + let Some((packed, aux)) = pack_array_guard_observation(observation) else { + return false; + }; + let entry = &ARRAY_GUARD_FAST_CACHE[array_guard_cache_index(site_id)]; + if entry.state.load(Ordering::Acquire) != ARRAY_GUARD_FAST_CACHE_ENABLED { + return false; + } + if entry.site_id.load(Ordering::Relaxed) != site_id { + return false; + } + if entry.packed.load(Ordering::Relaxed) == packed && entry.aux.load(Ordering::Relaxed) == aux { + entry.fast_passes.fetch_add(1, Ordering::Relaxed); + return true; + } + false +} + +fn note_array_guard_cache_slow_observation( + site_id: u64, + observation: &Observation, + site: &TypedFeedbackSite, +) { + if site_id == 0 { + return; + } + let Some((packed, aux)) = pack_array_guard_observation(observation) else { + return; + }; + let entry = &ARRAY_GUARD_FAST_CACHE[array_guard_cache_index(site_id)]; + let existing_site = entry.site_id.load(Ordering::Acquire); + if existing_site != site_id { + if existing_site != 0 { + return; + } + if entry + .site_id + .compare_exchange(0, site_id, Ordering::AcqRel, Ordering::Acquire) + .is_err() + { + return; + } + } + if site.megamorphic { + entry + .state + .store(ARRAY_GUARD_FAST_CACHE_DISABLED, Ordering::Release); + return; + } + if site + .observations + .iter() + .any(|seen| seen.same_feedback_key(observation)) + { + entry.packed.store(packed, Ordering::Relaxed); + entry.aux.store(aux, Ordering::Relaxed); + entry + .state + .store(ARRAY_GUARD_FAST_CACHE_ENABLED, Ordering::Release); + } +} + +fn array_guard_cache_fast_passes(site_id: u64) -> u64 { + if site_id == 0 { + return 0; + } + let entry = &ARRAY_GUARD_FAST_CACHE[array_guard_cache_index(site_id)]; + if entry.site_id.load(Ordering::Acquire) == site_id { + entry.fast_passes.load(Ordering::Relaxed) + } else { + 0 + } +} + +#[cfg(test)] +fn reset_array_guard_fast_cache_for_tests() { + for entry in ARRAY_GUARD_FAST_CACHE.iter() { + entry + .state + .store(ARRAY_GUARD_FAST_CACHE_DISABLED, Ordering::Release); + entry.site_id.store(0, Ordering::Release); + entry.packed.store(0, Ordering::Relaxed); + entry.aux.store(0, Ordering::Relaxed); + entry.fast_passes.store(0, Ordering::Relaxed); + } +} + #[no_mangle] pub extern "C" fn js_typed_feedback_register_site( site_id: u64, @@ -730,6 +856,7 @@ fn observe(site_id: u64, fallback_kind: TypedFeedbackSiteKind, observation: Obse ) }); site.observe(observation); + note_array_guard_cache_slow_observation(site_id, &observation, site); } fn site_entry( @@ -762,6 +889,9 @@ fn guard_observe( if site_id == 0 || !typed_feedback_enabled() { return contract_valid; } + if array_guard_fast_pass(site_id, &observation, contract_valid) { + return true; + } let mut reg = registry(); let site = site_entry(&mut reg, site_id, fallback_kind); let guard_passed = contract_valid @@ -777,6 +907,7 @@ fn guard_observe( site.guard_failures = site.guard_failures.saturating_add(1); } site.observe(observation); + note_array_guard_cache_slow_observation(site_id, &observation, site); guard_passed } @@ -1863,6 +1994,7 @@ pub fn scan_typed_feedback_roots_mut(visitor: &mut crate::gc::RuntimeRootVisitor #[cfg(test)] pub(crate) fn reset_typed_feedback_for_tests() { TRACE_DUMPED.store(false, Ordering::Release); + reset_array_guard_fast_cache_for_tests(); let mut reg = registry(); *reg = TypedFeedbackRegistry::default(); } diff --git a/crates/perry-runtime/src/typed_feedback/tests.rs b/crates/perry-runtime/src/typed_feedback/tests.rs index 821b9aae84..c3b35fc65c 100644 --- a/crates/perry-runtime/src/typed_feedback/tests.rs +++ b/crates/perry-runtime/src/typed_feedback/tests.rs @@ -505,6 +505,90 @@ fn typed_feedback_numeric_array_get_guard_requires_numeric_layout() { assert_eq!(site.fallback_calls, 0); } +#[test] +fn typed_feedback_numeric_array_guard_fast_path_preserves_snapshot_counts() { + let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap(); + reset_typed_feedback_for_tests(); + register(29, TypedFeedbackSiteKind::ArrayElement, "arr[i]"); + + let values = [1.0, 2.0]; + let arr = crate::array::js_array_from_f64(values.as_ptr(), values.len() as u32); + let arr_box = crate::value::js_nanbox_pointer(arr as i64); + + assert_eq!( + js_typed_feedback_numeric_array_index_get_guard(29, arr_box, 0.0, 0, 1), + 1 + ); + assert_eq!( + js_typed_feedback_numeric_array_index_get_guard(29, arr_box, 0.0, 0, 1), + 1 + ); + assert_eq!( + js_typed_feedback_numeric_array_index_get_guard(29, arr_box, 0.0, 0, 1), + 1 + ); + assert_eq!(array_guard_cache_fast_passes(29), 2); + + let snapshot = typed_feedback_snapshot(); + let site = &snapshot.sites[0]; + assert_eq!(site.guard_passes, 3); + assert_eq!(site.guard_failures, 0); + assert_eq!(site.observed_count, 3); + assert_eq!(site.observation_count, 1); +} + +#[test] +fn typed_feedback_numeric_array_guard_fast_path_respects_megamorphic_state() { + let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap(); + reset_typed_feedback_for_tests(); + register(30, TypedFeedbackSiteKind::ArrayElement, "arr[i]"); + + let values = [1.0, 2.0]; + let arr = crate::array::js_array_from_f64(values.as_ptr(), values.len() as u32); + let arr_box = crate::value::js_nanbox_pointer(arr as i64); + + assert_eq!( + js_typed_feedback_numeric_array_index_get_guard(30, arr_box, 0.0, 0, 1), + 1 + ); + assert_eq!( + js_typed_feedback_numeric_array_index_get_guard(30, arr_box, 0.0, 0, 1), + 1 + ); + assert_eq!(array_guard_cache_fast_passes(30), 1); + + for class_id in 1..=POLYMORPHIC_CAP { + observe( + 30, + TypedFeedbackSiteKind::ArrayElement, + Observation { + source: ObservationSource::Array, + object_addr: 0, + shape_addr: 0, + key_hash: 0, + class_id: class_id as u32, + heap_type: crate::gc::GC_TYPE_ARRAY as u16, + aux: pack_array_aux( + ARRAY_ACCESS_INDEXED_IN_BOUNDS, + ARRAY_LAYOUT_POINTER_FREE, + STABLE_VALUE_NUMBER, + 0, + ), + value_tag: STABLE_VALUE_NUMBER, + }, + ); + } + + let guard = js_typed_feedback_numeric_array_index_get_guard(30, arr_box, 0.0, 0, 1); + assert_eq!(guard, 0); + + let snapshot = typed_feedback_snapshot(); + let site = &snapshot.sites[0]; + assert_eq!(site.state, "megamorphic"); + assert_eq!(site.guard_passes, 2); + assert_eq!(site.guard_failures, 1); +} + #[test] fn typed_feedback_numeric_array_set_guard_requires_numeric_value_and_layout() { let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap(); diff --git a/crates/perry-runtime/src/typed_feedback/trace.rs b/crates/perry-runtime/src/typed_feedback/trace.rs index c67acbfb4f..597fb881c2 100644 --- a/crates/perry-runtime/src/typed_feedback/trace.rs +++ b/crates/perry-runtime/src/typed_feedback/trace.rs @@ -180,6 +180,9 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot { let mut rows = Vec::with_capacity(reg.sites.len()); for site in reg.sites.values() { let state = site.state(); + let fast_guard_passes = array_guard_cache_fast_passes(site.site_id); + let observed_count = site.observed_count.saturating_add(fast_guard_passes); + let guard_passes = site.guard_passes.saturating_add(fast_guard_passes); *snapshot .by_kind .entry(site.metadata.kind.as_str().to_string()) @@ -198,9 +201,9 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot { operation: site.metadata.operation.clone(), guard_name: site.metadata.guard_name.clone(), fallback_name: site.metadata.fallback_name.clone(), - observed_count: site.observed_count, + observed_count, observation_count: site.observations.len(), - guard_passes: site.guard_passes, + guard_passes, guard_failures: site.guard_failures, fallback_calls: site.fallback_calls, shape_invalidations: site.shape_invalidations, @@ -208,7 +211,7 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot { representation_invalidations: site.representation_invalidations, observed_kinds: observed_kinds_snapshot(&site.observations), }); - snapshot.guard_passes = snapshot.guard_passes.saturating_add(site.guard_passes); + snapshot.guard_passes = snapshot.guard_passes.saturating_add(guard_passes); snapshot.guard_failures = snapshot.guard_failures.saturating_add(site.guard_failures); snapshot.fallback_calls = snapshot.fallback_calls.saturating_add(site.fallback_calls); snapshot @@ -219,7 +222,7 @@ pub fn typed_feedback_snapshot() -> TypedFeedbackSnapshot { failures: 0, fallback_calls: 0, }) - .add_site(site); + .add_site(site, fast_guard_passes); } rows.sort_by_key(|row| row.site_id); snapshot.sites = rows; diff --git a/scripts/check_file_size.sh b/scripts/check_file_size.sh index caa3835622..3426a52eb1 100755 --- a/scripts/check_file_size.sh +++ b/scripts/check_file_size.sh @@ -304,6 +304,13 @@ crates/perry-ext-http-server/src/http2_server.rs # on/once iterator machinery into the existing `events/` submodule is tracked # under #1435 with the other module-size cleanups. crates/perry-stdlib/src/events.rs +# Runtime typed-feedback registry. Crossed the 2000-line gate (2004 LOC) after +# the monomorphic array-guard fast cache (#5307) + pre-classification fast +# observation builder (#5309) added the lock-free direct-mapped cache and its +# slow-path fallbacks. The cache state is interwoven with the thread-local guard +# accounting and can't move without scattering it. Splitting the fast-cache out +# of the registry trunk is tracked under #1435. +crates/perry-runtime/src/typed_feedback.rs EOF ) diff --git a/tests/test_compiler_output_regression.py b/tests/test_compiler_output_regression.py index 479b4e52de..54c46751c6 100644 --- a/tests/test_compiler_output_regression.py +++ b/tests/test_compiler_output_regression.py @@ -338,7 +338,7 @@ def numeric_array_native_records(): native_record( rep="f64", expr_kind="NumericArrayIndexGet", - consumer="js_array_numeric_get_f64_unboxed", + consumer="numeric_array_index_get.raw_f64_load", access_mode="checked_native", bounds_state={"guarded": {"guard_id": "numeric_array_index_get_guard"}}, ), @@ -353,7 +353,7 @@ def numeric_array_native_records(): native_record( rep="f64", expr_kind="NumericArrayIndexSet", - consumer="js_array_numeric_set_f64_unboxed", + consumer="numeric_array_index_set.raw_f64_store", access_mode="checked_native", bounds_state={"guarded": {"guard_id": "numeric_array_index_set_guard"}}, ), @@ -553,7 +553,7 @@ def test_numeric_arrays_requires_runtime_api_fallback_reasons(self): native_record( rep="f64", expr_kind="NumericArrayIndexGet", - consumer="js_array_numeric_get_f64_unboxed", + consumer="numeric_array_index_get.raw_f64_load", access_mode="checked_native", bounds_state={"guarded": {"guard_id": "numeric_array_index_get_guard"}}, ), @@ -568,7 +568,7 @@ def test_numeric_arrays_requires_runtime_api_fallback_reasons(self): native_record( rep="f64", expr_kind="NumericArrayIndexSet", - consumer="js_array_numeric_set_f64_unboxed", + consumer="numeric_array_index_set.raw_f64_store", access_mode="checked_native", bounds_state={"guarded": {"guard_id": "numeric_array_index_set_guard"}}, ), @@ -928,10 +928,11 @@ def test_native_rep_unchecked_unknown_bounds_fails_gate(self): ) def test_generic_native_rep_checks_require_configured_records(self): - # The numeric indexed read is inlined: a guarded fast block computes the - # element pointer (inttoptr) and performs a direct `load double` instead - # of calling js_array_numeric_get_f64_unboxed. Push/set still go through - # their guarded raw-f64 helpers. + # The numeric indexed read and write are both inlined: guarded fast + # blocks compute the element pointer (inttoptr) and perform a direct + # `load double` / `store double` instead of calling + # js_array_numeric_get_f64_unboxed / js_array_numeric_set_f64_unboxed. + # Push still goes through its guarded raw-f64 helper. ir = """ define i32 @main() { entry: @@ -950,7 +951,18 @@ def test_generic_native_rep_checks_require_configured_records(self): br label %bidx.num.merge.3 bidx.num.merge.3: - call i32 @js_array_numeric_set_f64_unboxed(i64 1, i32 0, double 3.0) + %sg = call i32 @js_typed_feedback_numeric_array_index_set_guard(i64 1, double 0.0, i32 0, double 3.0, i32 0) + %sgc = icmp ne i32 %sg, 0 + br i1 %sgc, label %idxset.inbounds.4, label %idxset.merge.5 + +idxset.inbounds.4: + %sv = fadd double 3.0, 0.0 + %saddr = add i64 1, 8 + %sp = inttoptr i64 %saddr to ptr + store double %sv, ptr %sp, align 8 + br label %idxset.merge.5 + +idxset.merge.5: ret i32 0 } """ @@ -973,7 +985,7 @@ def test_generic_native_rep_checks_require_configured_records(self): native_record( rep="f64", expr_kind="NumericArrayIndexGet", - consumer="js_array_numeric_get_f64_unboxed", + consumer="numeric_array_index_get.raw_f64_load", access_mode="checked_native", bounds_state={"guarded": {"guard_id": "numeric_array_index_get_guard"}}, ), @@ -988,7 +1000,7 @@ def test_generic_native_rep_checks_require_configured_records(self): native_record( rep="f64", expr_kind="NumericArrayIndexSet", - consumer="js_array_numeric_set_f64_unboxed", + consumer="numeric_array_index_set.raw_f64_store", access_mode="checked_native", bounds_state={"guarded": {"guard_id": "numeric_array_index_set_guard"}}, ),