diff --git a/PERF_RUN_LOG.md b/PERF_RUN_LOG.md index a64df6c01..3ef87f99f 100644 --- a/PERF_RUN_LOG.md +++ b/PERF_RUN_LOG.md @@ -234,3 +234,205 @@ - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local third-cycle results. - This follow-up is intended as a stacked draft PR on top of the monomorphic array guard fast-cache PR. - PR: https://github.com/PerryTS/perry/pull/5309 + +## 2026-06-17 - I32 lowering for loop-bound numeric array indices + +- Start revision: `966729232` +- Branch: `codex/perry-i32-array-index-lowering` +- Worker assignment: single Codex pass in this worktree +- Benchmark environment: Linux `/usr/bin/time`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness +- Baseline commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-guard-precheck-final2 --quiet` + - `for i in 1 2 3 4 5; do /usr/bin/time -f "sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-guard-precheck-final2; done` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-guard-precheck-final2-trace.json /usr/bin/time -f "wall=%e rss_kb=%M" /tmp/perry-matrix-guard-precheck-final2 && jq '.guards' /tmp/perry-matrix-guard-precheck-final2-trace.json` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-guard-precheck-final2` + - `benchmarks/quick.sh` + - `benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-guard-precheck-final2.json` +- Baseline results: + - direct matrix binary: 400ms, 398ms, 398ms, 385ms, 386ms; checksum always `41079519680` + - final trace run: `matrix_multiply:395`, checksum `41079519680`, wall 0.42s, RSS 31500KB, 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures; push guard retained 39 fallback calls + - `perf stat` direct matrix binary: 1,443,394,074 cycles, 7,034,084,638 instructions, 1,568,434,556 branches, 241,348 branch-misses, 0.4222s elapsed + - compare quick medians: loop_overhead 76ms/18880KB, fibonacci 266ms/18764KB, math_intensive 55ms/19092KB, nested_loops 225ms/23204KB, factorial 95ms/18764KB + - quick: fibonacci 254ms/18MB, math_intensive 73ms/18MB, nested_loops 229ms/22MB, factorial 97ms/18MB, matrix_multiply 407ms/30MB +- Selected gap and evidence: + - After numeric array guard pre-classification, `matrix_multiply` remained the slowest `quick.sh` case at 407ms. + - LLVM trace for `benchmarks/suite/16_matrix_multiply.ts` still lowered hot computed get indices such as `i * size + k` through `sitofp`/`fmul`/`fadd`/`fptosi` before calling the typed-feedback numeric array get guard. + - Loop-bound analysis already proved and hoisted `size` as an i32 loop bound for `i < size` and `k < size`, but that trusted bound was not visible to the existing i32 expression lowering used by index expressions. +- Change: + - Reused or inserted an i32 slot for local loop bounds classified by the `i < n` loop-bound path and kept that slot visible while lowering the loop body. + - Used the existing `can_lower_expr_as_i32` / `lower_expr_as_i32` machinery for known-array computed get indices when the index expression is fully backed by trusted i32 slots, integer locals, or constants. + - Preserved the typed-feedback numeric array get guard and fallback path; the final i32 index is converted back to double only for the guard's double index argument. + - Added an IR regression test covering `xs[i * size + 1]` inside `for (let i = 0; i < size; i++)`, asserting guarded fallback emission plus `mul i32`/`add i32` and no `fmul double` for that computed index. +- Post-change benchmark commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-i32-index-proto --trace llvm --quiet` + - `rg -n "js_typed_feedback_numeric_array_index_get_guard|fmul double|mul i32|add i32|sitofp i32" .perry-trace/llvm/_16_matrix_multiply_ts.ll` + - `for i in 1 2 3 4 5; do /usr/bin/time -f "sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-i32-index-proto; done` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-i32-index-proto-trace.json /usr/bin/time -f "wall=%e rss_kb=%M" /tmp/perry-matrix-i32-index-proto && jq '.guards' /tmp/perry-matrix-i32-index-proto-trace.json` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-i32-index-proto` + - `benchmarks/quick.sh` + - `benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-i32-index-proto.json` + - `for i in 1 2 3 4 5 6 7 8 9 10; do /usr/bin/time -f "sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-i32-index-proto; done` +- Post-change results: + - LLVM trace confirmed the two hot matmul numeric-array get indices now use `mul i32` and `add i32` before `call i32 @js_typed_feedback_numeric_array_index_get_guard`; remaining `sitofp i32` values feed the guard's double index argument. + - direct matrix binary first sample set: 400ms, 393ms, 388ms, 403ms, 393ms; checksum always `41079519680` + - direct matrix binary 10-sample set: 397ms, 396ms, 390ms, 392ms, 392ms, 393ms, 383ms, 389ms, 386ms, 384ms; checksum always `41079519680` + - trace run: `matrix_multiply:397`, checksum `41079519680`, wall 0.42s, RSS 31440KB, 33,619,968 numeric array index-get guard passes, 65,536 numeric array index-set guard passes, 0 get/set guard failures; push guard retained 39 fallback calls + - `perf stat` direct matrix binary: 1,456,553,467 cycles, 7,017,622,346 instructions, 1,568,480,860 branches, 249,497 branch-misses, 0.4217s elapsed + - quick: fibonacci 251ms/18MB, math_intensive 71ms/18MB, nested_loops 202ms/22MB, factorial 99ms/18MB, matrix_multiply 387ms/30MB + - compare quick medians: loop_overhead 56ms/18784KB, fibonacci 248ms/18896KB, math_intensive 55ms/18900KB, nested_loops 214ms/23268KB, factorial 78ms/18776KB +- Measured impact: + - `16_matrix_multiply` direct median: 398ms -> 391ms, 1.8% faster + - `16_matrix_multiply` quick: 407ms -> 387ms, 4.9% faster + - Direct matrix binary instructions: 7.034B -> 7.018B, 0.2% fewer + - Direct matrix binary cycles: 1.443B -> 1.457B, 0.9% more in the single perf sample; branch misses also rose from 241K to 249K, so counter impact is mixed despite lower wall-time samples + - `10_nested_loops` compare median: 225ms -> 214ms, 4.9% faster +- Verification: + - `cargo fmt --check` + - `git diff --check` + - `cargo test -p perry-codegen --test typed_feedback` + - `cargo test -p perry-codegen --test typed_shape_descriptors` + - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py` + - `tests/test_benchmark_output_verifier.sh` + - `cargo build --release` + - Typed-feedback trace confirmed get/set guard pass counts and zero get/set failures match the pre-change trace. +- Notes: + - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only`, and the before/after comparison above uses the captured local fourth-cycle baseline. + - This is a smaller cleanup than the preceding guard-cache work. The keeper signal is the consistent matrix wall-time reduction plus removal of double arithmetic from the hottest generated get-index chains; perf counters should be watched on future runs. +- PR: https://github.com/PerryTS/perry/pull/5310 + +## 2026-06-17 - Hoist invariant numeric array reads out of inner loops + +- Start revision: `ec79e68ff` +- Branch: `codex/perry-invariant-array-read-hoist` +- Worker assignment: single Codex pass in this worktree +- Benchmark environment: Linux `/usr/bin/time` and `perf stat`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness +- Baseline commands: + - `target/release/perry compile --no-cache benchmarks/suite/10_nested_loops.ts -o /tmp/perry-nested-i32-index-baseline --trace llvm --quiet` + - `/usr/bin/time -f "nested wall=%e rss_kb=%M" /tmp/perry-nested-i32-index-baseline` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-nested-i32-index-baseline-trace.json /usr/bin/time -f "wall=%e rss_kb=%M" /tmp/perry-nested-i32-index-baseline && jq '[.sites[] | select(.kind=="array_element") | {site_id, guard_name, observed_count, guard_passes, guard_failures, fallback_calls}]' /tmp/perry-nested-i32-index-baseline-trace.json` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-nested-i32-index-baseline` + - `benchmarks/quick.sh` + - `benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-i32-index-proto.json` +- Baseline results: + - direct nested binary: `nested_loops:221`, sum `26991000000`, wall 0.23s, RSS 19396KB + - trace run: `nested_loops:223`, wall 0.22s, RSS 23068KB, both numeric array index-get sites reported 9,000,000 observations/passes, 0 failures, and 0 fallback calls + - `perf stat` direct nested binary: 813,901,295 cycles, 3,643,681,343 instructions, 820,579,143 branches, 158,322 branch-misses, 0.2303s elapsed + - quick: fibonacci 251ms/18MB, math_intensive 71ms/18MB, nested_loops 202ms/22MB, factorial 99ms/18MB, matrix_multiply 387ms/30MB + - compare quick medians: loop_overhead 56ms/18784KB, fibonacci 248ms/18896KB, math_intensive 55ms/18900KB, nested_loops 214ms/23268KB, factorial 78ms/18776KB +- Selected gap and evidence: + - After i32 index lowering, `10_nested_loops.ts` had become the clearest remaining tight-loop bottleneck: two guarded raw-f64 array reads inside the 9M-iteration inner loop. + - `perf record`/`perf report` on `/tmp/perry-nested-i32-index-baseline` showed the hottest generated instructions were two payload loads at offsets `0x853f0` and `0x85384`. + - Disassembly showed the first hot load was `arr[i]`, invariant across the inner `j` loop, while `arr[j]` remained truly inner-loop variant. + - The typed-feedback trace confirmed both get sites were monomorphic and fallback-free with 9,000,000 successful guard passes each. +- Change: + - Added a narrowly scoped loop-lowering peephole for the shape `for (...; j < arr.length; j++)` nested under an already bounded `i < arr.length` loop. + - When the body is a single eager expression statement containing `arr[i]`, the body/update do not mutate `arr` or `i`, and the array is a numeric pointer-free array, the inner loop now guard-loads `arr[i]` once in a first-entry prebody and reuses a stack slot in the loop body. + - Split the loop CFG so only the initial true edge goes through `for.prebody`; backedges branch through `for.cond.backedge` directly to `for.body`, avoiding accidental reloading every inner iteration. + - Added `js_typed_feedback_record_array_guard_fast_passes(site_id, count)` so the skipped fast-path passes are bulk-accounted in the trace while preserving per-site observed/pass totals. + - Restricted candidate discovery to eager expression forms and added a negative test proving branch-only conditional reads are not hoisted. +- Post-change benchmark commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/10_nested_loops.ts -o /tmp/perry-nested-invariant-hoist-final --trace llvm --quiet` + - `for i in 1 2 3 4 5; do /usr/bin/time -f "nested final direct wall=%e rss_kb=%M" /tmp/perry-nested-invariant-hoist-final; done` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-nested-invariant-hoist-final-trace.json /usr/bin/time -f "nested final trace wall=%e rss_kb=%M" /tmp/perry-nested-invariant-hoist-final && jq '[.sites[] | select(.kind=="array_element") | {site_id, guard_name, observed_count, guard_passes, guard_failures, fallback_calls}]' /tmp/perry-nested-invariant-hoist-final-trace.json` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-nested-invariant-hoist-final` + - `benchmarks/quick.sh` + - `benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-compare-invariant-hoist-final.json` +- Post-change results: + - direct nested binary: 103ms, 102ms, 102ms, 104ms, 102ms; sum always `26991000000` + - direct run wall/RSS samples: 0.10s/23028KB, 0.10s/22812KB, 0.10s/22780KB, 0.10s/22852KB, 0.10s/23140KB + - trace run: `nested_loops:125`, sum `26991000000`, wall 0.13s, RSS 23324KB, both numeric array index-get sites reported 9,000,000 observations/passes, 0 failures, and 0 fallback calls + - `perf stat` direct nested binary: 422,566,715 cycles, 1,842,815,590 instructions, 415,434,773 branches, 121,411 branch-misses, 0.1194s elapsed + - quick: fibonacci 237ms/18MB, math_intensive 75ms/18MB, nested_loops 124ms/22MB, factorial 76ms/18MB, matrix_multiply 390ms/30MB + - compare quick medians: loop_overhead 72ms/18856KB, fibonacci 258ms/18876KB, math_intensive 56ms/18948KB, nested_loops 101ms/23176KB, factorial 76ms/18744KB +- Measured impact: + - `10_nested_loops` direct binary: 221ms -> 102ms median, 53.8% faster + - `10_nested_loops` compare median: 214ms -> 101ms, 52.8% faster + - Direct nested binary cycles: 813.9M -> 422.6M, 48.1% fewer + - Direct nested binary instructions: 3.64B -> 1.84B, 49.4% fewer + - Direct nested binary branches: 820.6M -> 415.4M, 49.4% fewer + - `16_matrix_multiply` quick stayed in the expected range: 387ms -> 390ms +- Verification: + - `cargo fmt --check` + - `git diff --check` + - `cargo test -p perry-codegen --test typed_feedback` + - `cargo test -p perry-codegen --test typed_shape_descriptors` + - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py` + - `tests/test_benchmark_output_verifier.sh` + - `cargo build --release` + - Final typed-feedback trace confirmed the bulk-accounted hoisted read still reports the original 9,000,000 get observations/passes and zero fallback calls. +- Notes: + - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only`, and the before/after comparison above uses the captured local fifth-cycle baseline. + - This follow-up is intended as a stacked draft PR on top of the i32 array index lowering PR. +- PR: https://github.com/PerryTS/perry/pull/5312 + +## 2026-06-17 - Use i32 numeric array get guards for trusted integer indices + +- Start revision: `f8454f6bb` +- Branch: `codex/perry-i32-numeric-get-guard` +- Worker assignment: single Codex pass in this worktree +- Benchmark environment: Linux `/usr/bin/time` and `perf stat`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness +- Baseline commands: + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-invariant-hoist-baseline --trace llvm --quiet` + - `for i in 1 2 3 4 5; do /usr/bin/time -f "matrix baseline sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-invariant-hoist-baseline; done` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-invariant-hoist-baseline-trace.json /usr/bin/time -f "matrix baseline trace wall=%e rss_kb=%M" /tmp/perry-matrix-invariant-hoist-baseline && jq '[.sites[] | select(.kind=="array_element") | {site_id, operation, guard_name, observed_count, guard_passes, guard_failures, fallback_calls}]' /tmp/perry-matrix-invariant-hoist-baseline-trace.json` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-invariant-hoist-baseline` + - `perf record -F 999 -g --call-graph fp -o /tmp/perry-matrix-invariant-hoist-baseline.perf /tmp/perry-matrix-invariant-hoist-baseline` +- Baseline results: + - direct matrix binary samples: 391ms, 406ms, 385ms, 397ms, 407ms; checksum always `41079519680` + - trace run: `matrix_multiply:394`, wall 0.42s, RSS 31528KB + - hot get trace sites `3181980809628221440` and `3181980809628221441`: 16,777,216 observations/passes each, 0 failures, 0 fallback calls + - final checksum get trace site `3181980809628221446`: 65,536 observations/passes, 0 failures, 0 fallback calls + - direct matrix `perf stat`: 1,462,371,864 cycles, 7,017,654,871 instructions, 1,568,480,811 branches, 253,622 branch-misses, 0.4253s elapsed + - `perf report` showed roughly 48.7% and 45.6% at the two generated hot get fast-path load/address-mask regions + - `benchmarks/quick.sh` from the previous cycle: matrix_multiply 390ms/30MB +- Selected gap and evidence: + - After invariant-read hoisting, `16_matrix_multiply.ts` had two fully hot numeric array index-get sites in the innermost multiply loop. + - LLVM IR still emitted `sitofp i32` for each trusted computed get index before calling `js_typed_feedback_numeric_array_index_get_guard`, even though codegen had already proven and lowered the index as i32. + - The runtime guard then rechecked the boxed index bits with `is_plain_number_bits(index_value)`, adding avoidable work on every hot get. +- Change: + - Added `js_typed_feedback_numeric_array_index_get_guard_i32(site_id, receiver, index, require_in_bounds)` for codegen paths that already have a trusted i32 index. + - The new guard preserves numeric-layout, bounds, observation, pass/fail, and fallback accounting, but skips the boxed-index argument and plain-number bit check. + - Changed computed numeric array gets and bounded-index numeric gets to call the trusted-i32 guard when the index is already lowered as i32. + - Moved the i32-to-double boxing for trusted-i32 gets into the fallback block only, so the hot fast path does not materialize the boxed index. + - Reused the trusted-i32 helper from the invariant numeric array read hoist prebody. +- Post-change benchmark commands: + - `cargo build --release` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-i32-get-guard --trace llvm --quiet` + - `rg -n "numeric_array_index_get_guard_i32|numeric_array_index_get_guard\\(|sitofp i32" .perry-trace/llvm/_16_matrix_multiply_ts.ll` + - `for i in 1 2 3 4 5; do /usr/bin/time -f "matrix i32-guard sample=$i wall=%e rss_kb=%M" /tmp/perry-matrix-i32-get-guard; done` + - `PERRY_TYPED_FEEDBACK_TRACE=/tmp/perry-matrix-i32-get-guard-trace.json /usr/bin/time -f "matrix i32-guard trace wall=%e rss_kb=%M" /tmp/perry-matrix-i32-get-guard && jq '[.sites[] | select(.kind=="array_element") | {site_id, operation, guard_name, observed_count, guard_passes, guard_failures, fallback_calls}]' /tmp/perry-matrix-i32-get-guard-trace.json` + - `perf stat -e cycles,instructions,branches,branch-misses /tmp/perry-matrix-i32-get-guard` + - `benchmarks/quick.sh` + - `benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-compare-i32-get-guard.json` +- Post-change results: + - LLVM IR uses `js_typed_feedback_numeric_array_index_get_guard_i32` for the two hot matrix get sites and the final checksum get site; the remaining get-index `sitofp` instructions are in fallback blocks. + - direct matrix binary samples: 378ms, 381ms, 385ms, 365ms, 376ms; checksum always `41079519680` + - trace run: `matrix_multiply:384`, wall 0.41s, RSS 31436KB + - hot get trace sites `3181980809628221440` and `3181980809628221441`: 16,777,216 observations/passes each, 0 failures, 0 fallback calls + - final checksum get trace site `3181980809628221446`: 65,536 observations/passes, 0 failures, 0 fallback calls + - direct matrix `perf stat`: 1,372,519,931 cycles, 6,614,211,059 instructions, 1,434,035,989 branches, 244,367 branch-misses, 0.4080s elapsed + - quick: fibonacci 262ms/18MB, math_intensive 61ms/18MB, nested_loops 118ms/22MB, factorial 93ms/18MB, matrix_multiply 368ms/30MB + - compare quick medians: loop_overhead 78ms/18828KB, fibonacci 246ms/18944KB, math_intensive 56ms/18904KB, nested_loops 120ms/23176KB, factorial 85ms/18904KB +- Measured impact: + - `16_matrix_multiply` direct binary: 397ms -> 378ms median, 4.8% faster + - `16_matrix_multiply` quick: 390ms -> 368ms, 5.6% faster + - Direct matrix binary cycles: 1.462B -> 1.373B, 6.1% fewer + - Direct matrix binary instructions: 7.018B -> 6.614B, 5.8% fewer + - Direct matrix binary branches: 1.568B -> 1.434B, 8.6% fewer +- Verification: + - `cargo fmt --check` + - `git diff --check` + - `cargo test -p perry-runtime typed_feedback_numeric_array_get_guard_i32_requires_numeric_layout` + - `cargo test -p perry-codegen --test typed_feedback` + - `cargo test -p perry-codegen --test typed_shape_descriptors` + - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py` + - `tests/test_benchmark_output_verifier.sh` + - `cargo build --release` + - Final typed-feedback trace confirmed unchanged get observation/pass totals and zero fallback calls. +- Notes: + - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only`, and the matrix before/after comparison above uses the captured local sixth-cycle baseline. + - This is a deliberately narrow runtime ABI and codegen change: only paths with an already trusted i32 index use the new guard, and fallback boxing remains available for correctness. +- PR: https://github.com/PerryTS/perry/pull/5313 diff --git a/crates/perry-codegen/src/codegen/closure.rs b/crates/perry-codegen/src/codegen/closure.rs index 9cd9f3e8c..5ddfaad5f 100644 --- a/crates/perry-codegen/src/codegen/closure.rs +++ b/crates/perry-codegen/src/codegen/closure.rs @@ -323,6 +323,7 @@ pub(super) fn compile_closure( arena_state_slot: None, class_keys_slots: HashMap::new(), cached_lengths: HashMap::new(), + hoisted_array_index_gets: HashMap::new(), bounded_index_pairs: Vec::new(), i32_counter_slots: HashMap::new(), index_used_locals: native_facts.index_used_locals(), diff --git a/crates/perry-codegen/src/codegen/entry.rs b/crates/perry-codegen/src/codegen/entry.rs index af441609e..46d0f3b49 100644 --- a/crates/perry-codegen/src/codegen/entry.rs +++ b/crates/perry-codegen/src/codegen/entry.rs @@ -425,6 +425,7 @@ pub(super) fn compile_module_entry( arena_state_slot: None, class_keys_slots: HashMap::new(), cached_lengths: HashMap::new(), + hoisted_array_index_gets: HashMap::new(), bounded_index_pairs: Vec::new(), i32_counter_slots: HashMap::new(), index_used_locals: main_native_facts.index_used_locals(), @@ -864,6 +865,7 @@ pub(super) fn compile_module_entry( arena_state_slot: None, class_keys_slots: HashMap::new(), cached_lengths: HashMap::new(), + hoisted_array_index_gets: HashMap::new(), bounded_index_pairs: Vec::new(), i32_counter_slots: HashMap::new(), index_used_locals: init_native_facts.index_used_locals(), diff --git a/crates/perry-codegen/src/codegen/function.rs b/crates/perry-codegen/src/codegen/function.rs index 6228ec749..47f235e4d 100644 --- a/crates/perry-codegen/src/codegen/function.rs +++ b/crates/perry-codegen/src/codegen/function.rs @@ -225,6 +225,7 @@ pub(super) fn compile_function( arena_state_slot: None, class_keys_slots: HashMap::new(), cached_lengths: HashMap::new(), + hoisted_array_index_gets: HashMap::new(), bounded_index_pairs: Vec::new(), i32_counter_slots: HashMap::new(), index_used_locals: native_facts.index_used_locals(), diff --git a/crates/perry-codegen/src/codegen/method.rs b/crates/perry-codegen/src/codegen/method.rs index d5ee5d0c8..3f9fce4b9 100644 --- a/crates/perry-codegen/src/codegen/method.rs +++ b/crates/perry-codegen/src/codegen/method.rs @@ -212,6 +212,7 @@ pub(super) fn compile_method( arena_state_slot: None, class_keys_slots: HashMap::new(), cached_lengths: HashMap::new(), + hoisted_array_index_gets: HashMap::new(), bounded_index_pairs: Vec::new(), i32_counter_slots: HashMap::new(), index_used_locals: native_facts.index_used_locals(), @@ -722,6 +723,7 @@ pub(super) fn compile_static_method( arena_state_slot: None, class_keys_slots: HashMap::new(), cached_lengths: HashMap::new(), + hoisted_array_index_gets: HashMap::new(), bounded_index_pairs: Vec::new(), i32_counter_slots: HashMap::new(), index_used_locals: native_facts.index_used_locals(), diff --git a/crates/perry-codegen/src/expr/index_get.rs b/crates/perry-codegen/src/expr/index_get.rs index a232b533e..433d40bcb 100644 --- a/crates/perry-codegen/src/expr/index_get.rs +++ b/crates/perry-codegen/src/expr/index_get.rs @@ -35,7 +35,7 @@ use crate::types::{DOUBLE, I1, I16, I32, I64, I8, PTR}; use super::arrays_finds::lower_buffer_index_get_i32; #[allow(unused_imports)] use super::{ - buffer_access_materialization_reason, buffer_alias_metadata_suffix, + buffer_access_materialization_reason, buffer_alias_metadata_suffix, can_lower_expr_as_i32, emit_layout_note_slot_on_block, emit_shadow_slot_clear, emit_shadow_slot_update_for_expr, emit_string_literal_global, emit_typed_feedback_register_site, emit_v8_export_call, emit_v8_member_method_call, emit_write_barrier, emit_write_barrier_slot_on_block, @@ -186,13 +186,55 @@ fn lower_class_method_bind( )) } -fn lower_guarded_array_index_get( +pub(crate) fn lower_guarded_array_index_get( ctx: &mut FnCtx<'_>, arr_box: &str, idx_box: &str, idx_i32: &str, block_prefix: &str, require_numeric_layout: bool, + skipped_fast_pass_count: Option<&str>, +) -> Result { + lower_guarded_array_index_get_impl( + ctx, + arr_box, + Some(idx_box), + idx_i32, + block_prefix, + require_numeric_layout, + skipped_fast_pass_count, + false, + ) +} + +pub(crate) fn lower_guarded_array_index_get_trusted_i32( + ctx: &mut FnCtx<'_>, + arr_box: &str, + idx_i32: &str, + block_prefix: &str, + skipped_fast_pass_count: Option<&str>, +) -> Result { + lower_guarded_array_index_get_impl( + ctx, + arr_box, + None, + idx_i32, + block_prefix, + true, + skipped_fast_pass_count, + true, + ) +} + +fn lower_guarded_array_index_get_impl( + ctx: &mut FnCtx<'_>, + arr_box: &str, + idx_box: Option<&str>, + idx_i32: &str, + block_prefix: &str, + require_numeric_layout: bool, + skipped_fast_pass_count: Option<&str>, + trusted_i32_index: bool, ) -> Result { let contract = if require_numeric_layout { TypedFeedbackContract::numeric_array_get_index() @@ -214,34 +256,58 @@ fn lower_guarded_array_index_get( let guard_ok = { let blk = ctx.block(); - let guard_fn = if require_numeric_layout { - "js_typed_feedback_numeric_array_index_get_guard" + let use_i32_numeric_guard = require_numeric_layout && trusted_i32_index; + let guard_i32 = if use_i32_numeric_guard { + blk.call( + I32, + "js_typed_feedback_numeric_array_index_get_guard_i32", + &[ + (I64, &feedback_site_id), + (DOUBLE, arr_box), + (I32, idx_i32), + (I32, "1"), + ], + ) } else { - "js_typed_feedback_plain_array_index_get_guard" + let guard_fn = if require_numeric_layout { + "js_typed_feedback_numeric_array_index_get_guard" + } else { + "js_typed_feedback_plain_array_index_get_guard" + }; + let idx_box = + idx_box.expect("non-i32 array index guard path requires a boxed index value"); + blk.call( + I32, + guard_fn, + &[ + (I64, &feedback_site_id), + (DOUBLE, arr_box), + (DOUBLE, idx_box), + (I32, idx_i32), + (I32, "1"), + ], + ) }; - let guard_i32 = blk.call( - I32, - guard_fn, - &[ - (I64, &feedback_site_id), - (DOUBLE, arr_box), - (DOUBLE, idx_box), - (I32, idx_i32), - (I32, "1"), - ], - ); blk.icmp_ne(I32, &guard_i32, "0") }; ctx.block().cond_br(&guard_ok, &fast_label, &fallback_label); ctx.current_block = fallback_idx; + let fallback_idx_box; + let fallback_idx_ref = match idx_box { + Some(idx_box) => idx_box, + None => { + fallback_idx_box = ctx.block().sitofp(I32, idx_i32, DOUBLE); + &fallback_idx_box + } + }; let fallback_val = ctx.block().call( DOUBLE, "js_typed_feedback_array_index_get_fallback_boxed", &[ (I64, &feedback_site_id), (DOUBLE, arr_box), - (DOUBLE, idx_box), + (DOUBLE, fallback_idx_ref), ], ); let fallback_end_label = ctx.block().label.clone(); @@ -287,6 +353,12 @@ fn lower_guarded_array_index_get( ctx.current_block = fast_idx; let fast_blk = ctx.block(); + if let Some(count) = skipped_fast_pass_count { + fast_blk.call_void( + "js_typed_feedback_record_array_guard_fast_passes", + &[(I64, &feedback_site_id), (I64, count)], + ); + } let arr_bits = fast_blk.bitcast_double_to_i64(arr_box); let arr_handle = fast_blk.and(I64, &arr_bits, POINTER_MASK_I64); let idx_i64 = fast_blk.zext(I32, idx_i32, I64); @@ -809,6 +881,17 @@ pub(crate) fn lower(ctx: &mut FnCtx<'_>, expr: &Expr) -> Result { } let require_numeric_layout = expr_has_numeric_pointer_free_array_layout(ctx, object); + if let (Expr::LocalGet(arr_id), Expr::LocalGet(idx_id)) = + (object.as_ref(), index.as_ref()) + { + if let Some(slot) = ctx + .hoisted_array_index_gets + .get(&(*arr_id, *idx_id)) + .cloned() + { + return Ok(ctx.block().load(DOUBLE, &slot)); + } + } // Bounded-index fast path (mirrors the IndexSet // optimization in the same file): if the surrounding // for-loop registered `(counter_id, arr_id)` as @@ -832,14 +915,8 @@ pub(crate) fn lower(ctx: &mut FnCtx<'_>, expr: &Expr) -> Result { ctx.block().fptosi(DOUBLE, &idx_double, I32) }; if require_numeric_layout { - let idx_double = ctx.block().sitofp(I32, &idx_i32, DOUBLE); - return lower_guarded_array_index_get( - ctx, - &arr_box, - &idx_double, - &idx_i32, - "bidx.num", - true, + return lower_guarded_array_index_get_trusted_i32( + ctx, &arr_box, &idx_i32, "bidx.num", None, ); } return lower_bounded_array_index_get(ctx, &arr_box, &idx_i32); @@ -847,8 +924,36 @@ pub(crate) fn lower(ctx: &mut FnCtx<'_>, expr: &Expr) -> Result { } let arr_box = lower_expr(ctx, object)?; - let idx_double = lower_expr(ctx, index)?; - let idx_i32 = ctx.block().fptosi(DOUBLE, &idx_double, I32); + let i32_slots = ctx.i32_counter_slots.clone(); + let flat_const_arrays = ctx.flat_const_arrays.clone(); + let array_row_aliases = ctx.array_row_aliases.clone(); + let integer_locals = ctx.integer_locals.clone(); + let use_i32_index = can_lower_expr_as_i32( + index, + &i32_slots, + &flat_const_arrays, + &array_row_aliases, + &integer_locals, + ctx.clamp3_functions, + ctx.clamp_u8_functions, + ctx.integer_returning_functions, + ctx.i32_identity_functions, + ); + if use_i32_index && require_numeric_layout { + let idx_i32 = lower_expr_as_i32(ctx, index)?; + return lower_guarded_array_index_get_trusted_i32( + ctx, &arr_box, &idx_i32, "arr", None, + ); + } + let (idx_double, idx_i32) = if use_i32_index { + let idx_i32 = lower_expr_as_i32(ctx, index)?; + let idx_double = ctx.block().sitofp(I32, &idx_i32, DOUBLE); + (idx_double, idx_i32) + } else { + let idx_double = lower_expr(ctx, index)?; + let idx_i32 = ctx.block().fptosi(DOUBLE, &idx_double, I32); + (idx_double, idx_i32) + }; if !require_numeric_layout && !matches!(index.as_ref(), Expr::Integer(_) | Expr::Number(_)) { @@ -861,6 +966,7 @@ pub(crate) fn lower(ctx: &mut FnCtx<'_>, expr: &Expr) -> Result { &idx_i32, "arr", require_numeric_layout, + None, ); } // Generic dynamic object access: stringify the index (no-op diff --git a/crates/perry-codegen/src/expr/mod.rs b/crates/perry-codegen/src/expr/mod.rs index 3c8800f0b..52de849c8 100644 --- a/crates/perry-codegen/src/expr/mod.rs +++ b/crates/perry-codegen/src/expr/mod.rs @@ -93,6 +93,7 @@ pub(crate) use i32_fast_path::{ try_flat_const_2d_int, try_lower_flat_const_index_get, }; pub(crate) use index::lower_index_set_fast; +pub(crate) use index_get::lower_guarded_array_index_get_trusted_i32; pub(crate) use nanbox_inline::{ i32_bool_to_nanbox, nanbox_bigint_inline, nanbox_pointer_inline, nanbox_pointer_inline_pub, nanbox_string_inline, @@ -599,6 +600,12 @@ pub(crate) struct FnCtx<'a> { /// call that LLVM can't prove won't modify the length). pub cached_lengths: std::collections::HashMap, + /// Loop-local replacements for an invariant numeric array read that was + /// guarded once in a loop prebody. Keyed as `(array_local_id, + /// index_local_id)` and active only while lowering the loop body whose + /// prebody filled the slot. + pub hoisted_array_index_gets: std::collections::HashMap<(u32, u32), String>, + /// `(counter_local_id, array_local_id)` pairs that are guaranteed /// inbounds inside the current loop nest — populated by /// `lower_for` when it detects the same `for (...; i < arr.length; diff --git a/crates/perry-codegen/src/runtime_decls/objects.rs b/crates/perry-codegen/src/runtime_decls/objects.rs index 8bbfa981d..4cfe8495a 100644 --- a/crates/perry-codegen/src/runtime_decls/objects.rs +++ b/crates/perry-codegen/src/runtime_decls/objects.rs @@ -82,6 +82,16 @@ pub fn declare_phase_b_objects(module: &mut LlModule) { module.declare_function("js_typed_feedback_record_guard_pass", VOID, &[I64]); module.declare_function("js_typed_feedback_record_guard_fail", VOID, &[I64]); module.declare_function("js_typed_feedback_record_fallback_call", VOID, &[I64]); + module.declare_function( + "js_typed_feedback_record_array_guard_fast_passes", + VOID, + &[I64, I64], + ); + module.declare_function( + "js_typed_feedback_numeric_array_index_get_guard_i32", + I32, + &[I64, DOUBLE, I32, I32], + ); module.declare_function( "js_typed_feedback_observe_property_get", VOID, diff --git a/crates/perry-codegen/src/stmt/loops.rs b/crates/perry-codegen/src/stmt/loops.rs index f05d636d2..84ac9191f 100644 --- a/crates/perry-codegen/src/stmt/loops.rs +++ b/crates/perry-codegen/src/stmt/loops.rs @@ -2,7 +2,10 @@ use super::*; -use crate::expr::{nanbox_pointer_inline, BoundedIndexPair, IntRangeFact}; +use crate::expr::{ + expr_has_numeric_pointer_free_array_layout, lower_guarded_array_index_get_trusted_i32, + nanbox_pointer_inline, BoundedIndexPair, IntRangeFact, +}; use crate::loop_purity::body_needs_asm_barrier; use crate::lower_conditional::lower_truthy; use crate::native_value::{BoundedBufferIndex, BoundsProof, BoundsState, LengthSource}; @@ -209,6 +212,13 @@ fn lower_numeric_bulk_fill_loop(ctx: &mut FnCtx<'_>, matched: NumericBulkFillLoo Ok(true) } +#[derive(Clone, Copy, Debug)] +struct InvariantArrayIndexGetHoist { + array_local_id: u32, + index_local_id: u32, + inner_counter_local_id: u32, +} + /// For-loop lowering: classic init / cond / body / update / exit CFG. /// /// ```text @@ -386,6 +396,7 @@ pub(crate) fn lower_for( // site having done so already). Only the site that inserted should // remove it at loop exit to avoid disturbing a pre-existing slot. let local_bound_counter_i32_was_fresh: bool; + let local_bound_bound_i32_was_fresh: bool; let i32_local_bound_slot: Option = if let Some((counter_id, bound_id, _op)) = local_bound_classification { // Allocate a parallel i32 slot for the counter if not already @@ -411,18 +422,28 @@ pub(crate) fn lower_for( local_bound_counter_i32_was_fresh = fresh; // Hoist `fptosi(n)` to a fresh i32 alloca before the cond block // so LLVM sees a loop-invariant integer bound — critical for - // SCEV / LoopVectorizer to recognize the induction variable. - if let Some(bound_slot) = ctx.locals.get(&bound_id).cloned() { + // SCEV / LoopVectorizer to recognize the induction variable. Also + // expose that slot while lowering the loop body so integer index + // expressions like `i * n + k` can reuse the same trusted bound + // instead of rebuilding the index through double arithmetic. + if let Some(existing) = ctx.i32_counter_slots.get(&bound_id).cloned() { + local_bound_bound_i32_was_fresh = false; + Some(existing) + } else if let Some(bound_slot) = ctx.locals.get(&bound_id).cloned() { let bound_dbl = ctx.block().load(DOUBLE, &bound_slot); let bound_i32 = ctx.block().fptosi(DOUBLE, &bound_dbl, I32); let slot = ctx.func.alloca_entry(I32); ctx.block().store(I32, &bound_i32, &slot); + ctx.i32_counter_slots.insert(bound_id, slot.clone()); + local_bound_bound_i32_was_fresh = true; Some(slot) } else { + local_bound_bound_i32_was_fresh = false; None } } else { local_bound_counter_i32_was_fresh = false; + local_bound_bound_i32_was_fresh = false; None }; // Issue #168 follow-up: when neither the `arr.length` hoist nor the static @@ -543,15 +564,53 @@ pub(crate) fn lower_for( ctx.int_range_facts.push(fact); } + let invariant_array_get_hoist: Option = + if let (Some((arr_id, inner_counter_id, op)), Some(_)) = + (hoist_classification, i32_length_slot.as_ref()) + { + if matches!(op, perry_hir::CompareOp::Lt) + && hoisted_index_bounds_are_safe + && ctx.i32_counter_slots.contains_key(&inner_counter_id) + { + classify_invariant_numeric_array_index_get_hoist( + ctx, + arr_id, + inner_counter_id, + update, + body, + ) + } else { + None + } + } else { + None + }; + let invariant_array_get_slot: Option = invariant_array_get_hoist + .as_ref() + .map(|_| ctx.func.alloca_entry(DOUBLE)); + let cond_idx = ctx.new_block("for.cond"); + let prebody_idx = if invariant_array_get_hoist.is_some() { + Some(ctx.new_block("for.prebody")) + } else { + None + }; let body_idx = ctx.new_block("for.body"); let update_idx = ctx.new_block("for.update"); + let backedge_cond_idx = if invariant_array_get_hoist.is_some() { + Some(ctx.new_block("for.cond.backedge")) + } else { + None + }; let exit_idx = ctx.new_block("for.exit"); let cond_label = ctx.block_label(cond_idx); + let prebody_label = prebody_idx.map(|idx| ctx.block_label(idx)); let body_label = ctx.block_label(body_idx); let update_label = ctx.block_label(update_idx); + let backedge_cond_label = backedge_cond_idx.map(|idx| ctx.block_label(idx)); let exit_label = ctx.block_label(exit_idx); + let cond_true_label = prebody_label.as_ref().unwrap_or(&body_label); // Branch from the block holding the init into the cond block. ctx.block().br(&cond_label); @@ -570,7 +629,7 @@ pub(crate) fn lower_for( perry_hir::CompareOp::Le => ctx.block().icmp_sle(I32, &ctr, &len), _ => ctx.block().icmp_slt(I32, &ctr, &len), }; - ctx.block().cond_br(&cmp, &body_label, &exit_label); + ctx.block().cond_br(&cmp, cond_true_label, &exit_label); true } else { false @@ -587,7 +646,7 @@ pub(crate) fn lower_for( perry_hir::CompareOp::Le => ctx.block().icmp_sle(I32, &ctr, &bound), _ => ctx.block().icmp_slt(I32, &ctr, &bound), }; - ctx.block().cond_br(&cmp, &body_label, &exit_label); + ctx.block().cond_br(&cmp, cond_true_label, &exit_label); true } else { false @@ -637,11 +696,11 @@ pub(crate) fn lower_for( if let Some(cond_expr) = condition { let cv = lower_expr(ctx, cond_expr)?; let i1 = lower_truthy(ctx, &cv, cond_expr); - ctx.block().cond_br(&i1, &body_label, &exit_label); + ctx.block().cond_br(&i1, cond_true_label, &exit_label); } else { // `for (;;)` — unconditional jump into the body. May be an // infinite loop unless the body contains a `break`. - ctx.block().br(&body_label); + ctx.block().br(cond_true_label); } } @@ -664,6 +723,25 @@ pub(crate) fn lower_for( ctx.active_region_id = Some(ctx.region_id_for_label(lbl)); } + if let (Some(pre_idx), Some(hoist), Some(slot)) = ( + prebody_idx, + invariant_array_get_hoist, + invariant_array_get_slot.as_ref(), + ) { + ctx.current_block = pre_idx; + let value = emit_invariant_numeric_array_index_get_hoist( + ctx, + hoist, + i32_length_slot + .as_ref() + .expect("invariant array get hoist requires an i32 length slot"), + )?; + ctx.block().store(DOUBLE, &value, slot); + if !ctx.block().is_terminated() { + ctx.block().br(&body_label); + } + } + // Body block. ctx.current_block = body_idx; if let Some(cond) = condition { @@ -672,7 +750,26 @@ pub(crate) fn lower_for( guarded.retain(|fact| loop_counter_bounds_are_safe(ctx, fact.index_local_id, update, body)); ctx.guarded_buffer_index_pairs.extend(guarded); } - lower_stmts(ctx, body)?; + let hoisted_replacement = if let (Some(hoist), Some(slot)) = + (invariant_array_get_hoist, invariant_array_get_slot.as_ref()) + { + Some(( + (hoist.array_local_id, hoist.index_local_id), + ctx.hoisted_array_index_gets + .insert((hoist.array_local_id, hoist.index_local_id), slot.clone()), + )) + } else { + None + }; + let lower_result = lower_stmts(ctx, body); + if let Some((key, previous)) = hoisted_replacement { + if let Some(previous) = previous { + ctx.hoisted_array_index_gets.insert(key, previous); + } else { + ctx.hoisted_array_index_gets.remove(&key); + } + } + lower_result?; clear_loop_body_shadow_slots(ctx, body); // Issue #74: insert an empty `asm sideeffect` in bodies whose // statements are all LLVM-pure (local-only arithmetic, no calls, @@ -695,7 +792,32 @@ pub(crate) fn lower_for( let _ = lower_expr(ctx, update_expr)?; } if !ctx.block().is_terminated() { - ctx.block().br(&cond_label); + ctx.block() + .br(backedge_cond_label.as_ref().unwrap_or(&cond_label)); + } + + if let Some(backedge_idx) = backedge_cond_idx { + ctx.current_block = backedge_idx; + if let (Some((_, counter_id, op)), Some(ref len_i32_slot)) = + (hoist_classification, &i32_length_slot) + { + if !emit_i32_length_loop_condition( + ctx, + counter_id, + op, + len_i32_slot, + &body_label, + &exit_label, + ) { + return Err(anyhow::anyhow!( + "invariant array get hoist missing backedge i32 condition" + )); + } + } else { + return Err(anyhow::anyhow!( + "invariant array get hoist missing backedge condition inputs" + )); + } } ctx.active_region_id = previous_region_id; @@ -718,6 +840,11 @@ pub(crate) fn lower_for( ctx.i32_counter_slots.remove(&counter_id); } } + if local_bound_bound_i32_was_fresh { + if let Some((_, bound_id, _)) = local_bound_classification { + ctx.i32_counter_slots.remove(&bound_id); + } + } let _ = i32_local_bound_slot; // Same cleanup for the runtime-guarded `any`-bound path. if let Some(dyn_bound) = dynamic_i32_bound { @@ -751,6 +878,203 @@ pub(crate) fn clear_loop_body_shadow_slots(ctx: &mut FnCtx<'_>, body: &[Stmt]) { emit_shadow_slot_clears(ctx, &slots); } +fn emit_invariant_numeric_array_index_get_hoist( + ctx: &mut FnCtx<'_>, + hoist: InvariantArrayIndexGetHoist, + len_i32_slot: &str, +) -> Result { + let arr_expr = perry_hir::Expr::LocalGet(hoist.array_local_id); + let idx_expr = perry_hir::Expr::LocalGet(hoist.index_local_id); + let arr_box = lower_expr(ctx, &arr_expr)?; + let idx_i32 = + if let Some(idx_i32_slot) = ctx.i32_counter_slots.get(&hoist.index_local_id).cloned() { + ctx.block().load(I32, &idx_i32_slot) + } else { + let idx_double = lower_expr(ctx, &idx_expr)?; + ctx.block().fptosi(DOUBLE, &idx_double, I32) + }; + let inner_counter_slot = ctx + .i32_counter_slots + .get(&hoist.inner_counter_local_id) + .cloned() + .ok_or_else(|| anyhow::anyhow!("invariant array get hoist missing inner i32 counter"))?; + let len_i32 = ctx.block().load(I32, len_i32_slot); + let inner_i32 = ctx.block().load(I32, &inner_counter_slot); + let remaining_i32 = ctx.block().sub(I32, &len_i32, &inner_i32); + let skipped_i32 = ctx.block().sub(I32, &remaining_i32, "1"); + let skipped_i64 = ctx.block().zext(I32, &skipped_i32, I64); + + lower_guarded_array_index_get_trusted_i32( + ctx, + &arr_box, + &idx_i32, + "hoist.num", + Some(&skipped_i64), + ) +} + +fn emit_i32_length_loop_condition( + ctx: &mut FnCtx<'_>, + counter_id: u32, + op: perry_hir::CompareOp, + len_i32_slot: &str, + true_label: &str, + false_label: &str, +) -> bool { + let Some(ctr_i32_slot) = ctx.i32_counter_slots.get(&counter_id).cloned() else { + return false; + }; + let ctr = ctx.block().load(I32, &ctr_i32_slot); + let len = ctx.block().load(I32, len_i32_slot); + let cmp = match op { + perry_hir::CompareOp::Le => ctx.block().icmp_sle(I32, &ctr, &len), + _ => ctx.block().icmp_slt(I32, &ctr, &len), + }; + ctx.block().cond_br(&cmp, true_label, false_label); + true +} + +fn classify_invariant_numeric_array_index_get_hoist( + ctx: &crate::expr::FnCtx<'_>, + arr_id: u32, + inner_counter_id: u32, + update: Option<&perry_hir::Expr>, + body: &[perry_hir::Stmt], +) -> Option { + if !expr_has_numeric_pointer_free_array_layout(ctx, &perry_hir::Expr::LocalGet(arr_id)) { + return None; + } + let [perry_hir::Stmt::Expr(expr)] = body else { + return None; + }; + let index_id = find_invariant_array_index_get_candidate(ctx, expr, arr_id, inner_counter_id)?; + if update + .is_some_and(|expr| expr_mutates_local(expr, arr_id) || expr_mutates_local(expr, index_id)) + { + return None; + } + if expr_mutates_local(expr, arr_id) || expr_mutates_local(expr, index_id) { + return None; + } + if !expr_preserves_invariant_array_read(expr, arr_id, index_id) { + return None; + } + Some(InvariantArrayIndexGetHoist { + array_local_id: arr_id, + index_local_id: index_id, + inner_counter_local_id: inner_counter_id, + }) +} + +fn find_invariant_array_index_get_candidate( + ctx: &crate::expr::FnCtx<'_>, + expr: &perry_hir::Expr, + arr_id: u32, + inner_counter_id: u32, +) -> Option { + use perry_hir::{ArrayElement, Expr}; + + let find = + |expr: &Expr| find_invariant_array_index_get_candidate(ctx, expr, arr_id, inner_counter_id); + let find_in_order = |exprs: &[&Expr]| exprs.iter().find_map(|expr| find(expr)); + + match expr { + Expr::IndexGet { object, index } => { + if let (Expr::LocalGet(candidate_arr_id), Expr::LocalGet(candidate_index_id)) = + (object.as_ref(), index.as_ref()) + { + if *candidate_arr_id == arr_id + && *candidate_index_id != inner_counter_id + && ctx.bounded_index_pairs.iter().any(|fact| { + fact.array_local_id == arr_id && fact.index_local_id == *candidate_index_id + }) + { + return Some(*candidate_index_id); + } + } + find_in_order(&[object.as_ref(), index.as_ref()]) + } + Expr::LocalSet(_, value) => find(value), + Expr::Binary { left, right, .. } | Expr::Compare { left, right, .. } => { + find_in_order(&[left.as_ref(), right.as_ref()]) + } + Expr::Unary { operand, .. } + | Expr::Void(operand) + | Expr::TypeOf(operand) + | Expr::StringCoerce(operand) + | Expr::ObjectCoerce(operand) + | Expr::BooleanCoerce(operand) + | Expr::NumberCoerce(operand) => find(operand), + Expr::Array(elements) => elements.iter().find_map(find), + Expr::ArraySpread(elements) => elements.iter().find_map(|element| match element { + ArrayElement::Expr(expr) | ArrayElement::Spread(expr) => find(expr), + ArrayElement::Hole => None, + }), + Expr::MathImul(left, right) | Expr::MathPow(left, right) => { + find_in_order(&[left.as_ref(), right.as_ref()]) + } + Expr::MathMin(elements) | Expr::MathMax(elements) => elements.iter().find_map(find), + Expr::MathAbs(expr) + | Expr::MathSqrt(expr) + | Expr::MathFloor(expr) + | Expr::MathCeil(expr) + | Expr::MathRound(expr) + | Expr::MathF16round(expr) => find(expr), + // Avoid changing semantics for short-circuited or branch-only reads. + Expr::Conditional { .. } | Expr::Logical { .. } => None, + _ => None, + } +} + +fn expr_preserves_invariant_array_read(expr: &perry_hir::Expr, arr_id: u32, index_id: u32) -> bool { + use perry_hir::{ArrayElement, Expr}; + let walk = |expr: &Expr| expr_preserves_invariant_array_read(expr, arr_id, index_id); + match expr { + Expr::LocalSet(id, value) => *id != arr_id && *id != index_id && walk(value), + Expr::Update { id, .. } => *id != arr_id && *id != index_id, + Expr::Binary { left, right, .. } + | Expr::Compare { left, right, .. } + | Expr::Logical { left, right, .. } => walk(left) && walk(right), + Expr::Unary { operand, .. } + | Expr::Void(operand) + | Expr::TypeOf(operand) + | Expr::StringCoerce(operand) + | Expr::ObjectCoerce(operand) + | Expr::BooleanCoerce(operand) + | Expr::NumberCoerce(operand) => walk(operand), + Expr::Conditional { + condition, + then_expr, + else_expr, + } => walk(condition) && walk(then_expr) && walk(else_expr), + Expr::IndexGet { object, index } => walk(object) && walk(index), + Expr::Array(elements) => elements.iter().all(&walk), + Expr::ArraySpread(elements) => elements.iter().all(|element| match element { + ArrayElement::Expr(expr) | ArrayElement::Spread(expr) => walk(expr), + ArrayElement::Hole => true, + }), + Expr::MathImul(left, right) | Expr::MathPow(left, right) => walk(left) && walk(right), + Expr::MathMin(elements) | Expr::MathMax(elements) => elements.iter().all(&walk), + Expr::MathAbs(expr) + | Expr::MathSqrt(expr) + | Expr::MathFloor(expr) + | Expr::MathCeil(expr) + | Expr::MathRound(expr) + | Expr::MathF16round(expr) => walk(expr), + Expr::LocalGet(_) + | Expr::GlobalGet(_) + | Expr::FuncRef(_) + | Expr::Number(_) + | Expr::Integer(_) + | Expr::Bool(_) + | Expr::Null + | Expr::Undefined + | Expr::String(_) + | Expr::WtfString(_) => true, + _ => false, + } +} + /// Inspect a `for` loop's condition expression and body, and return /// `Some((arr_local_id, counter_local_id, op))` if the loop is the /// well-known shape `for (let i = ...; i < .length; ...) { body }` diff --git a/crates/perry-codegen/tests/typed_feedback.rs b/crates/perry-codegen/tests/typed_feedback.rs index a0b124c57..b43d0e815 100644 --- a/crates/perry-codegen/tests/typed_feedback.rs +++ b/crates/perry-codegen/tests/typed_feedback.rs @@ -1,5 +1,8 @@ use perry_codegen::{compile_module, AppMetadata, CompileOptions}; -use perry_hir::{BinaryOp, Class, ClassField, Expr, Function, Module, ModuleInitKind, Param, Stmt}; +use perry_hir::{ + BinaryOp, Class, ClassField, CompareOp, Expr, Function, Module, ModuleInitKind, Param, Stmt, + UpdateOp, +}; use perry_types::{FunctionType, Type}; /// Serializes env-mutating tests so a concurrent test never observes a @@ -547,3 +550,234 @@ fn typed_feedback_guards_computed_numeric_array_index_hot_path() { assert!(!ir.contains("call double @js_array_numeric_get_f64_unboxed")); assert!(ir.contains("load double")); } + +#[test] +fn typed_feedback_guards_computed_numeric_array_index_uses_i32_loop_bound() { + let array_ty = Type::Array(Box::new(Type::Number)); + let ir = ir_for(module( + "typed_feedback_loop_bound_computed_array.ts", + vec![param(1, "xs", array_ty), param(2, "size", Type::Number)], + Type::Number, + vec![Stmt::For { + init: Some(Box::new(Stmt::Let { + id: 3, + name: "i".to_string(), + ty: Type::Number, + mutable: true, + init: Some(Expr::Integer(0)), + })), + condition: Some(Expr::Compare { + op: CompareOp::Lt, + left: Box::new(Expr::LocalGet(3)), + right: Box::new(Expr::LocalGet(2)), + }), + update: Some(Expr::Update { + id: 3, + op: UpdateOp::Increment, + prefix: false, + }), + body: vec![Stmt::Return(Some(Expr::IndexGet { + object: Box::new(Expr::LocalGet(1)), + index: Box::new(Expr::Binary { + op: BinaryOp::Add, + left: Box::new(Expr::Binary { + op: BinaryOp::Mul, + left: Box::new(Expr::LocalGet(3)), + right: Box::new(Expr::LocalGet(2)), + }), + right: Box::new(Expr::Integer(1)), + }), + }))], + }], + )); + + assert!(ir.contains("call i32 @js_typed_feedback_numeric_array_index_get_guard_i32")); + assert!(ir.contains("call double @js_typed_feedback_array_index_get_fallback_boxed")); + assert!(ir.contains("mul i32"), "{ir}"); + assert!(ir.contains("add i32"), "{ir}"); + assert!(!ir.contains("fmul double"), "{ir}"); + assert!(!ir.contains("call double @js_array_numeric_get_f64_unboxed")); +} + +#[test] +fn typed_feedback_hoists_invariant_numeric_array_get_out_of_inner_loop() { + let array_ty = Type::Array(Box::new(Type::Number)); + let arr_i = Expr::IndexGet { + object: Box::new(Expr::LocalGet(1)), + index: Box::new(Expr::LocalGet(3)), + }; + let arr_j = Expr::IndexGet { + object: Box::new(Expr::LocalGet(1)), + index: Box::new(Expr::LocalGet(4)), + }; + let ir = ir_for(module( + "typed_feedback_nested_array_hoist.ts", + vec![param(1, "xs", array_ty)], + Type::Number, + vec![ + Stmt::Let { + id: 2, + name: "sum".to_string(), + ty: Type::Number, + mutable: true, + init: Some(Expr::Integer(0)), + }, + Stmt::For { + init: Some(Box::new(Stmt::Let { + id: 3, + name: "i".to_string(), + ty: Type::Number, + mutable: true, + init: Some(Expr::Integer(0)), + })), + condition: Some(Expr::Compare { + op: CompareOp::Lt, + left: Box::new(Expr::LocalGet(3)), + right: Box::new(Expr::PropertyGet { + object: Box::new(Expr::LocalGet(1)), + property: "length".to_string(), + }), + }), + update: Some(Expr::Update { + id: 3, + op: UpdateOp::Increment, + prefix: false, + }), + body: vec![Stmt::For { + init: Some(Box::new(Stmt::Let { + id: 4, + name: "j".to_string(), + ty: Type::Number, + mutable: true, + init: Some(Expr::Integer(0)), + })), + condition: Some(Expr::Compare { + op: CompareOp::Lt, + left: Box::new(Expr::LocalGet(4)), + right: Box::new(Expr::PropertyGet { + object: Box::new(Expr::LocalGet(1)), + property: "length".to_string(), + }), + }), + update: Some(Expr::Update { + id: 4, + op: UpdateOp::Increment, + prefix: false, + }), + body: vec![Stmt::Expr(Expr::LocalSet( + 2, + Box::new(Expr::Binary { + op: BinaryOp::Add, + left: Box::new(Expr::Binary { + op: BinaryOp::Add, + left: Box::new(Expr::LocalGet(2)), + right: Box::new(arr_i), + }), + right: Box::new(arr_j), + }), + ))], + }], + }, + Stmt::Return(Some(Expr::LocalGet(2))), + ], + )); + + assert!(ir.contains("for.prebody"), "{ir}"); + assert!(ir.contains("hoist.num.fast"), "{ir}"); + assert!( + ir.contains("call void @js_typed_feedback_record_array_guard_fast_passes"), + "{ir}" + ); + assert!(ir.contains("call i32 @js_typed_feedback_numeric_array_index_get_guard_i32")); +} + +#[test] +fn typed_feedback_does_not_hoist_branch_only_invariant_numeric_array_get() { + let array_ty = Type::Array(Box::new(Type::Number)); + let arr_i = Expr::IndexGet { + object: Box::new(Expr::LocalGet(1)), + index: Box::new(Expr::LocalGet(3)), + }; + let arr_j = Expr::IndexGet { + object: Box::new(Expr::LocalGet(1)), + index: Box::new(Expr::LocalGet(4)), + }; + let ir = ir_for(module( + "typed_feedback_nested_array_no_conditional_hoist.ts", + vec![param(1, "xs", array_ty)], + Type::Number, + vec![ + Stmt::Let { + id: 2, + name: "sum".to_string(), + ty: Type::Number, + mutable: true, + init: Some(Expr::Integer(0)), + }, + Stmt::For { + init: Some(Box::new(Stmt::Let { + id: 3, + name: "i".to_string(), + ty: Type::Number, + mutable: true, + init: Some(Expr::Integer(0)), + })), + condition: Some(Expr::Compare { + op: CompareOp::Lt, + left: Box::new(Expr::LocalGet(3)), + right: Box::new(Expr::PropertyGet { + object: Box::new(Expr::LocalGet(1)), + property: "length".to_string(), + }), + }), + update: Some(Expr::Update { + id: 3, + op: UpdateOp::Increment, + prefix: false, + }), + body: vec![Stmt::For { + init: Some(Box::new(Stmt::Let { + id: 4, + name: "j".to_string(), + ty: Type::Number, + mutable: true, + init: Some(Expr::Integer(0)), + })), + condition: Some(Expr::Compare { + op: CompareOp::Lt, + left: Box::new(Expr::LocalGet(4)), + right: Box::new(Expr::PropertyGet { + object: Box::new(Expr::LocalGet(1)), + property: "length".to_string(), + }), + }), + update: Some(Expr::Update { + id: 4, + op: UpdateOp::Increment, + prefix: false, + }), + body: vec![Stmt::Expr(Expr::LocalSet( + 2, + Box::new(Expr::Binary { + op: BinaryOp::Add, + left: Box::new(Expr::LocalGet(2)), + right: Box::new(Expr::Conditional { + condition: Box::new(Expr::Bool(false)), + then_expr: Box::new(arr_i), + else_expr: Box::new(arr_j), + }), + }), + ))], + }], + }, + Stmt::Return(Some(Expr::LocalGet(2))), + ], + )); + + assert!(!ir.contains("for.prebody"), "{ir}"); + assert!(!ir.contains("hoist.num.fast"), "{ir}"); + assert!( + !ir.contains("call void @js_typed_feedback_record_array_guard_fast_passes"), + "{ir}" + ); +} diff --git a/crates/perry-runtime/src/typed_feedback.rs b/crates/perry-runtime/src/typed_feedback.rs index e6104a08d..794a4e990 100644 --- a/crates/perry-runtime/src/typed_feedback.rs +++ b/crates/perry-runtime/src/typed_feedback.rs @@ -483,6 +483,24 @@ fn array_guard_cache_fast_passes(site_id: u64) -> u64 { } } +fn record_array_guard_fast_passes(site_id: u64, count: u64) { + if site_id == 0 || count == 0 { + return; + } + + let entry = &ARRAY_GUARD_FAST_CACHE[array_guard_cache_index(site_id)]; + if entry.site_id.load(Ordering::Acquire) == site_id { + entry.fast_passes.fetch_add(count, Ordering::Relaxed); + return; + } + + let mut reg = registry(); + if let Some(site) = reg.sites.get_mut(&site_id) { + site.observed_count = site.observed_count.saturating_add(count); + site.guard_passes = site.guard_passes.saturating_add(count); + } +} + #[cfg(test)] fn reset_array_guard_fast_cache_for_tests() { for entry in ARRAY_GUARD_FAST_CACHE.iter() { @@ -956,6 +974,11 @@ pub extern "C" fn js_typed_feedback_record_fallback_call(site_id: u64) { record_fallback_call(site_id); } +#[no_mangle] +pub extern "C" fn js_typed_feedback_record_array_guard_fast_passes(site_id: u64, count: u64) { + record_array_guard_fast_passes(site_id, count); +} + fn observe_property( site_id: u64, kind: TypedFeedbackSiteKind, @@ -1499,6 +1522,55 @@ pub extern "C" fn js_typed_feedback_numeric_array_index_get_guard( } } +#[no_mangle] +pub extern "C" fn js_typed_feedback_numeric_array_index_get_guard_i32( + site_id: u64, + receiver: f64, + index: i32, + require_in_bounds: i32, +) -> i32 { + let raw_addr = normalize_raw_object_addr(receiver.to_bits()); + let require_in_bounds = require_in_bounds != 0; + if site_id != 0 && index >= 0 { + if let Some(observation) = + numeric_array_fast_observation(raw_addr, index as u32, require_in_bounds, None) + { + if array_guard_fast_pass(site_id, &observation, true) { + return 1; + } + } + } + let observed_index = if index >= 0 { index as u32 } else { u32::MAX }; + let (class_id, heap_type, aux, element_kind) = classify_array(raw_addr, Some(observed_index)); + let observation = Observation { + source: ObservationSource::Array, + object_addr: 0, + shape_addr: 0, + key_hash: 0, + class_id, + heap_type, + aux, + value_tag: element_kind, + }; + let contract_valid = index >= 0 + && numeric_array_index_guard( + raw_addr as *const ArrayHeader, + index as u32, + require_in_bounds, + ); + let pass = guard_observe( + site_id, + TypedFeedbackSiteKind::ArrayElement, + observation, + contract_valid, + ); + if pass { + 1 + } else { + 0 + } +} + #[no_mangle] pub extern "C" fn js_typed_feedback_array_index_get_fallback_boxed( site_id: u64, diff --git a/crates/perry-runtime/src/typed_feedback/tests.rs b/crates/perry-runtime/src/typed_feedback/tests.rs index f3eb208c4..832181f6f 100644 --- a/crates/perry-runtime/src/typed_feedback/tests.rs +++ b/crates/perry-runtime/src/typed_feedback/tests.rs @@ -505,6 +505,33 @@ fn typed_feedback_numeric_array_get_guard_requires_numeric_layout() { assert_eq!(site.fallback_calls, 0); } +#[test] +fn typed_feedback_numeric_array_get_guard_i32_requires_numeric_layout() { + let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap(); + reset_typed_feedback_for_tests(); + register(27, TypedFeedbackSiteKind::ArrayElement, "arr[i]"); + + let values = [1.0, 2.0]; + let arr = crate::array::js_array_from_f64(values.as_ptr(), values.len() as u32); + let arr_box = crate::value::js_nanbox_pointer(arr as i64); + + let first = js_typed_feedback_numeric_array_index_get_guard_i32(27, arr_box, 0, 1); + assert_eq!(first, 1); + + let payload = crate::string::js_string_from_bytes(b"downgraded".as_ptr(), 10); + let payload_value = crate::value::js_nanbox_string(payload as i64); + crate::array::js_array_set_f64(arr, 0, payload_value); + assert_eq!(crate::array::js_array_is_numeric_f64_layout(arr), 0); + + let second = js_typed_feedback_numeric_array_index_get_guard_i32(27, arr_box, 0, 1); + assert_eq!(second, 0); + + let site = &typed_feedback_snapshot().sites[0]; + assert_eq!(site.guard_passes, 1); + assert_eq!(site.guard_failures, 1); + assert_eq!(site.fallback_calls, 0); +} + #[test] fn typed_feedback_numeric_array_guard_fast_path_preserves_snapshot_counts() { let _guard = TYPED_FEEDBACK_TEST_LOCK.lock().unwrap(); diff --git a/crates/perry-runtime/src/typed_feedback/trace.rs b/crates/perry-runtime/src/typed_feedback/trace.rs index 597fb881c..a77df0796 100644 --- a/crates/perry-runtime/src/typed_feedback/trace.rs +++ b/crates/perry-runtime/src/typed_feedback/trace.rs @@ -392,4 +392,5 @@ mod keep_typed_feedback { #[used] static K22: extern "C" fn(u64, f64) -> f64 = js_typed_feedback_observe_helper_return; #[cfg(feature = "diagnostics")] #[used] static K23: extern "C" fn() = js_typed_feedback_maybe_dump_trace; + #[used] static K24: extern "C" fn(u64, f64, i32, i32) -> i32 = js_typed_feedback_numeric_array_index_get_guard_i32; }