From 8d953ca7ad6f32611209e7f8219f2c313f5db81a Mon Sep 17 00:00:00 2001 From: Andrew DiZenzo Date: Wed, 17 Jun 2026 04:52:16 +0000 Subject: [PATCH] Hoist typed feedback site registration --- PERF_RUN_LOG.md | 46 ++++++++++++ benchmarks/compare.sh | 70 ++++++++++++----- benchmarks/quick.sh | 75 ++++++++++++++----- .../perry-codegen/src/expr/typed_feedback.rs | 26 ++++--- crates/perry-codegen/src/function.rs | 24 +++++- 5 files changed, 193 insertions(+), 48 deletions(-) create mode 100644 PERF_RUN_LOG.md diff --git a/PERF_RUN_LOG.md b/PERF_RUN_LOG.md new file mode 100644 index 0000000000..e395ec7481 --- /dev/null +++ b/PERF_RUN_LOG.md @@ -0,0 +1,46 @@ +# Perry Performance Run Log + +## 2026-06-17 - Typed feedback registration hoist + +- Start revision: `e816fc3e4af1` +- Branch: `codex/perry-performance-20260617` +- Worker assignment: single Codex pass in this worktree +- Benchmark environment: Linux `/usr/bin/time -v`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness +- Baseline commands: + - `cargo build --release` + - `./benchmarks/quick.sh` + - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-baseline-e816fc3e4.json` +- Baseline results: + - quick: fibonacci 260ms/18MB, math_intensive 73ms/18MB, nested_loops 3508ms/17MB, factorial 95ms/18MB, matrix_multiply 6462ms/27MB + - compare quick medians: loop_overhead 74ms/18772KB, fibonacci 262ms/18696KB, math_intensive 70ms/18696KB, nested_loops 3383ms/17724KB, factorial 96ms/18836KB +- Selected gap and evidence: + - `nested_loops` dominated the quick compare set at 3383ms; `matrix_multiply` was the slowest `quick.sh` case at 6462ms. + - LLVM trace for `benchmarks/suite/10_nested_loops.ts` showed `js_typed_feedback_register_site(...)` emitted inside the hot `for.body.21` inner loop before each typed-feedback array guard. +- Change: + - Added `LlFunction::entry_setup_call_void` and changed typed-feedback site registration to emit once in function-entry setup instead of at every guard use site. + - Kept guard, fallback, pass, and counter calls at original use sites so runtime evidence semantics remain per-use. + - Updated benchmark harnesses to support Linux RSS measurement and skip Node `.ts` columns when the installed Node cannot run TypeScript directly. +- Post-change benchmark commands: + - `cargo build --release` + - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-final-e816fc3e4.json` + - `./benchmarks/quick.sh` +- Post-change results: + - compare quick medians: loop_overhead 74ms/18768KB, fibonacci 261ms/18920KB, math_intensive 69ms/18944KB, nested_loops 956ms/19152KB, factorial 94ms/18896KB + - quick: fibonacci 262ms/18MB, math_intensive 55ms/18MB, nested_loops 965ms/18MB, factorial 75ms/18MB, matrix_multiply 1842ms/28MB +- Measured impact: + - `10_nested_loops` compare median: 3383ms -> 956ms, 71.7% faster + - `16_matrix_multiply` quick: 6462ms -> 1842ms, 71.5% faster +- Verification: + - `bash -n benchmarks/quick.sh` + - `bash -n benchmarks/compare.sh` + - `cargo fmt --check` + - `cargo test -p perry-codegen --test typed_feedback` + - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py` + - `tests/test_benchmark_output_verifier.sh` + - `target/release/perry compile --no-cache benchmarks/suite/10_nested_loops.ts -o /tmp/perry-nested-loops-final --trace llvm --quiet`; trace confirmed registration calls in entry setup only and no registration calls in `for.body.21` + - `/tmp/perry-nested-loops-final` produced `nested_loops:963` and `sum:26991000000` + - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-multiply-final --quiet && /tmp/perry-matrix-multiply-final` produced `matrix_multiply:1778` and `checksum:41079519680` +- Notes: + - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local baseline JSON. + - Follow-up candidates remain in typed array and numeric array hot paths, but this cycle stopped at the isolated registration-hoist optimization. +- PR: https://github.com/PerryTS/perry/pull/5295 diff --git a/benchmarks/compare.sh b/benchmarks/compare.sh index 183a3c861b..41ba586ff7 100755 --- a/benchmarks/compare.sh +++ b/benchmarks/compare.sh @@ -75,7 +75,36 @@ fi # Check for node HAS_NODE=0 -command -v node &>/dev/null && HAS_NODE=1 +NODE_CMD=(node) + +detect_node_ts_runner() { + command -v node &>/dev/null || return 1 + + local probe + probe=$(mktemp "${TMPDIR:-/tmp}/perry-node-ts-probe.XXXXXX.ts") + printf 'const x: number = 1;\nconsole.log("node_ts_probe:" + x);\n' >"$probe" + + if node "$probe" >/dev/null 2>&1; then + NODE_CMD=(node) + rm -f "$probe" + return 0 + fi + + if node --experimental-strip-types "$probe" >/dev/null 2>&1; then + NODE_CMD=(node --experimental-strip-types) + rm -f "$probe" + return 0 + fi + + rm -f "$probe" + return 1 +} + +if detect_node_ts_runner; then + HAS_NODE=1 +else + echo "Node.js is unavailable for .ts benchmark inputs; Node columns and correctness checks will be skipped." >&2 +fi echo -e "${BOLD}${CYAN}Perry Performance Comparison (speed + RAM)${NC}" echo "" @@ -87,32 +116,39 @@ RESULTS_FILE=$(mktemp) RUN_OUTPUT_DIR=$(mktemp -d) extract_time() { - echo "$1" | grep -E "^[a-z_]+:[0-9]+" | head -1 | cut -d: -f2 + awk -F: '/^[a-z_]+:[0-9]+/ {print $2; exit}' <<<"$1" } measure_rss() { - # macOS: /usr/bin/time -l reports "peak memory footprint" in bytes on stderr - # Linux: /usr/bin/time -v reports "Maximum resident set size" in KB on stderr + # macOS: /usr/bin/time -l reports peak RSS in bytes. + # Linux: /usr/bin/time -v reports peak RSS in KB. local stdout_file="$1" local binary="$2" shift 2 local tmp_err=$(mktemp) - /usr/bin/time -l "$binary" "$@" >"$stdout_file" 2>"$tmp_err" + if [[ -x /usr/bin/time ]]; then + if [[ "$(uname)" == "Darwin" ]]; then + /usr/bin/time -l "$binary" "$@" >"$stdout_file" 2>"$tmp_err" || true + else + /usr/bin/time -v "$binary" "$@" >"$stdout_file" 2>"$tmp_err" || true + fi + else + "$binary" "$@" >"$stdout_file" 2>"$tmp_err" || true + fi - local rss_bytes=0 - # macOS newer: "peak memory footprint" in bytes - local pmf - pmf=$(grep 'peak memory footprint' "$tmp_err" 2>/dev/null | awk '{print $1}' || true) - if [[ -n "$pmf" && "$pmf" != "0" ]]; then - rss_bytes=$pmf + local rss_kb=0 + if [[ "$(uname)" == "Darwin" ]]; then + local rss_bytes + rss_bytes=$(awk '/peak memory footprint/ {print $1; exit} /maximum resident set size/ {print $1; exit}' "$tmp_err" 2>/dev/null || true) + rss_bytes=${rss_bytes:-0} + rss_kb=$((rss_bytes / 1024)) else - # macOS older / some versions: "maximum resident set size" in bytes - local mrs - mrs=$(grep 'maximum resident set size' "$tmp_err" 2>/dev/null | awk '{print $1}' || true) - [[ -n "$mrs" ]] && rss_bytes=$mrs + local linux_kb + linux_kb=$(awk -F': ' '/Maximum resident set size/ {print $2; exit}' "$tmp_err" 2>/dev/null || true) + linux_kb=${linux_kb:-0} + rss_kb=$linux_kb fi - local rss_kb=$((rss_bytes / 1024)) rm -f "$tmp_err" @@ -203,7 +239,7 @@ for bench in $BENCHMARKS; do for (( run=0; run/dev/null && HAS_NODE=1 +NODE_CMD=(node) + +detect_node_ts_runner() { + command -v node &>/dev/null || return 1 + + local probe + probe=$(mktemp "${TMPDIR:-/tmp}/perry-node-ts-probe.XXXXXX.ts") + printf 'const x: number = 1;\nconsole.log("node_ts_probe:" + x);\n' >"$probe" + + if node "$probe" >/dev/null 2>&1; then + NODE_CMD=(node) + rm -f "$probe" + return 0 + fi + + if node --experimental-strip-types "$probe" >/dev/null 2>&1; then + NODE_CMD=(node --experimental-strip-types) + rm -f "$probe" + return 0 + fi + + rm -f "$probe" + return 1 +} + +if detect_node_ts_runner; then + HAS_NODE=1 +else + echo "Node.js is unavailable for .ts benchmark inputs; Node columns will be skipped." >&2 +fi extract_time() { - echo "$1" | grep -E "^[a-z_]+:[0-9]+" | head -1 | cut -d: -f2 + awk -F: '/^[a-z_]+:[0-9]+/ {print $2; exit}' <<<"$1" } measure() { local tmp_err=$(mktemp) tmp_out=$(mktemp) - /usr/bin/time -l "$@" >"$tmp_out" 2>"$tmp_err" - local rss=0 - local pmf - pmf=$(grep 'peak memory footprint' "$tmp_err" 2>/dev/null | awk '{print $1}') - if [[ -n "$pmf" && "$pmf" != "0" ]]; then - rss=$pmf + if [[ -x /usr/bin/time ]]; then + if [[ "$(uname)" == "Darwin" ]]; then + /usr/bin/time -l "$@" >"$tmp_out" 2>"$tmp_err" || true + else + /usr/bin/time -v "$@" >"$tmp_out" 2>"$tmp_err" || true + fi + else + "$@" >"$tmp_out" 2>"$tmp_err" || true + fi + local rss_mb=0 + if [[ "$(uname)" == "Darwin" ]]; then + local rss_bytes + rss_bytes=$(awk '/peak memory footprint/ {print $1; exit} /maximum resident set size/ {print $1; exit}' "$tmp_err" 2>/dev/null || true) + rss_bytes=${rss_bytes:-0} + rss_mb=$((rss_bytes / 1024 / 1024)) else - local mrs - mrs=$(grep 'maximum resident set size' "$tmp_err" 2>/dev/null | awk '{print $1}') - [[ -n "$mrs" ]] && rss=$mrs + local rss_kb + rss_kb=$(awk -F': ' '/Maximum resident set size/ {print $2; exit}' "$tmp_err" 2>/dev/null || true) + rss_kb=${rss_kb:-0} + rss_mb=$((rss_kb / 1024)) fi - [[ -z "$rss" ]] && rss=0 - local rss_mb=$((rss / 1024 / 1024)) local output output=$(cat "$tmp_out") rm -f "$tmp_err" "$tmp_out" - echo "${rss_mb}|${output}" + printf '%s\n%s\n' "$rss_mb" "$output" } echo -e "${BOLD}${CYAN}Quick Bench (5 benchmarks)${NC}" @@ -71,17 +108,17 @@ for bench in $BENCHMARKS; do # Perry result=$(measure "./$name") - p_rss=$(echo "$result" | cut -d'|' -f1) - p_out=$(echo "$result" | cut -d'|' -f2-) + p_rss=$(sed -n '1p' <<<"$result") + p_out=$(sed '1d' <<<"$result") p_ms=$(extract_time "$p_out") # Node n_ms="-"; n_rss="-" ratio="-"; mratio="-" if [[ $HAS_NODE -eq 1 ]]; then - result=$(measure node "$bench") - n_rss=$(echo "$result" | cut -d'|' -f1) - n_out=$(echo "$result" | cut -d'|' -f2-) + result=$(measure "${NODE_CMD[@]}" "$bench") + n_rss=$(sed -n '1p' <<<"$result") + n_out=$(sed '1d' <<<"$result") n_ms=$(extract_time "$n_out") if [[ "$p_ms" =~ ^[0-9]+$ && "$n_ms" =~ ^[0-9]+$ && "$n_ms" -gt 0 ]]; then diff --git a/crates/perry-codegen/src/expr/typed_feedback.rs b/crates/perry-codegen/src/expr/typed_feedback.rs index 891592deb7..b2c6d0d336 100644 --- a/crates/perry-codegen/src/expr/typed_feedback.rs +++ b/crates/perry-codegen/src/expr/typed_feedback.rs @@ -228,23 +228,31 @@ pub(crate) fn emit_typed_feedback_register_site( emit_typed_feedback_bytes_global(ctx, local_site_id, "guard", contract.guard_name); let fallback_global = emit_typed_feedback_bytes_global(ctx, local_site_id, "fallback", contract.fallback_name); - ctx.block().call_void( + let site_id_arg = site_id.to_string(); + let kind_arg = kind.raw().to_string(); + let module_len_arg = module.len().to_string(); + let function_len_arg = function.len().to_string(); + let source_len_arg = source_label.len().to_string(); + let operation_len_arg = operation.len().to_string(); + let guard_len_arg = contract.guard_name.len().to_string(); + let fallback_len_arg = contract.fallback_name.len().to_string(); + ctx.func.entry_setup_call_void( "js_typed_feedback_register_site", &[ - (I64, &site_id.to_string()), - (I32, &kind.raw().to_string()), + (I64, &site_id_arg), + (I32, &kind_arg), (PTR, &module_global), - (I64, &module.len().to_string()), + (I64, &module_len_arg), (PTR, &function_global), - (I64, &function.len().to_string()), + (I64, &function_len_arg), (PTR, &source_global), - (I64, &source_label.len().to_string()), + (I64, &source_len_arg), (PTR, &operation_global), - (I64, &operation.len().to_string()), + (I64, &operation_len_arg), (PTR, &guard_global), - (I64, &contract.guard_name.len().to_string()), + (I64, &guard_len_arg), (PTR, &fallback_global), - (I64, &contract.fallback_name.len().to_string()), + (I64, &fallback_len_arg), ], ); site_id.to_string() diff --git a/crates/perry-codegen/src/function.rs b/crates/perry-codegen/src/function.rs index f7501ba2f4..afa753baf2 100644 --- a/crates/perry-codegen/src/function.rs +++ b/crates/perry-codegen/src/function.rs @@ -61,9 +61,9 @@ pub struct LlFunction { /// /// `to_ir()` splices these instructions into block 0 at the /// `entry_init_boundary` instruction index. If no boundary is set - /// (e.g. user functions, which have no init prelude), they're - /// appended to `entry_allocas` instead so the dominance guarantee - /// still holds. + /// (e.g. user functions, which have no init prelude), they are + /// emitted immediately after entry allocas and before the first + /// block instruction so the dominance guarantee still holds. entry_post_init_setup: Vec, /// Index in block 0's instruction list where `entry_post_init_setup` /// should be spliced in. Set by `mark_entry_init_boundary` after @@ -252,6 +252,24 @@ impl LlFunction { .push(format!(" store {} {}, ptr {}", ty, val, ptr)); } + /// Emit a one-time void call in the function-entry setup region. + /// + /// Use this for metadata/registration work that must happen before + /// any reachable hot-path use but does not need to run at each use + /// site. If the function has an init prelude boundary, the call is + /// spliced after runtime/string initialization; otherwise it is + /// emitted at the top of the entry block with the other entry setup. + pub fn entry_setup_call_void(&mut self, func_name: &str, args: &[(LlvmType, &str)]) { + crate::ext_registry::record_ffi_call(func_name); + let arg_str = args + .iter() + .map(|(ty, value)| format!("{} {}", ty, value)) + .collect::>() + .join(", "); + let line = format!(" call void @{}({})", func_name, arg_str); + self.entry_post_init_setup.push(line); + } + /// Emit a one-time function-entry init sequence: allocate a `ptr` /// slot, call `func_name()` (no args), store the result in the /// slot, return the slot pointer name. Used by the inline bump