From 8d953ca7ad6f32611209e7f8219f2c313f5db81a Mon Sep 17 00:00:00 2001
From: Andrew DiZenzo <andrewdizenzojhu@gmail.com>
Date: Wed, 17 Jun 2026 04:52:16 +0000
Subject: [PATCH] Hoist typed feedback site registration

---
 PERF_RUN_LOG.md                               | 46 ++++++++++++
 benchmarks/compare.sh                         | 70 ++++++++++++-----
 benchmarks/quick.sh                           | 75 ++++++++++++++-----
 .../perry-codegen/src/expr/typed_feedback.rs  | 26 ++++---
 crates/perry-codegen/src/function.rs          | 24 +++++-
 5 files changed, 193 insertions(+), 48 deletions(-)
 create mode 100644 PERF_RUN_LOG.md

diff --git a/PERF_RUN_LOG.md b/PERF_RUN_LOG.md
new file mode 100644
index 0000000000..e395ec7481
--- /dev/null
+++ b/PERF_RUN_LOG.md
@@ -0,0 +1,46 @@
+# Perry Performance Run Log
+
+## 2026-06-17 - Typed feedback registration hoist
+
+- Start revision: `e816fc3e4af1`
+- Branch: `codex/perry-performance-20260617`
+- Worker assignment: single Codex pass in this worktree
+- Benchmark environment: Linux `/usr/bin/time -v`; local `node` cannot execute `.ts` benchmark inputs, so Node columns and correctness comparisons were skipped by the harness
+- Baseline commands:
+  - `cargo build --release`
+  - `./benchmarks/quick.sh`
+  - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-baseline-e816fc3e4.json`
+- Baseline results:
+  - quick: fibonacci 260ms/18MB, math_intensive 73ms/18MB, nested_loops 3508ms/17MB, factorial 95ms/18MB, matrix_multiply 6462ms/27MB
+  - compare quick medians: loop_overhead 74ms/18772KB, fibonacci 262ms/18696KB, math_intensive 70ms/18696KB, nested_loops 3383ms/17724KB, factorial 96ms/18836KB
+- Selected gap and evidence:
+  - `nested_loops` dominated the quick compare set at 3383ms; `matrix_multiply` was the slowest `quick.sh` case at 6462ms.
+  - LLVM trace for `benchmarks/suite/10_nested_loops.ts` showed `js_typed_feedback_register_site(...)` emitted inside the hot `for.body.21` inner loop before each typed-feedback array guard.
+- Change:
+  - Added `LlFunction::entry_setup_call_void` and changed typed-feedback site registration to emit once in function-entry setup instead of at every guard use site.
+  - Kept guard, fallback, pass, and counter calls at original use sites so runtime evidence semantics remain per-use.
+  - Updated benchmark harnesses to support Linux RSS measurement and skip Node `.ts` columns when the installed Node cannot run TypeScript directly.
+- Post-change benchmark commands:
+  - `cargo build --release`
+  - `./benchmarks/compare.sh --quick --runs 3 --warn-only --json-out /tmp/perry-final-e816fc3e4.json`
+  - `./benchmarks/quick.sh`
+- Post-change results:
+  - compare quick medians: loop_overhead 74ms/18768KB, fibonacci 261ms/18920KB, math_intensive 69ms/18944KB, nested_loops 956ms/19152KB, factorial 94ms/18896KB
+  - quick: fibonacci 262ms/18MB, math_intensive 55ms/18MB, nested_loops 965ms/18MB, factorial 75ms/18MB, matrix_multiply 1842ms/28MB
+- Measured impact:
+  - `10_nested_loops` compare median: 3383ms -> 956ms, 71.7% faster
+  - `16_matrix_multiply` quick: 6462ms -> 1842ms, 71.5% faster
+- Verification:
+  - `bash -n benchmarks/quick.sh`
+  - `bash -n benchmarks/compare.sh`
+  - `cargo fmt --check`
+  - `cargo test -p perry-codegen --test typed_feedback`
+  - `PERRY_BIN=target/release/perry python3 tests/test_typed_feedback_runtime_evidence.py`
+  - `tests/test_benchmark_output_verifier.sh`
+  - `target/release/perry compile --no-cache benchmarks/suite/10_nested_loops.ts -o /tmp/perry-nested-loops-final --trace llvm --quiet`; trace confirmed registration calls in entry setup only and no registration calls in `for.body.21`
+  - `/tmp/perry-nested-loops-final` produced `nested_loops:963` and `sum:26991000000`
+  - `target/release/perry compile --no-cache benchmarks/suite/16_matrix_multiply.ts -o /tmp/perry-matrix-multiply-final --quiet && /tmp/perry-matrix-multiply-final` produced `matrix_multiply:1778` and `checksum:41079519680`
+- Notes:
+  - `benchmarks/baseline.json` is stale for this Linux environment; compare was run with `--warn-only` and the before/after comparison above uses the captured local baseline JSON.
+  - Follow-up candidates remain in typed array and numeric array hot paths, but this cycle stopped at the isolated registration-hoist optimization.
+- PR: https://github.com/PerryTS/perry/pull/5295
diff --git a/benchmarks/compare.sh b/benchmarks/compare.sh
index 183a3c861b..41ba586ff7 100755
--- a/benchmarks/compare.sh
+++ b/benchmarks/compare.sh
@@ -75,7 +75,36 @@ fi
 
 # Check for node
 HAS_NODE=0
-command -v node &>/dev/null && HAS_NODE=1
+NODE_CMD=(node)
+
+detect_node_ts_runner() {
+  command -v node &>/dev/null || return 1
+
+  local probe
+  probe=$(mktemp "${TMPDIR:-/tmp}/perry-node-ts-probe.XXXXXX.ts")
+  printf 'const x: number = 1;\nconsole.log("node_ts_probe:" + x);\n' >"$probe"
+
+  if node "$probe" >/dev/null 2>&1; then
+    NODE_CMD=(node)
+    rm -f "$probe"
+    return 0
+  fi
+
+  if node --experimental-strip-types "$probe" >/dev/null 2>&1; then
+    NODE_CMD=(node --experimental-strip-types)
+    rm -f "$probe"
+    return 0
+  fi
+
+  rm -f "$probe"
+  return 1
+}
+
+if detect_node_ts_runner; then
+  HAS_NODE=1
+else
+  echo "Node.js is unavailable for .ts benchmark inputs; Node columns and correctness checks will be skipped." >&2
+fi
 
 echo -e "${BOLD}${CYAN}Perry Performance Comparison (speed + RAM)${NC}"
 echo ""
@@ -87,32 +116,39 @@ RESULTS_FILE=$(mktemp)
 RUN_OUTPUT_DIR=$(mktemp -d)
 
 extract_time() {
-  echo "$1" | grep -E "^[a-z_]+:[0-9]+" | head -1 | cut -d: -f2
+  awk -F: '/^[a-z_]+:[0-9]+/ {print $2; exit}' <<<"$1"
 }
 
 measure_rss() {
-  # macOS: /usr/bin/time -l reports "peak memory footprint" in bytes on stderr
-  # Linux: /usr/bin/time -v reports "Maximum resident set size" in KB on stderr
+  # macOS: /usr/bin/time -l reports peak RSS in bytes.
+  # Linux: /usr/bin/time -v reports peak RSS in KB.
   local stdout_file="$1"
   local binary="$2"
   shift 2
   local tmp_err=$(mktemp)
 
-  /usr/bin/time -l "$binary" "$@" >"$stdout_file" 2>"$tmp_err"
+  if [[ -x /usr/bin/time ]]; then
+    if [[ "$(uname)" == "Darwin" ]]; then
+      /usr/bin/time -l "$binary" "$@" >"$stdout_file" 2>"$tmp_err" || true
+    else
+      /usr/bin/time -v "$binary" "$@" >"$stdout_file" 2>"$tmp_err" || true
+    fi
+  else
+    "$binary" "$@" >"$stdout_file" 2>"$tmp_err" || true
+  fi
 
-  local rss_bytes=0
-  # macOS newer: "peak memory footprint" in bytes
-  local pmf
-  pmf=$(grep 'peak memory footprint' "$tmp_err" 2>/dev/null | awk '{print $1}' || true)
-  if [[ -n "$pmf" && "$pmf" != "0" ]]; then
-    rss_bytes=$pmf
+  local rss_kb=0
+  if [[ "$(uname)" == "Darwin" ]]; then
+    local rss_bytes
+    rss_bytes=$(awk '/peak memory footprint/ {print $1; exit} /maximum resident set size/ {print $1; exit}' "$tmp_err" 2>/dev/null || true)
+    rss_bytes=${rss_bytes:-0}
+    rss_kb=$((rss_bytes / 1024))
   else
-    # macOS older / some versions: "maximum resident set size" in bytes
-    local mrs
-    mrs=$(grep 'maximum resident set size' "$tmp_err" 2>/dev/null | awk '{print $1}' || true)
-    [[ -n "$mrs" ]] && rss_bytes=$mrs
+    local linux_kb
+    linux_kb=$(awk -F': ' '/Maximum resident set size/ {print $2; exit}' "$tmp_err" 2>/dev/null || true)
+    linux_kb=${linux_kb:-0}
+    rss_kb=$linux_kb
   fi
-  local rss_kb=$((rss_bytes / 1024))
 
   rm -f "$tmp_err"
 
@@ -203,7 +239,7 @@ for bench in $BENCHMARKS; do
     for (( run=0; run<RUNS; run++ )); do
       n_out="$RUN_OUTPUT_DIR/$name.node.$run.out"
       n_out_samples+=("$n_out")
-      r_rss=$(measure_rss "$n_out" node "$SUITE_DIR/$bench")
+      r_rss=$(measure_rss "$n_out" "${NODE_CMD[@]}" "$SUITE_DIR/$bench")
       r_out=$(cat "$n_out")
       r_ms=$(extract_time "$r_out")
       [[ -n "$r_ms" ]] && n_ms_samples+=("$r_ms")
diff --git a/benchmarks/quick.sh b/benchmarks/quick.sh
index dbee7470d5..8518fb981b 100755
--- a/benchmarks/quick.sh
+++ b/benchmarks/quick.sh
@@ -24,31 +24,68 @@ fi
 
 BENCHMARKS="05_fibonacci.ts 06_math_intensive.ts 10_nested_loops.ts 13_factorial.ts 16_matrix_multiply.ts"
 HAS_NODE=0
-command -v node &>/dev/null && HAS_NODE=1
+NODE_CMD=(node)
+
+detect_node_ts_runner() {
+  command -v node &>/dev/null || return 1
+
+  local probe
+  probe=$(mktemp "${TMPDIR:-/tmp}/perry-node-ts-probe.XXXXXX.ts")
+  printf 'const x: number = 1;\nconsole.log("node_ts_probe:" + x);\n' >"$probe"
+
+  if node "$probe" >/dev/null 2>&1; then
+    NODE_CMD=(node)
+    rm -f "$probe"
+    return 0
+  fi
+
+  if node --experimental-strip-types "$probe" >/dev/null 2>&1; then
+    NODE_CMD=(node --experimental-strip-types)
+    rm -f "$probe"
+    return 0
+  fi
+
+  rm -f "$probe"
+  return 1
+}
+
+if detect_node_ts_runner; then
+  HAS_NODE=1
+else
+  echo "Node.js is unavailable for .ts benchmark inputs; Node columns will be skipped." >&2
+fi
 
 extract_time() {
-  echo "$1" | grep -E "^[a-z_]+:[0-9]+" | head -1 | cut -d: -f2
+  awk -F: '/^[a-z_]+:[0-9]+/ {print $2; exit}' <<<"$1"
 }
 
 measure() {
   local tmp_err=$(mktemp) tmp_out=$(mktemp)
-  /usr/bin/time -l "$@" >"$tmp_out" 2>"$tmp_err"
-  local rss=0
-  local pmf
-  pmf=$(grep 'peak memory footprint' "$tmp_err" 2>/dev/null | awk '{print $1}')
-  if [[ -n "$pmf" && "$pmf" != "0" ]]; then
-    rss=$pmf
+  if [[ -x /usr/bin/time ]]; then
+    if [[ "$(uname)" == "Darwin" ]]; then
+      /usr/bin/time -l "$@" >"$tmp_out" 2>"$tmp_err" || true
+    else
+      /usr/bin/time -v "$@" >"$tmp_out" 2>"$tmp_err" || true
+    fi
+  else
+    "$@" >"$tmp_out" 2>"$tmp_err" || true
+  fi
+  local rss_mb=0
+  if [[ "$(uname)" == "Darwin" ]]; then
+    local rss_bytes
+    rss_bytes=$(awk '/peak memory footprint/ {print $1; exit} /maximum resident set size/ {print $1; exit}' "$tmp_err" 2>/dev/null || true)
+    rss_bytes=${rss_bytes:-0}
+    rss_mb=$((rss_bytes / 1024 / 1024))
   else
-    local mrs
-    mrs=$(grep 'maximum resident set size' "$tmp_err" 2>/dev/null | awk '{print $1}')
-    [[ -n "$mrs" ]] && rss=$mrs
+    local rss_kb
+    rss_kb=$(awk -F': ' '/Maximum resident set size/ {print $2; exit}' "$tmp_err" 2>/dev/null || true)
+    rss_kb=${rss_kb:-0}
+    rss_mb=$((rss_kb / 1024))
   fi
-  [[ -z "$rss" ]] && rss=0
-  local rss_mb=$((rss / 1024 / 1024))
   local output
   output=$(cat "$tmp_out")
   rm -f "$tmp_err" "$tmp_out"
-  echo "${rss_mb}|${output}"
+  printf '%s\n%s\n' "$rss_mb" "$output"
 }
 
 echo -e "${BOLD}${CYAN}Quick Bench (5 benchmarks)${NC}"
@@ -71,17 +108,17 @@ for bench in $BENCHMARKS; do
 
   # Perry
   result=$(measure "./$name")
-  p_rss=$(echo "$result" | cut -d'|' -f1)
-  p_out=$(echo "$result" | cut -d'|' -f2-)
+  p_rss=$(sed -n '1p' <<<"$result")
+  p_out=$(sed '1d' <<<"$result")
   p_ms=$(extract_time "$p_out")
 
   # Node
   n_ms="-"; n_rss="-"
   ratio="-"; mratio="-"
   if [[ $HAS_NODE -eq 1 ]]; then
-    result=$(measure node "$bench")
-    n_rss=$(echo "$result" | cut -d'|' -f1)
-    n_out=$(echo "$result" | cut -d'|' -f2-)
+    result=$(measure "${NODE_CMD[@]}" "$bench")
+    n_rss=$(sed -n '1p' <<<"$result")
+    n_out=$(sed '1d' <<<"$result")
     n_ms=$(extract_time "$n_out")
 
     if [[ "$p_ms" =~ ^[0-9]+$ && "$n_ms" =~ ^[0-9]+$ && "$n_ms" -gt 0 ]]; then
diff --git a/crates/perry-codegen/src/expr/typed_feedback.rs b/crates/perry-codegen/src/expr/typed_feedback.rs
index 891592deb7..b2c6d0d336 100644
--- a/crates/perry-codegen/src/expr/typed_feedback.rs
+++ b/crates/perry-codegen/src/expr/typed_feedback.rs
@@ -228,23 +228,31 @@ pub(crate) fn emit_typed_feedback_register_site(
         emit_typed_feedback_bytes_global(ctx, local_site_id, "guard", contract.guard_name);
     let fallback_global =
         emit_typed_feedback_bytes_global(ctx, local_site_id, "fallback", contract.fallback_name);
-    ctx.block().call_void(
+    let site_id_arg = site_id.to_string();
+    let kind_arg = kind.raw().to_string();
+    let module_len_arg = module.len().to_string();
+    let function_len_arg = function.len().to_string();
+    let source_len_arg = source_label.len().to_string();
+    let operation_len_arg = operation.len().to_string();
+    let guard_len_arg = contract.guard_name.len().to_string();
+    let fallback_len_arg = contract.fallback_name.len().to_string();
+    ctx.func.entry_setup_call_void(
         "js_typed_feedback_register_site",
         &[
-            (I64, &site_id.to_string()),
-            (I32, &kind.raw().to_string()),
+            (I64, &site_id_arg),
+            (I32, &kind_arg),
             (PTR, &module_global),
-            (I64, &module.len().to_string()),
+            (I64, &module_len_arg),
             (PTR, &function_global),
-            (I64, &function.len().to_string()),
+            (I64, &function_len_arg),
             (PTR, &source_global),
-            (I64, &source_label.len().to_string()),
+            (I64, &source_len_arg),
             (PTR, &operation_global),
-            (I64, &operation.len().to_string()),
+            (I64, &operation_len_arg),
             (PTR, &guard_global),
-            (I64, &contract.guard_name.len().to_string()),
+            (I64, &guard_len_arg),
             (PTR, &fallback_global),
-            (I64, &contract.fallback_name.len().to_string()),
+            (I64, &fallback_len_arg),
         ],
     );
     site_id.to_string()
diff --git a/crates/perry-codegen/src/function.rs b/crates/perry-codegen/src/function.rs
index f7501ba2f4..afa753baf2 100644
--- a/crates/perry-codegen/src/function.rs
+++ b/crates/perry-codegen/src/function.rs
@@ -61,9 +61,9 @@ pub struct LlFunction {
     ///
     /// `to_ir()` splices these instructions into block 0 at the
     /// `entry_init_boundary` instruction index. If no boundary is set
-    /// (e.g. user functions, which have no init prelude), they're
-    /// appended to `entry_allocas` instead so the dominance guarantee
-    /// still holds.
+    /// (e.g. user functions, which have no init prelude), they are
+    /// emitted immediately after entry allocas and before the first
+    /// block instruction so the dominance guarantee still holds.
     entry_post_init_setup: Vec<String>,
     /// Index in block 0's instruction list where `entry_post_init_setup`
     /// should be spliced in. Set by `mark_entry_init_boundary` after
@@ -252,6 +252,24 @@ impl LlFunction {
             .push(format!("  store {} {}, ptr {}", ty, val, ptr));
     }
 
+    /// Emit a one-time void call in the function-entry setup region.
+    ///
+    /// Use this for metadata/registration work that must happen before
+    /// any reachable hot-path use but does not need to run at each use
+    /// site. If the function has an init prelude boundary, the call is
+    /// spliced after runtime/string initialization; otherwise it is
+    /// emitted at the top of the entry block with the other entry setup.
+    pub fn entry_setup_call_void(&mut self, func_name: &str, args: &[(LlvmType, &str)]) {
+        crate::ext_registry::record_ffi_call(func_name);
+        let arg_str = args
+            .iter()
+            .map(|(ty, value)| format!("{} {}", ty, value))
+            .collect::<Vec<_>>()
+            .join(", ");
+        let line = format!("  call void @{}({})", func_name, arg_str);
+        self.entry_post_init_setup.push(line);
+    }
+
     /// Emit a one-time function-entry init sequence: allocate a `ptr`
     /// slot, call `func_name()` (no args), store the result in the
     /// slot, return the slot pointer name. Used by the inline bump