From 9eec21894c603eeb359d4692a8e02cc77750d69a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralph=20K=C3=BCpper?= Date: Thu, 18 Jun 2026 11:21:37 +0200 Subject: [PATCH 1/5] feat(codegen): split large modules into N codegen units (#5391) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A single LLVM translation unit per module makes clang OOM on large inputs (a 13MB minified bundle lowers to one ~1.1GB .ll; clang -c needs ~15GB RSS). This adds codegen-unit splitting: the populated LlModule renders to N independent .ll texts, each compiled by clang -c separately (peak RSS ~whole/N) and merged with `ld -r` into one object — so compile_module's single-Vec contract and the existing one-object link path are unchanged. Mechanism (LlModule::render_codegen_units): - functions partitioned into N contiguous buckets; - the full string-constant + global set carried in every unit with local and bare-external DEFINITIONS promoted to linkonce_odr (linker keeps one copy; globals are ~1.4% of a large module's IR, so duplication is cheap); - each unit gets external declares for every function it calls but does not define (deduped by name; existing declarations win), so cross-unit calls resolve; the lone internal init/wrapper is promoted to external; - shared attribute groups + metadata replicated so #N/!N references resolve. Gating (decide_codegen_units): PERRY_CODEGEN_UNITS=N forces N (1 disables); otherwise auto by callable count — 1 unit below a floor, then ceil(callables / PERRY_CODEGEN_UNIT_SIZE) capped at 48. Default is 1 unit for ordinary modules — zero behavior change. Validated: a 2-unit split of a multi-function program with cross-unit calls + shared string/class globals compiles, links, and runs identically to the 1-unit build (`hi world` / 20). Unit tests cover the partition (define-once, cross-unit declares, linkonce_odr promotion) and the n=1 == to_ir identity. Follow-on to the IR-efficiency roadmap #5334; tracks #5391. --- crates/perry-codegen/src/codegen/helpers.rs | 28 +++ crates/perry-codegen/src/codegen/mod.rs | 34 ++- crates/perry-codegen/src/linker.rs | 61 +++++ crates/perry-codegen/src/module.rs | 251 ++++++++++++++++++-- 4 files changed, 346 insertions(+), 28 deletions(-) diff --git a/crates/perry-codegen/src/codegen/helpers.rs b/crates/perry-codegen/src/codegen/helpers.rs index e23094d410..6db349f87e 100644 --- a/crates/perry-codegen/src/codegen/helpers.rs +++ b/crates/perry-codegen/src/codegen/helpers.rs @@ -179,6 +179,34 @@ pub(crate) fn decide_full_outline_ic(callable_count: usize) -> bool { callable_count >= threshold } +/// Decide how many codegen units to split a module's object compilation into +/// (#5391). A single huge translation unit makes `clang -c` OOM (~15GB on the +/// 13MB bundle); splitting bounds peak compiler memory to roughly whole/N. +/// +/// `PERRY_CODEGEN_UNITS=N` forces exactly N units (1 disables splitting). +/// Otherwise auto: 1 unit until the module's callable count crosses a floor, +/// then `ceil(callables / target_per_unit)`, capped — so ordinary per-file +/// modules stay on the single-unit path (default 1, zero behavior change). +/// `PERRY_CODEGEN_UNIT_SIZE` overrides the target callables-per-unit. +pub(crate) fn decide_codegen_units(callable_count: usize) -> usize { + if let Ok(v) = std::env::var("PERRY_CODEGEN_UNITS") { + if let Ok(n) = v.parse::() { + return n.max(1); + } + } + const MIN_CALLABLES_TO_SPLIT: usize = 8000; + const MAX_UNITS: usize = 48; + let target = std::env::var("PERRY_CODEGEN_UNIT_SIZE") + .ok() + .and_then(|v| v.parse::().ok()) + .filter(|&n| n > 0) + .unwrap_or(6000); + if callable_count < MIN_CALLABLES_TO_SPLIT { + return 1; + } + callable_count.div_ceil(target).clamp(1, MAX_UNITS) +} + pub(super) fn scoped_fn_name(module_prefix: &str, hir_name: &str) -> String { // Use the INJECTIVE sanitizer (same as scoped_static_method_name): plain // `sanitize` maps every non-`[A-Za-z0-9_]` char to `_`, so distinct minified diff --git a/crates/perry-codegen/src/codegen/mod.rs b/crates/perry-codegen/src/codegen/mod.rs index f88d4edbb9..bed5f221c5 100644 --- a/crates/perry-codegen/src/codegen/mod.rs +++ b/crates/perry-codegen/src/codegen/mod.rs @@ -51,8 +51,8 @@ mod string_pool; pub use helpers::resolve_target_triple; pub(crate) use helpers::{ - decide_full_outline_ic, default_target_triple, full_outline_ic_enabled, module_callable_count, - set_full_outline_ic, write_barriers_enabled, + decide_codegen_units, decide_full_outline_ic, default_target_triple, full_outline_ic_enabled, + module_callable_count, set_full_outline_ic, write_barriers_enabled, }; pub use opts::{ AppMetadata, CompileOptions, FpContractMode, ImportedClass, NamespaceEntry, NamespaceEntryKind, @@ -2521,6 +2521,32 @@ pub fn compile_module(hir: &HirModule, opts: CompileOptions) -> Result> crate::native_value::verify_native_rep_records(&llmod.native_rep_records)?; } + crate::native_value::write_native_rep_artifact_if_enabled( + &hir.name, + &llmod.native_rep_records, + )?; + + // #5391 codegen units: large modules split their object compilation into N + // independently-compiled units so clang's peak RSS stays ~whole/N instead of + // OOMing on one giant TU. Gated to large modules (default 1 unit = unchanged + // behavior). `emit_ir_only` and `PERRY_SAVE_LL` want the whole-module text, + // so they take the single-text path; the split path avoids materializing the + // full ~1GB IR string at all (which would defeat the memory win). + let n_units = if opts.emit_ir_only { + 1 + } else { + decide_codegen_units(module_callable_count(hir)) + }; + if n_units > 1 { + let units = llmod.render_codegen_units(n_units); + log::debug!( + "perry-codegen: split '{}' into {} codegen units", + hir.name, + units.len() + ); + return crate::linker::compile_units_to_object(&units, opts.target.as_deref()); + } + let ll_text = llmod.to_ir(); log::debug!( "perry-codegen: emitted {} bytes of LLVM IR for '{}' ({} interned strings)", @@ -2533,10 +2559,6 @@ pub fn compile_module(hir: &HirModule, opts: CompileOptions) -> Result> let filename = format!("{}/{}.ll", save_dir, module_prefix); let _ = std::fs::write(&filename, &ll_text); } - crate::native_value::write_native_rep_artifact_if_enabled( - &hir.name, - &llmod.native_rep_records, - )?; if opts.emit_ir_only { Ok(ll_text.into_bytes()) } else { diff --git a/crates/perry-codegen/src/linker.rs b/crates/perry-codegen/src/linker.rs index 97440dad12..f209704bde 100644 --- a/crates/perry-codegen/src/linker.rs +++ b/crates/perry-codegen/src/linker.rs @@ -280,6 +280,67 @@ pub fn compile_ll_to_object(ll_text: &str, target_triple: Option<&str>) -> Resul Ok(bytes) } +/// Compile a module that was split into codegen units (#5391) to a SINGLE +/// object file's bytes. Each unit `.ll` (from `LlModule::render_codegen_units`) +/// is compiled independently by `clang -c` — bounding peak compiler memory to +/// roughly one unit's worth instead of the whole module — and the resulting +/// objects are merged with a partial link (`ld -r`) into one object, preserving +/// `compile_module`'s single-`Vec` contract and the existing one-object +/// link path. Units are compiled sequentially so peak RSS stays at one unit. +pub fn compile_units_to_object(units: &[String], target_triple: Option<&str>) -> Result> { + match units { + [] => return compile_ll_to_object("", target_triple), + [only] => return compile_ll_to_object(only, target_triple), + _ => {} + } + + let tmp_dir = env::temp_dir(); + let pid = std::process::id(); + let nonce = TEMP_NONCE_COUNTER.fetch_add(1, Ordering::Relaxed); + + let mut obj_paths: Vec = Vec::with_capacity(units.len()); + for (i, unit) in units.iter().enumerate() { + let bytes = compile_ll_to_object(unit, target_triple).with_context(|| { + format!("codegen unit {}/{} failed to compile", i + 1, units.len()) + })?; + let p = tmp_dir.join(format!("perry_cgu_{}_{}_{}.o", pid, nonce, i)); + fs::write(&p, &bytes) + .with_context(|| format!("failed to write codegen-unit object {}", p.display()))?; + obj_paths.push(p); + } + + let combined = tmp_dir.join(format!("perry_cgu_{}_{}_combined.o", pid, nonce)); + let ld = env::var("PERRY_LD").unwrap_or_else(|_| "ld".to_string()); + let mut cmd = Command::new(&ld); + cmd.arg("-r").arg("-o").arg(&combined); + for p in &obj_paths { + cmd.arg(p); + } + let out = cmd + .output() + .with_context(|| format!("failed to invoke partial linker `{} -r`", ld))?; + let result = if out.status.success() { + fs::read(&combined) + .with_context(|| format!("failed to read merged object {}", combined.display())) + } else { + Err(anyhow!( + "partial link `{} -r` of {} codegen units failed (status={}).\nstderr:\n{}", + ld, + units.len(), + out.status, + String::from_utf8_lossy(&out.stderr) + )) + }; + + if env::var_os("PERRY_LLVM_KEEP_IR").is_none() { + for p in &obj_paths { + let _ = fs::remove_file(p); + } + let _ = fs::remove_file(&combined); + } + result +} + fn json_string(value: &str) -> String { let mut out = String::with_capacity(value.len() + 2); out.push('"'); diff --git a/crates/perry-codegen/src/module.rs b/crates/perry-codegen/src/module.rs index 9e54cada4e..4dcb1cee88 100644 --- a/crates/perry-codegen/src/module.rs +++ b/crates/perry-codegen/src/module.rs @@ -10,13 +10,79 @@ //! `to_ir()` assembles the pieces into a complete `.ll` file with the target //! triple header. -use std::collections::HashSet; +use std::collections::{BTreeMap, HashSet}; use crate::block::FpFlags; use crate::function::LlFunction; use crate::native_value::NativeRepRecord; use crate::types::LlvmType; +/// Strip a leading LLVM linkage keyword from a global's post-`=` text, if +/// present. Linkage comes before `unnamed_addr`/`constant`/`global` in the +/// grammar, so this leaves the rest of the definition intact. +fn strip_leading_linkage(s: &str) -> &str { + for kw in [ + "private ", + "internal ", + "linkonce_odr ", + "linkonce ", + "weak_odr ", + "weak ", + "common ", + "available_externally ", + ] { + if let Some(rest) = s.strip_prefix(kw) { + return rest; + } + } + s +} + +/// Rewrite a module-global definition so it is safe to duplicate across +/// codegen units (#5391). Local-linkage (`private`/`internal`) and bare +/// external definitions are promoted to `linkonce_odr`, so the linker keeps a +/// single copy when the same global is emitted into multiple units. `external` +/// declarations (no initializer) are returned unchanged — duplicating a +/// declaration is harmless. +fn promote_global_for_units(line: &str) -> String { + if line.contains(" = external ") { + return line.to_string(); + } + match line.split_once(" = ") { + Some((lhs, rhs)) => format!("{} = linkonce_odr {}", lhs, strip_leading_linkage(rhs.trim_start())), + None => line.to_string(), + } +} + +/// Synthesize an external `declare` line matching a locally-defined function's +/// signature, so a codegen unit that calls it (but does not define it) resolves +/// the call at link time. +fn declare_line_for(f: &LlFunction) -> String { + let params = f + .params + .iter() + .map(|(t, _)| t.to_string()) + .collect::>() + .join(", "); + let attrs = if f.name == "setjmp" || f.name == "_setjmp" { + " #0" + } else { + "" + }; + format!("declare {} @{}({}){}", f.return_type, f.name, params, attrs) +} + +/// Render a function with external linkage forced, promoting an `internal` / +/// `private` definition so cross-unit calls can bind to it. Names are +/// module-prefixed and unique, so promotion never collides. +fn render_fn_external(f: &LlFunction) -> String { + let ir = f.to_ir(); + if f.linkage == "internal" || f.linkage == "private" { + return ir.replacen(&format!("define {} ", f.linkage), "define ", 1); + } + ir +} + pub struct LlModule { pub target_triple: String, declarations: Vec<(String, String)>, // (name, full "declare …" line) @@ -249,37 +315,128 @@ impl LlModule { ir.push('\n'); } - // Attribute group for setjmp's `returns_twice` marker. - // Only emit if setjmp (any variant) was actually declared in - // this module. Apple targets declare `_setjmp` (fast variant - // without signal-mask save), Windows declares `_setjmp` - // (2-arg ABI), Linux declares `setjmp` — all three need - // `returns_twice` on the call site. + self.push_attrs_and_metadata(&mut ir); + + ir + } + + /// Emit the shared setjmp attribute groups + the `!0`/buffer-alias metadata + /// tail. Factored out of [`to_ir`] so each codegen unit can replicate the + /// same attributes and metadata (so `#0`/`#1` and `!N` references resolve in + /// every unit). Over-emitting an unused attribute group is harmless. + fn push_attrs_and_metadata(&self, ir: &mut String) { + // Attribute group for setjmp's `returns_twice` marker. Only emit if + // setjmp (any variant) was declared. Apple declares `_setjmp`, Windows + // `_setjmp` (2-arg ABI), Linux `setjmp` — all need `returns_twice`. if self.declared_names.contains("setjmp") || self.declared_names.contains("_setjmp") { ir.push_str("\nattributes #0 = { returns_twice }\n"); - // Functions that contain a `try` statement are marked with `#1`. - // `optnone` forces LLVM to skip mem2reg/SROA inside the function, - // so allocas aren't promoted to SSA registers across the setjmp - // call — otherwise mutations in the try body are invisible to - // the catch block after longjmp. Pairs with `noinline` so the - // constraint isn't lost via inlining into a caller. + // Functions containing a `try` are marked `#1`. `optnone` skips + // mem2reg/SROA so allocas aren't promoted across the setjmp call + // (else try-body mutations are invisible to catch after longjmp); + // `noinline` keeps the constraint from being lost via inlining. ir.push_str("attributes #1 = { noinline optnone }\n"); } - - // Issue #52: `!0 = !{}` metadata node referenced by - // `load_invariant` (via `!invariant.load !0`). LLVM's GVN + LICM - // hoist loads tagged with `!invariant.load` out of their - // enclosing loops when the loop body can't write to the same - // address; without this, the per-access Buffer / Array length - // reload stays pinned inside every bounds check even when the - // buffer is loop-invariant. + // Issue #52: `!0 = !{}` referenced by `!invariant.load !0`, plus the + // buffer alias-scope metadata. LICM/GVN hoist invariant loads out of + // loops only with these present. ir.push_str("\n!0 = !{}\n"); for ml in &self.metadata_lines { ir.push_str(ml); ir.push('\n'); } + } - ir + /// Render this module as `n` independent codegen-unit `.ll` texts (#5391). + /// + /// Each unit is independently compilable by `clang -c`, so peak compiler + /// memory is bounded to ~1/n of the whole module — the structural fix for + /// the single giant translation unit that makes clang OOM on large bundles. + /// + /// The functions are split into `n` contiguous buckets. Every unit carries: + /// * the full string-constant + global set, with local-linkage and bare + /// external DEFINITIONS promoted to `linkonce_odr` (the linker keeps one + /// copy). Globals are a tiny fraction of a large module's IR, so the + /// duplication is cheap; `external` *declarations* are replicated as-is; + /// * the module's external `declare`s plus a synthesized `declare` for + /// every locally-defined function the unit does NOT itself define, so + /// cross-unit calls resolve at link time (deduped by name, existing + /// declarations win); + /// * each function rendered with external linkage forced (the lone + /// `internal` init/wrapper is promoted so cross-unit calls bind); + /// * the shared attribute groups + metadata (so `#N`/`!N` refs resolve). + /// + /// `n <= 1` (or a single-function module) returns one text identical to + /// [`to_ir`]. The caller compiles each text to an object and combines them + /// (`ld -r`) into one object, keeping `compile_module`'s single-object API. + pub fn render_codegen_units(&self, n: usize) -> Vec { + if n <= 1 || self.functions.len() <= 1 { + return vec![self.to_ir()]; + } + let n = n.min(self.functions.len()); + let chunk = self.functions.len().div_ceil(n); + + let shared_strings: Vec = self + .string_constants + .iter() + .map(|s| promote_global_for_units(s)) + .collect(); + let shared_globals: Vec = self + .globals + .iter() + .map(|g| promote_global_for_units(g)) + .collect(); + + // name -> declare line. Existing module declarations (runtime, FFI, + // cross-module) take precedence; every locally-defined function without + // one gets a synthesized declare. Deduped by name so no unit emits a + // duplicate declaration. BTreeMap for deterministic unit output. + let mut decl_by_name: BTreeMap<&str, String> = BTreeMap::new(); + for (name, decl) in &self.declarations { + decl_by_name.insert(name.as_str(), decl.clone()); + } + for f in &self.functions { + decl_by_name + .entry(f.name.as_str()) + .or_insert_with(|| declare_line_for(f)); + } + + let mut units = Vec::with_capacity(n); + for bucket in self.functions.chunks(chunk) { + let defined: HashSet<&str> = bucket.iter().map(|f| f.name.as_str()).collect(); + let mut ir = String::new(); + ir.push_str("; Generated by perry-codegen (codegen unit)\n"); + ir.push_str(&format!("target triple = \"{}\"\n\n", self.target_triple)); + + for sc in &shared_strings { + ir.push_str(sc); + ir.push('\n'); + } + ir.push('\n'); + for g in &shared_globals { + ir.push_str(g); + ir.push('\n'); + } + ir.push('\n'); + + // Declares for everything this unit references but does not define. + for (name, decl) in &decl_by_name { + if defined.contains(name) { + continue; + } + ir.push_str(decl); + ir.push('\n'); + } + ir.push('\n'); + + for func in bucket { + ir.push_str(&render_fn_external(func)); + ir.push('\n'); + } + + self.push_attrs_and_metadata(&mut ir); + units.push(ir); + } + units } } @@ -288,6 +445,56 @@ mod tests { use super::*; use crate::types::{DOUBLE, I32, I64, PTR, VOID}; + #[test] + fn render_codegen_units_partitions_and_links() { + // #5391: a 2-unit split of a 2-function module must (a) define each + // function in exactly one unit, (b) declare the other so cross-unit + // calls resolve, and (c) carry the shared globals in BOTH units with + // local linkage promoted to linkonce_odr (linker dedups). + let mut m = LlModule::new("arm64-apple-macosx15.0.0"); + m.declare_function("js_console_log_number", VOID, &[DOUBLE]); + m.add_internal_global("perry_global_x", DOUBLE, "0.0"); + let (_s, _l) = m.add_string_constant("hi"); + + // f() calls g() + let f = m.define_function("perry_fn_m__f", DOUBLE, vec![]); + let e = f.create_block("entry"); + let r = e.call(DOUBLE, "perry_fn_m__g", &[]); + e.ret(DOUBLE, &r); + let g = m.define_function("perry_fn_m__g", DOUBLE, vec![]); + let e2 = g.create_block("entry"); + e2.ret(DOUBLE, "0.0"); + + let units = m.render_codegen_units(2); + assert_eq!(units.len(), 2, "two functions → two units"); + + // Each function defined exactly once across all units. + let def_f = units.iter().filter(|u| u.contains("define double @perry_fn_m__f(")).count(); + let def_g = units.iter().filter(|u| u.contains("define double @perry_fn_m__g(")).count(); + assert_eq!(def_f, 1); + assert_eq!(def_g, 1); + + // The unit that DEFINES f (and calls g) must DECLARE g. + let u_with_f = units.iter().find(|u| u.contains("define double @perry_fn_m__f(")).unwrap(); + assert!(u_with_f.contains("declare double @perry_fn_m__g()")); + + // Shared globals appear in BOTH units, promoted to linkonce_odr. + for u in &units { + assert!(u.contains("@perry_global_x = linkonce_odr global double 0.0")); + assert!(u.contains("@.str.0 = linkonce_odr unnamed_addr constant")); + assert!(u.contains("declare void @js_console_log_number(double)")); + assert!(u.contains("target triple = \"arm64-apple-macosx15.0.0\"")); + } + } + + #[test] + fn render_codegen_units_single_unit_matches_to_ir() { + let mut m = LlModule::new("arm64-apple-macosx15.0.0"); + let f = m.define_function("main", I32, vec![]); + f.create_block("entry").ret(I32, "0"); + assert_eq!(m.render_codegen_units(1), vec![m.to_ir()]); + } + #[test] fn hello_world_ir_is_well_formed() { let mut m = LlModule::new("arm64-apple-macosx15.0.0"); From c60e5ae6b2621089c33dd6475f77b9ddeee97c94 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralph=20K=C3=BCpper?= Date: Thu, 18 Jun 2026 11:57:31 +0200 Subject: [PATCH 2/5] fix(codegen): emit each function symbol once (duplicate-name collision) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Minified bundles can contain two distinct classes that sanitize to the same name (e.g. two classes `j`), so their mangled method symbols collide (`perry_method_..._j__getElementsByTagName` defined twice). LLVM rejects the redefinition and the whole module fails to compile — independent of codegen-unit splitting (the single-TU `to_ir` had the same latent bug; it only surfaced once a real bundle reached clang). Add `deduped_function_refs` (first occurrence per name) and route both `to_ir` and `render_codegen_units` through it, so each symbol is emitted once. Calls to the duplicate resolve to the first definition — a dispatch ambiguity limited to genuinely name-colliding members; proper disambiguation by class id is a separate concern. No-op for ordinary modules (no duplicate names). Unit test covers both render paths. --- crates/perry-codegen/src/module.rs | 58 ++++++++++++++++++++++++++---- 1 file changed, 51 insertions(+), 7 deletions(-) diff --git a/crates/perry-codegen/src/module.rs b/crates/perry-codegen/src/module.rs index 4dcb1cee88..fb6ccdc4b8 100644 --- a/crates/perry-codegen/src/module.rs +++ b/crates/perry-codegen/src/module.rs @@ -280,6 +280,24 @@ impl LlModule { (name, len) } + /// Functions to emit, each symbol AT MOST ONCE (first occurrence wins). + /// + /// Minified bundles can contain two distinct classes that sanitize to the + /// same name (e.g. two classes `j`), producing colliding mangled method + /// symbols (`perry_method_..._j__getElementsByTagName` defined twice). LLVM + /// rejects the redefinition. Emitting each symbol once lets the module + /// compile; calls to the duplicate resolve to the first definition (a + /// dispatch ambiguity limited to genuinely name-colliding members — proper + /// disambiguation by class id is a separate concern). Shared by [`to_ir`] + /// and [`render_codegen_units`] so both paths agree on the symbol set. + fn deduped_function_refs(&self) -> Vec<&LlFunction> { + let mut seen: HashSet<&str> = HashSet::with_capacity(self.functions.len()); + self.functions + .iter() + .filter(|f| seen.insert(f.name.as_str())) + .collect() + } + /// Serialize the module to a complete `.ll` file. pub fn to_ir(&self) -> String { let mut ir = String::new(); @@ -298,9 +316,11 @@ impl LlModule { } ir.push('\n'); + let funcs = self.deduped_function_refs(); + // Skip any `declare` whose name is also `define`d in this module — // LLVM rejects declare+define for the same symbol. - let defined: HashSet<&str> = self.functions.iter().map(|f| f.name.as_str()).collect(); + let defined: HashSet<&str> = funcs.iter().map(|f| f.name.as_str()).collect(); for (name, decl) in &self.declarations { if defined.contains(name.as_str()) { continue; @@ -310,7 +330,7 @@ impl LlModule { } ir.push('\n'); - for func in &self.functions { + for func in &funcs { ir.push_str(&func.to_ir()); ir.push('\n'); } @@ -369,11 +389,12 @@ impl LlModule { /// [`to_ir`]. The caller compiles each text to an object and combines them /// (`ld -r`) into one object, keeping `compile_module`'s single-object API. pub fn render_codegen_units(&self, n: usize) -> Vec { - if n <= 1 || self.functions.len() <= 1 { + let funcs = self.deduped_function_refs(); + if n <= 1 || funcs.len() <= 1 { return vec![self.to_ir()]; } - let n = n.min(self.functions.len()); - let chunk = self.functions.len().div_ceil(n); + let n = n.min(funcs.len()); + let chunk = funcs.len().div_ceil(n); let shared_strings: Vec = self .string_constants @@ -394,14 +415,14 @@ impl LlModule { for (name, decl) in &self.declarations { decl_by_name.insert(name.as_str(), decl.clone()); } - for f in &self.functions { + for f in &funcs { decl_by_name .entry(f.name.as_str()) .or_insert_with(|| declare_line_for(f)); } let mut units = Vec::with_capacity(n); - for bucket in self.functions.chunks(chunk) { + for bucket in funcs.chunks(chunk) { let defined: HashSet<&str> = bucket.iter().map(|f| f.name.as_str()).collect(); let mut ir = String::new(); ir.push_str("; Generated by perry-codegen (codegen unit)\n"); @@ -487,6 +508,29 @@ mod tests { } } + #[test] + fn duplicate_function_symbol_emitted_once() { + // Two classes that sanitize to the same name produce a colliding + // method symbol; it must be emitted once (LLVM rejects redefinition), + // in both the single-TU and the codegen-unit render paths. + let mut m = LlModule::new("arm64-apple-macosx15.0.0"); + for _ in 0..2 { + let f = m.define_function("perry_method_j__foo", DOUBLE, vec![]); + f.create_block("entry").ret(DOUBLE, "0.0"); + } + assert_eq!( + m.to_ir().matches("define double @perry_method_j__foo(").count(), + 1, + "duplicate symbol must be defined once in to_ir" + ); + let units = m.render_codegen_units(4); + let defs: usize = units + .iter() + .map(|u| u.matches("define double @perry_method_j__foo(").count()) + .sum(); + assert_eq!(defs, 1, "duplicate symbol must be defined once across units"); + } + #[test] fn render_codegen_units_single_unit_matches_to_ir() { let mut m = LlModule::new("arm64-apple-macosx15.0.0"); From ea554550fa8436732e6f7f0fd46a3ca0c6b6a259 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralph=20K=C3=BCpper?= Date: Thu, 18 Jun 2026 13:57:28 +0200 Subject: [PATCH 3/5] perf(codegen): balance codegen units by estimated byte size (#5391) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Contiguous chunk-by-function-count partitioning clumped functions unevenly: on the cli.js bundle one unit ballooned to 139MB (it caught the biggest functions) while others were ~50MB, and clang -O0 time is superlinear in unit size, so that one unit dominated the whole compile. Switch to greedy largest-first bin-packing by `LlFunction::estimated_ir_bytes` (a cheap instruction-byte sum, no second render): each function is placed into the currently-smallest unit, so big functions are isolated and the rest stay even. This bounds the max unit size to ~total/N + the single largest function. NB: a function larger than total/N is irreducible by inter-function splitting — the cli.js bundle has a single 68MB minified IIFE whose clang -O0 time is the residual bottleneck (the intra-function #4880 problem, distinct from the module memory wall this feature removes). Balancing isolates it so the other units compile fast and in bounded memory. Unit test: a giant function + several tiny ones splits so the giant is not clumped with the small set. --- crates/perry-codegen/src/function.rs | 18 +++++++++ crates/perry-codegen/src/module.rs | 60 +++++++++++++++++++++++++++- 2 files changed, 76 insertions(+), 2 deletions(-) diff --git a/crates/perry-codegen/src/function.rs b/crates/perry-codegen/src/function.rs index afa753baf2..b68660d595 100644 --- a/crates/perry-codegen/src/function.rs +++ b/crates/perry-codegen/src/function.rs @@ -361,6 +361,24 @@ impl LlFunction { self.blocks.last().map(|b| b.label.as_str()) } + /// Cheap estimate of this function's rendered IR size in bytes, used to + /// balance codegen-unit partitioning (#5391) without rendering twice. Sums + /// the byte length of every instruction + entry alloca (the dominant terms); + /// block labels/headers are a small fixed overhead per block. + pub fn estimated_ir_bytes(&self) -> usize { + let body: usize = self + .blocks + .iter() + .map(|b| { + b.instructions_iter().map(|i| i.len() + 1).sum::() + + b.label.len() + + 4 + }) + .sum(); + let allocas: usize = self.entry_allocas.iter().map(|a| a.len() + 1).sum(); + body + allocas + self.name.len() + 64 + } + pub fn to_ir(&self) -> String { let param_str = self .params diff --git a/crates/perry-codegen/src/module.rs b/crates/perry-codegen/src/module.rs index fb6ccdc4b8..d2dcb56152 100644 --- a/crates/perry-codegen/src/module.rs +++ b/crates/perry-codegen/src/module.rs @@ -394,7 +394,30 @@ impl LlModule { return vec![self.to_ir()]; } let n = n.min(funcs.len()); - let chunk = funcs.len().div_ceil(n); + + // Balance units by estimated byte size, not function count: minified + // bundles have a few enormous functions (a 68MB IIFE in the cli.js + // case), so contiguous count-chunking can clump them into one outsized + // unit whose clang -O0 time dominates. Greedy largest-first bin-packing + // assigns each function to the currently-smallest unit, isolating big + // functions and keeping the rest even. (A single function larger than + // total/n is irreducible here — that is the intra-function #4880 + // problem, not something inter-function splitting can divide.) + let sizes: Vec = funcs.iter().map(|f| f.estimated_ir_bytes()).collect(); + let mut order: Vec = (0..funcs.len()).collect(); + order.sort_by_key(|&i| std::cmp::Reverse(sizes[i])); + let mut buckets: Vec> = vec![Vec::new(); n]; + let mut bucket_bytes = vec![0usize; n]; + for &i in &order { + let target = bucket_bytes + .iter() + .enumerate() + .min_by_key(|&(_, &b)| b) + .map(|(idx, _)| idx) + .unwrap_or(0); + buckets[target].push(funcs[i]); + bucket_bytes[target] += sizes[i]; + } let shared_strings: Vec = self .string_constants @@ -422,7 +445,7 @@ impl LlModule { } let mut units = Vec::with_capacity(n); - for bucket in funcs.chunks(chunk) { + for bucket in &buckets { let defined: HashSet<&str> = bucket.iter().map(|f| f.name.as_str()).collect(); let mut ir = String::new(); ir.push_str("; Generated by perry-codegen (codegen unit)\n"); @@ -531,6 +554,39 @@ mod tests { assert_eq!(defs, 1, "duplicate symbol must be defined once across units"); } + #[test] + fn render_codegen_units_balances_by_size_isolating_a_giant_fn() { + // One huge function + several tiny ones, split into 2 units: greedy + // size bin-packing must isolate the giant function so it does NOT share + // a unit with the tiny ones (which would make that unit outsized). + let mut m = LlModule::new("arm64-apple-macosx15.0.0"); + let big = m.define_function("perry_fn_m__big", DOUBLE, vec![]); + let be = big.create_block("entry"); + for _ in 0..2000 { + be.call_void("js_noop", &[]); + } + be.ret(DOUBLE, "0.0"); + for k in 0..6 { + let f = m.define_function(format!("perry_fn_m__small{k}"), DOUBLE, vec![]); + f.create_block("entry").ret(DOUBLE, "0.0"); + } + let units = m.render_codegen_units(2); + assert_eq!(units.len(), 2); + let big_unit = units + .iter() + .find(|u| u.contains("define double @perry_fn_m__big(")) + .unwrap(); + // The giant function's unit holds (essentially) only it — the six small + // functions land in the other unit to balance bytes. + let smalls_with_big = (0..6) + .filter(|k| big_unit.contains(&format!("define double @perry_fn_m__small{k}("))) + .count(); + assert!( + smalls_with_big <= 1, + "giant function should be isolated, not clumped with the small ones (got {smalls_with_big})" + ); + } + #[test] fn render_codegen_units_single_unit_matches_to_ir() { let mut m = LlModule::new("arm64-apple-macosx15.0.0"); From 7937a2c58c756580d6d29eee4bd03bb4f597e6a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralph=20K=C3=BCpper?= Date: Thu, 18 Jun 2026 14:54:32 +0200 Subject: [PATCH 4/5] perf(codegen): split the string-pool init into chunk functions (#5391) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A large bundle interns ~190K strings, and emitting every string's init (from-bytes + nanbox + global store + GC-root register) into one `__perry_init_strings` block produced a single ~68MB function. clang -O0 (forced for oversized modules, #4880) can't compile a function that large in practical time — its per-function passes are superlinear in size — so this was the residual wall after codegen-unit splitting removed the module MEMORY wall: a single giant FUNCTION that inter-function splitting can't divide. The string inits are independent (each handle is stored to its own global; no SSA value flows between iterations), so emit them into chunk functions of PERRY_STRING_INIT_CHUNK_SIZE entries (default 4000) and call those in sequence from `__perry_init_strings`. The ~48 chunks are each ~1.4MB and, combined with codegen-unit splitting (#5407), bin-pack evenly across units instead of one unit carrying the 68MB monolith. Order is preserved (chunks called in sequence, ops in order within a chunk), so GC-root registration and any ordering stay identical. Behavior-neutral for ordinary modules (a handful of strings → one chunk). Full codegen suite green. --- .../perry-codegen/src/codegen/string_pool.rs | 75 +++++++++++++++---- crates/perry-codegen/src/module.rs | 8 ++ 2 files changed, 68 insertions(+), 15 deletions(-) diff --git a/crates/perry-codegen/src/codegen/string_pool.rs b/crates/perry-codegen/src/codegen/string_pool.rs index 6f6b708516..012e1546ba 100644 --- a/crates/perry-codegen/src/codegen/string_pool.rs +++ b/crates/perry-codegen/src/codegen/string_pool.rs @@ -217,26 +217,71 @@ pub(super) fn emit_string_pool( } } + // #5391 function splitting: a large bundle interns ~190K strings, and + // emitting every string's init (from-bytes + nanbox + global store + GC-root + // register) into ONE `__perry_init_strings` block makes a single ~68MB + // function that `clang -O0` (forced for oversized modules, #4880) cannot + // compile in practical time — its per-function passes are superlinear in + // function size. The inits are independent (each handle is stored to its own + // global; no SSA value flows between iterations), so split them into chunk + // functions of `STRINGS_PER_CHUNK` each and call them in sequence from the + // init. Combined with codegen-unit splitting, the ~48 chunks bin-pack evenly + // across units instead of one unit carrying the monolith. + let strings_per_chunk: usize = std::env::var("PERRY_STRING_INIT_CHUNK_SIZE") + .ok() + .and_then(|v| v.parse::().ok()) + .filter(|&n| n > 0) + .unwrap_or(4000); + let mut string_chunk_names: Vec = Vec::new(); + { + let mut count_in_chunk = strings_per_chunk; // force a new chunk on entry 0 + let mut cur_idx = 0usize; + for entry in strings.iter() { + if count_in_chunk >= strings_per_chunk { + if !string_chunk_names.is_empty() { + llmod.function_mut(cur_idx).unwrap().block_mut(0).unwrap().ret_void(); + } + let cname = format!( + "__perry_init_strings_{}_chunk{}", + module_prefix, + string_chunk_names.len() + ); + llmod.define_function(&cname, VOID, vec![]).create_block("entry"); + cur_idx = llmod.function_count() - 1; + string_chunk_names.push(cname); + count_in_chunk = 0; + } + let blk = llmod.function_mut(cur_idx).unwrap().block_mut(0).unwrap(); + + let bytes_ref = format!("@{}", entry.bytes_global); + let handle_ref = format!("@{}", entry.handle_global); + let len_str = entry.byte_len.to_string(); + let from_bytes_fn = if entry.is_wtf8 { + "js_string_from_wtf8_bytes" + } else { + "js_string_from_bytes" + }; + let handle = blk.call(I64, from_bytes_fn, &[(PTR, &bytes_ref), (I32, &len_str)]); + let nanboxed = blk.call(DOUBLE, "js_nanbox_string", &[(I64, &handle)]); + crate::expr::emit_root_nanbox_store_on_block(blk, &nanboxed, &handle_ref); + let addr_i64 = blk.ptrtoint(&handle_ref, I64); + blk.call_void("js_gc_register_global_root", &[(I64, &addr_i64)]); + count_in_chunk += 1; + } + if !string_chunk_names.is_empty() { + llmod.function_mut(cur_idx).unwrap().block_mut(0).unwrap().ret_void(); + } + } + let init_name = format!("__perry_init_strings_{}", module_prefix); let init_fn = llmod.define_function(&init_name, VOID, vec![]); let _ = init_fn.create_block("entry"); let blk = init_fn.block_mut(0).unwrap(); - for entry in strings.iter() { - let bytes_ref = format!("@{}", entry.bytes_global); - let handle_ref = format!("@{}", entry.handle_global); - let len_str = entry.byte_len.to_string(); - - let init_fn = if entry.is_wtf8 { - "js_string_from_wtf8_bytes" - } else { - "js_string_from_bytes" - }; - let handle = blk.call(I64, init_fn, &[(PTR, &bytes_ref), (I32, &len_str)]); - let nanboxed = blk.call(DOUBLE, "js_nanbox_string", &[(I64, &handle)]); - crate::expr::emit_root_nanbox_store_on_block(blk, &nanboxed, &handle_ref); - let addr_i64 = blk.ptrtoint(&handle_ref, I64); - blk.call_void("js_gc_register_global_root", &[(I64, &addr_i64)]); + // Run the string-init chunks first, in order (each populates its slice of + // the string-handle globals before any user code runs). + for cname in &string_chunk_names { + blk.call_void(cname, &[]); } // Register display names for top-level user functions so diff --git a/crates/perry-codegen/src/module.rs b/crates/perry-codegen/src/module.rs index d2dcb56152..89fc064b60 100644 --- a/crates/perry-codegen/src/module.rs +++ b/crates/perry-codegen/src/module.rs @@ -191,6 +191,14 @@ impl LlModule { self.functions.get_mut(idx) } + /// Number of functions defined so far. Used to recover the index of a + /// just-`define_function`ed function (whose `&mut` borrow must be released + /// before the index can be read) when emitting a sequence of functions — + /// e.g. the chunked string-pool init (#5391 function splitting). + pub fn function_count(&self) -> usize { + self.functions.len() + } + /// True if a function with the given name has already been *defined* /// in this module. Used by the #461 export-stub pass to avoid /// redefining a symbol that an earlier emission path (function body, From 05fd8ef73a7a5ea89193e9d19baa3a9c4d8981a0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralph=20K=C3=BCpper?= Date: Thu, 18 Jun 2026 21:03:45 +0200 Subject: [PATCH 5/5] style: cargo fmt (codegen string_pool/function/linker/module) --- .../perry-codegen/src/codegen/string_pool.rs | 18 +++++++++-- crates/perry-codegen/src/function.rs | 6 +--- crates/perry-codegen/src/linker.rs | 5 ++-- crates/perry-codegen/src/module.rs | 30 +++++++++++++++---- 4 files changed, 42 insertions(+), 17 deletions(-) diff --git a/crates/perry-codegen/src/codegen/string_pool.rs b/crates/perry-codegen/src/codegen/string_pool.rs index 012e1546ba..fad86b7a58 100644 --- a/crates/perry-codegen/src/codegen/string_pool.rs +++ b/crates/perry-codegen/src/codegen/string_pool.rs @@ -239,14 +239,21 @@ pub(super) fn emit_string_pool( for entry in strings.iter() { if count_in_chunk >= strings_per_chunk { if !string_chunk_names.is_empty() { - llmod.function_mut(cur_idx).unwrap().block_mut(0).unwrap().ret_void(); + llmod + .function_mut(cur_idx) + .unwrap() + .block_mut(0) + .unwrap() + .ret_void(); } let cname = format!( "__perry_init_strings_{}_chunk{}", module_prefix, string_chunk_names.len() ); - llmod.define_function(&cname, VOID, vec![]).create_block("entry"); + llmod + .define_function(&cname, VOID, vec![]) + .create_block("entry"); cur_idx = llmod.function_count() - 1; string_chunk_names.push(cname); count_in_chunk = 0; @@ -269,7 +276,12 @@ pub(super) fn emit_string_pool( count_in_chunk += 1; } if !string_chunk_names.is_empty() { - llmod.function_mut(cur_idx).unwrap().block_mut(0).unwrap().ret_void(); + llmod + .function_mut(cur_idx) + .unwrap() + .block_mut(0) + .unwrap() + .ret_void(); } } diff --git a/crates/perry-codegen/src/function.rs b/crates/perry-codegen/src/function.rs index b68660d595..d5cad4eefc 100644 --- a/crates/perry-codegen/src/function.rs +++ b/crates/perry-codegen/src/function.rs @@ -369,11 +369,7 @@ impl LlFunction { let body: usize = self .blocks .iter() - .map(|b| { - b.instructions_iter().map(|i| i.len() + 1).sum::() - + b.label.len() - + 4 - }) + .map(|b| b.instructions_iter().map(|i| i.len() + 1).sum::() + b.label.len() + 4) .sum(); let allocas: usize = self.entry_allocas.iter().map(|a| a.len() + 1).sum(); body + allocas + self.name.len() + 64 diff --git a/crates/perry-codegen/src/linker.rs b/crates/perry-codegen/src/linker.rs index f209704bde..1620c178ab 100644 --- a/crates/perry-codegen/src/linker.rs +++ b/crates/perry-codegen/src/linker.rs @@ -300,9 +300,8 @@ pub fn compile_units_to_object(units: &[String], target_triple: Option<&str>) -> let mut obj_paths: Vec = Vec::with_capacity(units.len()); for (i, unit) in units.iter().enumerate() { - let bytes = compile_ll_to_object(unit, target_triple).with_context(|| { - format!("codegen unit {}/{} failed to compile", i + 1, units.len()) - })?; + let bytes = compile_ll_to_object(unit, target_triple) + .with_context(|| format!("codegen unit {}/{} failed to compile", i + 1, units.len()))?; let p = tmp_dir.join(format!("perry_cgu_{}_{}_{}.o", pid, nonce, i)); fs::write(&p, &bytes) .with_context(|| format!("failed to write codegen-unit object {}", p.display()))?; diff --git a/crates/perry-codegen/src/module.rs b/crates/perry-codegen/src/module.rs index 89fc064b60..c92dc167f3 100644 --- a/crates/perry-codegen/src/module.rs +++ b/crates/perry-codegen/src/module.rs @@ -49,7 +49,11 @@ fn promote_global_for_units(line: &str) -> String { return line.to_string(); } match line.split_once(" = ") { - Some((lhs, rhs)) => format!("{} = linkonce_odr {}", lhs, strip_leading_linkage(rhs.trim_start())), + Some((lhs, rhs)) => format!( + "{} = linkonce_odr {}", + lhs, + strip_leading_linkage(rhs.trim_start()) + ), None => line.to_string(), } } @@ -521,13 +525,22 @@ mod tests { assert_eq!(units.len(), 2, "two functions → two units"); // Each function defined exactly once across all units. - let def_f = units.iter().filter(|u| u.contains("define double @perry_fn_m__f(")).count(); - let def_g = units.iter().filter(|u| u.contains("define double @perry_fn_m__g(")).count(); + let def_f = units + .iter() + .filter(|u| u.contains("define double @perry_fn_m__f(")) + .count(); + let def_g = units + .iter() + .filter(|u| u.contains("define double @perry_fn_m__g(")) + .count(); assert_eq!(def_f, 1); assert_eq!(def_g, 1); // The unit that DEFINES f (and calls g) must DECLARE g. - let u_with_f = units.iter().find(|u| u.contains("define double @perry_fn_m__f(")).unwrap(); + let u_with_f = units + .iter() + .find(|u| u.contains("define double @perry_fn_m__f(")) + .unwrap(); assert!(u_with_f.contains("declare double @perry_fn_m__g()")); // Shared globals appear in BOTH units, promoted to linkonce_odr. @@ -550,7 +563,9 @@ mod tests { f.create_block("entry").ret(DOUBLE, "0.0"); } assert_eq!( - m.to_ir().matches("define double @perry_method_j__foo(").count(), + m.to_ir() + .matches("define double @perry_method_j__foo(") + .count(), 1, "duplicate symbol must be defined once in to_ir" ); @@ -559,7 +574,10 @@ mod tests { .iter() .map(|u| u.matches("define double @perry_method_j__foo(").count()) .sum(); - assert_eq!(defs, 1, "duplicate symbol must be defined once across units"); + assert_eq!( + defs, 1, + "duplicate symbol must be defined once across units" + ); } #[test]