From 5a3226f8a55e5f59666f5dfc0006fd859eb0f7f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Ralph=20K=C3=BCpper?= Date: Sun, 14 Jun 2026 02:10:46 +0200 Subject: [PATCH] perf(compile): compile oversized modules at -O0 to fix wide-object-literal blowup (#4880) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A module dominated by a huge generated object literal (config / lookup table) lowers to one enormous function whose thousands of `alloca`s make LLVM's `-O1+` optimization pipeline (SROA / mem2reg / GVN) super-linear. Perry's own IR generation is fast (<1s); the cost is the external `clang -c -O3` on the generated `.ll`. Measured on a 2800-key literal (≈9.6 MB / 199K-line `.ll`): `clang -c` is 3.0s at `-O0`, 17.1s at `-O1`, 18.4s at `-O2`, 18.5s at `-O3` — i.e. the blowup is entirely in the `-O1+` pipeline and `-O0` is the only escape (`-O1`/`-O2` are no faster than `-O3`). Fix: in `build_clang_compile_plan`, compile a module whose IR exceeds a size threshold (default 6 MiB, override via `PERRY_LL_O0_THRESHOLD_BYTES`) at `-O0` instead of `-O3`, with a one-line note to stderr. Such modules are almost always static data where optimization is irrelevant, and the threshold is high enough that ordinary modules are unaffected (they stay `-O3`). End-to-end: the 2800-key repro drops from ~19s to ~5.4s and still runs correctly; a 400-key program is unchanged (stays `-O3`). New `compile_plan_downgrades_to_o0_for_oversized_module` test + existing linker tests pass. (The issue's headline 2100=114s is ~10x stale — current main compiles 2100 keys in ~11s — but the super-linear LLVM cost remained; this caps it for the pathological generated-data case.) --- crates/perry-codegen/src/linker.rs | 61 +++++++++++++++++++++++++++++- 1 file changed, 60 insertions(+), 1 deletion(-) diff --git a/crates/perry-codegen/src/linker.rs b/crates/perry-codegen/src/linker.rs index 319c86c40d..97440dad12 100644 --- a/crates/perry-codegen/src/linker.rs +++ b/crates/perry-codegen/src/linker.rs @@ -50,11 +50,31 @@ fn native_tuning_arg_for_host() -> &'static str { } } +/// Default IR-size cutoff above which a module is compiled at `-O0` instead +/// of `-O3` (#4880). A module dominated by a huge generated literal +/// (config / lookup table) lowers to one enormous function whose +/// thousands of `alloca`s make LLVM's `-O1+` pipeline (SROA / mem2reg / +/// GVN) super-linear: a 2800-key object literal is ~10 MB of IR that +/// `clang -c -O3` chews on for ~18 s (and multi-thousand-key literals were +/// reported taking minutes / getting killed), versus ~3 s at `-O0`. +/// `-O1`/`-O2` are no faster than `-O3` here, so `-O0` is the only escape. +/// Such modules are almost always static data where optimization is +/// irrelevant. Tunable via `PERRY_LL_O0_THRESHOLD_BYTES`. +const DEFAULT_LL_O0_THRESHOLD_BYTES: usize = 6 * 1024 * 1024; + +fn ll_o0_threshold_bytes() -> usize { + std::env::var("PERRY_LL_O0_THRESHOLD_BYTES") + .ok() + .and_then(|s| s.trim().parse::().ok()) + .unwrap_or(DEFAULT_LL_O0_THRESHOLD_BYTES) +} + fn build_clang_compile_plan( clang: PathBuf, ll_path: PathBuf, obj_path: PathBuf, target_triple: Option<&str>, + ll_byte_size: usize, ) -> ClangCompilePlan { let effective_target = target_triple .map(|s| s.to_string()) @@ -64,7 +84,24 @@ fn build_clang_compile_plan( .then(|| native_tuning_arg_for_host().to_string()); let stderr_remarks_path = PathBuf::from(format!("{}.clang-stderr", obj_path.display())); - let mut clang_args = vec!["-c".to_string(), "-O3".to_string()]; + // #4880: fall back to -O0 for pathologically-large modules so a giant + // generated literal doesn't make `clang -c` super-linear (see + // DEFAULT_LL_O0_THRESHOLD_BYTES). + let o0_threshold = ll_o0_threshold_bytes(); + let opt_flag = if o0_threshold > 0 && ll_byte_size > o0_threshold { + eprintln!( + "perry: module IR is {:.1} MB (> {:.1} MB); compiling it at -O0 instead of -O3 \ + so LLVM's -O1+ pipeline doesn't blow up on the oversized function (#4880). \ + Override with PERRY_LL_O0_THRESHOLD_BYTES.", + ll_byte_size as f64 / (1024.0 * 1024.0), + o0_threshold as f64 / (1024.0 * 1024.0), + ); + "-O0" + } else { + "-O3" + }; + + let mut clang_args = vec!["-c".to_string(), opt_flag.to_string()]; if std::env::var("PERRY_DEBUG_SYMBOLS").is_ok() { clang_args.push("-g".to_string()); } @@ -151,6 +188,7 @@ pub fn compile_ll_to_object(ll_text: &str, target_triple: Option<&str>) -> Resul ll_path.clone(), obj_path.clone(), target_triple, + ll_text.len(), ); // Pre-flight probe: capture clang's default Target: line once per process, @@ -822,8 +860,11 @@ mod tests { PathBuf::from("/tmp/input.ll"), PathBuf::from("/tmp/output.o"), None, + 0, ); assert!(plan.clang_args.contains(&"-fno-math-errno".to_string())); + // Small module → optimized at -O3 (#4880). + assert!(plan.clang_args.contains(&"-O3".to_string())); assert!(plan.clang_args.contains(&"-target".to_string())); assert!(plan.analysis_clang_args.contains(&"-target".to_string())); assert_eq!( @@ -833,6 +874,22 @@ mod tests { assert!(!plan.effective_target.is_empty()); } + #[test] + fn compile_plan_downgrades_to_o0_for_oversized_module() { + // #4880: a module whose IR exceeds the threshold compiles at -O0 + // (avoiding LLVM's super-linear -O1+ pipeline on a giant function). + let huge = ll_o0_threshold_bytes() + 1; + let plan = build_clang_compile_plan( + PathBuf::from("clang"), + PathBuf::from("/tmp/input.ll"), + PathBuf::from("/tmp/output.o"), + None, + huge, + ); + assert!(plan.clang_args.contains(&"-O0".to_string())); + assert!(!plan.clang_args.contains(&"-O3".to_string())); + } + #[test] fn compile_plan_skips_native_tuning_for_explicit_target() { let plan = build_clang_compile_plan( @@ -840,6 +897,7 @@ mod tests { PathBuf::from("/tmp/input.ll"), PathBuf::from("/tmp/output.o"), Some("x86_64-unknown-linux-gnu"), + 0, ); assert_eq!(plan.effective_target, "x86_64-unknown-linux-gnu"); assert_eq!(plan.native_tuning_arg, None); @@ -861,6 +919,7 @@ mod tests { PathBuf::from("/tmp/input.ll"), PathBuf::from("/tmp/output.o"), Some("x86_64-unknown-linux-gnu"), + 0, ); write_compile_plan_metadata(&plan, &temp).unwrap(); let text = fs::read_to_string(&temp).unwrap();