From f0bff8f3730e04966927bbe2b14ab9856403f0e7 Mon Sep 17 00:00:00 2001
From: parsonsmatt <mattp@mercury.com>
Date: Tue, 26 May 2026 21:07:16 -0600
Subject: [PATCH 01/11] bench: extend perf-tracking infrastructure

- shake-bench: parse GC pause time, max live data, productivity,
  major/minor GC counts from +RTS -s; emit .hp.csv top-cost-centre
  summary alongside each .hp file
- ghcide-bench: capture per-sample latencies; emit p50/p95/p99/stddev
  CSV columns; add memory-pressure experiment
- bench: smoke phony target for fast local iteration; parameterise
  MultiLayerModules.sh with --depth/--width; add MultiLayerModulesXL
  (200x50 = 10k modules) for production-scale stress

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bench/Main.hs                                 |  18 +-
 bench/MultiLayerModules.sh                    |  41 +++-
 bench/config.yaml                             |  23 ++-
 bench/config_overnight.yaml                   | 107 +++++++++++
 bench/run_pass2_costcentre.sh                 |  88 +++++++++
 ghcide-bench/src/Experiments.hs               |  95 +++++++++-
 shake-bench/shake-bench.cabal                 |   1 +
 .../src/Development/Benchmark/Rules.hs        | 177 ++++++++++++++++--
 8 files changed, 517 insertions(+), 33 deletions(-)
 create mode 100644 bench/config_overnight.yaml
 create mode 100755 bench/run_pass2_costcentre.sh

diff --git a/bench/Main.hs b/bench/Main.hs
index d1ac603b18..9fb4235a7c 100644
--- a/bench/Main.hs
+++ b/bench/Main.hs
@@ -64,7 +64,7 @@ import           Development.Shake           (Action,
                                               actionBracket, addOracle,
                                               askOracle, command, command_,
                                               getDirectoryFiles, liftIO, need,
-                                              newCache, shakeArgsWith,
+                                              newCache, phony, shakeArgsWith,
                                               shakeOptions, versioned, want)
 import           Development.Shake.Classes
 import           Experiments.Types           (Example (exampleName),
@@ -168,6 +168,22 @@ createBuildSystem config = do
   whenJust (profileInterval configStatic) $ \i -> do
     phonyRules "profiled-" binaryName (CheapHeapProfiling i) build (examples configStatic)
 
+  -- Fast smoke target: one small example, a few cheap experiments, only HEAD.
+  -- Intended for tight local iteration while improving perf/memory.
+  -- Sample count is still controlled by `samples:` in config.yaml — drop it
+  -- to ~10 for the fastest turnaround.
+  -- DummyLevel0M01NoTH is a single empty module so `searchSymbol` returns
+  -- Nothing for `identifierP`; pick experiments that don't require it.
+  let smokeExample = "DummyLevel0M01NoTH"
+      smokeExperiments = ["edit", "edit-header", "documentSymbols after edit"]
+      smokeConfigs = [ confName | ConfigurationDescriptor{..} <- configurations configStatic ]
+  phony "smoke" $ need
+    [ build </> "unprofiled" </> smokeExample </> "HEAD" </> conf
+            </> escaped (escapeExperiment (Unescaped experiment)) <.> "csv"
+    | conf <- smokeConfigs
+    , experiment <- smokeExperiments
+    ]
+
   return configStatic
 
 disableAllPluginsBut :: (PluginId -> Bool) -> Plugin.Config
diff --git a/bench/MultiLayerModules.sh b/bench/MultiLayerModules.sh
index 38d85ce9ed..146fc9cb1d 100755
--- a/bench/MultiLayerModules.sh
+++ b/bench/MultiLayerModules.sh
@@ -1,9 +1,34 @@
 #!/usr/bin/env bash
-# Generate $DEPTH layers of modules with $WIDTH modules on each layer
-# Every module on layer N imports all the modules on layer N-1
-# MultiLayerModules.hs imports all the modules from the last layer
+# Generate $DEPTH layers of modules with $WIDTH modules on each layer.
+# Every module on layer N imports all the modules on layer N-1.
+# MultiLayerModules.hs imports all the modules from the last layer.
+#
+# Flags (any order):
+#   --th             enable TemplateHaskell pragma on the top module
+#   --depth N        override DEPTH (default 15)
+#   --width N        override WIDTH (default 40)
 DEPTH=15
 WIDTH=40
+TH=
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --th)
+      TH=1
+      shift
+      ;;
+    --depth)
+      DEPTH=$2
+      shift 2
+      ;;
+    --width)
+      WIDTH=$2
+      shift 2
+      ;;
+    *)
+      shift
+      ;;
+  esac
+done
 cat >hie.yaml << EOF
 cradle:
   direct:
@@ -22,13 +47,11 @@ for l in $(seq 1 $DEPTH); do
     done
   done
 done
-case "$1" in
-  '--th')
-    echo "{-# LANGUAGE TemplateHaskell #-}" > MultiLayerModules.hs
-    ;;
-esac
+if [[ -n "$TH" ]]; then
+  echo "{-# LANGUAGE TemplateHaskell #-}" > MultiLayerModules.hs
+fi
 echo "module MultiLayerModules where" >> MultiLayerModules.hs
-  echo "          - MultiLayerModules.hs" >> hie.yaml;
+echo "          - MultiLayerModules.hs" >> hie.yaml;
 for j in $(seq -w 1 $WIDTH); do
   echo "import DummyLevel${DEPTH}M$j" >> MultiLayerModules.hs;
 done
diff --git a/bench/config.yaml b/bench/config.yaml
index b1218976ce..6616ad75b1 100644
--- a/bench/config.yaml
+++ b/bench/config.yaml
@@ -1,6 +1,7 @@
 # The number of samples to run per experiment.
-# At least 100 is recommended in order to observe space leaks
-samples: 50
+# At least 100 is recommended in order to observe space leaks.
+# Temporarily lowered to 10 for fast turnaround during initial analysis.
+samples: 10
 
 buildTool: cabal
 
@@ -77,6 +78,23 @@ examples:
     modules:
         - DummyLevel1M01.hs
 
+  # Large synthetic project: 200 layers × 50 modules/layer = 10000 modules,
+  # ~500k import edges (each module imports every module of the prior layer).
+  # This is closer in scale to a real production Haskell codebase than the
+  # default MultiLayerModules. Expect this to be slow: initial cradle load
+  # alone takes many minutes, and each experiment runs `samples:` rounds.
+  # Recommended: drop `samples:` to ~10 and run via single-experiment targets
+  # (e.g. `bench-results/unprofiled/MultiLayerModulesXL/HEAD/All/edit.csv`)
+  # rather than the full matrix.
+  - name: MultiLayerModulesXL
+    path: bench/MultiLayerModules.sh
+    script: True
+    script-args: ["--depth", "200", "--width", "50"]
+    modules:
+        - MultiLayerModules.hs
+        - DummyLevel0M01.hs
+        - DummyLevel100M01.hs
+
   # Small but heavily multi-component example
   # Disabled as it is far to slow. hie-bios >0.7.2 should help
   # - name: HLS
@@ -114,6 +132,7 @@ experiments:
     - "hole fit suggestions"
     - "eval execute single-line code lens"
     - "eval execute multi-line code lens"
+    - "memory pressure"
 
 # An ordered list of versions to analyze
 versions:
diff --git a/bench/config_overnight.yaml b/bench/config_overnight.yaml
new file mode 100644
index 0000000000..7933a6be1e
--- /dev/null
+++ b/bench/config_overnight.yaml
@@ -0,0 +1,107 @@
+# Overnight Pass 1: GC flag A/B/C across a subset of experiments.
+# Same HLS binary in all cases; only the +RTS args supplied via extra-args differ.
+# Outputs land in bench-results/unprofiled/<example>/HEAD/All/.
+
+samples: 50
+
+buildTool: cabal
+outputFolder: bench-results
+
+# Heap profiling disabled for this pass — we want clean timing.
+# profileInterval: 1
+
+parallelism: 1
+
+examples:
+  # ---------- cabal-3.16.1.0 variants (~235 transitive modules) ----------
+  - name: cabal-default
+    package: Cabal
+    version: 3.16.1.0
+    modules:
+        - src/Distribution/Simple.hs
+        - src/Distribution/Types/ComponentLocalBuildInfo.hs
+    extra-args: []
+
+  - name: cabal-A64m-F1
+    package: Cabal
+    version: 3.16.1.0
+    modules:
+        - src/Distribution/Simple.hs
+        - src/Distribution/Types/ComponentLocalBuildInfo.hs
+    extra-args: ["+RTS", "-A64m", "-F1", "-RTS"]
+
+  - name: cabal-A128m-F11
+    package: Cabal
+    version: 3.16.1.0
+    modules:
+        - src/Distribution/Simple.hs
+        - src/Distribution/Types/ComponentLocalBuildInfo.hs
+    extra-args: ["+RTS", "-A128m", "-F1.1", "-RTS"]
+
+  - name: cabal-chunked
+    package: Cabal
+    version: 3.16.1.0
+    modules:
+        - src/Distribution/Simple.hs
+        - src/Distribution/Types/ComponentLocalBuildInfo.hs
+    extra-args: ["+RTS", "-A32m", "-n4m", "-F1.1", "-RTS"]
+
+  # ---------- MultiLayerModulesNoTH (600 modules) ----------
+  - name: mlm-default
+    path: bench/MultiLayerModules.sh
+    script: True
+    script-args: []
+    modules:
+        - MultiLayerModules.hs
+        - DummyLevel0M01.hs
+        - DummyLevel1M01.hs
+    extra-args: []
+
+  - name: mlm-A64m-F1
+    path: bench/MultiLayerModules.sh
+    script: True
+    script-args: []
+    modules:
+        - MultiLayerModules.hs
+        - DummyLevel0M01.hs
+        - DummyLevel1M01.hs
+    extra-args: ["+RTS", "-A64m", "-F1", "-RTS"]
+
+# Experiment subset — picked to maximise signal in the slow paths.
+experiments:
+    - "edit"
+    - "semanticTokens after typing burst"
+    - "hover after typing burst"
+    - "completions after typing burst"
+    - "memory pressure"
+    - "eval execute multi-line code lens"
+
+versions:
+- HEAD
+
+configurations:
+- All:
+  - alternateNumberFormat
+  - callHierarchy
+  - changeTypeSignature
+  - class
+  - codeRange
+  - eval
+  - explicitFixity
+  - fourmolu
+  - gadt
+  - ghcide-code-actions-bindings
+  - ghcide-code-actions-fill-holes
+  - ghcide-code-actions-imports-exports
+  - ghcide-code-actions-type-signatures
+  - ghcide-completions
+  - ghcide-type-lenses
+  - hlint
+  - importLens
+  - moduleName
+  - ormolu
+  - pragmas
+  - qualifyImportedNames
+  - rename
+  - stylish-haskell
+  - semanticTokens
diff --git a/bench/run_pass2_costcentre.sh b/bench/run_pass2_costcentre.sh
new file mode 100755
index 0000000000..ff2cdf0750
--- /dev/null
+++ b/bench/run_pass2_costcentre.sh
@@ -0,0 +1,88 @@
+#!/usr/bin/env bash
+# Pass 2: cost-centre profiled benchmark.
+#
+# Requires bench-results/binaries/HEAD-prof/haskell-language-server to exist
+# (built with --enable-profiling --profiling-detail=late).
+#
+# For each (example, experiment) pair, invoke ghcide-bench directly with
+# +RTS -p -hc -i1 -po<stem> -RTS so we get:
+#   <stem>.prof       — time + allocation by cost-centre
+#   <stem>.hp         — heap residency by cost-centre over time
+#   <stem>.hp.csv     — top-N cost-centres (produced by parseHpProfile, but
+#                       NOTE: only the bench harness invokes that. For Pass 2
+#                       we'd need to run summarizeHpProfile separately. See
+#                       end of script.)
+set -euo pipefail
+
+HLS=bench-results/binaries/HEAD-prof/haskell-language-server
+OUTDIR=bench-results/prof-cc
+SAMPLES=20
+
+if [[ ! -x "$HLS" ]]; then
+    echo "ERROR: profiled HLS not found at $HLS" >&2
+    exit 1
+fi
+
+run_experiment() {
+    local example_name=$1
+    local experiment=$2
+    shift 2
+    local example_args=("$@")
+
+    local exp_slug=${experiment// /_}
+    local outdir="$OUTDIR/$example_name/$exp_slug"
+    mkdir -p "$outdir"
+    local stem="$(pwd)/$outdir/$exp_slug"
+
+    echo
+    echo "=== $example_name :: $experiment ==="
+
+    cabal exec -- ghcide-bench \
+        --timeout=600 \
+        --no-clean \
+        --samples="$SAMPLES" \
+        --csv="$outdir/$exp_slug.csv" \
+        --ghcide="$(pwd)/$HLS" \
+        --select "$experiment" \
+        --lsp-config \
+        "${example_args[@]}" \
+        --ghcide-options=+RTS \
+        --ghcide-options=-p \
+        --ghcide-options=-hc \
+        --ghcide-options=-i1 \
+        --ghcide-options=-po"$stem" \
+        --ghcide-options=-l \
+        --ghcide-options=-ol"$stem.eventlog" \
+        --ghcide-options=-S"$stem.gcStats.log" \
+        --ghcide-options=-RTS \
+        < /dev/null || echo "  FAILED — continuing"
+}
+
+# ---------- Example: cabal-3.16.1.0 ----------
+CABAL_ARGS=(
+    --example-package-name Cabal
+    --example-package-version 3.16.1.0
+    --example-name cabal
+    --example-module=src/Distribution/Simple.hs
+    --example-module=src/Distribution/Types/ComponentLocalBuildInfo.hs
+)
+for exp in "memory pressure" "completions after typing burst" "hover after typing burst" "edit" "semanticTokens after typing burst"; do
+    run_experiment cabal "$exp" "${CABAL_ARGS[@]}"
+done
+
+# ---------- Example: MultiLayerModulesNoTH ----------
+MLM_ARGS=(
+    --example-script "$(pwd)/bench/MultiLayerModules.sh"
+    --example-name MultiLayerModulesNoTH
+    --example-module=MultiLayerModules.hs
+    --example-module=DummyLevel0M01.hs
+    --example-module=DummyLevel1M01.hs
+)
+for exp in "memory pressure" "edit" "hover after typing burst"; do
+    run_experiment MultiLayerModulesNoTH "$exp" "${MLM_ARGS[@]}"
+done
+
+echo
+echo "=== Pass 2 complete ==="
+echo "Profile artefacts under $OUTDIR/"
+echo "Inspect .prof files for time+alloc attribution: head -50 $OUTDIR/cabal/memory_pressure/memory_pressure.prof"
diff --git a/ghcide-bench/src/Experiments.hs b/ghcide-bench/src/Experiments.hs
index 8e70483559..4e673e6993 100644
--- a/ghcide-bench/src/Experiments.hs
+++ b/ghcide-bench/src/Experiments.hs
@@ -352,6 +352,45 @@ experiments =
               )
         ),
       ---------------------------------------------------------------------------------------
+      ---------------------------------------------------------------------------------------
+      -- Memory-pressure experiment: hammers the server with a variety of
+      -- requests across every open document, interleaved with edits, to force
+      -- the IDE to retain caches/results across many rules. Each sample is one
+      -- "round" so the resulting peak/post-GC heap (via RTS -s) reflects the
+      -- worst case under sustained load. The default sample count makes this
+      -- much longer than other experiments — override with --samples when you
+      -- only need a quick smoke check.
+      bench "memory pressure" $ \docs -> do
+        -- Round 1: typing burst across all docs.
+        applyTypingBurst docs
+        _ <- waitForBuildQueue
+        -- Round 2: exercise hover + getDefinition + completions on every
+        -- doc that has an identifier position. These populate per-rule
+        -- caches across the whole project.
+        forM_ docs $ \DocumentPositions{..} -> do
+          forM_ identifierP $ \p -> do
+            _ <- getHover doc p
+            _ <- getDefinitions doc p
+            _ <- getCompletions doc p
+            return ()
+        _ <- waitForBuildQueue
+        -- Round 3: code actions across every doc — these tend to pull in
+        -- additional rules (suggestions, refactor previews, etc.).
+        forM_ docs $ \DocumentPositions{..} -> do
+          forM_ identifierP $ \p ->
+            void $ getCodeActions doc (Range p p)
+        _ <- waitForBuildQueue
+        -- Round 4: document symbols on every doc — populates the symbol cache.
+        forM_ docs $ \DocumentPositions{..} ->
+          void $ getDocumentSymbols doc
+        _ <- waitForBuildQueue
+        -- Round 5: another typing burst, to ensure the caches above survive
+        -- invalidation rather than getting dropped — this is what catches
+        -- "we recompute everything from scratch on every edit" regressions.
+        applyTypingBurst docs
+        _ <- waitForBuildQueue
+        return True,
+      ---------------------------------------------------------------------------------------
       benchWithSetup
         "eval execute multi-line code lens"
         ( mapM_ $ \DocumentPositions{..} -> do
@@ -533,6 +572,13 @@ runBenchmarksFun dir allBenchmarks = do
         , "rulesTotal"
         , "ruleEdges"
         , "ghcRebuilds"
+        , "userP50"
+        , "userP95"
+        , "userP99"
+        , "userStdDev"
+        , "delayedP50"
+        , "delayedP95"
+        , "delayedP99"
         ]
       rows =
         [ [ name,
@@ -552,7 +598,14 @@ runBenchmarksFun dir allBenchmarks = do
             show rulesVisited,
             show rulesTotal,
             show edgesTotal,
-            show rebuildsTotal
+            show rebuildsTotal,
+            showMs (quantile 0.50 userWaitsSamples),
+            showMs (quantile 0.95 userWaitsSamples),
+            showMs (quantile 0.99 userWaitsSamples),
+            showMs (stdDev   userWaitsSamples),
+            showMs (quantile 0.50 delayedWorkSamples),
+            showMs (quantile 0.95 delayedWorkSamples),
+            showMs (quantile 0.99 delayedWorkSamples)
           ]
           | (Bench {name, samples}, BenchRun {..}) <- results,
             let runSetup' = if runSetup < 0.01 then 0 else runSetup
@@ -625,6 +678,10 @@ data BenchRun = BenchRun
     runExperiment        :: !Seconds,
     userWaits            :: !Seconds,
     delayedWork          :: !Seconds,
+    -- | Per-sample user-time samples (length == samples on success).
+    userWaitsSamples     :: ![Seconds],
+    -- | Per-sample delayed-work samples (length == samples on success).
+    delayedWorkSamples   :: ![Seconds],
     firstResponse        :: !Seconds,
     firstResponseDelayed :: !Seconds,
     rulesBuilt           :: !Int,
@@ -637,7 +694,26 @@ data BenchRun = BenchRun
   }
 
 badRun :: BenchRun
-badRun = BenchRun 0 0 0 0 0 0 0 0 0 0 0 0 0 False
+badRun = BenchRun 0 0 0 0 0 [] [] 0 0 0 0 0 0 0 0 False
+
+-- | Approximate quantile of a list of samples. Uses the nearest-rank
+-- method; good enough for benchmark reporting at n>=20.
+quantile :: Double -> [Seconds] -> Seconds
+quantile _ [] = 0
+quantile q xs =
+    let sorted = sort xs
+        n      = length sorted
+        idx    = max 0 $ min (n - 1) $ floor (q * fromIntegral (n - 1))
+    in sorted !! idx
+
+-- | Population standard deviation.
+stdDev :: [Seconds] -> Seconds
+stdDev [] = 0
+stdDev xs =
+    let n  = fromIntegral (length xs)
+        m  = sum xs / n
+        sq x = let d = x - m in d * d
+    in sqrt (sum (map sq xs) / n)
 
 waitForProgressStart :: Session ()
 waitForProgressStart = void $ do
@@ -688,8 +764,9 @@ runBench runSess Bench{..} = handleAny (\e -> print e >> return badRun)
 
       liftIO $ output $ "Running " <> name <> " benchmark"
       (runSetup, ()) <- duration $ benchSetup docs
-      let loop' (Just timeForFirstResponse) !userWaits !delayedWork 0 = return $ Just (userWaits, delayedWork, timeForFirstResponse)
-          loop' timeForFirstResponse !userWaits !delayedWork n = do
+      -- Accumulate per-sample (t, td) in reverse order; we reverse once at the end.
+      let loop' (Just timeForFirstResponse) !samplesAcc 0 = return $ Just (reverse samplesAcc, timeForFirstResponse)
+          loop' timeForFirstResponse !samplesAcc n = do
             (t, res) <- duration $ experiment docs
             if not res
               then return Nothing
@@ -697,12 +774,16 @@ runBench runSess Bench{..} = handleAny (\e -> print e >> return badRun)
                 output (showDuration t)
                 -- Wait for the delayed actions to finish
                 td <- waitForBuildQueue
-                loop' (timeForFirstResponse <|> Just (t,td)) (userWaits+t) (delayedWork+td) (n -1)
+                loop' (timeForFirstResponse <|> Just (t,td)) ((t,td):samplesAcc) (n -1)
           loop = loop' Nothing
 
-      (runExperiment, result) <- duration $ loop 0 0 samples
+      (runExperiment, result) <- duration $ loop [] samples
       let success = isJust result
-          (userWaits, delayedWork, (firstResponse, firstResponseDelayed)) = fromMaybe (0,0,(0,0)) result
+          (perSample, (firstResponse, firstResponseDelayed)) = fromMaybe ([], (0,0)) result
+          userWaitsSamples = map fst perSample
+          delayedWorkSamples = map snd perSample
+          userWaits = sum userWaitsSamples
+          delayedWork = sum delayedWorkSamples
 
       rulesTotal <- length <$> getStoredKeys
       rulesBuilt <- either (const 0) length <$> getBuildKeysBuilt
diff --git a/shake-bench/shake-bench.cabal b/shake-bench/shake-bench.cabal
index 3d2a7d3174..e5bee2fc42 100644
--- a/shake-bench/shake-bench.cabal
+++ b/shake-bench/shake-bench.cabal
@@ -22,6 +22,7 @@ library
         aeson,
         base == 4.*,
         bytestring,
+        containers,
         directory,
         extra >= 1.7.2,
         filepath,
diff --git a/shake-bench/src/Development/Benchmark/Rules.hs b/shake-bench/src/Development/Benchmark/Rules.hs
index 2ddcb97cc5..779306e84e 100644
--- a/shake-bench/src/Development/Benchmark/Rules.hs
+++ b/shake-bench/src/Development/Benchmark/Rules.hs
@@ -72,9 +72,12 @@ import           Data.Aeson.Lens           (AsJSON (_JSON), _Object, _String)
 import           Data.ByteString.Lazy      (ByteString)
 import           Data.Char                 (isDigit)
 import           Data.List                 (find, intercalate, isInfixOf,
-                                            isSuffixOf, stripPrefix, transpose)
+                                            isPrefixOf, isSuffixOf, sortBy,
+                                            stripPrefix, transpose)
 import           Data.List.Extra           (lower, splitOn)
+import qualified Data.Map.Strict           as Map
 import           Data.Maybe                (fromMaybe)
+import           Data.Ord                  (Down (..), comparing)
 import           Data.String               (fromString)
 import           Data.Text                 (Text)
 import qualified Data.Text                 as T
@@ -327,29 +330,175 @@ benchRules build MkBenchRules{..} = do
             NoProfiling -> writeFile outHp dummyHp
             _           -> return ()
 
-        -- extend csv output with allocation data
+        -- Write a co-located top-N heap summary derived from the .hp file.
+        -- Untracked side-effect; regenerated whenever the .hp is.
+        liftIO $ do
+            hp <- readFile outHp
+            writeFile (outHp <.> "csv") (summarizeHpProfile 20 hp)
+
+        -- extend csv output with allocation data + GC/pause/productivity
         csvContents <- liftIO $ lines <$> readFile outcsv
         let header = head csvContents
             results = tail csvContents
-            header' = header <> ", maxResidency, allocatedBytes"
+            extraHeaders =
+                [ "maxResidency"
+                , "allocatedBytes"
+                , "maxSlop"
+                , "mutTimeS"
+                , "gcTimeS"
+                , "maxGcPauseS"
+                , "productivityPct"
+                , "majorGCs"
+                , "minorGCs"
+                ]
+            header' = header <> ", " <> intercalate ", " extraHeaders
         results' <- forM results $ \row -> do
-            (maxResidency, allocations) <- liftIO
-                    (parseMaxResidencyAndAllocations <$> readFile outGc)
-            return $ printf "%s, %s, %s" row (showMB maxResidency) (showMB allocations)
+            stats <- liftIO (parseRTSStats <$> readFile outGc)
+            return $ intercalate ", "
+                [ row
+                , showMB (rtsMaxResidency stats)
+                , showMB (rtsAllocatedBytes stats)
+                , showMB (rtsMaxSlop stats)
+                , showSeconds (rtsMutTime stats)
+                , showSeconds (rtsGcTime stats)
+                , showSeconds (rtsMaxGcPause stats)
+                , showDouble (rtsProductivityPct stats)
+                , showInt (rtsMajorGCs stats)
+                , showInt (rtsMinorGCs stats)
+                ]
         let csvContents' = header' : results'
         writeFileLines outcsv csvContents'
     where
         showMB :: Int -> String
-        showMB x = show (x `div` 2^(20::Int)) <> "MB"
+        showMB x | x < 0 = "0MB"
+                 | otherwise = show (x `div` 2^(20::Int)) <> "MB"
+        showSeconds :: Double -> String
+        showSeconds x | x < 0 = "0"
+                      | otherwise = printf "%.4f" x
+        showDouble :: Double -> String
+        showDouble x | x < 0 = "0"
+                     | otherwise = printf "%.2f" x
+        showInt :: Int -> String
+        showInt x | x < 0 = "0"
+                  | otherwise = show x
+
+-- | Summarize a GHC heap profile (.hp) file into a top-N cost-centre CSV.
+-- For each cost-centre name we report the maximum retained bytes across
+-- all samples, plus the final-sample retained bytes (useful for spotting
+-- leaks that grow without bound).
+summarizeHpProfile :: Int -> String -> String
+summarizeHpProfile topN raw =
+    let ls = lines raw
+        samples = collectSamples ls
+        accumOne acc (cc, bs) = Map.insertWith combine cc (bs, bs) acc
+          where combine (newMax, newLast) (oldMax, _) = (max newMax oldMax, newLast)
+        -- Aggregate across all samples: track running max and last value per CC.
+        agg = foldl (\acc s -> foldl accumOne acc s) Map.empty samples
+        sorted = sortBy (comparing (Down . maxOf)) (Map.toList agg)
+        maxOf (_, (m, _)) = m
+        header = "costCentre,maxRetainedBytes,finalRetainedBytes"
+        body = [ cc <> "," <> show m <> "," <> show l
+               | (cc, (m, l)) <- take topN sorted
+               ]
+    in unlines (header : body)
+  where
+    -- | Group consecutive non-marker lines between BEGIN_SAMPLE/END_SAMPLE pairs.
+    collectSamples :: [String] -> [[(String, Integer)]]
+    collectSamples xs = case dropWhile (not . isBeginSample) xs of
+        [] -> []
+        (_:rest) ->
+            let (sample, rest') = break isEndSample rest
+                parsed = [ p | l <- sample, Just p <- [parseSampleLine l] ]
+            in parsed : case rest' of
+                []    -> []
+                (_:r) -> collectSamples r
+    isBeginSample = ("BEGIN_SAMPLE" `isPrefixOf`) . dropWhile (== ' ')
+    isEndSample   = ("END_SAMPLE"   `isPrefixOf`) . dropWhile (== ' ')
+
+-- | Parse one cost-centre line from a heap profile sample, e.g.
+-- @"GHC.Conc.Sync.CAF" 5678@ or @PINNED 1234@. The cost-centre name may
+-- contain spaces (when quoted), so we treat everything before the trailing
+-- number as the name.
+parseSampleLine :: String -> Maybe (String, Integer)
+parseSampleLine l = case reverse (words l) of
+    (numStr:rest) -> do
+        n <- readMaybe numStr
+        let name = unwords (reverse rest)
+        if null name then Nothing else Just (name, n)
+    _ -> Nothing
+
+-- | Stats extracted from `+RTS -s` (or -S) summary output.
+data RTSStats = RTSStats
+    { rtsMaxResidency    :: !Int
+    , rtsAllocatedBytes  :: !Int
+    , rtsMaxSlop         :: !Int
+    , rtsMutTime         :: !Double  -- ^ MUT time (seconds)
+    , rtsGcTime          :: !Double  -- ^ GC time (seconds)
+    , rtsMaxGcPause      :: !Double  -- ^ Max GC pause across all generations (seconds)
+    , rtsProductivityPct :: !Double  -- ^ Productivity (% of total user time)
+    , rtsMajorGCs        :: !Int     -- ^ Gen 1 collections
+    , rtsMinorGCs        :: !Int     -- ^ Gen 0 collections
+    }
 
--- Parse the max residency and allocations in RTS -s output
-parseMaxResidencyAndAllocations :: String -> (Int, Int)
-parseMaxResidencyAndAllocations input =
-    (f "maximum residency", f "bytes allocated in the heap")
+-- | Parse the summary block at the tail of `+RTS -s`/`-S` output.
+-- Each field is best-effort; missing fields return -1.
+parseRTSStats :: String -> RTSStats
+parseRTSStats input = RTSStats
+    { rtsMaxResidency    = digitsFirstWord "maximum residency"
+    , rtsAllocatedBytes  = digitsFirstWord "bytes allocated in the heap"
+    , rtsMaxSlop         = digitsFirstWord "maximum slop"
+    , rtsMutTime         = secondsAt 2 "MUT     time"
+    , rtsGcTime          = secondsAt 2 "GC      time"
+    , rtsMaxGcPause      = parseMaxPause
+    , rtsProductivityPct = parseProductivity
+    , rtsMajorGCs        = parseGenColls "Gen  1"
+    , rtsMinorGCs        = parseGenColls "Gen  0"
+    }
   where
-    inps = reverse $ lines input
-    f label = case find (label `isInfixOf`) inps of
-        Just l  -> read $ filter isDigit $ head $ words l
+    ls = lines input
+    findLine label = find (label `isInfixOf`) ls
+    digitsFirstWord label = case findLine label of
+        Just l  -> readIntDef $ filter isDigit $ head (words l ++ [""])
+        Nothing -> -1
+    readIntDef "" = -1
+    readIntDef s  = fromMaybe (-1) (readMaybe s)
+    readDoubleDef s = fromMaybe (-1) (readMaybe (takeWhile (\c -> isDigit c || c == '.' || c == '-') s))
+    -- Parse the time value (in seconds) at word index `i` (after dropping the label
+    -- words). For lines like "  MUT     time    0.012s  (  0.012s elapsed)".
+    secondsAt i label = case findLine label of
+        Just l ->
+            let ws = words l
+            in if length ws > i then readDoubleDef (ws !! i) else -1
+        Nothing -> -1
+    -- "Max pause" appears on each Gen line; take the worst.
+    parseMaxPause =
+        let candidates =
+                [ readDoubleDef w
+                | l <- ls
+                , "Gen " `isInfixOf` l
+                , let ws = words l
+                , not (null ws)
+                , w <- take 1 (reverse ws)
+                ]
+            valid = filter (>= 0) candidates
+        in if null valid then -1 else maximum valid
+    -- "Productivity  81.2% of total user, ..."
+    parseProductivity = case findLine "Productivity" of
+        Just l ->
+            let ws = words l
+            in case ws of
+                (_:n:_) -> readDoubleDef n
+                _       -> -1
+        Nothing -> -1
+    -- "  Gen  0         1 colls,     0 par   ..." → 1
+    parseGenColls genLabel = case findLine genLabel of
+        Just l ->
+            let ws = words l
+                -- Drop "Gen N" prefix, then first numeric word is colls count.
+                rest = drop 2 ws
+            in case rest of
+                (n:_) -> readIntDef (filter isDigit n)
+                _     -> -1
         Nothing -> -1
 
 

From 3ce6464d33850a472e9b5162059300a4335da18a Mon Sep 17 00:00:00 2001
From: parsonsmatt <mattp@mercury.com>
Date: Tue, 26 May 2026 22:52:19 -0600
Subject: [PATCH 02/11] ghcide: preserve pointer identity on early-cutoff match

defineEarlyCutoff' reuses the previously-cached Succeeded value when the
cutoff fingerprint matches the prior. Without this, every cutoff-matched
rerun stored a freshly allocated (but structurally equivalent) value,
and downstream rules whose deps had not changed continued to hold
pointers to the prior value. Multiple equivalent copies of large shared
values accumulated across rebuilds.

On a 10k-module MLM-XL project, this cut NodeKey_Module heap growth by
~3000x (241 KB/s -> 81 B/s) and overall heap leak by ~50% (1.9 MB/s ->
0.95 MB/s). Largest source of remaining leak is HPT/UniqDFM churn via
GhcSessionDeps; tracked separately.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 ghcide/src/Development/IDE/Core/Shake.hs | 13 ++++++++++++-
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/ghcide/src/Development/IDE/Core/Shake.hs b/ghcide/src/Development/IDE/Core/Shake.hs
index 9334a13ad3..ab207b0a11 100644
--- a/ghcide/src/Development/IDE/Core/Shake.hs
+++ b/ghcide/src/Development/IDE/Core/Shake.hs
@@ -1288,7 +1288,7 @@ defineEarlyCutoff' doDiagnostics cmp key file mbOld mode action = do
                         pure (Nothing, ([ideErrorText file (T.pack $ show (key, file) ++ show e) | not $ isBadDependency e],Nothing))
 
                 ver <- estimateFileVersionUnsafely key mbRes file
-                (bs, res) <- case mbRes of
+                (bs, freshRes) <- case mbRes of
                     Nothing -> do
                         pure (toShakeValue ShakeStale mbBs, staleV)
                     Just v -> pure (maybe ShakeNoCutoff ShakeResult mbBs, Succeeded ver v)
@@ -1299,6 +1299,17 @@ defineEarlyCutoff' doDiagnostics cmp key file mbOld mode action = do
                         -- If we do not have a previous result
                         -- or we got ShakeNoCutoff we always return False.
                         _                                     -> False
+                -- Pointer-identity preservation on early-cutoff match.
+                -- When the cutoff fingerprint matches the prior, reuse the
+                -- previously cached value instead of the freshly computed
+                -- (but structurally equivalent) one. This is critical for
+                -- large shared values like ModuleGraph: downstream rules
+                -- that did not invalidate continue to share a pointer with
+                -- the cache, instead of accumulating multiple equivalent
+                -- copies across rebuilds.
+                let res = case (eq, staleV, freshRes) of
+                        (True, Stale _ _ oldV, Succeeded sver _) -> Succeeded sver oldV
+                        _                                        -> freshRes
                 return $ RunResult
                     (if eq then ChangedRecomputeSame else ChangedRecomputeDiff)
                     (encodeShakeValue bs)

From 1db55f257d47e88092a4aa57b1744b9d9ab45cec Mon Sep 17 00:00:00 2001
From: parsonsmatt <mattp@mercury.com>
Date: Tue, 26 May 2026 23:08:33 -0600
Subject: [PATCH 03/11] ghcide: add structural cutoff fingerprint to
 GhcSessionDeps
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

GhcSessionDeps previously used defineNoDiagnostics (no early-cutoff
fingerprint), so the prior pointer-identity preservation patch on
defineEarlyCutoff' could not engage for it. As a result, every re-run
of GhcSessionDeps allocated a fresh HscEnvEq even when the resulting
value was structurally equivalent — and downstream rules whose deps
had not changed kept pointing at the prior version, accumulating
generations of equivalent HscEnv/HPT.

Switch GhcSessionDeps to defineEarlyCutoff and derive a cutoff
fingerprint from the inputs that fully determine HscEnvEq identity:

  - the file's own source hash (covers pragma/dflags effects via
    msrHscEnv's initializePlugins step)
  - the file's import-level mod summary fingerprint
  - the trans-deps fingerprint (covers import-graph changes anywhere
    in the file's transitive cone)
  - the iface fingerprint of each direct dep (covers signature
    changes that would actually affect typecheck)

When all four match the prior run, the new defineEarlyCutoff' code
path reuses the cached HscEnvEq pointer — eliminating accumulation of
ModuleGraph-internal data structures (NodeKey_Module, GWIB, Set.Bin
all drop out of the retainer set entirely on MLM-XL) and cutting
overall heap at fixed wall-clock by ~36%.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 ghcide/src/Development/IDE/Core/Rules.hs | 38 ++++++++++++++++++++----
 1 file changed, 33 insertions(+), 5 deletions(-)

diff --git a/ghcide/src/Development/IDE/Core/Rules.hs b/ghcide/src/Development/IDE/Core/Rules.hs
index 24de344bfa..b04757a965 100644
--- a/ghcide/src/Development/IDE/Core/Rules.hs
+++ b/ghcide/src/Development/IDE/Core/Rules.hs
@@ -763,9 +763,9 @@ loadGhcSession recorder ghcSessionDepsConfig = do
         let cutoffHash = LBS.toStrict $ B.encode (hash (snd val))
         return (Just cutoffHash, val)
 
-    defineNoDiagnostics (cmapWithPrio LogShake recorder) $ \(GhcSessionDeps_ fullModSummary) file -> do
+    defineEarlyCutoff (cmapWithPrio LogShake recorder) $ RuleNoDiagnostics $ \(GhcSessionDeps_ fullModSummary) file -> do
         env <- use_ GhcSession file
-        ghcSessionDepsDefinition fullModSummary ghcSessionDepsConfig env file
+        ghcSessionDepsDefinitionCutoff fullModSummary ghcSessionDepsConfig env file
 
 newtype GhcSessionDepsConfig = GhcSessionDepsConfig
     { fullModuleGraph :: Bool
@@ -785,10 +785,25 @@ ghcSessionDepsDefinition
     :: -- | full mod summary
         Bool ->
         GhcSessionDepsConfig -> HscEnvEq -> NormalizedFilePath -> Action (Maybe HscEnvEq)
-ghcSessionDepsDefinition fullModSummary GhcSessionDepsConfig{..} hscEnvEq file = do
+ghcSessionDepsDefinition fullModSummary cfg hscEnvEq file =
+    snd <$> ghcSessionDepsDefinitionCutoff fullModSummary cfg hscEnvEq file
+
+-- | Like 'ghcSessionDepsDefinition' but also produces a cutoff fingerprint so
+-- the rule can short-circuit when none of its inputs (transitive deps,
+-- own mod summary, dep ifaces) have changed. Combined with the
+-- early-cutoff pointer-identity preservation in 'defineEarlyCutoff'',
+-- this stops generations of equivalent HscEnv\/HPT from accumulating in
+-- the per-file Shake cache.
+ghcSessionDepsDefinitionCutoff
+    :: Bool
+    -> GhcSessionDepsConfig
+    -> HscEnvEq
+    -> NormalizedFilePath
+    -> Action (Maybe BS.ByteString, Maybe HscEnvEq)
+ghcSessionDepsDefinitionCutoff fullModSummary GhcSessionDepsConfig{..} hscEnvEq file = do
     mbdeps <- mapM(fmap artifactFilePath . snd) <$> use_ GetLocatedImports file
     case mbdeps of
-        Nothing -> return Nothing
+        Nothing -> return (Nothing, Nothing)
         Just deps -> do
             when fullModuleGraph $ void $ use_ ReportImportCycles file
             msr <- if fullModSummary
@@ -801,6 +816,18 @@ ghcSessionDepsDefinition fullModSummary GhcSessionDepsConfig{..} hscEnvEq file =
                 env = msrHscEnv msr
             depSessions <- map hscEnv <$> uses_ (GhcSessionDeps_ fullModSummary) deps
             ifaces <- uses_ GetModIface deps
+            transDepsFp <- use_ GetModuleGraphTransDepsFingerprints file
+            -- Cutoff fingerprint: HscEnvEq value is determined by this file's
+            -- own source (which feeds msrHscEnv via pragmas\/dflags), this
+            -- file's import-level mod summary, the trans-deps structure, and
+            -- each dep's iface fingerprint. When all match the prior run, the
+            -- value is structurally equivalent and 'defineEarlyCutoff'' will
+            -- reuse the prior pointer.
+            let !cutoffFp = BS.concat $
+                    fingerprintToBS transDepsFp
+                    : fingerprintToBS (msrFingerprint msr)
+                    : fingerprintToBS (ms_hs_hash ms)
+                    : map hiFileFingerPrint ifaces
             -- Load .hs-boot before .hs: the HPT is keyed by module name, and
             -- GHC's addHomeModInfoToHpt overwrites, so the non-boot must be last.
             let inLoadOrder = sortOn (not . isBootHmi)
@@ -837,7 +864,8 @@ ghcSessionDepsDefinition fullModSummary GhcSessionDepsConfig{..} hscEnvEq file =
             -- session, while `ghcSessionDepsDefinition` will be called for each file we need
             -- to compile. `updateHscEnvEq` will refresh the HscEnv (session') and also
             -- generate a new Unique.
-            Just <$> liftIO (updateHscEnvEq hscEnvEq session')
+            session'' <- liftIO (updateHscEnvEq hscEnvEq session')
+            return (Just cutoffFp, Just session'')
 
 -- | Load a iface from disk, or generate it if there isn't one or it is out of date
 -- This rule also ensures that the `.hie` and `.o` (if needed) files are written out.

From 14d8d158143ad02d5eaad52661316b611f431966 Mon Sep 17 00:00:00 2001
From: parsonsmatt <mattp@mercury.com>
Date: Wed, 27 May 2026 14:01:29 -0600
Subject: [PATCH 04/11] Revert "ghcide: add structural cutoff fingerprint to
 GhcSessionDeps"

This reverts commit 1db55f257d47e88092a4aa57b1744b9d9ab45cec.
---
 ghcide/src/Development/IDE/Core/Rules.hs | 38 ++++--------------------
 1 file changed, 5 insertions(+), 33 deletions(-)

diff --git a/ghcide/src/Development/IDE/Core/Rules.hs b/ghcide/src/Development/IDE/Core/Rules.hs
index b04757a965..24de344bfa 100644
--- a/ghcide/src/Development/IDE/Core/Rules.hs
+++ b/ghcide/src/Development/IDE/Core/Rules.hs
@@ -763,9 +763,9 @@ loadGhcSession recorder ghcSessionDepsConfig = do
         let cutoffHash = LBS.toStrict $ B.encode (hash (snd val))
         return (Just cutoffHash, val)
 
-    defineEarlyCutoff (cmapWithPrio LogShake recorder) $ RuleNoDiagnostics $ \(GhcSessionDeps_ fullModSummary) file -> do
+    defineNoDiagnostics (cmapWithPrio LogShake recorder) $ \(GhcSessionDeps_ fullModSummary) file -> do
         env <- use_ GhcSession file
-        ghcSessionDepsDefinitionCutoff fullModSummary ghcSessionDepsConfig env file
+        ghcSessionDepsDefinition fullModSummary ghcSessionDepsConfig env file
 
 newtype GhcSessionDepsConfig = GhcSessionDepsConfig
     { fullModuleGraph :: Bool
@@ -785,25 +785,10 @@ ghcSessionDepsDefinition
     :: -- | full mod summary
         Bool ->
         GhcSessionDepsConfig -> HscEnvEq -> NormalizedFilePath -> Action (Maybe HscEnvEq)
-ghcSessionDepsDefinition fullModSummary cfg hscEnvEq file =
-    snd <$> ghcSessionDepsDefinitionCutoff fullModSummary cfg hscEnvEq file
-
--- | Like 'ghcSessionDepsDefinition' but also produces a cutoff fingerprint so
--- the rule can short-circuit when none of its inputs (transitive deps,
--- own mod summary, dep ifaces) have changed. Combined with the
--- early-cutoff pointer-identity preservation in 'defineEarlyCutoff'',
--- this stops generations of equivalent HscEnv\/HPT from accumulating in
--- the per-file Shake cache.
-ghcSessionDepsDefinitionCutoff
-    :: Bool
-    -> GhcSessionDepsConfig
-    -> HscEnvEq
-    -> NormalizedFilePath
-    -> Action (Maybe BS.ByteString, Maybe HscEnvEq)
-ghcSessionDepsDefinitionCutoff fullModSummary GhcSessionDepsConfig{..} hscEnvEq file = do
+ghcSessionDepsDefinition fullModSummary GhcSessionDepsConfig{..} hscEnvEq file = do
     mbdeps <- mapM(fmap artifactFilePath . snd) <$> use_ GetLocatedImports file
     case mbdeps of
-        Nothing -> return (Nothing, Nothing)
+        Nothing -> return Nothing
         Just deps -> do
             when fullModuleGraph $ void $ use_ ReportImportCycles file
             msr <- if fullModSummary
@@ -816,18 +801,6 @@ ghcSessionDepsDefinitionCutoff fullModSummary GhcSessionDepsConfig{..} hscEnvEq
                 env = msrHscEnv msr
             depSessions <- map hscEnv <$> uses_ (GhcSessionDeps_ fullModSummary) deps
             ifaces <- uses_ GetModIface deps
-            transDepsFp <- use_ GetModuleGraphTransDepsFingerprints file
-            -- Cutoff fingerprint: HscEnvEq value is determined by this file's
-            -- own source (which feeds msrHscEnv via pragmas\/dflags), this
-            -- file's import-level mod summary, the trans-deps structure, and
-            -- each dep's iface fingerprint. When all match the prior run, the
-            -- value is structurally equivalent and 'defineEarlyCutoff'' will
-            -- reuse the prior pointer.
-            let !cutoffFp = BS.concat $
-                    fingerprintToBS transDepsFp
-                    : fingerprintToBS (msrFingerprint msr)
-                    : fingerprintToBS (ms_hs_hash ms)
-                    : map hiFileFingerPrint ifaces
             -- Load .hs-boot before .hs: the HPT is keyed by module name, and
             -- GHC's addHomeModInfoToHpt overwrites, so the non-boot must be last.
             let inLoadOrder = sortOn (not . isBootHmi)
@@ -864,8 +837,7 @@ ghcSessionDepsDefinitionCutoff fullModSummary GhcSessionDepsConfig{..} hscEnvEq
             -- session, while `ghcSessionDepsDefinition` will be called for each file we need
             -- to compile. `updateHscEnvEq` will refresh the HscEnv (session') and also
             -- generate a new Unique.
-            session'' <- liftIO (updateHscEnvEq hscEnvEq session')
-            return (Just cutoffFp, Just session'')
+            Just <$> liftIO (updateHscEnvEq hscEnvEq session')
 
 -- | Load a iface from disk, or generate it if there isn't one or it is out of date
 -- This rule also ensures that the `.hie` and `.o` (if needed) files are written out.

From 2add1d4964164b5d02ca2b0fac28864b1f928460 Mon Sep 17 00:00:00 2001
From: parsonsmatt <mattp@mercury.com>
Date: Wed, 27 May 2026 14:01:51 -0600
Subject: [PATCH 05/11] bench: avoid restricted (!!) in RTS stats parser

Hlint flagged ws !! i in secondsAt. Use safe drop/pattern-match
instead.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 shake-bench/src/Development/Benchmark/Rules.hs | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/shake-bench/src/Development/Benchmark/Rules.hs b/shake-bench/src/Development/Benchmark/Rules.hs
index 779306e84e..62779bb212 100644
--- a/shake-bench/src/Development/Benchmark/Rules.hs
+++ b/shake-bench/src/Development/Benchmark/Rules.hs
@@ -466,9 +466,9 @@ parseRTSStats input = RTSStats
     -- Parse the time value (in seconds) at word index `i` (after dropping the label
     -- words). For lines like "  MUT     time    0.012s  (  0.012s elapsed)".
     secondsAt i label = case findLine label of
-        Just l ->
-            let ws = words l
-            in if length ws > i then readDoubleDef (ws !! i) else -1
+        Just l -> case drop i (words l) of
+            (w:_) -> readDoubleDef w
+            _     -> -1
         Nothing -> -1
     -- "Max pause" appears on each Gen line; take the worst.
     parseMaxPause =

From 0d425dacd7a4498c957ca148ed610abae63778ca Mon Sep 17 00:00:00 2001
From: parsonsmatt <mattp@mercury.com>
Date: Wed, 27 May 2026 14:22:35 -0600
Subject: [PATCH 06/11] bench: tidy up infrastructure changes

- Restore samples: 50 in config.yaml (was temporarily 10 for iteration)
- Drop config_overnight.yaml (RTS-flag exploration artifact, not used by CI)
- Trim verbose comments in Main.hs smoke target and MultiLayerModules.sh

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 bench/Main.hs               |   7 +--
 bench/MultiLayerModules.sh  |   6 +-
 bench/config.yaml           |   5 +-
 bench/config_overnight.yaml | 107 ------------------------------------
 4 files changed, 4 insertions(+), 121 deletions(-)
 delete mode 100644 bench/config_overnight.yaml

diff --git a/bench/Main.hs b/bench/Main.hs
index 9fb4235a7c..df80af851a 100644
--- a/bench/Main.hs
+++ b/bench/Main.hs
@@ -168,12 +168,7 @@ createBuildSystem config = do
   whenJust (profileInterval configStatic) $ \i -> do
     phonyRules "profiled-" binaryName (CheapHeapProfiling i) build (examples configStatic)
 
-  -- Fast smoke target: one small example, a few cheap experiments, only HEAD.
-  -- Intended for tight local iteration while improving perf/memory.
-  -- Sample count is still controlled by `samples:` in config.yaml — drop it
-  -- to ~10 for the fastest turnaround.
-  -- DummyLevel0M01NoTH is a single empty module so `searchSymbol` returns
-  -- Nothing for `identifierP`; pick experiments that don't require it.
+  -- Fast smoke target for local iteration: smallest example, cheapest experiments, HEAD only.
   let smokeExample = "DummyLevel0M01NoTH"
       smokeExperiments = ["edit", "edit-header", "documentSymbols after edit"]
       smokeConfigs = [ confName | ConfigurationDescriptor{..} <- configurations configStatic ]
diff --git a/bench/MultiLayerModules.sh b/bench/MultiLayerModules.sh
index 146fc9cb1d..6dc4ba1c5a 100755
--- a/bench/MultiLayerModules.sh
+++ b/bench/MultiLayerModules.sh
@@ -2,11 +2,7 @@
 # Generate $DEPTH layers of modules with $WIDTH modules on each layer.
 # Every module on layer N imports all the modules on layer N-1.
 # MultiLayerModules.hs imports all the modules from the last layer.
-#
-# Flags (any order):
-#   --th             enable TemplateHaskell pragma on the top module
-#   --depth N        override DEPTH (default 15)
-#   --width N        override WIDTH (default 40)
+# Flags (any order): --th, --depth N, --width N.
 DEPTH=15
 WIDTH=40
 TH=
diff --git a/bench/config.yaml b/bench/config.yaml
index 6616ad75b1..1e6f811225 100644
--- a/bench/config.yaml
+++ b/bench/config.yaml
@@ -1,7 +1,6 @@
 # The number of samples to run per experiment.
-# At least 100 is recommended in order to observe space leaks.
-# Temporarily lowered to 10 for fast turnaround during initial analysis.
-samples: 10
+# At least 100 is recommended in order to observe space leaks
+samples: 50
 
 buildTool: cabal
 
diff --git a/bench/config_overnight.yaml b/bench/config_overnight.yaml
deleted file mode 100644
index 7933a6be1e..0000000000
--- a/bench/config_overnight.yaml
+++ /dev/null
@@ -1,107 +0,0 @@
-# Overnight Pass 1: GC flag A/B/C across a subset of experiments.
-# Same HLS binary in all cases; only the +RTS args supplied via extra-args differ.
-# Outputs land in bench-results/unprofiled/<example>/HEAD/All/.
-
-samples: 50
-
-buildTool: cabal
-outputFolder: bench-results
-
-# Heap profiling disabled for this pass — we want clean timing.
-# profileInterval: 1
-
-parallelism: 1
-
-examples:
-  # ---------- cabal-3.16.1.0 variants (~235 transitive modules) ----------
-  - name: cabal-default
-    package: Cabal
-    version: 3.16.1.0
-    modules:
-        - src/Distribution/Simple.hs
-        - src/Distribution/Types/ComponentLocalBuildInfo.hs
-    extra-args: []
-
-  - name: cabal-A64m-F1
-    package: Cabal
-    version: 3.16.1.0
-    modules:
-        - src/Distribution/Simple.hs
-        - src/Distribution/Types/ComponentLocalBuildInfo.hs
-    extra-args: ["+RTS", "-A64m", "-F1", "-RTS"]
-
-  - name: cabal-A128m-F11
-    package: Cabal
-    version: 3.16.1.0
-    modules:
-        - src/Distribution/Simple.hs
-        - src/Distribution/Types/ComponentLocalBuildInfo.hs
-    extra-args: ["+RTS", "-A128m", "-F1.1", "-RTS"]
-
-  - name: cabal-chunked
-    package: Cabal
-    version: 3.16.1.0
-    modules:
-        - src/Distribution/Simple.hs
-        - src/Distribution/Types/ComponentLocalBuildInfo.hs
-    extra-args: ["+RTS", "-A32m", "-n4m", "-F1.1", "-RTS"]
-
-  # ---------- MultiLayerModulesNoTH (600 modules) ----------
-  - name: mlm-default
-    path: bench/MultiLayerModules.sh
-    script: True
-    script-args: []
-    modules:
-        - MultiLayerModules.hs
-        - DummyLevel0M01.hs
-        - DummyLevel1M01.hs
-    extra-args: []
-
-  - name: mlm-A64m-F1
-    path: bench/MultiLayerModules.sh
-    script: True
-    script-args: []
-    modules:
-        - MultiLayerModules.hs
-        - DummyLevel0M01.hs
-        - DummyLevel1M01.hs
-    extra-args: ["+RTS", "-A64m", "-F1", "-RTS"]
-
-# Experiment subset — picked to maximise signal in the slow paths.
-experiments:
-    - "edit"
-    - "semanticTokens after typing burst"
-    - "hover after typing burst"
-    - "completions after typing burst"
-    - "memory pressure"
-    - "eval execute multi-line code lens"
-
-versions:
-- HEAD
-
-configurations:
-- All:
-  - alternateNumberFormat
-  - callHierarchy
-  - changeTypeSignature
-  - class
-  - codeRange
-  - eval
-  - explicitFixity
-  - fourmolu
-  - gadt
-  - ghcide-code-actions-bindings
-  - ghcide-code-actions-fill-holes
-  - ghcide-code-actions-imports-exports
-  - ghcide-code-actions-type-signatures
-  - ghcide-completions
-  - ghcide-type-lenses
-  - hlint
-  - importLens
-  - moduleName
-  - ormolu
-  - pragmas
-  - qualifyImportedNames
-  - rename
-  - stylish-haskell
-  - semanticTokens

From 887f445b7628bec77d7599beb2df456d2871ab16 Mon Sep 17 00:00:00 2001
From: parsonsmatt <mattp@mercury.com>
Date: Thu, 28 May 2026 08:26:48 -0600
Subject: [PATCH 07/11] ci: re-trigger after env failure

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>

From dc31d1ec1d51d9ba92a8733bb9042ed26cde4ad0 Mon Sep 17 00:00:00 2001
From: parsonsmatt <mattp@mercury.com>
Date: Thu, 28 May 2026 08:35:45 -0600
Subject: [PATCH 08/11] ci: drop --unset from ghcup gc to keep active GHC
 installed

After haskell-actions/setup@v2.10.3 finishes, the active GHC is in an
"unset" state. With --unset, `ghcup gc` then deletes the active GHC's
binary, and `cabal build` fails with Cabal-7620 (ghc not found). All
Ubuntu test/flags jobs have been failing this way for new PRs.

Drop the --unset flag. The other gc flags (--share-dir --hls-no-ghc
--cache --tmpdirs) still free the disk space the gc step exists for.

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 .github/actions/setup-build/action.yml | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/.github/actions/setup-build/action.yml b/.github/actions/setup-build/action.yml
index 6612f07ad8..89b057f479 100644
--- a/.github/actions/setup-build/action.yml
+++ b/.github/actions/setup-build/action.yml
@@ -140,5 +140,8 @@ runs:
     # installed besides the ones we explicitly want.
     - name: "Remove ghcup caches"
       if: runner.os == 'Linux'
-      run: ghcup gc --ghc-old --share-dir --hls-no-ghc --cache --tmpdirs --unset
+      # NB: do not pass --unset here. haskell-actions/setup leaves the active
+      # GHC in an "unset" state, and --unset would then delete its binary,
+      # breaking the subsequent `cabal build`.
+      run: ghcup gc --ghc-old --share-dir --hls-no-ghc --cache --tmpdirs
       shell: bash

From 6c6b8892410557344e9120fccc2bb19bf44eb666 Mon Sep 17 00:00:00 2001
From: parsonsmatt <mattp@mercury.com>
Date: Thu, 28 May 2026 09:49:10 -0600
Subject: [PATCH 09/11] ghcide: scope cutoff value-reuse to GetModuleGraph

The unconditional pointer-identity preservation in defineEarlyCutoff'
broke rules whose action body has value-dependent side effects. In
particular, GetLinkable registers the freshly built linkable and unloads
the prior; returning the cached value to downstream then left
consumers (e.g. the eval plugin's :info command) pointing at bytecode
the action had just unloaded.

Make value reuse opt-in via a new flag on defineEarlyCutoff', and a
parallel wrapper defineEarlyCutOffNoFileReuseValue. Apply only to
GetModuleGraph, which is pure and accounts for nearly all of the heap
savings (NodeKey_Module / GWIB / Set.Bin retainers).

Co-Authored-By: Claude Opus 4.7 (1M context) <noreply@anthropic.com>
---
 ghcide/src/Development/IDE/Core/Rules.hs |  2 +-
 ghcide/src/Development/IDE/Core/Shake.hs | 57 +++++++++++++++++-------
 2 files changed, 43 insertions(+), 16 deletions(-)

diff --git a/ghcide/src/Development/IDE/Core/Rules.hs b/ghcide/src/Development/IDE/Core/Rules.hs
index 24de344bfa..fc43dcef63 100644
--- a/ghcide/src/Development/IDE/Core/Rules.hs
+++ b/ghcide/src/Development/IDE/Core/Rules.hs
@@ -639,7 +639,7 @@ getFileHashRule recorder =
         return (Just (fingerprintToBS fileHash), ([], Just fileHash))
 
 getModuleGraphRule :: Recorder (WithPriority Log) -> Rules ()
-getModuleGraphRule recorder = defineEarlyCutOffNoFile (cmapWithPrio LogShake recorder) $ \GetModuleGraph -> do
+getModuleGraphRule recorder = defineEarlyCutOffNoFileReuseValue (cmapWithPrio LogShake recorder) $ \GetModuleGraph -> do
   fs <- toKnownFiles <$> useNoFile_ GetKnownTargets
   dependencyInfoForFiles (HashSet.toList fs)
 
diff --git a/ghcide/src/Development/IDE/Core/Shake.hs b/ghcide/src/Development/IDE/Core/Shake.hs
index ab207b0a11..aa58dc649e 100644
--- a/ghcide/src/Development/IDE/Core/Shake.hs
+++ b/ghcide/src/Development/IDE/Core/Shake.hs
@@ -41,7 +41,7 @@ module Development.IDE.Core.Shake(
     RuleBody(..),
     define, defineNoDiagnostics,
     defineEarlyCutoff,
-    defineNoFile, defineEarlyCutOffNoFile,
+    defineNoFile, defineEarlyCutOffNoFile, defineEarlyCutOffNoFileReuseValue,
     getDiagnostics,
     mRunLspT, mRunLspTCallback,
     getHiddenDiagnostics,
@@ -1212,26 +1212,26 @@ defineEarlyCutoff recorder (Rule op) = addRule $ \(Q (key, file)) (old :: Maybe
     let diagnostics ver diags = do
             traceDiagnostics diags
             updateFileDiagnostics recorder file ver (newKey key) extras diags
-    defineEarlyCutoff' diagnostics (==) key file old mode $ const $ op key file
+    defineEarlyCutoff' diagnostics (==) False key file old mode $ const $ op key file
 defineEarlyCutoff recorder (RuleNoDiagnostics op) = addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode -> otTracedAction key file mode traceA $ \traceDiagnostics -> do
     let diagnostics _ver diags = do
             traceDiagnostics diags
             mapM_ (logWith recorder Warning . LogDefineEarlyCutoffRuleNoDiagHasDiag) diags
-    defineEarlyCutoff' diagnostics (==) key file old mode $ const $ second (mempty,) <$> op key file
+    defineEarlyCutoff' diagnostics (==) False key file old mode $ const $ second (mempty,) <$> op key file
 defineEarlyCutoff recorder RuleWithCustomNewnessCheck{..} =
     addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode ->
         otTracedAction key file mode traceA $ \ traceDiagnostics -> do
             let diagnostics _ver diags = do
                     traceDiagnostics diags
                     mapM_ (logWith recorder Warning . LogDefineEarlyCutoffRuleCustomNewnessHasDiag) diags
-            defineEarlyCutoff' diagnostics newnessCheck key file old mode $
+            defineEarlyCutoff' diagnostics newnessCheck False key file old mode $
                 const $ second (mempty,) <$> build key file
 defineEarlyCutoff recorder (RuleWithOldValue op) = addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode -> otTracedAction key file mode traceA $ \traceDiagnostics -> do
     extras <- getShakeExtras
     let diagnostics ver diags = do
             traceDiagnostics diags
             updateFileDiagnostics recorder file ver (newKey key) extras diags
-    defineEarlyCutoff' diagnostics (==) key file old mode $ op key file
+    defineEarlyCutoff' diagnostics (==) False key file old mode $ op key file
 
 defineNoFile :: IdeRule k v => Recorder (WithPriority Log) -> (k -> Action v) -> Rules ()
 defineNoFile recorder f = defineNoDiagnostics recorder $ \k file -> do
@@ -1243,18 +1243,40 @@ defineEarlyCutOffNoFile recorder f = defineEarlyCutoff recorder $ RuleNoDiagnost
     if file == emptyFilePath then do (hashString, res) <- f k; return (Just hashString, Just res) else
         fail $ "Rule " ++ show k ++ " should always be called with the empty string for a file"
 
+-- | Like 'defineEarlyCutOffNoFile', but reuses the previously cached value
+-- when the early-cutoff fingerprint matches the prior run. Preserves pointer
+-- identity across rebuilds — important for large shared values
+-- (e.g. ModuleGraph) so downstream rules whose deps did not change continue
+-- to share a pointer with the cache, rather than accumulating multiple
+-- structurally equivalent copies.
+--
+-- Only use for rules whose action has no side effects that depend on the
+-- freshly computed value (no global registration, no unloading the prior).
+defineEarlyCutOffNoFileReuseValue :: IdeRule k v => Recorder (WithPriority Log) -> (k -> Action (BS.ByteString, v)) -> Rules ()
+defineEarlyCutOffNoFileReuseValue recorder f =
+  addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode ->
+    otTracedAction key file mode traceA $ \traceDiagnostics -> do
+      let diagnostics _ver diags = do
+              traceDiagnostics diags
+              mapM_ (logWith recorder Warning . LogDefineEarlyCutoffRuleNoDiagHasDiag) diags
+      defineEarlyCutoff' diagnostics (==) True key file old mode $ const $ do
+        if file == emptyFilePath
+          then do (hashString, res) <- f key; return (Just hashString, (mempty, Just res))
+          else fail $ "Rule " ++ show key ++ " should always be called with the empty string for a file"
+
 defineEarlyCutoff'
     :: forall k v. IdeRule k v
     => (Maybe Int32 -> [FileDiagnostic] -> Action ()) -- ^ update diagnostics
     -- | compare current and previous for freshness
     -> (BS.ByteString -> BS.ByteString -> Bool)
+    -> Bool -- ^ reuse cached value on cutoff match (see Note below)
     -> k
     -> NormalizedFilePath
     -> Maybe BS.ByteString
     -> RunMode
     -> (Value v -> Action (Maybe BS.ByteString, IdeResult v))
     -> Action (RunResult (A (RuleResult k)))
-defineEarlyCutoff' doDiagnostics cmp key file mbOld mode action = do
+defineEarlyCutoff' doDiagnostics cmp reuseValueOnMatch key file mbOld mode action = do
     ShakeExtras{state, progress, dirtyKeys} <- getShakeExtras
     options <- getIdeOptions
     let trans g x =  withRunInIO $ \run -> g (run x)
@@ -1299,17 +1321,22 @@ defineEarlyCutoff' doDiagnostics cmp key file mbOld mode action = do
                         -- If we do not have a previous result
                         -- or we got ShakeNoCutoff we always return False.
                         _                                     -> False
-                -- Pointer-identity preservation on early-cutoff match.
+                -- Pointer-identity preservation on early-cutoff match (opt-in).
                 -- When the cutoff fingerprint matches the prior, reuse the
                 -- previously cached value instead of the freshly computed
-                -- (but structurally equivalent) one. This is critical for
-                -- large shared values like ModuleGraph: downstream rules
-                -- that did not invalidate continue to share a pointer with
-                -- the cache, instead of accumulating multiple equivalent
-                -- copies across rebuilds.
-                let res = case (eq, staleV, freshRes) of
-                        (True, Stale _ _ oldV, Succeeded sver _) -> Succeeded sver oldV
-                        _                                        -> freshRes
+                -- (but structurally equivalent) one. This avoids accumulating
+                -- multiple equivalent copies of large shared values like
+                -- ModuleGraph across rebuilds.
+                --
+                -- Only safe for rules whose action body has no
+                -- value-dependent side effects (e.g. registering the fresh
+                -- result in a global table and unloading the stale one).
+                -- For such rules, returning the cached value would leave
+                -- downstream pointing at a representation that the action
+                -- just invalidated. Opt in by passing reuseValueOnMatch=True.
+                let res = case (reuseValueOnMatch, eq, staleV, freshRes) of
+                        (True, True, Stale _ _ oldV, Succeeded sver _) -> Succeeded sver oldV
+                        _                                              -> freshRes
                 return $ RunResult
                     (if eq then ChangedRecomputeSame else ChangedRecomputeDiff)
                     (encodeShakeValue bs)

From 86f71c932231868b0c8363de564b64eae4ec8697 Mon Sep 17 00:00:00 2001
From: parsonsmatt <mattp@mercury.com>
Date: Thu, 28 May 2026 13:55:51 -0600
Subject: [PATCH 10/11] review fixups: cutoff reuse hardening + bench tooling
 fixes

Code-review follow-ups on the GetModuleGraph value-reuse change and the
benchmark tooling it ships with.

ghcide:
- defineEarlyCutoff' only reuses a clean prior value (Stale Nothing); a
  deletion-marked Stale (Just _) is no longer resurrected as Succeeded.
- Replace the hand-inlined addRule copy in defineEarlyCutOffNoFileReuseValue
  with a RuleNoDiagnosticsReuseValue constructor + shared addNoDiagnosticsRule
  helper, and factor the empty-path guard into noFileBody.

bench/shake-bench:
- run_pass2_costcentre.sh: drop --lsp-config (it made ghcide-bench read empty
  stdin from /dev/null and error out, masked by `|| echo`).
- MultiLayerModules.sh: require a value for --depth/--width (${2:?...}) instead
  of looping forever when it is missing.
- summarizeHpProfile: foldl' + forced max (no thunk buildup); skip the .hp.csv
  summary under NoProfiling instead of rewriting the dummy each build.
- parseRTSStats: whitespace-normalise label matching (robust to GHC -s spacing).
- Collapse showMB/showSeconds/showDouble/showInt to a single max-0 clamp.
- quantileSorted: sort each sample list once per row instead of 6x.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 bench/MultiLayerModules.sh                    |  4 +-
 bench/run_pass2_costcentre.sh                 |  1 -
 ghcide-bench/src/Experiments.hs               | 31 +++++-----
 ghcide/src/Development/IDE/Core/Shake.hs      | 62 +++++++++++++------
 .../src/Development/Benchmark/Rules.hs        | 45 +++++++-------
 5 files changed, 85 insertions(+), 58 deletions(-)

diff --git a/bench/MultiLayerModules.sh b/bench/MultiLayerModules.sh
index 6dc4ba1c5a..55407f600e 100755
--- a/bench/MultiLayerModules.sh
+++ b/bench/MultiLayerModules.sh
@@ -13,11 +13,11 @@ while [[ $# -gt 0 ]]; do
       shift
       ;;
     --depth)
-      DEPTH=$2
+      DEPTH=${2:?"--depth requires a value"}
       shift 2
       ;;
     --width)
-      WIDTH=$2
+      WIDTH=${2:?"--width requires a value"}
       shift 2
       ;;
     *)
diff --git a/bench/run_pass2_costcentre.sh b/bench/run_pass2_costcentre.sh
index ff2cdf0750..6357065426 100755
--- a/bench/run_pass2_costcentre.sh
+++ b/bench/run_pass2_costcentre.sh
@@ -44,7 +44,6 @@ run_experiment() {
         --csv="$outdir/$exp_slug.csv" \
         --ghcide="$(pwd)/$HLS" \
         --select "$experiment" \
-        --lsp-config \
         "${example_args[@]}" \
         --ghcide-options=+RTS \
         --ghcide-options=-p \
diff --git a/ghcide-bench/src/Experiments.hs b/ghcide-bench/src/Experiments.hs
index 4e673e6993..6516eab37e 100644
--- a/ghcide-bench/src/Experiments.hs
+++ b/ghcide-bench/src/Experiments.hs
@@ -599,17 +599,19 @@ runBenchmarksFun dir allBenchmarks = do
             show rulesTotal,
             show edgesTotal,
             show rebuildsTotal,
-            showMs (quantile 0.50 userWaitsSamples),
-            showMs (quantile 0.95 userWaitsSamples),
-            showMs (quantile 0.99 userWaitsSamples),
+            showMs (quantileSorted 0.50 sortedUserWaits),
+            showMs (quantileSorted 0.95 sortedUserWaits),
+            showMs (quantileSorted 0.99 sortedUserWaits),
             showMs (stdDev   userWaitsSamples),
-            showMs (quantile 0.50 delayedWorkSamples),
-            showMs (quantile 0.95 delayedWorkSamples),
-            showMs (quantile 0.99 delayedWorkSamples)
+            showMs (quantileSorted 0.50 sortedDelayedWork),
+            showMs (quantileSorted 0.95 sortedDelayedWork),
+            showMs (quantileSorted 0.99 sortedDelayedWork)
           ]
           | (Bench {name, samples}, BenchRun {..}) <- results,
             let runSetup' = if runSetup < 0.01 then 0 else runSetup
                 modules = fromIntegral $ length $ exampleModules $ example ?config
+                sortedUserWaits   = sort userWaitsSamples
+                sortedDelayedWork = sort delayedWorkSamples
         ]
       csv = unlines $ map (intercalate ", ") (headers : rows)
   writeFile (outputCSV ?config) csv
@@ -696,14 +698,15 @@ data BenchRun = BenchRun
 badRun :: BenchRun
 badRun = BenchRun 0 0 0 0 0 [] [] 0 0 0 0 0 0 0 0 False
 
--- | Approximate quantile of a list of samples. Uses the nearest-rank
--- method; good enough for benchmark reporting at n>=20.
-quantile :: Double -> [Seconds] -> Seconds
-quantile _ [] = 0
-quantile q xs =
-    let sorted = sort xs
-        n      = length sorted
-        idx    = max 0 $ min (n - 1) $ floor (q * fromIntegral (n - 1))
+-- | Approximate quantile (nearest-rank) of an /already-sorted/ list of
+-- samples. Good enough for benchmark reporting at n>=20. Sorting is the
+-- caller's responsibility so that several quantiles over the same samples
+-- can share a single sort.
+quantileSorted :: Double -> [Seconds] -> Seconds
+quantileSorted _ [] = 0
+quantileSorted q sorted =
+    let n   = length sorted
+        idx = min (n - 1) $ floor (q * fromIntegral (n - 1))
     in sorted !! idx
 
 -- | Population standard deviation.
diff --git a/ghcide/src/Development/IDE/Core/Shake.hs b/ghcide/src/Development/IDE/Core/Shake.hs
index aa58dc649e..a3ad6ab895 100644
--- a/ghcide/src/Development/IDE/Core/Shake.hs
+++ b/ghcide/src/Development/IDE/Core/Shake.hs
@@ -1195,6 +1195,9 @@ useWithoutDependency key file =
 data RuleBody k v
   = Rule (k -> NormalizedFilePath -> Action (Maybe BS.ByteString, IdeResult v))
   | RuleNoDiagnostics (k -> NormalizedFilePath -> Action (Maybe BS.ByteString, Maybe v))
+  -- | Like 'RuleNoDiagnostics', but reuses the previously cached value on an
+  -- early-cutoff match (see 'defineEarlyCutOffNoFileReuseValue').
+  | RuleNoDiagnosticsReuseValue (k -> NormalizedFilePath -> Action (Maybe BS.ByteString, Maybe v))
   | RuleWithCustomNewnessCheck
     { newnessCheck :: BS.ByteString -> BS.ByteString -> Bool
     , build :: k -> NormalizedFilePath -> Action (Maybe BS.ByteString, Maybe v)
@@ -1213,11 +1216,10 @@ defineEarlyCutoff recorder (Rule op) = addRule $ \(Q (key, file)) (old :: Maybe
             traceDiagnostics diags
             updateFileDiagnostics recorder file ver (newKey key) extras diags
     defineEarlyCutoff' diagnostics (==) False key file old mode $ const $ op key file
-defineEarlyCutoff recorder (RuleNoDiagnostics op) = addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode -> otTracedAction key file mode traceA $ \traceDiagnostics -> do
-    let diagnostics _ver diags = do
-            traceDiagnostics diags
-            mapM_ (logWith recorder Warning . LogDefineEarlyCutoffRuleNoDiagHasDiag) diags
-    defineEarlyCutoff' diagnostics (==) False key file old mode $ const $ second (mempty,) <$> op key file
+defineEarlyCutoff recorder (RuleNoDiagnostics op) =
+    addNoDiagnosticsRule recorder False op
+defineEarlyCutoff recorder (RuleNoDiagnosticsReuseValue op) =
+    addNoDiagnosticsRule recorder True op
 defineEarlyCutoff recorder RuleWithCustomNewnessCheck{..} =
     addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode ->
         otTracedAction key file mode traceA $ \ traceDiagnostics -> do
@@ -1233,15 +1235,40 @@ defineEarlyCutoff recorder (RuleWithOldValue op) = addRule $ \(Q (key, file)) (o
             updateFileDiagnostics recorder file ver (newKey key) extras diags
     defineEarlyCutoff' diagnostics (==) False key file old mode $ op key file
 
+-- | Shared implementation for the no-diagnostics rule variants. The 'Bool'
+-- selects @defineEarlyCutoff'@'s value-reuse behaviour on an early-cutoff match.
+addNoDiagnosticsRule
+    :: forall k v. IdeRule k v
+    => Recorder (WithPriority Log)
+    -> Bool
+    -> (k -> NormalizedFilePath -> Action (Maybe BS.ByteString, Maybe v))
+    -> Rules ()
+addNoDiagnosticsRule recorder reuseValueOnMatch op =
+    addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode ->
+        otTracedAction key file mode traceA $ \traceDiagnostics -> do
+            let diagnostics _ver diags = do
+                    traceDiagnostics diags
+                    mapM_ (logWith recorder Warning . LogDefineEarlyCutoffRuleNoDiagHasDiag) diags
+            defineEarlyCutoff' diagnostics (==) reuseValueOnMatch key file old mode $
+                const $ second (mempty,) <$> op key file
+
 defineNoFile :: IdeRule k v => Recorder (WithPriority Log) -> (k -> Action v) -> Rules ()
 defineNoFile recorder f = defineNoDiagnostics recorder $ \k file -> do
     if file == emptyFilePath then do res <- f k; return (Just res) else
         fail $ "Rule " ++ show k ++ " should always be called with the empty string for a file"
 
+-- | Body shared by the no-file early-cutoff rule variants: assert the empty
+-- file path and wrap the (fingerprint, value) result.
+noFileBody
+    :: Show k
+    => (k -> Action (BS.ByteString, v))
+    -> k -> NormalizedFilePath -> Action (Maybe BS.ByteString, Maybe v)
+noFileBody f k file
+    | file == emptyFilePath = do (hashString, res) <- f k; return (Just hashString, Just res)
+    | otherwise = fail $ "Rule " ++ show k ++ " should always be called with the empty string for a file"
+
 defineEarlyCutOffNoFile :: IdeRule k v => Recorder (WithPriority Log) -> (k -> Action (BS.ByteString, v)) -> Rules ()
-defineEarlyCutOffNoFile recorder f = defineEarlyCutoff recorder $ RuleNoDiagnostics $ \k file -> do
-    if file == emptyFilePath then do (hashString, res) <- f k; return (Just hashString, Just res) else
-        fail $ "Rule " ++ show k ++ " should always be called with the empty string for a file"
+defineEarlyCutOffNoFile recorder f = defineEarlyCutoff recorder $ RuleNoDiagnostics $ noFileBody f
 
 -- | Like 'defineEarlyCutOffNoFile', but reuses the previously cached value
 -- when the early-cutoff fingerprint matches the prior run. Preserves pointer
@@ -1253,16 +1280,7 @@ defineEarlyCutOffNoFile recorder f = defineEarlyCutoff recorder $ RuleNoDiagnost
 -- Only use for rules whose action has no side effects that depend on the
 -- freshly computed value (no global registration, no unloading the prior).
 defineEarlyCutOffNoFileReuseValue :: IdeRule k v => Recorder (WithPriority Log) -> (k -> Action (BS.ByteString, v)) -> Rules ()
-defineEarlyCutOffNoFileReuseValue recorder f =
-  addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode ->
-    otTracedAction key file mode traceA $ \traceDiagnostics -> do
-      let diagnostics _ver diags = do
-              traceDiagnostics diags
-              mapM_ (logWith recorder Warning . LogDefineEarlyCutoffRuleNoDiagHasDiag) diags
-      defineEarlyCutoff' diagnostics (==) True key file old mode $ const $ do
-        if file == emptyFilePath
-          then do (hashString, res) <- f key; return (Just hashString, (mempty, Just res))
-          else fail $ "Rule " ++ show key ++ " should always be called with the empty string for a file"
+defineEarlyCutOffNoFileReuseValue recorder f = defineEarlyCutoff recorder $ RuleNoDiagnosticsReuseValue $ noFileBody f
 
 defineEarlyCutoff'
     :: forall k v. IdeRule k v
@@ -1334,9 +1352,13 @@ defineEarlyCutoff' doDiagnostics cmp reuseValueOnMatch key file mbOld mode actio
                 -- For such rules, returning the cached value would leave
                 -- downstream pointing at a representation that the action
                 -- just invalidated. Opt in by passing reuseValueOnMatch=True.
+                -- Reuse only a clean prior value (Stale Nothing): a
+                -- deletion-marked Stale (Just _) may have been
+                -- garbage-collected/invalidated, so fall through to the fresh
+                -- result rather than resurrecting it.
                 let res = case (reuseValueOnMatch, eq, staleV, freshRes) of
-                        (True, True, Stale _ _ oldV, Succeeded sver _) -> Succeeded sver oldV
-                        _                                              -> freshRes
+                        (True, True, Stale Nothing _ oldV, Succeeded sver _) -> Succeeded sver oldV
+                        _                                                    -> freshRes
                 return $ RunResult
                     (if eq then ChangedRecomputeSame else ChangedRecomputeDiff)
                     (encodeShakeValue bs)
diff --git a/shake-bench/src/Development/Benchmark/Rules.hs b/shake-bench/src/Development/Benchmark/Rules.hs
index 62779bb212..e7daa8a554 100644
--- a/shake-bench/src/Development/Benchmark/Rules.hs
+++ b/shake-bench/src/Development/Benchmark/Rules.hs
@@ -71,9 +71,9 @@ import           Data.Aeson                (FromJSON (..), ToJSON (..),
 import           Data.Aeson.Lens           (AsJSON (_JSON), _Object, _String)
 import           Data.ByteString.Lazy      (ByteString)
 import           Data.Char                 (isDigit)
-import           Data.List                 (find, intercalate, isInfixOf,
-                                            isPrefixOf, isSuffixOf, sortBy,
-                                            stripPrefix, transpose)
+import           Data.List                 (find, foldl', intercalate,
+                                            isInfixOf, isPrefixOf, isSuffixOf,
+                                            sortBy, stripPrefix, transpose)
 import           Data.List.Extra           (lower, splitOn)
 import qualified Data.Map.Strict           as Map
 import           Data.Maybe                (fromMaybe)
@@ -328,13 +328,12 @@ benchRules build MkBenchRules{..} = do
               BenchProject {..}
         liftIO $ case prof of
             NoProfiling -> writeFile outHp dummyHp
-            _           -> return ()
-
-        -- Write a co-located top-N heap summary derived from the .hp file.
-        -- Untracked side-effect; regenerated whenever the .hp is.
-        liftIO $ do
-            hp <- readFile outHp
-            writeFile (outHp <.> "csv") (summarizeHpProfile 20 hp)
+            _ -> do
+                -- Write a co-located top-N heap summary derived from the .hp
+                -- file. Untracked side-effect; regenerated whenever the .hp
+                -- is. Skipped under NoProfiling, where outHp is only a dummy.
+                hp <- readFile outHp
+                writeFile (outHp <.> "csv") (summarizeHpProfile 20 hp)
 
         -- extend csv output with allocation data + GC/pause/productivity
         csvContents <- liftIO $ lines <$> readFile outcsv
@@ -369,18 +368,16 @@ benchRules build MkBenchRules{..} = do
         let csvContents' = header' : results'
         writeFileLines outcsv csvContents'
     where
+        -- A negative value is parseRTSStats's "field missing" sentinel; clamp
+        -- to zero so the column stays numeric for downstream CSV parsing.
         showMB :: Int -> String
-        showMB x | x < 0 = "0MB"
-                 | otherwise = show (x `div` 2^(20::Int)) <> "MB"
+        showMB x = show (max 0 x `div` 2^(20::Int)) <> "MB"
         showSeconds :: Double -> String
-        showSeconds x | x < 0 = "0"
-                      | otherwise = printf "%.4f" x
+        showSeconds = printf "%.4f" . max 0
         showDouble :: Double -> String
-        showDouble x | x < 0 = "0"
-                     | otherwise = printf "%.2f" x
+        showDouble = printf "%.2f" . max 0
         showInt :: Int -> String
-        showInt x | x < 0 = "0"
-                  | otherwise = show x
+        showInt = show . max (0 :: Int)
 
 -- | Summarize a GHC heap profile (.hp) file into a top-N cost-centre CSV.
 -- For each cost-centre name we report the maximum retained bytes across
@@ -391,9 +388,11 @@ summarizeHpProfile topN raw =
     let ls = lines raw
         samples = collectSamples ls
         accumOne acc (cc, bs) = Map.insertWith combine cc (bs, bs) acc
-          where combine (newMax, newLast) (oldMax, _) = (max newMax oldMax, newLast)
+          where combine (newMax, newLast) (oldMax, _) =
+                    let m = max newMax oldMax in m `seq` (m, newLast)
         -- Aggregate across all samples: track running max and last value per CC.
-        agg = foldl (\acc s -> foldl accumOne acc s) Map.empty samples
+        -- Strict fold + forced max to avoid a thunk chain over the samples.
+        agg = foldl' (\acc s -> foldl' accumOne acc s) Map.empty samples
         sorted = sortBy (comparing (Down . maxOf)) (Map.toList agg)
         maxOf (_, (m, _)) = m
         header = "costCentre,maxRetainedBytes,finalRetainedBytes"
@@ -456,7 +455,11 @@ parseRTSStats input = RTSStats
     }
   where
     ls = lines input
-    findLine label = find (label `isInfixOf`) ls
+    -- Match summary labels tolerant of GHC's inter-column spacing, which
+    -- varies across versions (e.g. "MUT     time" vs "MUT  time"); both the
+    -- label and each line are whitespace-normalised before comparison.
+    findLine label = let needle = unwords (words label)
+                     in find ((needle `isInfixOf`) . unwords . words) ls
     digitsFirstWord label = case findLine label of
         Just l  -> readIntDef $ filter isDigit $ head (words l ++ [""])
         Nothing -> -1

From 756d9654dbfc7e68fbf4700b59e7decf5618e63e Mon Sep 17 00:00:00 2001
From: parsonsmatt <mattp@mercury.com>
Date: Thu, 28 May 2026 17:50:25 -0600
Subject: [PATCH 11/11] ghcide: correct comment on Stale Nothing reuse guard
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The comment justified skipping `Stale (Just _)` values by calling them
deletion/GC markers. That is inaccurate: `Stale`'s first field is a
`Maybe PositionDelta`, written only by the persistent-rule fallback in
lastValueIO (a value loaded from disk, not produced by the rule's action
this run). State the real reason — such a value is not a trustworthy
prior to reuse — so future readers don't misread the Value lifecycle.

Comment-only; no behavior change.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 ghcide/src/Development/IDE/Core/Shake.hs | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ghcide/src/Development/IDE/Core/Shake.hs b/ghcide/src/Development/IDE/Core/Shake.hs
index a3ad6ab895..77401cc0aa 100644
--- a/ghcide/src/Development/IDE/Core/Shake.hs
+++ b/ghcide/src/Development/IDE/Core/Shake.hs
@@ -1352,10 +1352,14 @@ defineEarlyCutoff' doDiagnostics cmp reuseValueOnMatch key file mbOld mode actio
                 -- For such rules, returning the cached value would leave
                 -- downstream pointing at a representation that the action
                 -- just invalidated. Opt in by passing reuseValueOnMatch=True.
-                -- Reuse only a clean prior value (Stale Nothing): a
-                -- deletion-marked Stale (Just _) may have been
-                -- garbage-collected/invalidated, so fall through to the fresh
-                -- result rather than resurrecting it.
+                -- Reuse only a clean prior value (Stale Nothing _): that is
+                -- the shape a value successfully computed by this rule takes
+                -- (see the Just Succeeded branch above). A Stale (Just _)
+                -- instead carries a PositionDelta and is only ever written by
+                -- the persistent-rule fallback (lastValueIO), i.e. a value
+                -- loaded from disk rather than produced by this rule's action
+                -- this run, so it is not a trustworthy prior to reuse; fall
+                -- through to the fresh result in that case.
                 let res = case (reuseValueOnMatch, eq, staleV, freshRes) of
                         (True, True, Stale Nothing _ oldV, Succeeded sver _) -> Succeeded sver oldV
                         _                                                    -> freshRes