diff --git a/.github/actions/setup-build/action.yml b/.github/actions/setup-build/action.yml
index 6612f07ad8..89b057f479 100644
--- a/.github/actions/setup-build/action.yml
+++ b/.github/actions/setup-build/action.yml
@@ -140,5 +140,8 @@ runs:
     # installed besides the ones we explicitly want.
     - name: "Remove ghcup caches"
       if: runner.os == 'Linux'
-      run: ghcup gc --ghc-old --share-dir --hls-no-ghc --cache --tmpdirs --unset
+      # NB: do not pass --unset here. haskell-actions/setup leaves the active
+      # GHC in an "unset" state, and --unset would then delete its binary,
+      # breaking the subsequent `cabal build`.
+      run: ghcup gc --ghc-old --share-dir --hls-no-ghc --cache --tmpdirs
       shell: bash
diff --git a/bench/Main.hs b/bench/Main.hs
index d1ac603b18..df80af851a 100644
--- a/bench/Main.hs
+++ b/bench/Main.hs
@@ -64,7 +64,7 @@ import           Development.Shake           (Action,
                                               actionBracket, addOracle,
                                               askOracle, command, command_,
                                               getDirectoryFiles, liftIO, need,
-                                              newCache, shakeArgsWith,
+                                              newCache, phony, shakeArgsWith,
                                               shakeOptions, versioned, want)
 import           Development.Shake.Classes
 import           Experiments.Types           (Example (exampleName),
@@ -168,6 +168,17 @@ createBuildSystem config = do
   whenJust (profileInterval configStatic) $ \i -> do
     phonyRules "profiled-" binaryName (CheapHeapProfiling i) build (examples configStatic)
 
+  -- Fast smoke target for local iteration: smallest example, cheapest experiments, HEAD only.
+  let smokeExample = "DummyLevel0M01NoTH"
+      smokeExperiments = ["edit", "edit-header", "documentSymbols after edit"]
+      smokeConfigs = [ confName | ConfigurationDescriptor{..} <- configurations configStatic ]
+  phony "smoke" $ need
+    [ build </> "unprofiled" </> smokeExample </> "HEAD" </> conf
+            </> escaped (escapeExperiment (Unescaped experiment)) <.> "csv"
+    | conf <- smokeConfigs
+    , experiment <- smokeExperiments
+    ]
+
   return configStatic
 
 disableAllPluginsBut :: (PluginId -> Bool) -> Plugin.Config
diff --git a/bench/MultiLayerModules.sh b/bench/MultiLayerModules.sh
index 38d85ce9ed..55407f600e 100755
--- a/bench/MultiLayerModules.sh
+++ b/bench/MultiLayerModules.sh
@@ -1,9 +1,30 @@
 #!/usr/bin/env bash
-# Generate $DEPTH layers of modules with $WIDTH modules on each layer
-# Every module on layer N imports all the modules on layer N-1
-# MultiLayerModules.hs imports all the modules from the last layer
+# Generate $DEPTH layers of modules with $WIDTH modules on each layer.
+# Every module on layer N imports all the modules on layer N-1.
+# MultiLayerModules.hs imports all the modules from the last layer.
+# Flags (any order): --th, --depth N, --width N.
 DEPTH=15
 WIDTH=40
+TH=
+while [[ $# -gt 0 ]]; do
+  case "$1" in
+    --th)
+      TH=1
+      shift
+      ;;
+    --depth)
+      DEPTH=${2:?"--depth requires a value"}
+      shift 2
+      ;;
+    --width)
+      WIDTH=${2:?"--width requires a value"}
+      shift 2
+      ;;
+    *)
+      shift
+      ;;
+  esac
+done
 cat >hie.yaml << EOF
 cradle:
   direct:
@@ -22,13 +43,11 @@ for l in $(seq 1 $DEPTH); do
     done
   done
 done
-case "$1" in
-  '--th')
-    echo "{-# LANGUAGE TemplateHaskell #-}" > MultiLayerModules.hs
-    ;;
-esac
+if [[ -n "$TH" ]]; then
+  echo "{-# LANGUAGE TemplateHaskell #-}" > MultiLayerModules.hs
+fi
 echo "module MultiLayerModules where" >> MultiLayerModules.hs
-  echo "          - MultiLayerModules.hs" >> hie.yaml;
+echo "          - MultiLayerModules.hs" >> hie.yaml;
 for j in $(seq -w 1 $WIDTH); do
   echo "import DummyLevel${DEPTH}M$j" >> MultiLayerModules.hs;
 done
diff --git a/bench/config.yaml b/bench/config.yaml
index b1218976ce..1e6f811225 100644
--- a/bench/config.yaml
+++ b/bench/config.yaml
@@ -77,6 +77,23 @@ examples:
     modules:
         - DummyLevel1M01.hs
 
+  # Large synthetic project: 200 layers × 50 modules/layer = 10000 modules,
+  # ~500k import edges (each module imports every module of the prior layer).
+  # This is closer in scale to a real production Haskell codebase than the
+  # default MultiLayerModules. Expect this to be slow: initial cradle load
+  # alone takes many minutes, and each experiment runs `samples:` rounds.
+  # Recommended: drop `samples:` to ~10 and run via single-experiment targets
+  # (e.g. `bench-results/unprofiled/MultiLayerModulesXL/HEAD/All/edit.csv`)
+  # rather than the full matrix.
+  - name: MultiLayerModulesXL
+    path: bench/MultiLayerModules.sh
+    script: True
+    script-args: ["--depth", "200", "--width", "50"]
+    modules:
+        - MultiLayerModules.hs
+        - DummyLevel0M01.hs
+        - DummyLevel100M01.hs
+
   # Small but heavily multi-component example
   # Disabled as it is far to slow. hie-bios >0.7.2 should help
   # - name: HLS
@@ -114,6 +131,7 @@ experiments:
     - "hole fit suggestions"
     - "eval execute single-line code lens"
     - "eval execute multi-line code lens"
+    - "memory pressure"
 
 # An ordered list of versions to analyze
 versions:
diff --git a/bench/run_pass2_costcentre.sh b/bench/run_pass2_costcentre.sh
new file mode 100755
index 0000000000..6357065426
--- /dev/null
+++ b/bench/run_pass2_costcentre.sh
@@ -0,0 +1,87 @@
+#!/usr/bin/env bash
+# Pass 2: cost-centre profiled benchmark.
+#
+# Requires bench-results/binaries/HEAD-prof/haskell-language-server to exist
+# (built with --enable-profiling --profiling-detail=late).
+#
+# For each (example, experiment) pair, invoke ghcide-bench directly with
+# +RTS -p -hc -i1 -po<stem> -RTS so we get:
+#   <stem>.prof       — time + allocation by cost-centre
+#   <stem>.hp         — heap residency by cost-centre over time
+#   <stem>.hp.csv     — top-N cost-centres (produced by parseHpProfile, but
+#                       NOTE: only the bench harness invokes that. For Pass 2
+#                       we'd need to run summarizeHpProfile separately. See
+#                       end of script.)
+set -euo pipefail
+
+HLS=bench-results/binaries/HEAD-prof/haskell-language-server
+OUTDIR=bench-results/prof-cc
+SAMPLES=20
+
+if [[ ! -x "$HLS" ]]; then
+    echo "ERROR: profiled HLS not found at $HLS" >&2
+    exit 1
+fi
+
+run_experiment() {
+    local example_name=$1
+    local experiment=$2
+    shift 2
+    local example_args=("$@")
+
+    local exp_slug=${experiment// /_}
+    local outdir="$OUTDIR/$example_name/$exp_slug"
+    mkdir -p "$outdir"
+    local stem="$(pwd)/$outdir/$exp_slug"
+
+    echo
+    echo "=== $example_name :: $experiment ==="
+
+    cabal exec -- ghcide-bench \
+        --timeout=600 \
+        --no-clean \
+        --samples="$SAMPLES" \
+        --csv="$outdir/$exp_slug.csv" \
+        --ghcide="$(pwd)/$HLS" \
+        --select "$experiment" \
+        "${example_args[@]}" \
+        --ghcide-options=+RTS \
+        --ghcide-options=-p \
+        --ghcide-options=-hc \
+        --ghcide-options=-i1 \
+        --ghcide-options=-po"$stem" \
+        --ghcide-options=-l \
+        --ghcide-options=-ol"$stem.eventlog" \
+        --ghcide-options=-S"$stem.gcStats.log" \
+        --ghcide-options=-RTS \
+        < /dev/null || echo "  FAILED — continuing"
+}
+
+# ---------- Example: cabal-3.16.1.0 ----------
+CABAL_ARGS=(
+    --example-package-name Cabal
+    --example-package-version 3.16.1.0
+    --example-name cabal
+    --example-module=src/Distribution/Simple.hs
+    --example-module=src/Distribution/Types/ComponentLocalBuildInfo.hs
+)
+for exp in "memory pressure" "completions after typing burst" "hover after typing burst" "edit" "semanticTokens after typing burst"; do
+    run_experiment cabal "$exp" "${CABAL_ARGS[@]}"
+done
+
+# ---------- Example: MultiLayerModulesNoTH ----------
+MLM_ARGS=(
+    --example-script "$(pwd)/bench/MultiLayerModules.sh"
+    --example-name MultiLayerModulesNoTH
+    --example-module=MultiLayerModules.hs
+    --example-module=DummyLevel0M01.hs
+    --example-module=DummyLevel1M01.hs
+)
+for exp in "memory pressure" "edit" "hover after typing burst"; do
+    run_experiment MultiLayerModulesNoTH "$exp" "${MLM_ARGS[@]}"
+done
+
+echo
+echo "=== Pass 2 complete ==="
+echo "Profile artefacts under $OUTDIR/"
+echo "Inspect .prof files for time+alloc attribution: head -50 $OUTDIR/cabal/memory_pressure/memory_pressure.prof"
diff --git a/ghcide-bench/src/Experiments.hs b/ghcide-bench/src/Experiments.hs
index 8e70483559..6516eab37e 100644
--- a/ghcide-bench/src/Experiments.hs
+++ b/ghcide-bench/src/Experiments.hs
@@ -352,6 +352,45 @@ experiments =
               )
         ),
       ---------------------------------------------------------------------------------------
+      ---------------------------------------------------------------------------------------
+      -- Memory-pressure experiment: hammers the server with a variety of
+      -- requests across every open document, interleaved with edits, to force
+      -- the IDE to retain caches/results across many rules. Each sample is one
+      -- "round" so the resulting peak/post-GC heap (via RTS -s) reflects the
+      -- worst case under sustained load. The default sample count makes this
+      -- much longer than other experiments — override with --samples when you
+      -- only need a quick smoke check.
+      bench "memory pressure" $ \docs -> do
+        -- Round 1: typing burst across all docs.
+        applyTypingBurst docs
+        _ <- waitForBuildQueue
+        -- Round 2: exercise hover + getDefinition + completions on every
+        -- doc that has an identifier position. These populate per-rule
+        -- caches across the whole project.
+        forM_ docs $ \DocumentPositions{..} -> do
+          forM_ identifierP $ \p -> do
+            _ <- getHover doc p
+            _ <- getDefinitions doc p
+            _ <- getCompletions doc p
+            return ()
+        _ <- waitForBuildQueue
+        -- Round 3: code actions across every doc — these tend to pull in
+        -- additional rules (suggestions, refactor previews, etc.).
+        forM_ docs $ \DocumentPositions{..} -> do
+          forM_ identifierP $ \p ->
+            void $ getCodeActions doc (Range p p)
+        _ <- waitForBuildQueue
+        -- Round 4: document symbols on every doc — populates the symbol cache.
+        forM_ docs $ \DocumentPositions{..} ->
+          void $ getDocumentSymbols doc
+        _ <- waitForBuildQueue
+        -- Round 5: another typing burst, to ensure the caches above survive
+        -- invalidation rather than getting dropped — this is what catches
+        -- "we recompute everything from scratch on every edit" regressions.
+        applyTypingBurst docs
+        _ <- waitForBuildQueue
+        return True,
+      ---------------------------------------------------------------------------------------
       benchWithSetup
         "eval execute multi-line code lens"
         ( mapM_ $ \DocumentPositions{..} -> do
@@ -533,6 +572,13 @@ runBenchmarksFun dir allBenchmarks = do
         , "rulesTotal"
         , "ruleEdges"
         , "ghcRebuilds"
+        , "userP50"
+        , "userP95"
+        , "userP99"
+        , "userStdDev"
+        , "delayedP50"
+        , "delayedP95"
+        , "delayedP99"
         ]
       rows =
         [ [ name,
@@ -552,11 +598,20 @@ runBenchmarksFun dir allBenchmarks = do
             show rulesVisited,
             show rulesTotal,
             show edgesTotal,
-            show rebuildsTotal
+            show rebuildsTotal,
+            showMs (quantileSorted 0.50 sortedUserWaits),
+            showMs (quantileSorted 0.95 sortedUserWaits),
+            showMs (quantileSorted 0.99 sortedUserWaits),
+            showMs (stdDev   userWaitsSamples),
+            showMs (quantileSorted 0.50 sortedDelayedWork),
+            showMs (quantileSorted 0.95 sortedDelayedWork),
+            showMs (quantileSorted 0.99 sortedDelayedWork)
           ]
           | (Bench {name, samples}, BenchRun {..}) <- results,
             let runSetup' = if runSetup < 0.01 then 0 else runSetup
                 modules = fromIntegral $ length $ exampleModules $ example ?config
+                sortedUserWaits   = sort userWaitsSamples
+                sortedDelayedWork = sort delayedWorkSamples
         ]
       csv = unlines $ map (intercalate ", ") (headers : rows)
   writeFile (outputCSV ?config) csv
@@ -625,6 +680,10 @@ data BenchRun = BenchRun
     runExperiment        :: !Seconds,
     userWaits            :: !Seconds,
     delayedWork          :: !Seconds,
+    -- | Per-sample user-time samples (length == samples on success).
+    userWaitsSamples     :: ![Seconds],
+    -- | Per-sample delayed-work samples (length == samples on success).
+    delayedWorkSamples   :: ![Seconds],
     firstResponse        :: !Seconds,
     firstResponseDelayed :: !Seconds,
     rulesBuilt           :: !Int,
@@ -637,7 +696,27 @@ data BenchRun = BenchRun
   }
 
 badRun :: BenchRun
-badRun = BenchRun 0 0 0 0 0 0 0 0 0 0 0 0 0 False
+badRun = BenchRun 0 0 0 0 0 [] [] 0 0 0 0 0 0 0 0 False
+
+-- | Approximate quantile (nearest-rank) of an /already-sorted/ list of
+-- samples. Good enough for benchmark reporting at n>=20. Sorting is the
+-- caller's responsibility so that several quantiles over the same samples
+-- can share a single sort.
+quantileSorted :: Double -> [Seconds] -> Seconds
+quantileSorted _ [] = 0
+quantileSorted q sorted =
+    let n   = length sorted
+        idx = min (n - 1) $ floor (q * fromIntegral (n - 1))
+    in sorted !! idx
+
+-- | Population standard deviation.
+stdDev :: [Seconds] -> Seconds
+stdDev [] = 0
+stdDev xs =
+    let n  = fromIntegral (length xs)
+        m  = sum xs / n
+        sq x = let d = x - m in d * d
+    in sqrt (sum (map sq xs) / n)
 
 waitForProgressStart :: Session ()
 waitForProgressStart = void $ do
@@ -688,8 +767,9 @@ runBench runSess Bench{..} = handleAny (\e -> print e >> return badRun)
 
       liftIO $ output $ "Running " <> name <> " benchmark"
       (runSetup, ()) <- duration $ benchSetup docs
-      let loop' (Just timeForFirstResponse) !userWaits !delayedWork 0 = return $ Just (userWaits, delayedWork, timeForFirstResponse)
-          loop' timeForFirstResponse !userWaits !delayedWork n = do
+      -- Accumulate per-sample (t, td) in reverse order; we reverse once at the end.
+      let loop' (Just timeForFirstResponse) !samplesAcc 0 = return $ Just (reverse samplesAcc, timeForFirstResponse)
+          loop' timeForFirstResponse !samplesAcc n = do
             (t, res) <- duration $ experiment docs
             if not res
               then return Nothing
@@ -697,12 +777,16 @@ runBench runSess Bench{..} = handleAny (\e -> print e >> return badRun)
                 output (showDuration t)
                 -- Wait for the delayed actions to finish
                 td <- waitForBuildQueue
-                loop' (timeForFirstResponse <|> Just (t,td)) (userWaits+t) (delayedWork+td) (n -1)
+                loop' (timeForFirstResponse <|> Just (t,td)) ((t,td):samplesAcc) (n -1)
           loop = loop' Nothing
 
-      (runExperiment, result) <- duration $ loop 0 0 samples
+      (runExperiment, result) <- duration $ loop [] samples
       let success = isJust result
-          (userWaits, delayedWork, (firstResponse, firstResponseDelayed)) = fromMaybe (0,0,(0,0)) result
+          (perSample, (firstResponse, firstResponseDelayed)) = fromMaybe ([], (0,0)) result
+          userWaitsSamples = map fst perSample
+          delayedWorkSamples = map snd perSample
+          userWaits = sum userWaitsSamples
+          delayedWork = sum delayedWorkSamples
 
       rulesTotal <- length <$> getStoredKeys
       rulesBuilt <- either (const 0) length <$> getBuildKeysBuilt
diff --git a/ghcide/src/Development/IDE/Core/Rules.hs b/ghcide/src/Development/IDE/Core/Rules.hs
index 24de344bfa..fc43dcef63 100644
--- a/ghcide/src/Development/IDE/Core/Rules.hs
+++ b/ghcide/src/Development/IDE/Core/Rules.hs
@@ -639,7 +639,7 @@ getFileHashRule recorder =
         return (Just (fingerprintToBS fileHash), ([], Just fileHash))
 
 getModuleGraphRule :: Recorder (WithPriority Log) -> Rules ()
-getModuleGraphRule recorder = defineEarlyCutOffNoFile (cmapWithPrio LogShake recorder) $ \GetModuleGraph -> do
+getModuleGraphRule recorder = defineEarlyCutOffNoFileReuseValue (cmapWithPrio LogShake recorder) $ \GetModuleGraph -> do
   fs <- toKnownFiles <$> useNoFile_ GetKnownTargets
   dependencyInfoForFiles (HashSet.toList fs)
 
diff --git a/ghcide/src/Development/IDE/Core/Shake.hs b/ghcide/src/Development/IDE/Core/Shake.hs
index 9334a13ad3..77401cc0aa 100644
--- a/ghcide/src/Development/IDE/Core/Shake.hs
+++ b/ghcide/src/Development/IDE/Core/Shake.hs
@@ -41,7 +41,7 @@ module Development.IDE.Core.Shake(
     RuleBody(..),
     define, defineNoDiagnostics,
     defineEarlyCutoff,
-    defineNoFile, defineEarlyCutOffNoFile,
+    defineNoFile, defineEarlyCutOffNoFile, defineEarlyCutOffNoFileReuseValue,
     getDiagnostics,
     mRunLspT, mRunLspTCallback,
     getHiddenDiagnostics,
@@ -1195,6 +1195,9 @@ useWithoutDependency key file =
 data RuleBody k v
   = Rule (k -> NormalizedFilePath -> Action (Maybe BS.ByteString, IdeResult v))
   | RuleNoDiagnostics (k -> NormalizedFilePath -> Action (Maybe BS.ByteString, Maybe v))
+  -- | Like 'RuleNoDiagnostics', but reuses the previously cached value on an
+  -- early-cutoff match (see 'defineEarlyCutOffNoFileReuseValue').
+  | RuleNoDiagnosticsReuseValue (k -> NormalizedFilePath -> Action (Maybe BS.ByteString, Maybe v))
   | RuleWithCustomNewnessCheck
     { newnessCheck :: BS.ByteString -> BS.ByteString -> Bool
     , build :: k -> NormalizedFilePath -> Action (Maybe BS.ByteString, Maybe v)
@@ -1212,49 +1215,86 @@ defineEarlyCutoff recorder (Rule op) = addRule $ \(Q (key, file)) (old :: Maybe
     let diagnostics ver diags = do
             traceDiagnostics diags
             updateFileDiagnostics recorder file ver (newKey key) extras diags
-    defineEarlyCutoff' diagnostics (==) key file old mode $ const $ op key file
-defineEarlyCutoff recorder (RuleNoDiagnostics op) = addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode -> otTracedAction key file mode traceA $ \traceDiagnostics -> do
-    let diagnostics _ver diags = do
-            traceDiagnostics diags
-            mapM_ (logWith recorder Warning . LogDefineEarlyCutoffRuleNoDiagHasDiag) diags
-    defineEarlyCutoff' diagnostics (==) key file old mode $ const $ second (mempty,) <$> op key file
+    defineEarlyCutoff' diagnostics (==) False key file old mode $ const $ op key file
+defineEarlyCutoff recorder (RuleNoDiagnostics op) =
+    addNoDiagnosticsRule recorder False op
+defineEarlyCutoff recorder (RuleNoDiagnosticsReuseValue op) =
+    addNoDiagnosticsRule recorder True op
 defineEarlyCutoff recorder RuleWithCustomNewnessCheck{..} =
     addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode ->
         otTracedAction key file mode traceA $ \ traceDiagnostics -> do
             let diagnostics _ver diags = do
                     traceDiagnostics diags
                     mapM_ (logWith recorder Warning . LogDefineEarlyCutoffRuleCustomNewnessHasDiag) diags
-            defineEarlyCutoff' diagnostics newnessCheck key file old mode $
+            defineEarlyCutoff' diagnostics newnessCheck False key file old mode $
                 const $ second (mempty,) <$> build key file
 defineEarlyCutoff recorder (RuleWithOldValue op) = addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode -> otTracedAction key file mode traceA $ \traceDiagnostics -> do
     extras <- getShakeExtras
     let diagnostics ver diags = do
             traceDiagnostics diags
             updateFileDiagnostics recorder file ver (newKey key) extras diags
-    defineEarlyCutoff' diagnostics (==) key file old mode $ op key file
+    defineEarlyCutoff' diagnostics (==) False key file old mode $ op key file
+
+-- | Shared implementation for the no-diagnostics rule variants. The 'Bool'
+-- selects @defineEarlyCutoff'@'s value-reuse behaviour on an early-cutoff match.
+addNoDiagnosticsRule
+    :: forall k v. IdeRule k v
+    => Recorder (WithPriority Log)
+    -> Bool
+    -> (k -> NormalizedFilePath -> Action (Maybe BS.ByteString, Maybe v))
+    -> Rules ()
+addNoDiagnosticsRule recorder reuseValueOnMatch op =
+    addRule $ \(Q (key, file)) (old :: Maybe BS.ByteString) mode ->
+        otTracedAction key file mode traceA $ \traceDiagnostics -> do
+            let diagnostics _ver diags = do
+                    traceDiagnostics diags
+                    mapM_ (logWith recorder Warning . LogDefineEarlyCutoffRuleNoDiagHasDiag) diags
+            defineEarlyCutoff' diagnostics (==) reuseValueOnMatch key file old mode $
+                const $ second (mempty,) <$> op key file
 
 defineNoFile :: IdeRule k v => Recorder (WithPriority Log) -> (k -> Action v) -> Rules ()
 defineNoFile recorder f = defineNoDiagnostics recorder $ \k file -> do
     if file == emptyFilePath then do res <- f k; return (Just res) else
         fail $ "Rule " ++ show k ++ " should always be called with the empty string for a file"
 
+-- | Body shared by the no-file early-cutoff rule variants: assert the empty
+-- file path and wrap the (fingerprint, value) result.
+noFileBody
+    :: Show k
+    => (k -> Action (BS.ByteString, v))
+    -> k -> NormalizedFilePath -> Action (Maybe BS.ByteString, Maybe v)
+noFileBody f k file
+    | file == emptyFilePath = do (hashString, res) <- f k; return (Just hashString, Just res)
+    | otherwise = fail $ "Rule " ++ show k ++ " should always be called with the empty string for a file"
+
 defineEarlyCutOffNoFile :: IdeRule k v => Recorder (WithPriority Log) -> (k -> Action (BS.ByteString, v)) -> Rules ()
-defineEarlyCutOffNoFile recorder f = defineEarlyCutoff recorder $ RuleNoDiagnostics $ \k file -> do
-    if file == emptyFilePath then do (hashString, res) <- f k; return (Just hashString, Just res) else
-        fail $ "Rule " ++ show k ++ " should always be called with the empty string for a file"
+defineEarlyCutOffNoFile recorder f = defineEarlyCutoff recorder $ RuleNoDiagnostics $ noFileBody f
+
+-- | Like 'defineEarlyCutOffNoFile', but reuses the previously cached value
+-- when the early-cutoff fingerprint matches the prior run. Preserves pointer
+-- identity across rebuilds — important for large shared values
+-- (e.g. ModuleGraph) so downstream rules whose deps did not change continue
+-- to share a pointer with the cache, rather than accumulating multiple
+-- structurally equivalent copies.
+--
+-- Only use for rules whose action has no side effects that depend on the
+-- freshly computed value (no global registration, no unloading the prior).
+defineEarlyCutOffNoFileReuseValue :: IdeRule k v => Recorder (WithPriority Log) -> (k -> Action (BS.ByteString, v)) -> Rules ()
+defineEarlyCutOffNoFileReuseValue recorder f = defineEarlyCutoff recorder $ RuleNoDiagnosticsReuseValue $ noFileBody f
 
 defineEarlyCutoff'
     :: forall k v. IdeRule k v
     => (Maybe Int32 -> [FileDiagnostic] -> Action ()) -- ^ update diagnostics
     -- | compare current and previous for freshness
     -> (BS.ByteString -> BS.ByteString -> Bool)
+    -> Bool -- ^ reuse cached value on cutoff match (see Note below)
     -> k
     -> NormalizedFilePath
     -> Maybe BS.ByteString
     -> RunMode
     -> (Value v -> Action (Maybe BS.ByteString, IdeResult v))
     -> Action (RunResult (A (RuleResult k)))
-defineEarlyCutoff' doDiagnostics cmp key file mbOld mode action = do
+defineEarlyCutoff' doDiagnostics cmp reuseValueOnMatch key file mbOld mode action = do
     ShakeExtras{state, progress, dirtyKeys} <- getShakeExtras
     options <- getIdeOptions
     let trans g x =  withRunInIO $ \run -> g (run x)
@@ -1288,7 +1328,7 @@ defineEarlyCutoff' doDiagnostics cmp key file mbOld mode action = do
                         pure (Nothing, ([ideErrorText file (T.pack $ show (key, file) ++ show e) | not $ isBadDependency e],Nothing))
 
                 ver <- estimateFileVersionUnsafely key mbRes file
-                (bs, res) <- case mbRes of
+                (bs, freshRes) <- case mbRes of
                     Nothing -> do
                         pure (toShakeValue ShakeStale mbBs, staleV)
                     Just v -> pure (maybe ShakeNoCutoff ShakeResult mbBs, Succeeded ver v)
@@ -1299,6 +1339,30 @@ defineEarlyCutoff' doDiagnostics cmp key file mbOld mode action = do
                         -- If we do not have a previous result
                         -- or we got ShakeNoCutoff we always return False.
                         _                                     -> False
+                -- Pointer-identity preservation on early-cutoff match (opt-in).
+                -- When the cutoff fingerprint matches the prior, reuse the
+                -- previously cached value instead of the freshly computed
+                -- (but structurally equivalent) one. This avoids accumulating
+                -- multiple equivalent copies of large shared values like
+                -- ModuleGraph across rebuilds.
+                --
+                -- Only safe for rules whose action body has no
+                -- value-dependent side effects (e.g. registering the fresh
+                -- result in a global table and unloading the stale one).
+                -- For such rules, returning the cached value would leave
+                -- downstream pointing at a representation that the action
+                -- just invalidated. Opt in by passing reuseValueOnMatch=True.
+                -- Reuse only a clean prior value (Stale Nothing _): that is
+                -- the shape a value successfully computed by this rule takes
+                -- (see the Just Succeeded branch above). A Stale (Just _)
+                -- instead carries a PositionDelta and is only ever written by
+                -- the persistent-rule fallback (lastValueIO), i.e. a value
+                -- loaded from disk rather than produced by this rule's action
+                -- this run, so it is not a trustworthy prior to reuse; fall
+                -- through to the fresh result in that case.
+                let res = case (reuseValueOnMatch, eq, staleV, freshRes) of
+                        (True, True, Stale Nothing _ oldV, Succeeded sver _) -> Succeeded sver oldV
+                        _                                                    -> freshRes
                 return $ RunResult
                     (if eq then ChangedRecomputeSame else ChangedRecomputeDiff)
                     (encodeShakeValue bs)
diff --git a/shake-bench/shake-bench.cabal b/shake-bench/shake-bench.cabal
index 3d2a7d3174..e5bee2fc42 100644
--- a/shake-bench/shake-bench.cabal
+++ b/shake-bench/shake-bench.cabal
@@ -22,6 +22,7 @@ library
         aeson,
         base == 4.*,
         bytestring,
+        containers,
         directory,
         extra >= 1.7.2,
         filepath,
diff --git a/shake-bench/src/Development/Benchmark/Rules.hs b/shake-bench/src/Development/Benchmark/Rules.hs
index 2ddcb97cc5..e7daa8a554 100644
--- a/shake-bench/src/Development/Benchmark/Rules.hs
+++ b/shake-bench/src/Development/Benchmark/Rules.hs
@@ -71,10 +71,13 @@ import           Data.Aeson                (FromJSON (..), ToJSON (..),
 import           Data.Aeson.Lens           (AsJSON (_JSON), _Object, _String)
 import           Data.ByteString.Lazy      (ByteString)
 import           Data.Char                 (isDigit)
-import           Data.List                 (find, intercalate, isInfixOf,
-                                            isSuffixOf, stripPrefix, transpose)
+import           Data.List                 (find, foldl', intercalate,
+                                            isInfixOf, isPrefixOf, isSuffixOf,
+                                            sortBy, stripPrefix, transpose)
 import           Data.List.Extra           (lower, splitOn)
+import qualified Data.Map.Strict           as Map
 import           Data.Maybe                (fromMaybe)
+import           Data.Ord                  (Down (..), comparing)
 import           Data.String               (fromString)
 import           Data.Text                 (Text)
 import qualified Data.Text                 as T
@@ -325,31 +328,180 @@ benchRules build MkBenchRules{..} = do
               BenchProject {..}
         liftIO $ case prof of
             NoProfiling -> writeFile outHp dummyHp
-            _           -> return ()
-
-        -- extend csv output with allocation data
+            _ -> do
+                -- Write a co-located top-N heap summary derived from the .hp
+                -- file. Untracked side-effect; regenerated whenever the .hp
+                -- is. Skipped under NoProfiling, where outHp is only a dummy.
+                hp <- readFile outHp
+                writeFile (outHp <.> "csv") (summarizeHpProfile 20 hp)
+
+        -- extend csv output with allocation data + GC/pause/productivity
         csvContents <- liftIO $ lines <$> readFile outcsv
         let header = head csvContents
             results = tail csvContents
-            header' = header <> ", maxResidency, allocatedBytes"
+            extraHeaders =
+                [ "maxResidency"
+                , "allocatedBytes"
+                , "maxSlop"
+                , "mutTimeS"
+                , "gcTimeS"
+                , "maxGcPauseS"
+                , "productivityPct"
+                , "majorGCs"
+                , "minorGCs"
+                ]
+            header' = header <> ", " <> intercalate ", " extraHeaders
         results' <- forM results $ \row -> do
-            (maxResidency, allocations) <- liftIO
-                    (parseMaxResidencyAndAllocations <$> readFile outGc)
-            return $ printf "%s, %s, %s" row (showMB maxResidency) (showMB allocations)
+            stats <- liftIO (parseRTSStats <$> readFile outGc)
+            return $ intercalate ", "
+                [ row
+                , showMB (rtsMaxResidency stats)
+                , showMB (rtsAllocatedBytes stats)
+                , showMB (rtsMaxSlop stats)
+                , showSeconds (rtsMutTime stats)
+                , showSeconds (rtsGcTime stats)
+                , showSeconds (rtsMaxGcPause stats)
+                , showDouble (rtsProductivityPct stats)
+                , showInt (rtsMajorGCs stats)
+                , showInt (rtsMinorGCs stats)
+                ]
         let csvContents' = header' : results'
         writeFileLines outcsv csvContents'
     where
+        -- A negative value is parseRTSStats's "field missing" sentinel; clamp
+        -- to zero so the column stays numeric for downstream CSV parsing.
         showMB :: Int -> String
-        showMB x = show (x `div` 2^(20::Int)) <> "MB"
+        showMB x = show (max 0 x `div` 2^(20::Int)) <> "MB"
+        showSeconds :: Double -> String
+        showSeconds = printf "%.4f" . max 0
+        showDouble :: Double -> String
+        showDouble = printf "%.2f" . max 0
+        showInt :: Int -> String
+        showInt = show . max (0 :: Int)
+
+-- | Summarize a GHC heap profile (.hp) file into a top-N cost-centre CSV.
+-- For each cost-centre name we report the maximum retained bytes across
+-- all samples, plus the final-sample retained bytes (useful for spotting
+-- leaks that grow without bound).
+summarizeHpProfile :: Int -> String -> String
+summarizeHpProfile topN raw =
+    let ls = lines raw
+        samples = collectSamples ls
+        accumOne acc (cc, bs) = Map.insertWith combine cc (bs, bs) acc
+          where combine (newMax, newLast) (oldMax, _) =
+                    let m = max newMax oldMax in m `seq` (m, newLast)
+        -- Aggregate across all samples: track running max and last value per CC.
+        -- Strict fold + forced max to avoid a thunk chain over the samples.
+        agg = foldl' (\acc s -> foldl' accumOne acc s) Map.empty samples
+        sorted = sortBy (comparing (Down . maxOf)) (Map.toList agg)
+        maxOf (_, (m, _)) = m
+        header = "costCentre,maxRetainedBytes,finalRetainedBytes"
+        body = [ cc <> "," <> show m <> "," <> show l
+               | (cc, (m, l)) <- take topN sorted
+               ]
+    in unlines (header : body)
+  where
+    -- | Group consecutive non-marker lines between BEGIN_SAMPLE/END_SAMPLE pairs.
+    collectSamples :: [String] -> [[(String, Integer)]]
+    collectSamples xs = case dropWhile (not . isBeginSample) xs of
+        [] -> []
+        (_:rest) ->
+            let (sample, rest') = break isEndSample rest
+                parsed = [ p | l <- sample, Just p <- [parseSampleLine l] ]
+            in parsed : case rest' of
+                []    -> []
+                (_:r) -> collectSamples r
+    isBeginSample = ("BEGIN_SAMPLE" `isPrefixOf`) . dropWhile (== ' ')
+    isEndSample   = ("END_SAMPLE"   `isPrefixOf`) . dropWhile (== ' ')
+
+-- | Parse one cost-centre line from a heap profile sample, e.g.
+-- @"GHC.Conc.Sync.CAF" 5678@ or @PINNED 1234@. The cost-centre name may
+-- contain spaces (when quoted), so we treat everything before the trailing
+-- number as the name.
+parseSampleLine :: String -> Maybe (String, Integer)
+parseSampleLine l = case reverse (words l) of
+    (numStr:rest) -> do
+        n <- readMaybe numStr
+        let name = unwords (reverse rest)
+        if null name then Nothing else Just (name, n)
+    _ -> Nothing
+
+-- | Stats extracted from `+RTS -s` (or -S) summary output.
+data RTSStats = RTSStats
+    { rtsMaxResidency    :: !Int
+    , rtsAllocatedBytes  :: !Int
+    , rtsMaxSlop         :: !Int
+    , rtsMutTime         :: !Double  -- ^ MUT time (seconds)
+    , rtsGcTime          :: !Double  -- ^ GC time (seconds)
+    , rtsMaxGcPause      :: !Double  -- ^ Max GC pause across all generations (seconds)
+    , rtsProductivityPct :: !Double  -- ^ Productivity (% of total user time)
+    , rtsMajorGCs        :: !Int     -- ^ Gen 1 collections
+    , rtsMinorGCs        :: !Int     -- ^ Gen 0 collections
+    }
 
--- Parse the max residency and allocations in RTS -s output
-parseMaxResidencyAndAllocations :: String -> (Int, Int)
-parseMaxResidencyAndAllocations input =
-    (f "maximum residency", f "bytes allocated in the heap")
+-- | Parse the summary block at the tail of `+RTS -s`/`-S` output.
+-- Each field is best-effort; missing fields return -1.
+parseRTSStats :: String -> RTSStats
+parseRTSStats input = RTSStats
+    { rtsMaxResidency    = digitsFirstWord "maximum residency"
+    , rtsAllocatedBytes  = digitsFirstWord "bytes allocated in the heap"
+    , rtsMaxSlop         = digitsFirstWord "maximum slop"
+    , rtsMutTime         = secondsAt 2 "MUT     time"
+    , rtsGcTime          = secondsAt 2 "GC      time"
+    , rtsMaxGcPause      = parseMaxPause
+    , rtsProductivityPct = parseProductivity
+    , rtsMajorGCs        = parseGenColls "Gen  1"
+    , rtsMinorGCs        = parseGenColls "Gen  0"
+    }
   where
-    inps = reverse $ lines input
-    f label = case find (label `isInfixOf`) inps of
-        Just l  -> read $ filter isDigit $ head $ words l
+    ls = lines input
+    -- Match summary labels tolerant of GHC's inter-column spacing, which
+    -- varies across versions (e.g. "MUT     time" vs "MUT  time"); both the
+    -- label and each line are whitespace-normalised before comparison.
+    findLine label = let needle = unwords (words label)
+                     in find ((needle `isInfixOf`) . unwords . words) ls
+    digitsFirstWord label = case findLine label of
+        Just l  -> readIntDef $ filter isDigit $ head (words l ++ [""])
+        Nothing -> -1
+    readIntDef "" = -1
+    readIntDef s  = fromMaybe (-1) (readMaybe s)
+    readDoubleDef s = fromMaybe (-1) (readMaybe (takeWhile (\c -> isDigit c || c == '.' || c == '-') s))
+    -- Parse the time value (in seconds) at word index `i` (after dropping the label
+    -- words). For lines like "  MUT     time    0.012s  (  0.012s elapsed)".
+    secondsAt i label = case findLine label of
+        Just l -> case drop i (words l) of
+            (w:_) -> readDoubleDef w
+            _     -> -1
+        Nothing -> -1
+    -- "Max pause" appears on each Gen line; take the worst.
+    parseMaxPause =
+        let candidates =
+                [ readDoubleDef w
+                | l <- ls
+                , "Gen " `isInfixOf` l
+                , let ws = words l
+                , not (null ws)
+                , w <- take 1 (reverse ws)
+                ]
+            valid = filter (>= 0) candidates
+        in if null valid then -1 else maximum valid
+    -- "Productivity  81.2% of total user, ..."
+    parseProductivity = case findLine "Productivity" of
+        Just l ->
+            let ws = words l
+            in case ws of
+                (_:n:_) -> readDoubleDef n
+                _       -> -1
+        Nothing -> -1
+    -- "  Gen  0         1 colls,     0 par   ..." → 1
+    parseGenColls genLabel = case findLine genLabel of
+        Just l ->
+            let ws = words l
+                -- Drop "Gen N" prefix, then first numeric word is colls count.
+                rest = drop 2 ws
+            in case rest of
+                (n:_) -> readIntDef (filter isDigit n)
+                _     -> -1
         Nothing -> -1