the-tuning-machine · Gklajer · Mar 28, 2026 · Mar 22, 2026 · Mar 26, 2026 · Mar 26, 2026
diff --git a/deliverables/report.tex b/deliverables/report.tex
@@ -42,7 +42,7 @@ \subsection{Direct Comparison}
 
 \begin{figure}[h]
     \centering
-    \includegraphics[width=0.5\textwidth]{figures/direct.png}
+    \includegraphics[draft,width=0.5\textwidth]{figures/direct.png}
     \caption{Direct comparison: mean fitting loss with min--mean shaded region (1000 trials per rank).}
     \label{fig:direct}
 \end{figure}
@@ -53,7 +53,7 @@ \subsection{Baseline Comparison}
 
 \begin{figure}[h]
     \centering
-    \includegraphics[width=0.5\textwidth]{figures/baseline.png}
+    \includegraphics[draft,width=0.5\textwidth]{figures/baseline.png}
     \caption{Baseline comparison: both methods approximate a full Transformer. Shaded region shows min--mean spread.}
     \label{fig:baseline}
 \end{figure}
@@ -64,7 +64,7 @@ \subsection{Ablation: Stiefel vs.\ Euclidean Geometry}
 
 \begin{figure}[h]
     \centering
-    \includegraphics[width=0.5\textwidth]{figures/ablation.png}
+    \includegraphics[draft,width=0.5\textwidth]{figures/ablation.png}
     \caption{Ablation: Euclidean 3-factor vs.\ StelLA (same $USV^\top$, different optimizer).}
     \label{fig:ablation}
 \end{figure}
@@ -75,7 +75,7 @@ \subsection{Computational Efficiency}
 
 \begin{figure}[h]
     \centering
-    \includegraphics[width=0.7\textwidth]{figures/benchmark_gpu_memory_phases.png}
+    \includegraphics[draft,width=0.7\textwidth]{figures/benchmark_gpu_memory_phases.png}
     \caption{GPU memory usage per training phase (averaged over steps) for a full Transformer, LoRA, and StelLA at ranks $r \in \{8, 32, 64, 128\}$.}
     \label{fig:mem_phases}
 \end{figure}
@@ -84,13 +84,13 @@ \subsection{Computational Efficiency}
     \centering
     \begin{minipage}{0.48\textwidth}
         \centering
-        \includegraphics[width=\textwidth]{figures/benchmark_peak_memory.png}
+        \includegraphics[draft,width=\textwidth]{figures/benchmark_peak_memory.png}
         \caption{Peak GPU memory (MB).}
         \label{fig:peak_mem}
     \end{minipage}\hfill
     \begin{minipage}{0.48\textwidth}
         \centering
-        \includegraphics[width=\textwidth]{figures/benchmark_throughput.png}
+        \includegraphics[draft,width=\textwidth]{figures/benchmark_throughput.png}
         \caption{Training throughput (samples/s).}
         \label{fig:throughput}
     \end{minipage}
@@ -104,13 +104,32 @@ \subsection{Computational Efficiency}
 
 \begin{figure}[h]
     \centering
-    \includegraphics[width=0.7\textwidth]{figures/benchmark_time_breakdown.png}
+    \includegraphics[draft,width=0.7\textwidth]{figures/benchmark_time_breakdown.png}
     \caption{Per-step time breakdown (forward / backward / optimizer) in milliseconds.}
     \label{fig:time_breakdown}
 \end{figure}
 
 \paragraph{Time breakdown.} Figure~\ref{fig:time_breakdown} decomposes the per-step wall time into forward, backward, and optimizer phases. Forward and backward times are nearly identical between LoRA and StelLA at each rank, confirming that the Stiefel constraint does not alter the computational graph of these phases. The overhead is entirely concentrated in the optimizer step: at $r{=}128$, StelLA's optimizer phase takes ${\approx}140$\,ms vs.\ ${\approx}10$\,ms for LoRA, accounting for the throughput gap observed above. This is expected, as the polar retraction requires an SVD of an $r \times r$ matrix at each adapted layer per step. At low ranks this cost is negligible, but it scales cubically with $r$, making it the dominant bottleneck at $r{=}128$. This throughput penalty is the main practical cost of the Stiefel constraint and should be weighed against the expressivity and memory gains reported above.
 
+\subsection{Single-Layer Memory Accounting}
+
+\begin{figure}[h]
+    \centering
+    \includegraphics[width=0.95\textwidth]{figures/single_layer_memory_mosaic_widget_din_4096_dout_4096_r_128_b_128.png}
+    \caption{Single-layer memory comparison for a $4096 \times 4096$ linear layer and rank-$128$ adapters at batch size $128$. The top row compares the measured Mosaic peak breakdown against the analytical tensor-accounting estimate for the dense baseline, LoRA, and StelLA. The bottom row shows the measured Mosaic categories over one profiled training step, with the peak marked by a dashed vertical line.}
+    \label{fig:single_layer_memory}
+\end{figure}
+
+Figure~\ref{fig:single_layer_memory} provides a more controlled view of the memory story behind the end-to-end benchmark. We isolate a single trainable linear layer and compare three variants: full dense training, LoRA, and StelLA. The analytical bars are obtained from explicit tensor accounting: static memory is the resident parameter storage, activation memory is estimated from the forward intermediates, backward memory is approximated by gradient-bearing tensors, and optimizer memory is the Adam state required by trainable parameters. For the dense baseline this yields a total estimate of $264.0$\,MiB, while LoRA and StelLA are both around $88$\,MiB because only the low-rank adapter parameters are trainable.
+
+Analytically, the dense baseline pays full parameter, gradient, and Adam-state cost,
+$$
+m_{\mathrm{dense}} \sim 4d_{\mathrm{out}}d_{\mathrm{in}} + 4d_{\mathrm{out}}d_{\mathrm{in}} + 8d_{\mathrm{out}}d_{\mathrm{in}},
+$$
+while LoRA reduces the trainable part to $r(d_{\mathrm{in}} + d_{\mathrm{out}})$ and StelLA to $r(d_{\mathrm{in}} + d_{\mathrm{out}}) + r^2$, with activation terms of order $4b(d_{\mathrm{in}} + r + d_{\mathrm{out}})$. This gives $264.0$\,MiB for dense training and about $88$\,MiB for both LoRA and StelLA.
+
+The measured bars are taken from Mosaic after 5 warmup and 5 active steps. LoRA reaches $106.2$\,MiB, while StelLA rises to $124.8$\,MiB because its optimizer introduces extra transient workspace. The analytical view is therefore a compact tensor-accounting model, while Mosaic captures the true runtime peak, including temporary buffers and \texttt{Unknown} allocator activity.
+
 \section{Limitations and Discussion}
 
 Several caveats apply to our findings. Our models are single-layer Transformers with $d{=}8$, far from the multi-layer, high-dimensional architectures (LLaMA-7B, ViT-Large) on which StelLA reports its strongest gains; the observed trends may not transfer directly to larger scales. The expressivity framework measures functional capacity rather than downstream task accuracy, and uses only 5 optimization steps per trial---too few for the geometric constraint to fully manifest its advantage, as evidenced by the narrow ablation gap. Finally, LoRA's advantage at $r{=}1$ in the baseline setting suggests the Stiefel constraint may be overly restrictive when the rank budget is very small.

diff --git a/pyproject.toml b/pyproject.toml
@@ -23,6 +23,10 @@ dependencies = [
 [tool.uv.sources]
 stella = { git = "https://github.com/SonyResearch/stella" }
 
+# Mosaic's latest upstream commit is currently broken for this workflow, so we pin
+# to a previous commit.
+mosaic = { git = "https://github.com/facebookresearch/mosaic.git", rev = "27a16228ae22222cf2b834d31afbf64f2dff66e0" }
+
 [build-system]  # https://docs.astral.sh/uv/concepts/build-backend/
 requires = ["uv_build>=0.9.18,<0.11.0"]
 build-backend = "uv_build"
@@ -56,6 +60,24 @@ dev = [
   "typeguard (>=4.4.1)",
 ]
 
+mosaic = [
+  # Keep the runtime deps here because the pinned git package is not pulling
+  # them into this project environment reliably on its own.
+  "altair>=5.5.0",
+  "click>=8.3.0",
+  "mcp>=1.0.0",
+  "mosaic",
+  "omegaconf>=2.3.0",
+  "pandas>=2.3.3",
+  "tabulate>=0.9.0",
+]
+
+profiling = [
+  { include-group = "mosaic" },
+  "tensorboard>=2.20.0",
+  "torch-tb-profiler>=0.4.3",
+]
+
 [tool.codespell]  # https://github.com/codespell-project/codespell
 builtin = "en-GB_to_en-US,clear,code,rare"
 check-filenames = true