From d0accfc8b13516b3763b8c5df1e8cfc8eacfe87a Mon Sep 17 00:00:00 2001
From: Purdze <r.s.sutton@hotmail.co.uk>
Date: Tue, 23 Jun 2026 22:35:49 +0100
Subject: [PATCH 1/4] Fix chunk-load frame spikes and improve streaming
 throughput

---
 Cargo.lock                                |  39 +-
 pomme-client/Cargo.toml                   |   4 +
 pomme-client/src/app/core.rs              |  11 +
 pomme-client/src/app/phases/connecting.rs |   6 +-
 pomme-client/src/app/phases/in_game.rs    |  59 ++-
 pomme-client/src/benchmark.rs             |  73 +++-
 pomme-client/src/main.rs                  |  17 +-
 pomme-client/src/renderer/chunk/buffer.rs | 492 +++++++++++++---------
 pomme-client/src/renderer/chunk/mesher.rs | 131 +++++-
 pomme-client/src/renderer/mod.rs          |   9 +-
 10 files changed, 595 insertions(+), 246 deletions(-)

diff --git a/Cargo.lock b/Cargo.lock
index 7dbe63f9..4364cdb2 100644
--- a/Cargo.lock
+++ b/Cargo.lock
@@ -247,7 +247,7 @@ dependencies = [
  "objc2-foundation 0.3.2",
  "parking_lot",
  "percent-encoding",
- "windows-sys 0.59.0",
+ "windows-sys 0.60.2",
  "x11rb",
 ]
 
@@ -1899,7 +1899,7 @@ dependencies = [
  "libc",
  "option-ext",
  "redox_users",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -2276,7 +2276,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb"
 dependencies = [
  "libc",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -3834,6 +3834,15 @@ version = "0.2.16"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981"
 
+[[package]]
+name = "libmimalloc-sys"
+version = "0.1.49"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "6a45a52f43e1c16f667ccfe4dd8c85b7f7c204fd5e3bf46c5b0db9a5c3c0b8e9"
+dependencies = [
+ "cc",
+]
+
 [[package]]
 name = "libredox"
 version = "0.1.16"
@@ -3996,6 +4005,15 @@ dependencies = [
  "autocfg",
 ]
 
+[[package]]
+name = "mimalloc"
+version = "0.1.52"
+source = "registry+https://github.com/rust-lang/crates.io-index"
+checksum = "2d4139bb28d14ad1facf21d5eb8825051b326e172d216b39f6d31df53cc97862"
+dependencies = [
+ "libmimalloc-sys",
+]
+
 [[package]]
 name = "mime"
 version = "0.3.17"
@@ -4161,7 +4179,7 @@ version = "0.50.3"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5"
 dependencies = [
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -4811,7 +4829,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "7d8fae84b431384b68627d0f9b3b1245fcf9f46f6c0e3dc902e9dce64edd1967"
 dependencies = [
  "libc",
- "windows-sys 0.45.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -5113,6 +5131,7 @@ dependencies = [
  "gilrs",
  "glam",
  "image",
+ "mimalloc",
  "open",
  "parking_lot",
  "png 0.17.16",
@@ -5428,7 +5447,7 @@ dependencies = [
  "once_cell",
  "socket2",
  "tracing",
- "windows-sys 0.59.0",
+ "windows-sys 0.60.2",
 ]
 
 [[package]]
@@ -5904,7 +5923,7 @@ dependencies = [
  "errno",
  "libc",
  "linux-raw-sys 0.12.1",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -5961,7 +5980,7 @@ dependencies = [
  "security-framework 3.7.0",
  "security-framework-sys",
  "webpki-root-certs",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -7446,7 +7465,7 @@ dependencies = [
  "getrandom 0.4.2",
  "once_cell",
  "rustix 1.1.4",
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
@@ -8585,7 +8604,7 @@ version = "0.1.11"
 source = "registry+https://github.com/rust-lang/crates.io-index"
 checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22"
 dependencies = [
- "windows-sys 0.59.0",
+ "windows-sys 0.61.2",
 ]
 
 [[package]]
diff --git a/pomme-client/Cargo.toml b/pomme-client/Cargo.toml
index c17596c5..c46d2f64 100644
--- a/pomme-client/Cargo.toml
+++ b/pomme-client/Cargo.toml
@@ -23,6 +23,10 @@ tracing = { workspace = true }
 tracing-subscriber = { workspace = true, features = ["env-filter"] }
 
 tracing-appender = "0.2"
+# Per-thread-heap allocator: the chunk-mesh worker pool churns vertex/index Vecs
+# across threads, and the default system heap's global lock serializes that and
+# stalls the main thread. Mirrors vanilla (JVM TLABs + LWJGL's jemalloc).
+mimalloc = "0.1"
 pomme-gpu-allocator = { path = "../pomme-gpu-allocator" }
 winit = { version = "0.30", features = ["rwh_06"] }
 raw-window-handle = "0.6"
diff --git a/pomme-client/src/app/core.rs b/pomme-client/src/app/core.rs
index 1bca4786..5413e3f2 100644
--- a/pomme-client/src/app/core.rs
+++ b/pomme-client/src/app/core.rs
@@ -313,6 +313,9 @@ impl AppCore {
     ) -> Option<String> {
         let rx = &connection.event_rx;
 
+        // Phase timers for the chunk-load benchmark's worst-frame breakdown.
+        let t_net = std::time::Instant::now();
+
         let mut chunks_to_mesh = Vec::new();
         // Block edits go on the priority lane so they apply instantly even while
         // chunks stream in, instead of starving behind the load backlog.
@@ -986,8 +989,16 @@ impl AppCore {
         // then enqueue everything that needs meshing — visible-first, with hidden
         // columns backfilled at a bounded rate so the world still completes.
         let loads_happened = !chunks_to_mesh.is_empty();
+        let ms = |t: std::time::Instant| t.elapsed().as_secs_f32() * 1000.0;
+        game.last_update_phases.net_decode_ms = ms(t_net);
+
+        let t_vis = std::time::Instant::now();
         game.update_visibility(renderer, player_chunk, loads_happened);
+        game.last_update_phases.visibility_ms = ms(t_vis);
+
+        let t_rescan = std::time::Instant::now();
         game.rescan_mesh_jobs(player_chunk);
+        game.last_update_phases.rescan_ms = ms(t_rescan);
 
         disconnect_reason
     }
diff --git a/pomme-client/src/app/phases/connecting.rs b/pomme-client/src/app/phases/connecting.rs
index 0c92ac63..b5805825 100644
--- a/pomme-client/src/app/phases/connecting.rs
+++ b/pomme-client/src/app/phases/connecting.rs
@@ -37,8 +37,10 @@ pub fn update_connecting(
     if matches!(connect_phase, ConnectionPhase::Loading) {
         game.mesh_dispatcher
             .set_camera_position(*game.player.position);
-        for mesh in game.mesh_dispatcher.drain_results() {
-            gfx.renderer.upload_chunk_mesh(&mesh);
+        let ready_meshes: Vec<_> = game.mesh_dispatcher.drain_results().collect();
+        gfx.renderer.upload_chunk_meshes(&ready_meshes);
+        for mesh in ready_meshes {
+            game.mesh_dispatcher.recycle(mesh);
         }
 
         let ready = game.position_set && (game.dead || gfx.renderer.loaded_chunk_count() > 0);
diff --git a/pomme-client/src/app/phases/in_game.rs b/pomme-client/src/app/phases/in_game.rs
index 5b74eda3..02f590b9 100644
--- a/pomme-client/src/app/phases/in_game.rs
+++ b/pomme-client/src/app/phases/in_game.rs
@@ -91,6 +91,9 @@ pub struct GameState {
     /// In-flight/finished upload of the chunk-load result, while its overlay is
     /// shown.
     pub chunk_load_upload: Option<UploadHandle>,
+    /// Last frame's `update_game` CPU phase timings, for the chunk-load
+    /// benchmark's worst-frame breakdown.
+    pub last_update_phases: crate::benchmark::UpdatePhases,
     /// Monotonic content generation per column, bumped on every edit (and chunk
     /// load). This is the dirty marker: a column needs (re)meshing whenever its
     /// `content_gen` outruns what was last enqueued, regardless of visibility,
@@ -199,6 +202,7 @@ impl GameState {
             chunk_load_result: None,
             chunk_load_abort: false,
             chunk_load_upload: None,
+            last_update_phases: crate::benchmark::UpdatePhases::default(),
             content_gen: HashMap::new(),
             meshed: HashMap::new(),
             vis_mask: HashMap::new(),
@@ -596,6 +600,11 @@ pub fn update_game(
     connection: &ConnectionHandle,
     game: &mut GameState,
 ) -> GameUpdateResult {
+    // Snapshot last frame's phase timings before this frame overwrites them: they
+    // align with `raw_dt`, which measures the previous frame's full duration.
+    let frame_start = std::time::Instant::now();
+    let prev_phases = game.last_update_phases;
+
     // Position the audio listener at the player's head and push current
     // volumes before draining sound packets this frame.
     let listener_pos = game.player.eye_pos();
@@ -611,7 +620,13 @@ pub fn update_game(
         return GameUpdateResult::Disconnected { reason };
     }
 
-    for mesh in game.mesh_dispatcher.drain_results() {
+    // Collect the frame's ready meshes, apply their CPU-side bookkeeping, then
+    // upload them in one coalesced GPU transfer (one fence wait, not one per
+    // mesh) to avoid the streaming stutter from per-mesh `queue.wait_idle`.
+    let drain_start = std::time::Instant::now();
+    let results: Vec<_> = game.mesh_dispatcher.drain_results().collect();
+    let mut batch = Vec::with_capacity(results.len());
+    for mut mesh in results {
         // Drop a mesh built from an out-of-date snapshot. Edits (priority lane,
         // single section) are keyed per section so editing one section never
         // drops a sibling's in-flight result; bulk loads keep the column key.
@@ -623,6 +638,7 @@ pub fn update_game(
             mesh.content_gen < game.content_gen.get(&mesh.pos).copied().unwrap_or(0)
         };
         if stale {
+            game.mesh_dispatcher.recycle(mesh);
             continue;
         }
         if let Some(t) = &mesh.timing {
@@ -637,24 +653,34 @@ pub fn update_game(
                 ms(t.enqueued_at.elapsed()),
             );
         }
-        let dropped = gfx.renderer.upload_chunk_mesh(&mesh);
+        // Visibility updates are independent of the GPU upload; apply them now so
+        // the mesh can move into the upload batch.
         let pos = mesh.pos;
-        // Sections dropped on pool exhaustion were retired from the buffer; clear
-        // their meshed bit so the next rescan re-enqueues them.
-        if !dropped.is_empty()
-            && let Some(m) = game.meshed.get_mut(&pos)
-        {
-            for si in dropped {
-                m.mask &= !(1u32 << si);
-            }
-        }
-        for (si, vis) in mesh.visibility {
+        for (si, vis) in std::mem::take(&mut mesh.visibility) {
             let e = game.section_vis_epoch.entry((pos, si)).or_insert(0);
             if mesh.upload_epoch >= *e {
                 *e = mesh.upload_epoch;
                 game.section_vis.insert((pos, si), vis);
             }
         }
+        batch.push(mesh);
+    }
+    game.last_update_phases.mesh_drain_ms = drain_start.elapsed().as_secs_f32() * 1000.0;
+    let upload_start = std::time::Instant::now();
+    let dropped = gfx.renderer.upload_chunk_meshes(&batch);
+    game.last_update_phases.upload_ms = upload_start.elapsed().as_secs_f32() * 1000.0;
+    // Sections dropped on pool exhaustion were retired from the buffer; clear
+    // their meshed bit so the next rescan re-enqueues them.
+    for (pos, sections) in dropped {
+        if let Some(m) = game.meshed.get_mut(&pos) {
+            for si in sections {
+                m.mask &= !(1u32 << si);
+            }
+        }
+    }
+    // Return the uploaded meshes' buffers to the worker pool for reuse.
+    for mesh in batch {
+        game.mesh_dispatcher.recycle(mesh);
     }
 
     game.mesh_dispatcher
@@ -930,7 +956,12 @@ pub fn update_game(
 
     if let Some(mut bench) = game.chunk_load_bench.take() {
         let count = gfx.renderer.loaded_chunk_count();
-        match bench.update(count, raw_dt * 1000.0) {
+        match bench.update(
+            count,
+            raw_dt * 1000.0,
+            gfx.renderer.last_timings(),
+            prev_phases,
+        ) {
             ChunkLoadStep::Wait => {
                 game.chunk_load_bench = Some(bench);
             }
@@ -1382,6 +1413,8 @@ pub fn update_game(
     ) {
         tracing::error!("Render error: {e}");
     }
+    // Whole-frame wall time (incl. render), read next frame to align with `raw_dt`.
+    game.last_update_phases.update_ms = frame_start.elapsed().as_secs_f32() * 1000.0;
 
     if close_inventory {
         game.inventory_open = false;
diff --git a/pomme-client/src/benchmark.rs b/pomme-client/src/benchmark.rs
index d3a38643..f7cd9fc6 100644
--- a/pomme-client/src/benchmark.rs
+++ b/pomme-client/src/benchmark.rs
@@ -247,6 +247,40 @@ fn radius_from_chunk_count(count: u32) -> u32 {
     (((count as f32).sqrt() - 1.0) / 2.0).round().max(0.0) as u32
 }
 
+/// `update_game`'s CPU phase timings — the per-frame work not covered by the
+/// render timings. Set each frame and folded into [`FrameBreakdown`].
+/// `update_ms` is the whole-`update_game` wall time (including the render
+/// call); if it is far below `total_ms`, the hitch is outside `update_game`
+/// (framerate limiter / OS scheduling / inter-frame gap) rather than in any CPU
+/// phase.
+#[derive(Clone, Copy, Default, serde::Serialize)]
+pub struct UpdatePhases {
+    pub update_ms: f32,
+    pub net_decode_ms: f32,
+    pub visibility_ms: f32,
+    pub rescan_ms: f32,
+    pub mesh_drain_ms: f32,
+    pub upload_ms: f32,
+}
+
+/// Phase split of a run's single worst frame, to localize a hitch. `total_ms`
+/// is the wall-clock frame (`raw_dt`); `render_ms` the `render_frame` portion
+/// (which includes `fence_ms`, the GPU-bound wait); the `update` phases cover
+/// the rest. All sub-timings reflect the same prior frame `raw_dt` measures, so
+/// the split lines up; whatever `total_ms` exceeds the parts is time spent
+/// outside `update_game` (limiter / OS scheduling / inter-frame gap).
+#[derive(Clone, Default, serde::Serialize)]
+pub struct FrameBreakdown {
+    pub total_ms: f32,
+    pub render_ms: f32,
+    pub fence_ms: f32,
+    pub acquire_ms: f32,
+    pub cull_ms: f32,
+    pub present_ms: f32,
+    #[serde(flatten)]
+    pub update: UpdatePhases,
+}
+
 /// One reset→load cycle's measurements.
 #[derive(Clone, serde::Serialize)]
 pub struct ChunkLoadRun {
@@ -256,6 +290,7 @@ pub struct ChunkLoadRun {
     pub time_to_first_secs: f32,
     pub avg_frame_ms: f32,
     pub worst_frame_ms: f32,
+    pub worst_frame_breakdown: FrameBreakdown,
 }
 
 #[derive(Clone, serde::Serialize)]
@@ -293,6 +328,9 @@ pub struct ChunkLoadResult {
     pub avg_frame_ms: f32,
     pub worst_frame_ms: f32,
     pub runs_detail: Vec<ChunkLoadRun>,
+    /// Phase split of the worst frame across the measured runs — what the spike
+    /// was actually spent on.
+    pub worst_frame_breakdown: FrameBreakdown,
     /// "debug" or "release" — see [`build_profile`].
     pub profile: String,
     pub measurement_note: String,
@@ -348,6 +386,8 @@ pub struct ChunkLoadBench {
     first_load_at: Option<Instant>,
     frame_ms_sum: f32,
     frame_ms_max: f32,
+    /// Phase split of the current run's worst frame so far.
+    worst_breakdown: FrameBreakdown,
     frame_samples: u32,
     /// How many reset→load cycles have finished (warmup + measured).
     runs_done: u32,
@@ -389,13 +429,20 @@ impl ChunkLoadBench {
             first_load_at: None,
             frame_ms_sum: 0.0,
             frame_ms_max: 0.0,
+            worst_breakdown: FrameBreakdown::default(),
             frame_samples: 0,
             runs_done: 0,
             completed: Vec::new(),
         }
     }
 
-    pub fn update(&mut self, loaded_count: u32, frame_ms: f32) -> ChunkLoadStep {
+    pub fn update(
+        &mut self,
+        loaded_count: u32,
+        frame_ms: f32,
+        timings: &RenderTimings,
+        phases: UpdatePhases,
+    ) -> ChunkLoadStep {
         match self.phase {
             ChunkPhase::Reset => {
                 // Wait for the unload to settle (count stops dropping) so the
@@ -421,7 +468,18 @@ impl ChunkLoadBench {
             }
             ChunkPhase::Load => {
                 self.frame_ms_sum += frame_ms;
-                self.frame_ms_max = self.frame_ms_max.max(frame_ms);
+                if frame_ms > self.frame_ms_max {
+                    self.frame_ms_max = frame_ms;
+                    self.worst_breakdown = FrameBreakdown {
+                        total_ms: frame_ms,
+                        render_ms: timings.frame_ms,
+                        fence_ms: timings.fence_ms,
+                        acquire_ms: timings.acquire_ms,
+                        cull_ms: timings.cull_ms,
+                        present_ms: timings.present_ms,
+                        update: phases,
+                    };
+                }
                 self.frame_samples += 1;
 
                 if loaded_count != self.last_count {
@@ -465,6 +523,7 @@ impl ChunkLoadBench {
                         time_to_first_secs,
                         avg_frame_ms,
                         worst_frame_ms: self.frame_ms_max,
+                        worst_frame_breakdown: self.worst_breakdown.clone(),
                     });
                     self.runs_done += 1;
 
@@ -482,6 +541,7 @@ impl ChunkLoadBench {
                     self.first_load_at = None;
                     self.frame_ms_sum = 0.0;
                     self.frame_ms_max = 0.0;
+                    self.worst_breakdown = FrameBreakdown::default();
                     self.frame_samples = 0;
                     ChunkLoadStep::Load(CHUNK_LOAD_MIN_RD)
                 } else {
@@ -524,6 +584,15 @@ impl ChunkLoadBench {
                 .iter()
                 .map(|r| r.worst_frame_ms)
                 .fold(0.0, f32::max),
+            worst_frame_breakdown: measured
+                .iter()
+                .max_by(|a, b| {
+                    a.worst_frame_ms
+                        .partial_cmp(&b.worst_frame_ms)
+                        .unwrap_or(std::cmp::Ordering::Equal)
+                })
+                .map(|r| r.worst_frame_breakdown.clone())
+                .unwrap_or_default(),
             runs_detail: measured.to_vec(),
             profile: build_profile().to_owned(),
             measurement_note: MEASUREMENT_NOTE.to_owned(),
diff --git a/pomme-client/src/main.rs b/pomme-client/src/main.rs
index 84e39bae..b55a3e09 100644
--- a/pomme-client/src/main.rs
+++ b/pomme-client/src/main.rs
@@ -1,3 +1,9 @@
+// Per-thread-heap allocator (see Cargo.toml): keeps the chunk-mesh worker
+// pool's cross-thread Vec churn from serializing on the system heap's global
+// lock and stalling the main thread.
+#[global_allocator]
+static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc;
+
 mod app;
 mod args;
 mod assets;
@@ -96,7 +102,16 @@ fn main() {
     data_dirs.ensure_game_dir().ok();
     tracing::info!("Installation directory: {}", data_dirs.game_dir.display());
 
-    let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create tokio runtime"));
+    // A single connection needs only a few async workers; the default runtime
+    // spawns one per core and floods them decoding the chunk-load burst, starving
+    // the render/mesh threads. Cap it so those cores stay free.
+    let rt = Arc::new(
+        tokio::runtime::Builder::new_multi_thread()
+            .worker_threads(4)
+            .enable_all()
+            .build()
+            .expect("Failed to create tokio runtime"),
+    );
 
     let user = UserData::from_args(args.username, args.uuid, args.access_token);
 
diff --git a/pomme-client/src/renderer/chunk/buffer.rs b/pomme-client/src/renderer/chunk/buffer.rs
index 4c7f10eb..90604885 100644
--- a/pomme-client/src/renderer/chunk/buffer.rs
+++ b/pomme-client/src/renderer/chunk/buffer.rs
@@ -180,6 +180,9 @@ pub struct ChunkBufferStore {
     staging_size: u64,
     transfer_pool: vk::CommandPool,
     transfer_cmd: vk::CommandBuffer,
+    /// Signals completion of a batched staging->device transfer. Reused (reset
+    /// before each submit) so a frame's uploads sync once instead of per-mesh.
+    transfer_fence: vk::Fence,
     use_staging: bool,
 
     /// Exact-size sub-allocators over the vertex and index pools (in elements).
@@ -269,7 +272,14 @@ impl ChunkBufferStore {
             (vb, va, ib, ia)
         };
 
-        let staging_size = BYTES_PER_BUCKET * 4;
+        // Discrete GPUs batch a frame's uploads through this buffer in one
+        // transfer, so size it to hold several columns and keep sub-flushes rare.
+        // The integrated path writes mapped memory directly and never touches it.
+        let staging_size = if use_staging {
+            BYTES_PER_BUCKET * 16
+        } else {
+            BYTES_PER_BUCKET * 4
+        };
         let (staging_buffer, staging_alloc) = util::create_host_buffer(
             device,
             allocator,
@@ -299,6 +309,10 @@ impl ChunkBufferStore {
         }
         .expect("failed to alloc transfer cmd");
 
+        let transfer_fence = device
+            .create_fence(&vk::FenceCreateInfo::default(), None)
+            .expect("failed to create transfer fence");
+
         tracing::info!(
             "Chunk buffers: {} (vertex={} MB, index={} MB, staging={} KB)",
             if use_staging {
@@ -314,7 +328,11 @@ impl ChunkBufferStore {
         let vtx_free = FreeList::new(total_buckets * BUCKET_VERTICES);
         let idx_free = FreeList::new(total_buckets * BUCKET_INDICES);
 
-        let max_meta = (total_buckets * 2) as usize;
+        // Per-section packing yields many more draws than buckets, so pre-size
+        // generously: growth (`ensure_meta_capacity`) needs a `device.wait_idle`
+        // to safely rewrite the descriptor sets, and we don't want that stall
+        // firing mid-stream. The remaining grow path stays as a rare safety net.
+        let max_meta = (total_buckets * 16).max(8192) as usize;
         let meta_size = (max_meta * size_of::<ChunkMeta>()) as u64;
         let indirect_size = (max_meta * size_of::<DrawCommand>()) as u64;
         let count_size = 4u64;
@@ -493,6 +511,7 @@ impl ChunkBufferStore {
             staging_size,
             transfer_pool,
             transfer_cmd,
+            transfer_fence,
             use_staging,
             vtx_free,
             idx_free,
@@ -526,17 +545,65 @@ impl ChunkBufferStore {
         self.last_draw_count
     }
 
-    /// Upload a mesh result, replacing the sections in `mesh.replaced`. Returns
-    /// the section indices that were dropped due to pool exhaustion (and so
-    /// need re-meshing); empty on success or for the permanent "too large"
-    /// skip.
-    pub fn upload(
+    /// Submit the accumulated staging copies as a single transfer and block on
+    /// a fence until it completes. One fence wait per call replaces the old
+    /// per-mesh `queue.wait_idle`, so a frame's uploads synchronize once
+    /// instead of once per mesh.
+    fn flush_transfer(
+        &mut self,
+        device: &vk::Device,
+        queue: vk::Queue,
+        copy_v: &[vk::BufferCopy],
+        copy_i: &[vk::BufferCopy],
+    ) {
+        if copy_v.is_empty() && copy_i.is_empty() {
+            return;
+        }
+        let begin = vk::CommandBufferBeginInfo {
+            flags: vk::CommandBufferUsageFlags::OneTimeSubmit,
+            ..Default::default()
+        };
+        self.transfer_cmd.begin(&begin).unwrap();
+        if !copy_v.is_empty() {
+            self.transfer_cmd
+                .copy_buffer(self.staging_buffer, self.vertex_buffer, copy_v);
+        }
+        if !copy_i.is_empty() {
+            self.transfer_cmd
+                .copy_buffer(self.staging_buffer, self.index_buffer, copy_i);
+        }
+        self.transfer_cmd.end().unwrap();
+        let submit = [vk::SubmitInfo {
+            command_buffer_count: 1,
+            command_buffers: &self.transfer_cmd.handle(),
+            ..Default::default()
+        }];
+        device.reset_fences(&[self.transfer_fence]).unwrap();
+        queue.submit(&submit, self.transfer_fence).unwrap();
+        device
+            .wait_for_fences(&[self.transfer_fence], true, u64::MAX)
+            .unwrap();
+    }
+
+    /// Upload a batch of mesh results, each replacing the sections in its
+    /// `mesh.replaced` range. Staging copies for the whole batch are coalesced
+    /// into as few transfers as the staging buffer holds (one per overflow,
+    /// plus a final flush), each synchronized by a single fence wait — so a
+    /// streaming frame stalls once, not once per mesh. Returns, per mesh
+    /// that hit pool exhaustion, the section indices that were dropped and
+    /// need re-meshing.
+    pub fn upload_batch(
         &mut self,
         device: &vk::Device,
         allocator: &Arc<Mutex<Allocator>>,
         queue: vk::Queue,
-        mesh: &ChunkMeshData,
-    ) -> Vec<i32> {
+        meshes: &[ChunkMeshData],
+    ) -> Vec<(ChunkPos, Vec<i32>)> {
+        let mut needs_remesh: Vec<(ChunkPos, Vec<i32>)> = Vec::new();
+        if meshes.is_empty() {
+            return needs_remesh;
+        }
+
         // Tight AABB over a section's own vertices (better cull granularity than
         // the chunk-column bounds; also robust to LOD cubes that exceed 16 tall).
         fn section_aabb(verts: &[ChunkVertex]) -> ChunkAABB {
@@ -554,6 +621,18 @@ impl ChunkBufferStore {
             }
         }
 
+        // Sub-allocate an exact-size vertex + index slice for each non-empty
+        // section. Indices stay section-local and `vertex_offset` rebases the draw,
+        // so no packing or rebasing is needed — just one slice per section.
+        struct Plan<'a> {
+            section_index: i32,
+            verts: &'a [ChunkVertex],
+            indices: &'a [u32],
+            vtx_off: u32,
+            idx_off: u32,
+            aabb: ChunkAABB,
+        }
+
         // Retired slices only reclaim in `begin_frame`; if rendering is paused
         // while meshing continues (e.g. minimized window) the backlog grows
         // unbounded. Past a sane bound, force a GPU wait and reclaim it all.
@@ -565,155 +644,167 @@ impl ChunkBufferStore {
             }
         }
 
-        // The covered sections this job is authoritative for: reject any where a
-        // newer upload (higher epoch) already landed. See
-        // `ChunkMeshData::upload_epoch`.
-        let accepted: std::collections::HashSet<i32> = mesh
-            .replaced
-            .clone()
-            .filter(|si| {
-                let stored = self
+        let staging_half = self.staging_size as usize / 2;
+        // Copies accumulated for the current (not-yet-submitted) transfer, and the
+        // running write cursors into each half of the staging buffer.
+        let mut copy_v: Vec<vk::BufferCopy> = Vec::new();
+        let mut copy_i: Vec<vk::BufferCopy> = Vec::new();
+        let mut stg_v = 0usize;
+        let mut stg_i = 0usize;
+
+        for mesh in meshes {
+            // The covered sections this job is authoritative for: reject any where
+            // a newer upload (higher epoch) already landed. See
+            // `ChunkMeshData::upload_epoch`.
+            let accepted: std::collections::HashSet<i32> = mesh
+                .replaced
+                .clone()
+                .filter(|si| {
+                    let stored = self
+                        .chunks
+                        .get(&mesh.pos)
+                        .and_then(|c| c.sections.iter().find(|s| s.section_index == *si))
+                        .map(|s| s.epoch)
+                        .unwrap_or(0);
+                    mesh.upload_epoch >= stored
+                })
+                .collect();
+
+            // Retire the slices of every accepted covered section: the re-meshed
+            // ones are re-allocated below, the now-empty ones simply vanish.
+            // Remember which were present so a re-meshed section swaps instantly
+            // while a freshly revealed one still fades in. Rejected sections are
+            // left untouched.
+            let mut freed: Vec<(u32, u32, u32, u32)> = Vec::new();
+            let mut was_present: std::collections::HashSet<i32> = std::collections::HashSet::new();
+            if let Some(entry) = self.chunks.get_mut(&mesh.pos) {
+                entry.sections.retain(|s| {
+                    if accepted.contains(&s.section_index) {
+                        was_present.insert(s.section_index);
+                        freed.push((
+                            s.vertex_offset as u32,
+                            s.vtx_len,
+                            s.first_index,
+                            s.index_count,
+                        ));
+                        false
+                    } else {
+                        true
+                    }
+                });
+            }
+            self.retire_slices(freed.iter().copied());
+            // Sections were removed/replaced, so the draw list must be rebuilt even
+            // if this mesh is skipped below (otherwise it keeps drawing a retired,
+            // soon-reused slice).
+            self.meta_dirty = true;
+
+            let upload_secs: Vec<&SectionMesh> = mesh
+                .sections
+                .iter()
+                .filter(|s| accepted.contains(&s.section_index))
+                .collect();
+
+            if upload_secs.is_empty() {
+                // Every accepted section is now empty (freed above); drop the
+                // column if nothing remains.
+                if self
                     .chunks
                     .get(&mesh.pos)
-                    .and_then(|c| c.sections.iter().find(|s| s.section_index == *si))
-                    .map(|s| s.epoch)
-                    .unwrap_or(0);
-                mesh.upload_epoch >= stored
-            })
-            .collect();
-
-        // Retire the slices of every accepted covered section: the re-meshed ones
-        // are re-allocated below, the now-empty ones simply vanish. Remember which
-        // were present so a re-meshed section swaps instantly while a freshly
-        // revealed one still fades in. Rejected sections are left untouched.
-        let mut freed: Vec<(u32, u32, u32, u32)> = Vec::new();
-        let mut was_present: std::collections::HashSet<i32> = std::collections::HashSet::new();
-        if let Some(entry) = self.chunks.get_mut(&mesh.pos) {
-            entry.sections.retain(|s| {
-                if accepted.contains(&s.section_index) {
-                    was_present.insert(s.section_index);
-                    freed.push((
-                        s.vertex_offset as u32,
-                        s.vtx_len,
-                        s.first_index,
-                        s.index_count,
-                    ));
-                    false
-                } else {
-                    true
+                    .is_some_and(|c| c.sections.is_empty())
+                {
+                    self.chunks.remove(&mesh.pos);
                 }
-            });
-        }
-        self.retire_slices(freed.iter().copied());
-        // Sections were removed/replaced, so the draw list must be rebuilt even if
-        // an early return below skips the upload (otherwise it keeps drawing a
-        // retired, soon-reused slice).
-        self.meta_dirty = true;
-
-        let upload_secs: Vec<&SectionMesh> = mesh
-            .sections
-            .iter()
-            .filter(|s| accepted.contains(&s.section_index))
-            .collect();
-
-        if upload_secs.is_empty() {
-            // Every accepted section is now empty (freed above); drop the column
-            // if nothing remains.
-            if self
-                .chunks
-                .get(&mesh.pos)
-                .is_some_and(|c| c.sections.is_empty())
-            {
-                self.chunks.remove(&mesh.pos);
+                continue;
             }
-            return Vec::new();
-        }
 
-        let staging_half = self.staging_size as usize / 2;
-        if self.use_staging {
-            // Verts and indices share the staging buffer (two halves), copied in
-            // one transfer. A chunk too large for staging is skipped rather than
-            // overflowing the buffer (matches the prior column-sized limit). This
-            // is permanent, so it's not reported for retry.
-            let v_bytes: usize = upload_secs
-                .iter()
-                .map(|s| s.vertices.len() * VERTEX_SIZE as usize)
-                .sum();
-            let i_bytes: usize = upload_secs
-                .iter()
-                .map(|s| s.indices.len() * INDEX_SIZE as usize)
-                .sum();
-            if v_bytes > staging_half || i_bytes > staging_half {
-                tracing::warn!(
-                    "Chunk {:?} too large for staging ({} v / {} i bytes), skipping",
-                    mesh.pos,
-                    v_bytes,
-                    i_bytes,
-                );
-                return Vec::new();
+            if self.use_staging {
+                // Verts and indices share the staging buffer (two halves). A chunk
+                // too large for one half is skipped rather than overflowing the
+                // buffer. This is permanent, so it's not reported for retry.
+                let v_bytes: usize = upload_secs
+                    .iter()
+                    .map(|s| s.vertices.len() * VERTEX_SIZE as usize)
+                    .sum();
+                let i_bytes: usize = upload_secs
+                    .iter()
+                    .map(|s| s.indices.len() * INDEX_SIZE as usize)
+                    .sum();
+                if v_bytes > staging_half || i_bytes > staging_half {
+                    tracing::warn!(
+                        "Chunk {:?} too large for staging ({} v / {} i bytes), skipping",
+                        mesh.pos,
+                        v_bytes,
+                        i_bytes,
+                    );
+                    continue;
+                }
             }
-        }
 
-        // Sub-allocate an exact-size vertex + index slice for each non-empty
-        // section. Indices stay section-local and `vertex_offset` rebases the draw,
-        // so no packing or rebasing is needed — just one slice per section.
-        struct Plan<'a> {
-            section_index: i32,
-            verts: &'a [ChunkVertex],
-            indices: &'a [u32],
-            vtx_off: u32,
-            idx_off: u32,
-            aabb: ChunkAABB,
-        }
-
-        let mut plans: Vec<Plan> = Vec::with_capacity(upload_secs.len());
-        // (vtx_off, vtx_len, idx_off, idx_len) taken this call, for rollback if the
-        // pool runs out partway through a column.
-        let mut taken: Vec<(u32, u32, u32, u32)> = Vec::new();
-        // The accepted sections were retired above; on a pool-full rollback they
-        // need re-meshing, so report them for retry (rescan re-enqueues next frame).
-        let dropped: Vec<i32> = accepted.iter().copied().collect();
-        for sec in &upload_secs {
-            let vcount = sec.vertices.len() as u32;
-            let icount = sec.indices.len() as u32;
-            if vcount == 0 || icount == 0 {
+            let mut plans: Vec<Plan> = Vec::with_capacity(upload_secs.len());
+            // (vtx_off, vtx_len, idx_off, idx_len) taken for this mesh, for
+            // rollback if the pool runs out partway through a column.
+            let mut taken: Vec<(u32, u32, u32, u32)> = Vec::new();
+            let mut pool_full = false;
+            for sec in &upload_secs {
+                let vcount = sec.vertices.len() as u32;
+                let icount = sec.indices.len() as u32;
+                if vcount == 0 || icount == 0 {
+                    continue;
+                }
+                let Some(vtx_off) = self.vtx_free.alloc(vcount) else {
+                    self.free_slices(&taken);
+                    tracing::debug!("Vertex pool full, skipping {:?}", mesh.pos);
+                    pool_full = true;
+                    break;
+                };
+                let Some(idx_off) = self.idx_free.alloc(icount) else {
+                    self.vtx_free.free_region(vtx_off, vcount);
+                    self.free_slices(&taken);
+                    tracing::debug!("Index pool full, skipping {:?}", mesh.pos);
+                    pool_full = true;
+                    break;
+                };
+                taken.push((vtx_off, vcount, idx_off, icount));
+                plans.push(Plan {
+                    section_index: sec.section_index,
+                    verts: &sec.vertices,
+                    indices: &sec.indices,
+                    vtx_off,
+                    idx_off,
+                    aabb: section_aabb(&sec.vertices),
+                });
+            }
+            if pool_full {
+                // The accepted sections were retired above; report them so the
+                // next rescan re-enqueues them.
+                needs_remesh.push((mesh.pos, accepted.iter().copied().collect()));
+                continue;
+            }
+            if plans.is_empty() {
+                // Nothing to upload (all accepted sections were empty).
                 continue;
             }
-            let Some(vtx_off) = self.vtx_free.alloc(vcount) else {
-                self.free_slices(&taken);
-                tracing::debug!("Vertex pool full, skipping {:?}", mesh.pos);
-                return dropped;
-            };
-            let Some(idx_off) = self.idx_free.alloc(icount) else {
-                self.vtx_free.free_region(vtx_off, vcount);
-                self.free_slices(&taken);
-                tracing::debug!("Index pool full, skipping {:?}", mesh.pos);
-                return dropped;
-            };
-            taken.push((vtx_off, vcount, idx_off, icount));
-            plans.push(Plan {
-                section_index: sec.section_index,
-                verts: &sec.vertices,
-                indices: &sec.indices,
-                vtx_off,
-                idx_off,
-                aabb: section_aabb(&sec.vertices),
-            });
-        }
-
-        if plans.is_empty() {
-            // Nothing to upload (all accepted sections were empty) — not a
-            // capacity failure, so no retry.
-            return Vec::new();
-        }
 
-        if self.use_staging {
-            let mut copy_v: Vec<vk::BufferCopy> = Vec::new();
-            let mut copy_i: Vec<vk::BufferCopy> = Vec::new();
-            {
+            if self.use_staging {
+                let mv: usize = plans
+                    .iter()
+                    .map(|p| p.verts.len() * VERTEX_SIZE as usize)
+                    .sum();
+                let mi: usize = plans
+                    .iter()
+                    .map(|p| p.indices.len() * INDEX_SIZE as usize)
+                    .sum();
+                // This mesh alone fits a half (checked above), so a flush always
+                // makes room: submit the pending transfer and reset the cursors.
+                if stg_v + mv > staging_half || stg_i + mi > staging_half {
+                    self.flush_transfer(device, queue, &copy_v, &copy_i);
+                    copy_v.clear();
+                    copy_i.clear();
+                    stg_v = 0;
+                    stg_i = 0;
+                }
                 let buf = self.staging_alloc.mapped_slice_mut().unwrap();
-                let mut stg_v = 0usize;
-                let mut stg_i = 0usize;
                 for p in &plans {
                     let vb: &[u8] = bytemuck::cast_slice(p.verts);
                     buf[stg_v..stg_v + vb.len()].copy_from_slice(vb);
@@ -734,73 +825,61 @@ impl ChunkBufferStore {
                     });
                     stg_i += ib.len();
                 }
-            }
-
-            let begin = vk::CommandBufferBeginInfo {
-                flags: vk::CommandBufferUsageFlags::OneTimeSubmit,
-                ..Default::default()
-            };
-            self.transfer_cmd.begin(&begin).unwrap();
-            self.transfer_cmd
-                .copy_buffer(self.staging_buffer, self.vertex_buffer, &copy_v);
-            self.transfer_cmd
-                .copy_buffer(self.staging_buffer, self.index_buffer, &copy_i);
-            self.transfer_cmd.end().unwrap();
-            let submit = [vk::SubmitInfo {
-                command_buffer_count: 1,
-                command_buffers: &self.transfer_cmd.handle(),
-                ..Default::default()
-            }];
-            queue.submit(&submit, vk::Fence::null()).unwrap();
-            queue.wait_idle().unwrap();
-        } else {
-            {
-                let vbuf = self.vertex_alloc.mapped_slice_mut().unwrap();
-                for p in &plans {
-                    let vb: &[u8] = bytemuck::cast_slice(p.verts);
-                    let off = p.vtx_off as usize * VERTEX_SIZE as usize;
-                    vbuf[off..off + vb.len()].copy_from_slice(vb);
+            } else {
+                {
+                    let vbuf = self.vertex_alloc.mapped_slice_mut().unwrap();
+                    for p in &plans {
+                        let vb: &[u8] = bytemuck::cast_slice(p.verts);
+                        let off = p.vtx_off as usize * VERTEX_SIZE as usize;
+                        vbuf[off..off + vb.len()].copy_from_slice(vb);
+                    }
                 }
-            }
-            {
-                let ibuf = self.index_alloc.mapped_slice_mut().unwrap();
-                for p in &plans {
-                    let ib: &[u8] = bytemuck::cast_slice(p.indices);
-                    let off = p.idx_off as usize * INDEX_SIZE as usize;
-                    ibuf[off..off + ib.len()].copy_from_slice(ib);
+                {
+                    let ibuf = self.index_alloc.mapped_slice_mut().unwrap();
+                    for p in &plans {
+                        let ib: &[u8] = bytemuck::cast_slice(p.indices);
+                        let off = p.idx_off as usize * INDEX_SIZE as usize;
+                        ibuf[off..off + ib.len()].copy_from_slice(ib);
+                    }
                 }
             }
-        }
 
-        let now = std::time::Instant::now();
-        let new_sections = plans.iter().map(|p| SectionAlloc {
-            section_index: p.section_index,
-            aabb: p.aabb,
-            first_index: p.idx_off,
-            index_count: p.indices.len() as u32,
-            vertex_offset: p.vtx_off as i32,
-            vtx_len: p.verts.len() as u32,
-            // A re-meshed section swaps instantly; a freshly revealed one fades in.
-            uploaded_at: if was_present.contains(&p.section_index) {
-                now.checked_sub(std::time::Duration::from_secs(2))
-                    .unwrap_or(now)
-            } else {
-                now
-            },
-            epoch: mesh.upload_epoch,
-        });
+            let now = std::time::Instant::now();
+            let new_sections = plans.iter().map(|p| SectionAlloc {
+                section_index: p.section_index,
+                aabb: p.aabb,
+                first_index: p.idx_off,
+                index_count: p.indices.len() as u32,
+                vertex_offset: p.vtx_off as i32,
+                vtx_len: p.verts.len() as u32,
+                // A re-meshed section swaps instantly; a freshly revealed one fades in.
+                uploaded_at: if was_present.contains(&p.section_index) {
+                    now.checked_sub(std::time::Duration::from_secs(2))
+                        .unwrap_or(now)
+                } else {
+                    now
+                },
+                epoch: mesh.upload_epoch,
+            });
+
+            self.chunks
+                .entry(mesh.pos)
+                .or_insert_with(|| ChunkAlloc {
+                    sections: Vec::new(),
+                })
+                .sections
+                .extend(new_sections);
+        }
 
-        self.chunks
-            .entry(mesh.pos)
-            .or_insert_with(|| ChunkAlloc {
-                sections: Vec::new(),
-            })
-            .sections
-            .extend(new_sections);
+        // Flush whatever remains accumulated from the last (or only) batch.
+        if self.use_staging {
+            self.flush_transfer(device, queue, &copy_v, &copy_i);
+        }
 
         let total_sections: usize = self.chunks.values().map(|c| c.sections.len()).sum();
         self.ensure_meta_capacity(device, allocator, total_sections);
-        Vec::new()
+
+        needs_remesh
     }
 
     /// Grow the per-frame meta and indirect buffers so they can hold `needed`
@@ -1185,6 +1264,7 @@ impl ChunkBufferStore {
             .ok();
         drop(alloc);
 
+        device.destroy_fence(self.transfer_fence, None);
         device.destroy_command_pool(self.transfer_pool, None);
         device.destroy_pipeline(self.compute_pipeline, None);
         device.destroy_pipeline_layout(self.compute_layout, None);
diff --git a/pomme-client/src/renderer/chunk/mesher.rs b/pomme-client/src/renderer/chunk/mesher.rs
index f0d400f4..11d23be6 100644
--- a/pomme-client/src/renderer/chunk/mesher.rs
+++ b/pomme-client/src/renderer/chunk/mesher.rs
@@ -168,7 +168,12 @@ fn tint_color(tint: Tint, grass: [f32; 3], foliage: [f32; 3], dry_foliage: [f32;
     }
 }
 
-const MAX_MESH_UPLOADS_PER_FRAME: usize = 16;
+const MAX_MESH_UPLOADS_PER_FRAME: usize = 32;
+
+/// Bound on un-drained bulk results: past this, workers block on send (back-
+/// pressure) rather than piling finished meshes — and their pooled buffers —
+/// into an unbounded queue, which would starve the buffer pool.
+const MAX_PENDING_RESULTS: usize = 256;
 
 pub struct Colormap {
     pixels: Vec<[u8; 3]>,
@@ -381,6 +386,60 @@ pub fn int_to_rgb(color: i32) -> [f32; 3] {
     [r, g, b]
 }
 
+/// Pre-allocation hints sized to a typical section so a fresh buffer fills
+/// without reallocating (indices run ~1.5x vertices: 6 per quad vs 4).
+const SECTION_VERTEX_HINT: usize = 2048;
+const SECTION_INDEX_HINT: usize = 3072;
+
+/// Recycles section vertex/index `Vec`s so workers reuse them instead of
+/// allocating/freeing through the OS each mesh (vanilla reuses its
+/// `ByteBufferBuilder`s the same way). Bounded: returns past capacity are
+/// dropped, takes past it allocate.
+struct BufferPool {
+    vtx_tx: crossbeam_channel::Sender<Vec<ChunkVertex>>,
+    vtx_rx: crossbeam_channel::Receiver<Vec<ChunkVertex>>,
+    idx_tx: crossbeam_channel::Sender<Vec<u32>>,
+    idx_rx: crossbeam_channel::Receiver<Vec<u32>>,
+}
+
+impl BufferPool {
+    fn new(capacity: usize) -> Self {
+        let (vtx_tx, vtx_rx) = crossbeam_channel::bounded(capacity);
+        let (idx_tx, idx_rx) = crossbeam_channel::bounded(capacity);
+        Self {
+            vtx_tx,
+            vtx_rx,
+            idx_tx,
+            idx_rx,
+        }
+    }
+
+    // A fresh buffer is pre-sized so filling it doesn't realloc-grow; recycled
+    // buffers keep their capacity, so the pool self-tunes to real section sizes.
+    fn take_vertices(&self) -> Vec<ChunkVertex> {
+        self.vtx_rx
+            .try_recv()
+            .unwrap_or_else(|_| Vec::with_capacity(SECTION_VERTEX_HINT))
+    }
+
+    fn take_indices(&self) -> Vec<u32> {
+        self.idx_rx
+            .try_recv()
+            .unwrap_or_else(|_| Vec::with_capacity(SECTION_INDEX_HINT))
+    }
+
+    fn recycle(&self, mut vertices: Vec<ChunkVertex>, mut indices: Vec<u32>) {
+        if vertices.capacity() > 0 {
+            vertices.clear();
+            let _ = self.vtx_tx.try_send(vertices);
+        }
+        if indices.capacity() > 0 {
+            indices.clear();
+            let _ = self.idx_tx.try_send(indices);
+        }
+    }
+}
+
 pub struct MeshDispatcher {
     result_rx: crossbeam_channel::Receiver<ChunkMeshData>,
     result_tx: crossbeam_channel::Sender<ChunkMeshData>,
@@ -398,6 +457,7 @@ pub struct MeshDispatcher {
     foliage_colormap: Arc<Colormap>,
     dry_foliage_colormap: Arc<Colormap>,
     biome_climate: Arc<HashMap<u32, BiomeClimate>>,
+    pool: Arc<BufferPool>,
 }
 
 impl MeshDispatcher {
@@ -409,13 +469,17 @@ impl MeshDispatcher {
         dry_foliage_colormap: Colormap,
         biome_climate: Arc<HashMap<u32, BiomeClimate>>,
     ) -> Self {
-        let (result_tx, result_rx) = crossbeam_channel::unbounded();
+        // Bulk results are bounded for back-pressure; edits stay unbounded so a
+        // block edit never blocks a worker behind the load backlog.
+        let (result_tx, result_rx) = crossbeam_channel::bounded(MAX_PENDING_RESULTS);
         let (priority_tx, priority_rx) = crossbeam_channel::unbounded();
 
         let queue = Arc::new(MeshQueue::new());
-        // One worker per core minus one, leaving a core for the main/render thread.
+        // Half the cores, capped: more saturated workers starve the main/render
+        // thread during a load burst (frame spikes), and pooling makes load
+        // network-bound so they wouldn't speed it up anyway.
         let worker_count = std::thread::available_parallelism()
-            .map(|n| n.get().saturating_sub(1).max(1))
+            .map(|n| (n.get() / 2).clamp(2, 12))
             .unwrap_or(1);
         let mut workers = Vec::with_capacity(worker_count);
         for _ in 0..worker_count {
@@ -423,7 +487,10 @@ impl MeshDispatcher {
             workers.push(
                 std::thread::Builder::new()
                     .name("chunk-mesher".into())
-                    .spawn(move || queue.run_worker())
+                    .spawn(move || {
+                        lower_current_thread_priority();
+                        queue.run_worker()
+                    })
                     .expect("spawn chunk-mesher thread"),
             );
         }
@@ -442,6 +509,15 @@ impl MeshDispatcher {
             foliage_colormap: Arc::new(foliage_colormap),
             dry_foliage_colormap: Arc::new(dry_foliage_colormap),
             biome_climate,
+            pool: Arc::new(BufferPool::new(1024)),
+        }
+    }
+
+    /// Return an uploaded (or stale) mesh's section buffers to the pool for
+    /// reuse.
+    pub fn recycle(&self, mesh: ChunkMeshData) {
+        for sec in mesh.sections {
+            self.pool.recycle(sec.vertices, sec.indices);
         }
     }
 
@@ -517,6 +593,7 @@ impl MeshDispatcher {
             min_y,
             height,
             tx,
+            pool: Arc::clone(&self.pool),
         });
     }
 
@@ -567,6 +644,7 @@ struct PendingJob {
     min_y: i32,
     height: u32,
     tx: crossbeam_channel::Sender<ChunkMeshData>,
+    pool: Arc<BufferPool>,
 }
 
 impl PendingJob {
@@ -593,6 +671,7 @@ impl PendingJob {
             &self.uv_map,
             self.lod,
             self.sections,
+            &self.pool,
         );
         mesh.content_gen = self.content_gen;
         mesh.upload_epoch = self.upload_epoch;
@@ -728,6 +807,26 @@ fn poll(state: &mut QueueState) -> Option<PendingJob> {
     best_initial.map(|(ii, _)| state.tasks.swap_remove(ii))
 }
 
+/// Run mesh workers below normal priority so the OS preempts them for the
+/// main/render thread during a load burst, while they still use idle cores.
+#[cfg(windows)]
+fn lower_current_thread_priority() {
+    const THREAD_PRIORITY_BELOW_NORMAL: i32 = -1;
+    #[link(name = "kernel32")]
+    unsafe extern "system" {
+        fn GetCurrentThread() -> isize;
+        fn SetThreadPriority(thread: isize, priority: i32) -> i32;
+    }
+    unsafe {
+        SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_BELOW_NORMAL);
+    }
+}
+
+#[cfg(not(windows))]
+fn lower_current_thread_priority() {
+    // TODO: lower priority on non-Windows (libc::nice / pthread_setschedparam).
+}
+
 struct ChunkStoreSnapshot {
     chunks: Vec<(
         ChunkPos,
@@ -1081,6 +1180,7 @@ fn mesh_chunk_snapshot(
     uv_map: &AtlasUVMap,
     lod: u32,
     sections_to_mesh: std::ops::Range<i32>,
+    pool: &BufferPool,
 ) -> ChunkMeshData {
     let mut logged_missing: std::collections::HashSet<String> = std::collections::HashSet::new();
 
@@ -1105,6 +1205,13 @@ fn mesh_chunk_snapshot(
             indices: Vec::new(),
         })
         .collect();
+    // In-range sections get recycled buffers (capacity retained from earlier
+    // meshes) so the worker fills them without going through the OS allocator.
+    for si in range.clone() {
+        let sec = &mut sections[si as usize];
+        sec.vertices = pool.take_vertices();
+        sec.indices = pool.take_indices();
+    }
 
     // The type map is a state->id map, so it only needs the meshed span (+1-block
     // border for face culling); states outside it are never queried.
@@ -1278,9 +1385,17 @@ fn mesh_chunk_snapshot(
         local_z += step;
     }
 
-    // Keep only non-empty sections (untouched out-of-range ones stay empty);
-    // empty indices within `range` are freed by the per-section upload.
-    sections.retain(|s| !s.vertices.is_empty() && !s.indices.is_empty());
+    // Keep only non-empty sections; recycle the buffers of in-range sections that
+    // ended up empty (rather than dropping their retained capacity).
+    let mut kept = Vec::with_capacity(sections.len());
+    for s in sections {
+        if s.vertices.is_empty() || s.indices.is_empty() {
+            pool.recycle(s.vertices, s.indices);
+        } else {
+            kept.push(s);
+        }
+    }
+    let sections = kept;
 
     ChunkMeshData {
         pos,
diff --git a/pomme-client/src/renderer/mod.rs b/pomme-client/src/renderer/mod.rs
index f2f3b4de..a941b612 100644
--- a/pomme-client/src/renderer/mod.rs
+++ b/pomme-client/src/renderer/mod.rs
@@ -857,14 +857,15 @@ impl Renderer {
             .wait_for_fences(&self.ctx.in_flight_fences, true, u64::MAX);
     }
 
-    /// Returns the section indices dropped due to pool exhaustion (need
+    /// Upload a batch of chunk meshes in a single coalesced transfer. Returns,
+    /// per mesh that hit pool exhaustion, the section indices dropped (need
     /// re-mesh); empty on success.
-    pub fn upload_chunk_mesh(&mut self, mesh: &ChunkMeshData) -> Vec<i32> {
-        self.chunk_buffers.upload(
+    pub fn upload_chunk_meshes(&mut self, meshes: &[ChunkMeshData]) -> Vec<(ChunkPos, Vec<i32>)> {
+        self.chunk_buffers.upload_batch(
             &self.ctx.device,
             &self.ctx.allocator,
             self.ctx.graphics_queue,
-            mesh,
+            meshes,
         )
     }
 

From c6c28f08283cd24eeb801e0f1e164fbbe4d165b7 Mon Sep 17 00:00:00 2001
From: Purdze <r.s.sutton@hotmail.co.uk>
Date: Wed, 24 Jun 2026 21:26:17 +0100
Subject: [PATCH 2/4] Optimize chunk rendering: cull caching, mesh queue,
 compact vertices

---
 pomme-client/src/app/phases/in_game.rs       |   1 -
 pomme-client/src/renderer/chunk/buffer.rs    | 249 ++++++++++++++++---
 pomme-client/src/renderer/chunk/mesher.rs    | 148 +++++++----
 pomme-client/src/renderer/pipelines/chunk.rs |   6 +-
 pomme-client/src/renderer/shaders/chunk.vert |  21 +-
 pomme-client/src/renderer/shaders/cull.comp  |   5 +-
 pomme-client/src/world/chunk.rs              |  13 +-
 7 files changed, 338 insertions(+), 105 deletions(-)

diff --git a/pomme-client/src/app/phases/in_game.rs b/pomme-client/src/app/phases/in_game.rs
index 02f590b9..c06e4d98 100644
--- a/pomme-client/src/app/phases/in_game.rs
+++ b/pomme-client/src/app/phases/in_game.rs
@@ -335,7 +335,6 @@ impl GameState {
             let rd = self
                 .chunk_store
                 .loaded_positions()
-                .iter()
                 .map(|p| {
                     (p.x - player_chunk.x)
                         .abs()
diff --git a/pomme-client/src/renderer/chunk/buffer.rs b/pomme-client/src/renderer/chunk/buffer.rs
index 90604885..a85d565c 100644
--- a/pomme-client/src/renderer/chunk/buffer.rs
+++ b/pomme-client/src/renderer/chunk/buffer.rs
@@ -10,13 +10,22 @@ use crate::renderer::{MAX_FRAMES_IN_FLIGHT, shader, util};
 
 const BUCKET_VERTICES: u32 = 32768;
 const BUCKET_INDICES: u32 = 49152;
-const VERTEX_SIZE: u64 = size_of::<ChunkVertex>() as u64;
+const VERTEX_SIZE: u64 = size_of::<PackedVertex>() as u64;
+/// Section-local position quantization: local coords (block 0..16 plus model
+/// overhang) map into `[-POS_BIAS, POS_RANGE - POS_BIAS]` across a u16. Chosen
+/// so a 16-block shift is an exact integer number of u16 steps (16/24*65535 =
+/// 43690), so the same world position encodes identically in adjacent sections
+/// — no seams. Must match `chunk.vert`.
+const POS_RANGE: f32 = 24.0;
+const POS_BIAS: f32 = 4.0;
 const INDEX_SIZE: u64 = size_of::<u32>() as u64;
 const BYTES_PER_BUCKET: u64 =
     BUCKET_VERTICES as u64 * VERTEX_SIZE + BUCKET_INDICES as u64 * INDEX_SIZE;
 const MIN_BUCKETS: u32 = 128;
 const MAX_BUCKETS: u32 = 2048;
 const VRAM_BUDGET_FRACTION: f64 = 0.25;
+/// Fade-in duration for a freshly revealed section (ms).
+const FADE_DURATION_MS: f32 = 1000.0;
 
 /// First-fit free-list sub-allocator over a fixed element range, coalescing on
 /// free. Each section gets an exact-size vertex (and index) slice instead of
@@ -124,6 +133,111 @@ struct ChunkMeta {
     first_index: u32,
     vertex_offset: i32,
     visibility: u32,
+    /// Section world origin; bound as a per-instance vertex attribute so the
+    /// vertex shader rebases the quantized local position. `[3]` is padding.
+    origin: [f32; 4],
+}
+
+/// Compact GPU vertex (14 bytes): position quantized to section-local u16 (see
+/// `POS_RANGE`), rebased to world in `chunk.vert` via the per-instance origin.
+/// `light_tint` is `[u8; 4]` (not `u32`) so the struct packs to 14 bytes with
+/// no alignment padding; byte order matches the old `R8G8B8A8_UNORM` (light,
+/// r,g,b).
+#[repr(C)]
+#[derive(Copy, Clone, bytemuck::Pod, bytemuck::Zeroable)]
+struct PackedVertex {
+    pos: [u16; 3],
+    uv: [u16; 2],
+    light_tint: [u8; 4],
+}
+
+fn quantize_coord(world: f32, origin: f32) -> u16 {
+    let unorm = ((world - origin + POS_BIAS) / POS_RANGE).clamp(0.0, 1.0);
+    (unorm * 65535.0 + 0.5) as u16
+}
+
+fn pack_vertex(v: &ChunkVertex, origin: [f32; 3]) -> PackedVertex {
+    PackedVertex {
+        pos: [
+            quantize_coord(v.position[0], origin[0]),
+            quantize_coord(v.position[1], origin[1]),
+            quantize_coord(v.position[2], origin[2]),
+        ],
+        uv: v.tex_coords,
+        light_tint: v.light_tint.to_le_bytes(),
+    }
+}
+
+/// Pack `verts` (rebased against `origin`) into `dst` starting at byte `off`.
+fn write_packed_verts(dst: &mut [u8], off: usize, verts: &[ChunkVertex], origin: [f32; 3]) {
+    let vsize = VERTEX_SIZE as usize;
+    for (k, v) in verts.iter().enumerate() {
+        let o = off + k * vsize;
+        dst[o..o + vsize].copy_from_slice(bytemuck::bytes_of(&pack_vertex(v, origin)));
+    }
+}
+
+/// Vertex input for the chunk pipeline: binding 0 is the packed per-vertex
+/// pool, binding 1 is the meta buffer read per-instance (origin + fade),
+/// indexed by the `first_instance` the cull shader writes.
+pub fn chunk_vertex_bindings() -> [vk::VertexInputBindingDescription; 2] {
+    [
+        vk::VertexInputBindingDescription {
+            binding: 0,
+            stride: size_of::<PackedVertex>() as u32,
+            input_rate: vk::VertexInputRate::Vertex,
+        },
+        vk::VertexInputBindingDescription {
+            binding: 1,
+            stride: size_of::<ChunkMeta>() as u32,
+            input_rate: vk::VertexInputRate::Instance,
+        },
+    ]
+}
+
+pub fn chunk_vertex_attributes() -> [vk::VertexInputAttributeDescription; 6] {
+    let origin_off = std::mem::offset_of!(ChunkMeta, origin) as u32;
+    let vis_off = std::mem::offset_of!(ChunkMeta, visibility) as u32;
+    [
+        // binding 0 — packed vertex
+        vk::VertexInputAttributeDescription {
+            location: 0,
+            binding: 0,
+            format: vk::Format::R16G16Unorm,
+            offset: 0,
+        },
+        vk::VertexInputAttributeDescription {
+            location: 1,
+            binding: 0,
+            format: vk::Format::R16Unorm,
+            offset: 4,
+        },
+        vk::VertexInputAttributeDescription {
+            location: 2,
+            binding: 0,
+            format: vk::Format::R16G16Unorm,
+            offset: 6,
+        },
+        vk::VertexInputAttributeDescription {
+            location: 3,
+            binding: 0,
+            format: vk::Format::R8G8B8A8Unorm,
+            offset: 10,
+        },
+        // binding 1 — per-instance meta (origin + fade)
+        vk::VertexInputAttributeDescription {
+            location: 4,
+            binding: 1,
+            format: vk::Format::R32G32B32Sfloat,
+            offset: origin_off,
+        },
+        vk::VertexInputAttributeDescription {
+            location: 5,
+            binding: 1,
+            format: vk::Format::R32Sfloat,
+            offset: vis_off,
+        },
+    ]
 }
 
 #[repr(C)]
@@ -153,6 +267,9 @@ struct FrustumData {
 struct SectionAlloc {
     section_index: i32,
     aabb: ChunkAABB,
+    /// Section world origin (`chunk*16`, `min_y + si*16`), used to rebase the
+    /// quantized vertices and passed to the GPU via `ChunkMeta.origin`.
+    origin: [f32; 3],
     first_index: u32,
     index_count: u32,
     vertex_offset: i32,
@@ -195,6 +312,17 @@ pub struct ChunkBufferStore {
     chunk_visibility: HashMap<ChunkPos, u32>,
     cached_meta: Vec<ChunkMeta>,
     meta_dirty: bool,
+    /// End of the current fade-in window. While `now < fade_until` the
+    /// per-section fade values change each frame, so `cached_meta` must be
+    /// rebuilt; an O(1) check replacing the old all-sections scan.
+    fade_until: std::time::Instant,
+    /// Camera position at the last front-to-back sort; the sort (an early-Z
+    /// optimization) is only redone once the camera moves past a threshold.
+    last_sort_cam: [f32; 3],
+    /// Frame slots still needing the latest `cached_meta` uploaded. Set to
+    /// `MAX_FRAMES_IN_FLIGHT` whenever the draw list changes, decremented per
+    /// frame; at steady state the per-frame meta copy stops.
+    meta_upload_pending: u32,
 
     compute_pipeline: vk::Pipeline,
     compute_layout: vk::PipelineLayout,
@@ -352,7 +480,7 @@ impl ChunkBufferStore {
                 device,
                 allocator,
                 meta_size,
-                vk::BufferUsageFlags::StorageBuffer,
+                vk::BufferUsageFlags::StorageBuffer | vk::BufferUsageFlags::VertexBuffer,
                 "chunk_meta",
             );
             meta_buffers.push(b);
@@ -519,6 +647,9 @@ impl ChunkBufferStore {
             chunk_visibility: HashMap::new(),
             cached_meta: Vec::new(),
             meta_dirty: true,
+            fade_until: std::time::Instant::now(),
+            last_sort_cam: [f32::MAX; 3],
+            meta_upload_pending: 0,
             compute_pipeline,
             compute_layout,
             compute_desc_layout,
@@ -631,6 +762,7 @@ impl ChunkBufferStore {
             vtx_off: u32,
             idx_off: u32,
             aabb: ChunkAABB,
+            origin: [f32; 3],
         }
 
         // Retired slices only reclaim in `begin_frame`; if rendering is paused
@@ -773,6 +905,11 @@ impl ChunkBufferStore {
                     vtx_off,
                     idx_off,
                     aabb: section_aabb(&sec.vertices),
+                    origin: [
+                        (mesh.pos.x * 16) as f32,
+                        (mesh.min_y + sec.section_index * 16) as f32,
+                        (mesh.pos.z * 16) as f32,
+                    ],
                 });
             }
             if pool_full {
@@ -806,14 +943,14 @@ impl ChunkBufferStore {
                 }
                 let buf = self.staging_alloc.mapped_slice_mut().unwrap();
                 for p in &plans {
-                    let vb: &[u8] = bytemuck::cast_slice(p.verts);
-                    buf[stg_v..stg_v + vb.len()].copy_from_slice(vb);
+                    write_packed_verts(buf, stg_v, p.verts, p.origin);
+                    let vbytes = p.verts.len() * VERTEX_SIZE as usize;
                     copy_v.push(vk::BufferCopy {
                         src_offset: stg_v as u64,
                         dst_offset: p.vtx_off as u64 * VERTEX_SIZE,
-                        size: vb.len() as u64,
+                        size: vbytes as u64,
                     });
-                    stg_v += vb.len();
+                    stg_v += vbytes;
 
                     let ib: &[u8] = bytemuck::cast_slice(p.indices);
                     let off = staging_half + stg_i;
@@ -829,9 +966,8 @@ impl ChunkBufferStore {
                 {
                     let vbuf = self.vertex_alloc.mapped_slice_mut().unwrap();
                     for p in &plans {
-                        let vb: &[u8] = bytemuck::cast_slice(p.verts);
-                        let off = p.vtx_off as usize * VERTEX_SIZE as usize;
-                        vbuf[off..off + vb.len()].copy_from_slice(vb);
+                        let base = p.vtx_off as usize * VERTEX_SIZE as usize;
+                        write_packed_verts(vbuf, base, p.verts, p.origin);
                     }
                 }
                 {
@@ -848,6 +984,7 @@ impl ChunkBufferStore {
             let new_sections = plans.iter().map(|p| SectionAlloc {
                 section_index: p.section_index,
                 aabb: p.aabb,
+                origin: p.origin,
                 first_index: p.idx_off,
                 index_count: p.indices.len() as u32,
                 vertex_offset: p.vtx_off as i32,
@@ -862,6 +999,16 @@ impl ChunkBufferStore {
                 epoch: mesh.upload_epoch,
             });
 
+            // Freshly revealed sections fade in, so extend the fade window the
+            // cull's O(1) check reads; re-meshed-only uploads swap instantly.
+            if plans
+                .iter()
+                .any(|p| !was_present.contains(&p.section_index))
+            {
+                let dur = std::time::Duration::from_secs_f32(FADE_DURATION_MS / 1000.0);
+                self.fade_until = self.fade_until.max(now + dur);
+            }
+
             self.chunks
                 .entry(mesh.pos)
                 .or_insert_with(|| ChunkAlloc {
@@ -926,7 +1073,7 @@ impl ChunkBufferStore {
                 device,
                 allocator,
                 meta_size,
-                vk::BufferUsageFlags::StorageBuffer,
+                vk::BufferUsageFlags::StorageBuffer | vk::BufferUsageFlags::VertexBuffer,
                 "chunk_meta",
             );
             self.meta_buffers[i] = b;
@@ -1055,15 +1202,18 @@ impl ChunkBufferStore {
         }
 
         let now = std::time::Instant::now();
-        const FADE_DURATION_MS: f32 = 1000.0;
         const NEARBY_DIST_SQ: f32 = 768.0;
+        // Re-sort only once the camera moves ~8 blocks; front-to-back order is an
+        // early-Z optimization, so finer staleness is harmless.
+        const SORT_RECAM_SQ: f32 = 64.0;
 
-        let any_fading = self.fade_enabled
-            && self.chunks.values().flat_map(|a| &a.sections).any(|s| {
-                now.duration_since(s.uploaded_at).as_secs_f32() * 1000.0 < FADE_DURATION_MS
-            });
+        // A fade in flight changes per-section visibility every frame, so the draw
+        // list must rebuild; otherwise it only changes on edits/loads/visibility
+        // (`meta_dirty`). The fade check is O(1) against `fade_until`.
+        let any_fading = self.fade_enabled && now < self.fade_until;
+        let content_changed = self.meta_dirty || any_fading;
 
-        if self.meta_dirty || any_fading {
+        if content_changed {
             self.cached_meta.clear();
             for (pos, alloc) in self.chunks.iter() {
                 // Near columns never fade; otherwise each section fades on its own
@@ -1093,36 +1243,55 @@ impl ChunkBufferStore {
                         first_index: sec.first_index,
                         vertex_offset: sec.vertex_offset,
                         visibility: vis.to_bits(),
+                        origin: [sec.origin[0], sec.origin[1], sec.origin[2], 0.0],
                     });
                 }
             }
             self.meta_dirty = false;
         }
 
-        self.cached_meta.sort_unstable_by(|a, b| {
-            let center_a = [
-                (a.aabb_min[0] + a.aabb_max[0]) * 0.5 - camera_pos[0],
-                (a.aabb_min[1] + a.aabb_max[1]) * 0.5 - camera_pos[1],
-                (a.aabb_min[2] + a.aabb_max[2]) * 0.5 - camera_pos[2],
-            ];
-            let center_b = [
-                (b.aabb_min[0] + b.aabb_max[0]) * 0.5 - camera_pos[0],
-                (b.aabb_min[1] + b.aabb_max[1]) * 0.5 - camera_pos[1],
-                (b.aabb_min[2] + b.aabb_max[2]) * 0.5 - camera_pos[2],
-            ];
-            let dist_a =
-                center_a[0] * center_a[0] + center_a[1] * center_a[1] + center_a[2] * center_a[2];
-            let dist_b =
-                center_b[0] * center_b[0] + center_b[1] * center_b[1] + center_b[2] * center_b[2];
-            dist_a
-                .partial_cmp(&dist_b)
-                .unwrap_or(std::cmp::Ordering::Equal)
-        });
+        let cam_moved = {
+            let dx = camera_pos[0] - self.last_sort_cam[0];
+            let dy = camera_pos[1] - self.last_sort_cam[1];
+            let dz = camera_pos[2] - self.last_sort_cam[2];
+            dx * dx + dy * dy + dz * dz > SORT_RECAM_SQ
+        };
+        if content_changed || cam_moved {
+            self.cached_meta.sort_unstable_by(|a, b| {
+                let center_a = [
+                    (a.aabb_min[0] + a.aabb_max[0]) * 0.5 - camera_pos[0],
+                    (a.aabb_min[1] + a.aabb_max[1]) * 0.5 - camera_pos[1],
+                    (a.aabb_min[2] + a.aabb_max[2]) * 0.5 - camera_pos[2],
+                ];
+                let center_b = [
+                    (b.aabb_min[0] + b.aabb_max[0]) * 0.5 - camera_pos[0],
+                    (b.aabb_min[1] + b.aabb_max[1]) * 0.5 - camera_pos[1],
+                    (b.aabb_min[2] + b.aabb_max[2]) * 0.5 - camera_pos[2],
+                ];
+                let dist_a = center_a[0] * center_a[0]
+                    + center_a[1] * center_a[1]
+                    + center_a[2] * center_a[2];
+                let dist_b = center_b[0] * center_b[0]
+                    + center_b[1] * center_b[1]
+                    + center_b[2] * center_b[2];
+                dist_a
+                    .partial_cmp(&dist_b)
+                    .unwrap_or(std::cmp::Ordering::Equal)
+            });
+            self.last_sort_cam = camera_pos;
+            // Draw list reordered: every frame slot's meta buffer needs the refresh.
+            self.meta_upload_pending = MAX_FRAMES_IN_FLIGHT as u32;
+        }
 
         let count = self.cached_meta.len() as u32;
-        let meta_bytes = bytemuck::cast_slice(&self.cached_meta);
-        self.meta_allocs[frame].mapped_slice_mut().unwrap()[..meta_bytes.len()]
-            .copy_from_slice(meta_bytes);
+        // Each frame slot has its own meta buffer; copy only into slots that
+        // haven't yet seen the current draw list. Steady state stops copying.
+        if self.meta_upload_pending > 0 {
+            let meta_bytes = bytemuck::cast_slice(&self.cached_meta);
+            self.meta_allocs[frame].mapped_slice_mut().unwrap()[..meta_bytes.len()]
+                .copy_from_slice(meta_bytes);
+            self.meta_upload_pending -= 1;
+        }
 
         let frustum_data = FrustumData {
             planes: *frustum,
@@ -1191,7 +1360,9 @@ impl ChunkBufferStore {
             .map(|c| c.sections.len() as u32)
             .sum::<u32>();
 
-        cmd.bind_vertex_buffers(0, &[self.vertex_buffer], &[0]);
+        // Binding 0: packed vertex pool. Binding 1: the meta buffer, read per
+        // instance for the section origin + fade (indexed by `first_instance`).
+        cmd.bind_vertex_buffers(0, &[self.vertex_buffer, self.meta_buffers[frame]], &[0, 0]);
         cmd.bind_index_buffer(self.index_buffer, 0, vk::IndexType::Uint32);
         if cfg!(target_os = "macos") {
             cmd.draw_indexed_indirect(
diff --git a/pomme-client/src/renderer/chunk/mesher.rs b/pomme-client/src/renderer/chunk/mesher.rs
index 11d23be6..5cf40d84 100644
--- a/pomme-client/src/renderer/chunk/mesher.rs
+++ b/pomme-client/src/renderer/chunk/mesher.rs
@@ -1,4 +1,4 @@
-use std::collections::HashMap;
+use std::collections::{BinaryHeap, HashMap};
 use std::sync::atomic::{AtomicBool, AtomicU64, Ordering};
 use std::sync::{Arc, Condvar, Mutex};
 
@@ -99,6 +99,9 @@ pub struct SectionMesh {
 
 pub struct ChunkMeshData {
     pub pos: ChunkPos,
+    /// World Y of section index 0, so the buffer can derive each section's
+    /// origin (`min_y + section_index * 16`) for vertex quantization.
+    pub min_y: i32,
     /// Non-empty meshed sections (each tagged with its `section_index`).
     pub sections: Vec<SectionMesh>,
     /// The section-index range this job (re)meshed. Upload replaces exactly
@@ -475,11 +478,11 @@ impl MeshDispatcher {
         let (priority_tx, priority_rx) = crossbeam_channel::unbounded();
 
         let queue = Arc::new(MeshQueue::new());
-        // Half the cores, capped: more saturated workers starve the main/render
-        // thread during a load burst (frame spikes), and pooling makes load
-        // network-bound so they wouldn't speed it up anyway.
+        // Half the cores, capped. Too many saturated workers starve the
+        // main/render thread during a load burst (frame spikes); the cap trades
+        // some load throughput for that.
         let worker_count = std::thread::available_parallelism()
-            .map(|n| (n.get() / 2).clamp(2, 12))
+            .map(|n| (n.get() / 2).clamp(2, 16))
             .unwrap_or(1);
         let mut workers = Vec::with_capacity(worker_count);
         for _ in 0..worker_count {
@@ -561,14 +564,14 @@ impl MeshDispatcher {
         let min_y = chunk_store.min_y();
         let height = chunk_store.height();
 
-        let light: std::collections::HashMap<(i32, i32), crate::world::chunk::ChunkLightData> =
+        let light: std::collections::HashMap<(i32, i32), Arc<crate::world::chunk::ChunkLightData>> =
             chunks_needed
                 .iter()
                 .filter_map(|p| {
                     chunk_store
                         .light_data
                         .get(&(p.x, p.z))
-                        .map(|ld| ((p.x, p.z), ld.clone()))
+                        .map(|ld| ((p.x, p.z), Arc::clone(ld)))
                 })
                 .collect();
 
@@ -634,7 +637,7 @@ struct PendingJob {
     enqueued_at: Option<std::time::Instant>,
     chunks_needed: [ChunkPos; 5],
     chunk_arcs: Vec<Option<Arc<parking_lot::RwLock<azalea_world::Chunk>>>>,
-    light: HashMap<(i32, i32), crate::world::chunk::ChunkLightData>,
+    light: HashMap<(i32, i32), Arc<crate::world::chunk::ChunkLightData>>,
     registry: Arc<BlockRegistry>,
     uv_map: Arc<AtlasUVMap>,
     grass_colormap: Arc<Colormap>,
@@ -686,12 +689,52 @@ impl PendingJob {
     }
 }
 
+/// X/Z (column) distance from `cam` to a chunk's centre. Meshing order is
+/// purely horizontal distance; occlusion gates drawing, not meshing.
+fn column_dist_sq(pos: ChunkPos, cam: glam::DVec3) -> f64 {
+    let dx = (pos.x as f64 * 16.0 + 8.0) - cam.x;
+    let dz = (pos.z as f64 * 16.0 + 8.0) - cam.z;
+    dx * dx + dz * dz
+}
+
+/// A queued bulk-load job keyed by its column distance for the load heap.
+struct LoadEntry {
+    dist: f64,
+    job: PendingJob,
+}
+
+impl PartialEq for LoadEntry {
+    fn eq(&self, other: &Self) -> bool {
+        self.dist == other.dist
+    }
+}
+impl Eq for LoadEntry {}
+impl PartialOrd for LoadEntry {
+    fn partial_cmp(&self, other: &Self) -> Option<std::cmp::Ordering> {
+        Some(self.cmp(other))
+    }
+}
+impl Ord for LoadEntry {
+    // Reversed so `BinaryHeap` (a max-heap) pops the nearest (smallest dist).
+    fn cmp(&self, other: &Self) -> std::cmp::Ordering {
+        other.dist.total_cmp(&self.dist)
+    }
+}
+
 struct QueueState {
-    tasks: Vec<PendingJob>,
+    /// Edits, kept small (a handful in flight) so a linear scan + in-place
+    /// replace stays cheap.
+    recompiles: Vec<PendingJob>,
+    /// Bulk loads, a min-by-distance heap so dequeue is `O(log n)` under the
+    /// lock instead of an `O(n)` scan (the old contention point).
+    loads: BinaryHeap<LoadEntry>,
     // Consecutive edits served ahead of an initial load before one is forced, so
     // streaming never starves (vanilla SectionTaskDynamicQueue.MAX_RECOMPILE_QUOTA).
     recompile_quota: i32,
     camera: glam::DVec3,
+    /// Camera the load heap is keyed against; re-keyed only when the camera
+    /// crosses a bucket, so push/pop stay cheap between rebuilds.
+    sort_cam: glam::DVec3,
 }
 
 /// Re-orderable mesh queue, a port of vanilla `SectionTaskDynamicQueue`. The
@@ -707,9 +750,11 @@ impl MeshQueue {
     fn new() -> Self {
         Self {
             state: Mutex::new(QueueState {
-                tasks: Vec::new(),
+                recompiles: Vec::new(),
+                loads: BinaryHeap::new(),
                 recompile_quota: MAX_RECOMPILE_QUOTA,
                 camera: glam::DVec3::ZERO,
+                sort_cam: glam::DVec3::ZERO,
             }),
             available: Condvar::new(),
             closed: AtomicBool::new(false),
@@ -718,25 +763,44 @@ impl MeshQueue {
 
     fn push(&self, job: PendingJob) {
         let mut state = self.state.lock().unwrap();
-        // A re-edit of a still-queued section replaces the queued job in place
-        // instead of duplicating it. Bulk loads can't duplicate (`meshed` gates
-        // them), so only edits need this.
-        if job.is_recompile
-            && let Some(existing) = state
-                .tasks
+        if job.is_recompile {
+            // A re-edit of a still-queued section replaces the queued job in
+            // place instead of duplicating it. Bulk loads can't duplicate
+            // (`meshed` gates them), so only edits need this.
+            if let Some(existing) = state
+                .recompiles
                 .iter_mut()
-                .find(|t| t.is_recompile && t.pos == job.pos && t.sections == job.sections)
-        {
-            *existing = job;
+                .find(|t| t.pos == job.pos && t.sections == job.sections)
+            {
+                *existing = job;
+            } else {
+                state.recompiles.push(job);
+            }
         } else {
-            state.tasks.push(job);
+            let dist = column_dist_sq(job.pos, state.sort_cam);
+            state.loads.push(LoadEntry { dist, job });
         }
         drop(state);
         self.available.notify_one();
     }
 
     fn set_camera(&self, camera: glam::DVec3) {
-        self.state.lock().unwrap().camera = camera;
+        const BUCKET: f64 = 8.0;
+        let mut state = self.state.lock().unwrap();
+        state.camera = camera;
+        let crossed = (camera.x / BUCKET).floor() != (state.sort_cam.x / BUCKET).floor()
+            || (camera.z / BUCKET).floor() != (state.sort_cam.z / BUCKET).floor();
+        if crossed {
+            state.sort_cam = camera;
+            // Re-key the load heap to the new bucket (pop still gives the nearest).
+            state.loads = std::mem::take(&mut state.loads)
+                .into_iter()
+                .map(|e| LoadEntry {
+                    dist: column_dist_sq(e.job.pos, camera),
+                    job: e.job,
+                })
+                .collect();
+        }
     }
 
     fn close(&self) {
@@ -770,41 +834,28 @@ impl MeshQueue {
 /// over initial loads when the edit is closer, bounded by the recompile quota.
 /// Mirrors vanilla `SectionTaskDynamicQueue.poll`.
 fn poll(state: &mut QueueState) -> Option<PendingJob> {
-    let camera = state.camera;
-    let dist_sq = |task: &PendingJob| {
-        let dx = (task.pos.x as f64 * 16.0 + 8.0) - camera.x;
-        let dz = (task.pos.z as f64 * 16.0 + 8.0) - camera.z;
-        dx * dx + dz * dz
-    };
-
-    // Both lanes mesh nearest-first; edits (recompiles) are preferred over initial
-    // loads when closer, bounded by the recompile quota. Occlusion gates drawing,
-    // not meshing, so meshing order is purely distance-based.
-    let mut best_initial: Option<(usize, f64)> = None;
-    let mut best_recompile: Option<(usize, f64)> = None;
-    for (i, task) in state.tasks.iter().enumerate() {
-        let dist = dist_sq(task);
-        if task.is_recompile {
-            if best_recompile.is_none_or(|(_, d)| dist < d) {
-                best_recompile = Some((i, dist));
-            }
-        } else if best_initial.is_none_or(|(_, d)| dist < d) {
-            best_initial = Some((i, dist));
-        }
-    }
+    let cam = state.sort_cam;
+    // Nearest queued recompile (edits are few, so the linear scan is cheap).
+    let best_recompile = state
+        .recompiles
+        .iter()
+        .enumerate()
+        .map(|(i, t)| (i, column_dist_sq(t.pos, cam)))
+        .min_by(|a, b| a.1.total_cmp(&b.1));
+    let load_dist = state.loads.peek().map(|e| e.dist);
 
     if let Some((ri, rd)) = best_recompile {
-        let take_recompile = match best_initial {
+        let take_recompile = match load_dist {
             None => true,
-            Some((_, id)) => state.recompile_quota > 0 && rd < id,
+            Some(ld) => state.recompile_quota > 0 && rd < ld,
         };
         if take_recompile {
             state.recompile_quota -= 1;
-            return Some(state.tasks.swap_remove(ri));
+            return Some(state.recompiles.swap_remove(ri));
         }
     }
     state.recompile_quota = MAX_RECOMPILE_QUOTA;
-    best_initial.map(|(ii, _)| state.tasks.swap_remove(ii))
+    state.loads.pop().map(|e| e.job)
 }
 
 /// Run mesh workers below normal priority so the OS preempts them for the
@@ -832,7 +883,7 @@ struct ChunkStoreSnapshot {
         ChunkPos,
         Option<Arc<parking_lot::RwLock<azalea_world::Chunk>>>,
     )>,
-    light: std::collections::HashMap<(i32, i32), crate::world::chunk::ChunkLightData>,
+    light: std::collections::HashMap<(i32, i32), Arc<crate::world::chunk::ChunkLightData>>,
     grass_colormap: Arc<Colormap>,
     foliage_colormap: Arc<Colormap>,
     dry_foliage_colormap: Arc<Colormap>,
@@ -1399,6 +1450,7 @@ fn mesh_chunk_snapshot(
 
     ChunkMeshData {
         pos,
+        min_y,
         sections,
         replaced: range,
         content_gen: 0,
diff --git a/pomme-client/src/renderer/pipelines/chunk.rs b/pomme-client/src/renderer/pipelines/chunk.rs
index 4b3844c3..c909ff62 100644
--- a/pomme-client/src/renderer/pipelines/chunk.rs
+++ b/pomme-client/src/renderer/pipelines/chunk.rs
@@ -257,9 +257,9 @@ fn build_pipeline(
     stages: &[vk::PipelineShaderStageCreateInfo],
     color_blend: &vk::PipelineColorBlendStateCreateInfo,
 ) -> vk::Pipeline {
-    use crate::renderer::chunk::mesher::ChunkVertex;
-    let binding_descs = [ChunkVertex::binding_description()];
-    let attr_descs = ChunkVertex::attribute_descriptions();
+    use crate::renderer::chunk::buffer::{chunk_vertex_attributes, chunk_vertex_bindings};
+    let binding_descs = chunk_vertex_bindings();
+    let attr_descs = chunk_vertex_attributes();
     let vertex_input = vk::PipelineVertexInputStateCreateInfo {
         vertex_binding_description_count: binding_descs.len() as u32,
         vertex_binding_descriptions: binding_descs.as_ptr(),
diff --git a/pomme-client/src/renderer/shaders/chunk.vert b/pomme-client/src/renderer/shaders/chunk.vert
index 71879bd8..660a2f1a 100644
--- a/pomme-client/src/renderer/shaders/chunk.vert
+++ b/pomme-client/src/renderer/shaders/chunk.vert
@@ -8,9 +8,18 @@ layout(set = 0, binding = 0) uniform CameraUniform {
     vec4 fog_color;
 };
 
-layout(location = 0) in vec3 position;
-layout(location = 1) in vec2 tex_coords;
-layout(location = 2) in vec4 light_tint;
+// Position is quantized section-local (unorm); rebased to world via the
+// per-instance origin. Must match POS_RANGE / POS_BIAS in buffer.rs.
+const float POS_RANGE = 24.0;
+const float POS_BIAS = 4.0;
+
+layout(location = 0) in vec2 in_pos_xy;
+layout(location = 1) in float in_pos_z;
+layout(location = 2) in vec2 tex_coords;
+layout(location = 3) in vec4 light_tint;
+// Per-instance (from the meta buffer):
+layout(location = 4) in vec3 in_origin;
+layout(location = 5) in float in_fade;
 
 layout(location = 0) out vec2 v_tex_coords;
 layout(location = 1) out float v_light;
@@ -20,12 +29,14 @@ layout(location = 4) out vec3 v_fog_color;
 layout(location = 5) out float v_fog;
 
 void main() {
-    vec3 rel = position - camera_pos.xyz;
+    vec3 local = vec3(in_pos_xy, in_pos_z) * POS_RANGE - POS_BIAS;
+    vec3 world = in_origin + local;
+    vec3 rel = world - camera_pos.xyz;
     gl_Position = view_proj * vec4(rel, 1.0);
     v_tex_coords = tex_coords;
     v_light = light_tint.r;
     v_tint = light_tint.gba;
-    v_visibility = uintBitsToFloat(gl_InstanceIndex);
+    v_visibility = in_fade;
     v_fog_color = fog_color.rgb;
     v_fog = fog_factor(rel, camera_pos.w, fog_color.w);
 }
diff --git a/pomme-client/src/renderer/shaders/cull.comp b/pomme-client/src/renderer/shaders/cull.comp
index 1584433f..321bc0a5 100644
--- a/pomme-client/src/renderer/shaders/cull.comp
+++ b/pomme-client/src/renderer/shaders/cull.comp
@@ -9,6 +9,7 @@ struct ChunkMeta {
     uint first_index;
     int vertex_offset;
     uint visibility;
+    vec4 origin;
 };
 
 struct DrawCmd {
@@ -64,5 +65,7 @@ void main() {
     draws[slot].instance_count = 1u;
     draws[slot].first_index = m.first_index;
     draws[slot].vertex_offset = m.vertex_offset;
-    draws[slot].first_instance = m.visibility;
+    // first_instance routes the draw to its meta entry (origin + fade) which the
+    // vertex shader reads as a per-instance attribute.
+    draws[slot].first_instance = idx;
 }
diff --git a/pomme-client/src/world/chunk.rs b/pomme-client/src/world/chunk.rs
index 4cce73b7..bac76b06 100644
--- a/pomme-client/src/world/chunk.rs
+++ b/pomme-client/src/world/chunk.rs
@@ -76,7 +76,7 @@ impl ChunkLightData {
 pub struct ChunkStore {
     pub chunk_storage: ChunkStorage,
     pub partial_storage: PartialChunkStorage,
-    pub light_data: std::collections::HashMap<(i32, i32), ChunkLightData>,
+    pub light_data: std::collections::HashMap<(i32, i32), Arc<ChunkLightData>>,
     pub block_entities: std::collections::HashMap<BlockPos, StoredBlockEntity>,
 }
 
@@ -94,11 +94,8 @@ impl ChunkStore {
         }
     }
 
-    pub fn loaded_positions(&self) -> Vec<ChunkPos> {
-        self.light_data
-            .keys()
-            .map(|&(x, z)| ChunkPos::new(x, z))
-            .collect()
+    pub fn loaded_positions(&self) -> impl Iterator<Item = ChunkPos> + '_ {
+        self.light_data.keys().map(|&(x, z)| ChunkPos::new(x, z))
     }
 
     pub fn load_chunk(
@@ -151,11 +148,11 @@ impl ChunkStore {
 
         self.light_data.insert(
             (pos.x, pos.z),
-            ChunkLightData {
+            Arc::new(ChunkLightData {
                 sky_sections,
                 block_sections,
                 min_y: self.chunk_storage.min_y(),
-            },
+            }),
         );
     }
 

From 99e179fef6ab12505d2b06cbbac33d935a447265 Mon Sep 17 00:00:00 2001
From: Purdze <r.s.sutton@hotmail.co.uk>
Date: Fri, 26 Jun 2026 21:11:01 +0100
Subject: [PATCH 3/4] cleanup

---
 pomme-client/src/renderer/chunk/buffer.rs | 25 ++++++++++++++++-------
 pomme-client/src/renderer/chunk/mesher.rs |  6 +++---
 2 files changed, 21 insertions(+), 10 deletions(-)

diff --git a/pomme-client/src/renderer/chunk/buffer.rs b/pomme-client/src/renderer/chunk/buffer.rs
index a85d565c..5cf808b0 100644
--- a/pomme-client/src/renderer/chunk/buffer.rs
+++ b/pomme-client/src/renderer/chunk/buffer.rs
@@ -26,6 +26,9 @@ const MAX_BUCKETS: u32 = 2048;
 const VRAM_BUDGET_FRACTION: f64 = 0.25;
 /// Fade-in duration for a freshly revealed section (ms).
 const FADE_DURATION_MS: f32 = 1000.0;
+/// Columns within this squared X/Z distance of the camera render opaque
+/// immediately and never fade in.
+const NEARBY_DIST_SQ: f32 = 768.0;
 
 /// First-fit free-list sub-allocator over a fixed element range, coalescing on
 /// free. Each section gets an exact-size vertex (and index) slice instead of
@@ -676,6 +679,14 @@ impl ChunkBufferStore {
         self.last_draw_count
     }
 
+    /// Whether `pos`'s column is near enough to `cam` to render opaque
+    /// immediately (a nearby column never fades in).
+    fn column_nearby(&self, pos: ChunkPos, cam: [f32; 3]) -> bool {
+        let dx = pos.x as f32 * 16.0 + 8.0 - cam[0];
+        let dz = pos.z as f32 * 16.0 + 8.0 - cam[2];
+        !self.fade_enabled || dx * dx + dz * dz < NEARBY_DIST_SQ
+    }
+
     /// Submit the accumulated staging copies as a single transfer and block on
     /// a fence until it completes. One fence wait per call replaces the old
     /// per-mesh `queue.wait_idle`, so a frame's uploads synchronize once
@@ -1001,10 +1012,13 @@ impl ChunkBufferStore {
 
             // Freshly revealed sections fade in, so extend the fade window the
             // cull's O(1) check reads; re-meshed-only uploads swap instantly.
-            if plans
+            // Nearby columns never fade, so extending for them only forces
+            // redundant rebuilds — skip them. `last_sort_cam` is the camera the
+            // draw list is keyed to (unset => far, the safe default).
+            let revealed = plans
                 .iter()
-                .any(|p| !was_present.contains(&p.section_index))
-            {
+                .any(|p| !was_present.contains(&p.section_index));
+            if revealed && !self.column_nearby(mesh.pos, self.last_sort_cam) {
                 let dur = std::time::Duration::from_secs_f32(FADE_DURATION_MS / 1000.0);
                 self.fade_until = self.fade_until.max(now + dur);
             }
@@ -1202,7 +1216,6 @@ impl ChunkBufferStore {
         }
 
         let now = std::time::Instant::now();
-        const NEARBY_DIST_SQ: f32 = 768.0;
         // Re-sort only once the camera moves ~8 blocks; front-to-back order is an
         // early-Z optimization, so finer staleness is harmless.
         const SORT_RECAM_SQ: f32 = 64.0;
@@ -1218,9 +1231,7 @@ impl ChunkBufferStore {
             for (pos, alloc) in self.chunks.iter() {
                 // Near columns never fade; otherwise each section fades on its own
                 // timer (X/Z distance is per-column).
-                let dx = pos.x as f32 * 16.0 + 8.0 - camera_pos[0];
-                let dz = pos.z as f32 * 16.0 + 8.0 - camera_pos[2];
-                let nearby = !self.fade_enabled || dx * dx + dz * dz < NEARBY_DIST_SQ;
+                let nearby = self.column_nearby(*pos, camera_pos);
 
                 // CPU omission: the visibility graph's mask skips sections proven
                 // occluded, so they never reach the GPU cull (absent => all draw).
diff --git a/pomme-client/src/renderer/chunk/mesher.rs b/pomme-client/src/renderer/chunk/mesher.rs
index 5cf40d84..2cf6abf4 100644
--- a/pomme-client/src/renderer/chunk/mesher.rs
+++ b/pomme-client/src/renderer/chunk/mesher.rs
@@ -472,8 +472,8 @@ impl MeshDispatcher {
         dry_foliage_colormap: Colormap,
         biome_climate: Arc<HashMap<u32, BiomeClimate>>,
     ) -> Self {
-        // Bulk results are bounded for back-pressure; edits stay unbounded so a
-        // block edit never blocks a worker behind the load backlog.
+        // Bulk results are bounded for back-pressure; edit results use the
+        // unbounded priority channel so they never queue behind the load backlog.
         let (result_tx, result_rx) = crossbeam_channel::bounded(MAX_PENDING_RESULTS);
         let (priority_tx, priority_rx) = crossbeam_channel::unbounded();
 
@@ -483,7 +483,7 @@ impl MeshDispatcher {
         // some load throughput for that.
         let worker_count = std::thread::available_parallelism()
             .map(|n| (n.get() / 2).clamp(2, 16))
-            .unwrap_or(1);
+            .unwrap_or(2);
         let mut workers = Vec::with_capacity(worker_count);
         for _ in 0..worker_count {
             let queue = Arc::clone(&queue);

From 3799bb949cac844d6c55dc757650875c33c4b9a1 Mon Sep 17 00:00:00 2001
From: Purdze <r.s.sutton@hotmail.co.uk>
Date: Sat, 27 Jun 2026 09:07:24 +0100
Subject: [PATCH 4/4] Split terrain into solid and cutout passes to restore
 early-Z

---
 pomme-client/build.rs                         |   1 +
 pomme-client/src/renderer/chunk/atlas.rs      |  17 +-
 pomme-client/src/renderer/chunk/buffer.rs     | 192 +++++++++++++---
 pomme-client/src/renderer/chunk/mesher.rs     | 210 ++++++++----------
 pomme-client/src/renderer/mod.rs              |   8 +-
 pomme-client/src/renderer/pipelines/chunk.rs  |  54 +++--
 .../src/renderer/shaders/chunk_solid.frag     |  36 +++
 pomme-client/src/renderer/shaders/cull.comp   |  38 +++-
 8 files changed, 383 insertions(+), 173 deletions(-)
 create mode 100644 pomme-client/src/renderer/shaders/chunk_solid.frag

diff --git a/pomme-client/build.rs b/pomme-client/build.rs
index f759ea11..f8df4d78 100644
--- a/pomme-client/build.rs
+++ b/pomme-client/build.rs
@@ -37,6 +37,7 @@ fn main() {
     let shaders = [
         ("chunk.vert", shaderc::ShaderKind::Vertex),
         ("chunk.frag", shaderc::ShaderKind::Fragment),
+        ("chunk_solid.frag", shaderc::ShaderKind::Fragment),
         ("cube.vert", shaderc::ShaderKind::Vertex),
         ("cube.frag", shaderc::ShaderKind::Fragment),
         ("panorama.vert", shaderc::ShaderKind::Vertex),
diff --git a/pomme-client/src/renderer/chunk/atlas.rs b/pomme-client/src/renderer/chunk/atlas.rs
index d336160f..b5a7bd24 100644
--- a/pomme-client/src/renderer/chunk/atlas.rs
+++ b/pomme-client/src/renderer/chunk/atlas.rs
@@ -14,6 +14,10 @@ pub struct AtlasRegion {
     pub v_min: f32,
     pub u_max: f32,
     pub v_max: f32,
+    /// Every level-0 texel is fully opaque (alpha 255), so quads using this
+    /// sprite can render in the no-discard solid pass (early-Z). Sprites with
+    /// any transparent texel are cutout and stay in the discard pass.
+    pub opaque: bool,
 }
 
 #[derive(Clone)]
@@ -149,7 +153,8 @@ impl TextureAtlas {
         for src in &sources {
             match placements.get(src.name.as_str()) {
                 Some(Some((cx, cy))) => {
-                    let region = pixel_region(*cx, *cy, src.w, src.h, atlas_size);
+                    let mut region = pixel_region(*cx, *cy, src.w, src.h, atlas_size);
+                    region.opaque = sprite_is_opaque(&src.data);
                     for py in 0..src.h {
                         for px in 0..src.w {
                             let s = ((py * src.w + px) * 4) as usize;
@@ -321,9 +326,19 @@ fn pixel_region(x: u32, y: u32, w: u32, h: u32, atlas_size: u32) -> AtlasRegion
         v_min: (y as f32 + INSET) / s,
         u_max: ((x + w) as f32 - INSET) / s,
         v_max: ((y + h) as f32 - INSET) / s,
+        // Filled in by the caller from the sprite's texels; the missing tile is a
+        // solid checker, so the geometric default is opaque.
+        opaque: true,
     }
 }
 
+/// Whether every level-0 texel of an RGBA sprite is fully opaque (alpha 255).
+/// Conservative: any transparency (or unknown) routes the sprite to the cutout
+/// pass, so a hole never renders solid.
+fn sprite_is_opaque(data: &[u8]) -> bool {
+    data.chunks_exact(4).all(|px| px[3] == 255)
+}
+
 type PackResult = (HashMap<String, Option<(u32, u32)>>, AtlasRegion);
 
 fn pack(sources: &[Source], atlas_size: u32) -> (PackResult, bool) {
diff --git a/pomme-client/src/renderer/chunk/buffer.rs b/pomme-client/src/renderer/chunk/buffer.rs
index 5cf808b0..46f1a437 100644
--- a/pomme-client/src/renderer/chunk/buffer.rs
+++ b/pomme-client/src/renderer/chunk/buffer.rs
@@ -136,8 +136,10 @@ struct ChunkMeta {
     first_index: u32,
     vertex_offset: i32,
     visibility: u32,
-    /// Section world origin; bound as a per-instance vertex attribute so the
-    /// vertex shader rebases the quantized local position. `[3]` is padding.
+    /// `[0..3]`: section world origin, bound as a per-instance vertex attribute
+    /// so the vertex shader rebases the quantized local position (it reads only
+    /// xyz). `[3]`: the section's `solid_index_count` reinterpreted as float
+    /// bits, read by the cull shader to split the solid/cutout draws.
     origin: [f32; 4],
 }
 
@@ -275,6 +277,9 @@ struct SectionAlloc {
     origin: [f32; 3],
     first_index: u32,
     index_count: u32,
+    /// Leading indices belonging to the solid (no-discard) pass; the rest are
+    /// cutout. Passed to the GPU via `ChunkMeta.origin[3]`.
+    solid_index_count: u32,
     vertex_offset: i32,
     vtx_len: u32,
     uploaded_at: std::time::Instant,
@@ -335,10 +340,17 @@ pub struct ChunkBufferStore {
 
     meta_buffers: Vec<vk::Buffer>,
     meta_allocs: Vec<Allocation>,
+    // Solid (no-discard, early-Z) draw list, written by the cull shader.
     indirect_buffers: Vec<vk::Buffer>,
     indirect_allocs: Vec<Allocation>,
     count_buffers: Vec<vk::Buffer>,
     count_allocs: Vec<Allocation>,
+    // Cutout (discard) draw list. Same sections, the back of each section's
+    // index slice; drawn in a second pass after solid lays down depth.
+    indirect_cutout_buffers: Vec<vk::Buffer>,
+    indirect_cutout_allocs: Vec<Allocation>,
+    count_cutout_buffers: Vec<vk::Buffer>,
+    count_cutout_allocs: Vec<Allocation>,
     frustum_buffers: Vec<vk::Buffer>,
     frustum_allocs: Vec<Allocation>,
     fade_enabled: bool,
@@ -475,6 +487,10 @@ impl ChunkBufferStore {
         let mut indirect_allocs = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT);
         let mut count_buffers = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT);
         let mut count_allocs = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT);
+        let mut indirect_cutout_buffers = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT);
+        let mut indirect_cutout_allocs = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT);
+        let mut count_cutout_buffers = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT);
+        let mut count_cutout_allocs = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT);
         let mut frustum_buffers = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT);
         let mut frustum_allocs = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT);
 
@@ -509,6 +525,26 @@ impl ChunkBufferStore {
             count_buffers.push(b);
             count_allocs.push(a);
 
+            let (b, a) = util::create_host_buffer(
+                device,
+                allocator,
+                indirect_size,
+                vk::BufferUsageFlags::StorageBuffer | vk::BufferUsageFlags::IndirectBuffer,
+                "indirect_cmds_cutout",
+            );
+            indirect_cutout_buffers.push(b);
+            indirect_cutout_allocs.push(a);
+
+            let (b, a) = util::create_host_buffer(
+                device,
+                allocator,
+                count_size,
+                vk::BufferUsageFlags::StorageBuffer | vk::BufferUsageFlags::IndirectBuffer,
+                "draw_count_cutout",
+            );
+            count_cutout_buffers.push(b);
+            count_cutout_allocs.push(a);
+
             let (b, a) = util::create_host_buffer(
                 device,
                 allocator,
@@ -557,7 +593,8 @@ impl ChunkBufferStore {
         let pool_sizes = [
             vk::DescriptorPoolSize {
                 ty: vk::DescriptorType::StorageBuffer,
-                descriptor_count: 3 * MAX_FRAMES_IN_FLIGHT as u32,
+                // meta + solid indirect/count + cutout indirect/count = 5 per frame.
+                descriptor_count: 5 * MAX_FRAMES_IN_FLIGHT as u32,
             },
             vk::DescriptorPoolSize {
                 ty: vk::DescriptorType::UniformBuffer,
@@ -621,12 +658,37 @@ impl ChunkBufferStore {
                 count_size,
             );
 
+            let (indirect_c_info, mut indirect_c_write) = desc_write(
+                compute_sets[i],
+                4,
+                vk::DescriptorType::StorageBuffer,
+                indirect_cutout_buffers[i],
+                indirect_size,
+            );
+
+            let (count_c_info, mut count_c_write) = desc_write(
+                compute_sets[i],
+                5,
+                vk::DescriptorType::StorageBuffer,
+                count_cutout_buffers[i],
+                count_size,
+            );
+
             meta_write.buffer_info = meta_info.as_ptr();
             frustum_write.buffer_info = frustum_info.as_ptr();
             indirect_write.buffer_info = indirect_info.as_ptr();
             count_write.buffer_info = count_info.as_ptr();
-
-            let writes = [meta_write, frustum_write, indirect_write, count_write];
+            indirect_c_write.buffer_info = indirect_c_info.as_ptr();
+            count_c_write.buffer_info = count_c_info.as_ptr();
+
+            let writes = [
+                meta_write,
+                frustum_write,
+                indirect_write,
+                count_write,
+                indirect_c_write,
+                count_c_write,
+            ];
 
             device.update_descriptor_sets(&writes, &[]);
         }
@@ -664,6 +726,10 @@ impl ChunkBufferStore {
             indirect_allocs,
             count_buffers,
             count_allocs,
+            indirect_cutout_buffers,
+            indirect_cutout_allocs,
+            count_cutout_buffers,
+            count_cutout_allocs,
             frustum_buffers,
             frustum_allocs,
             fade_enabled: false,
@@ -772,6 +838,7 @@ impl ChunkBufferStore {
             indices: &'a [u32],
             vtx_off: u32,
             idx_off: u32,
+            solid_index_count: u32,
             aabb: ChunkAABB,
             origin: [f32; 3],
         }
@@ -915,6 +982,7 @@ impl ChunkBufferStore {
                     indices: &sec.indices,
                     vtx_off,
                     idx_off,
+                    solid_index_count: sec.solid_index_count,
                     aabb: section_aabb(&sec.vertices),
                     origin: [
                         (mesh.pos.x * 16) as f32,
@@ -998,6 +1066,7 @@ impl ChunkBufferStore {
                 origin: p.origin,
                 first_index: p.idx_off,
                 index_count: p.indices.len() as u32,
+                solid_index_count: p.solid_index_count,
                 vertex_offset: p.vtx_off as i32,
                 vtx_len: p.verts.len() as u32,
                 // A re-meshed section swaps instantly; a freshly revealed one fades in.
@@ -1077,6 +1146,13 @@ impl ChunkBufferStore {
                         std::mem::zeroed()
                     }))
                     .ok();
+                device.destroy_buffer(self.indirect_cutout_buffers[i], None);
+                alloc
+                    .free(std::mem::replace(
+                        &mut self.indirect_cutout_allocs[i],
+                        unsafe { std::mem::zeroed() },
+                    ))
+                    .ok();
             }
         }
 
@@ -1103,6 +1179,16 @@ impl ChunkBufferStore {
             self.indirect_buffers[i] = b;
             self.indirect_allocs[i] = a;
 
+            let (b, a) = util::create_host_buffer(
+                device,
+                allocator,
+                indirect_size,
+                vk::BufferUsageFlags::StorageBuffer | vk::BufferUsageFlags::IndirectBuffer,
+                "indirect_cmds_cutout",
+            );
+            self.indirect_cutout_buffers[i] = b;
+            self.indirect_cutout_allocs[i] = a;
+
             let (meta_info, mut meta_write) = desc_write(
                 self.compute_sets[i],
                 0,
@@ -1117,9 +1203,17 @@ impl ChunkBufferStore {
                 self.indirect_buffers[i],
                 indirect_size,
             );
+            let (indirect_c_info, mut indirect_c_write) = desc_write(
+                self.compute_sets[i],
+                4,
+                vk::DescriptorType::StorageBuffer,
+                self.indirect_cutout_buffers[i],
+                indirect_size,
+            );
             meta_write.buffer_info = meta_info.as_ptr();
             indirect_write.buffer_info = indirect_info.as_ptr();
-            device.update_descriptor_sets(&[meta_write, indirect_write], &[]);
+            indirect_c_write.buffer_info = indirect_c_info.as_ptr();
+            device.update_descriptor_sets(&[meta_write, indirect_write, indirect_c_write], &[]);
         }
 
         self.max_meta = new_max;
@@ -1254,7 +1348,12 @@ impl ChunkBufferStore {
                         first_index: sec.first_index,
                         vertex_offset: sec.vertex_offset,
                         visibility: vis.to_bits(),
-                        origin: [sec.origin[0], sec.origin[1], sec.origin[2], 0.0],
+                        origin: [
+                            sec.origin[0],
+                            sec.origin[1],
+                            sec.origin[2],
+                            f32::from_bits(sec.solid_index_count),
+                        ],
                     });
                 }
             }
@@ -1314,22 +1413,28 @@ impl ChunkBufferStore {
             .copy_from_slice(frustum_bytes);
 
         // This frame slot's GPU work has completed (fence-waited at frame start),
-        // so the count buffer still holds its previous cull result; capture it for
-        // the debug overlay before clearing it for this dispatch.
+        // so the count buffers still hold their previous cull result; capture the
+        // total (solid + cutout draws) for the debug overlay before clearing them.
         {
-            let s = self.count_allocs[frame].mapped_slice_mut().unwrap();
-            self.last_draw_count = u32::from_ne_bytes([s[0], s[1], s[2], s[3]]);
+            let read_and_clear = |a: &mut Allocation| {
+                let s = a.mapped_slice_mut().unwrap();
+                let n = u32::from_ne_bytes([s[0], s[1], s[2], s[3]]);
+                s[..4].copy_from_slice(&0u32.to_ne_bytes());
+                n
+            };
+            self.last_draw_count = read_and_clear(&mut self.count_allocs[frame])
+                + read_and_clear(&mut self.count_cutout_allocs[frame]);
         }
-        self.count_allocs[frame].mapped_slice_mut().unwrap()[..4]
-            .copy_from_slice(&0u32.to_ne_bytes());
 
         // macOS draws the whole indirect buffer (no drawIndirectCount), so slots
         // the cull shader leaves unfilled must read as no-op draws, not stale data.
         #[cfg(target_os = "macos")]
-        self.indirect_allocs[frame]
-            .mapped_slice_mut()
-            .unwrap()
-            .fill(0);
+        for a in [
+            &mut self.indirect_allocs[frame],
+            &mut self.indirect_cutout_allocs[frame],
+        ] {
+            a.mapped_slice_mut().unwrap().fill(0);
+        }
 
         cmd.bind_pipeline(vk::PipelineBindPoint::Compute, self.compute_pipeline);
         cmd.bind_descriptor_sets(
@@ -1360,7 +1465,11 @@ impl ChunkBufferStore {
         }
     }
 
-    pub fn draw_indirect(&self, cmd: vk::CommandBuffer, frame: usize) {
+    /// Issue one render layer's indirect draws. `cutout` selects the discard
+    /// pass's draw list (drawn after `solid`, which lays down depth); the
+    /// caller binds the matching pipeline first. Both layers share the
+    /// vertex/index/meta buffers and the cull-written draw lists.
+    pub fn draw_indirect(&self, cmd: vk::CommandBuffer, frame: usize, cutout: bool) {
         if self.chunks.is_empty() {
             return;
         }
@@ -1370,23 +1479,26 @@ impl ChunkBufferStore {
             .values()
             .map(|c| c.sections.len() as u32)
             .sum::<u32>();
+        let (indirect, count) = if cutout {
+            (
+                self.indirect_cutout_buffers[frame],
+                self.count_cutout_buffers[frame],
+            )
+        } else {
+            (self.indirect_buffers[frame], self.count_buffers[frame])
+        };
 
         // Binding 0: packed vertex pool. Binding 1: the meta buffer, read per
         // instance for the section origin + fade (indexed by `first_instance`).
         cmd.bind_vertex_buffers(0, &[self.vertex_buffer, self.meta_buffers[frame]], &[0, 0]);
         cmd.bind_index_buffer(self.index_buffer, 0, vk::IndexType::Uint32);
         if cfg!(target_os = "macos") {
-            cmd.draw_indexed_indirect(
-                self.indirect_buffers[frame],
-                0,
-                max_draws,
-                size_of::<DrawCommand>() as u32,
-            );
+            cmd.draw_indexed_indirect(indirect, 0, max_draws, size_of::<DrawCommand>() as u32);
         } else {
             cmd.draw_indexed_indirect_count(
-                self.indirect_buffers[frame],
+                indirect,
                 0,
-                self.count_buffers[frame],
+                count,
                 0,
                 max_draws,
                 size_of::<DrawCommand>() as u32,
@@ -1415,6 +1527,8 @@ impl ChunkBufferStore {
             device.destroy_buffer(self.meta_buffers[i], None);
             device.destroy_buffer(self.indirect_buffers[i], None);
             device.destroy_buffer(self.count_buffers[i], None);
+            device.destroy_buffer(self.indirect_cutout_buffers[i], None);
+            device.destroy_buffer(self.count_cutout_buffers[i], None);
             device.destroy_buffer(self.frustum_buffers[i], None);
 
             alloc
@@ -1432,6 +1546,18 @@ impl ChunkBufferStore {
                     std::mem::zeroed()
                 }))
                 .ok();
+            alloc
+                .free(std::mem::replace(
+                    &mut self.indirect_cutout_allocs[i],
+                    unsafe { std::mem::zeroed() },
+                ))
+                .ok();
+            alloc
+                .free(std::mem::replace(
+                    &mut self.count_cutout_allocs[i],
+                    unsafe { std::mem::zeroed() },
+                ))
+                .ok();
             alloc
                 .free(std::mem::replace(&mut self.frustum_allocs[i], unsafe {
                     std::mem::zeroed()
@@ -1485,6 +1611,20 @@ fn create_cull_desc_layout(device: &vk::Device) -> vk::DescriptorSetLayout {
             stage_flags: vk::ShaderStageFlags::Compute,
             ..Default::default()
         },
+        vk::DescriptorSetLayoutBinding {
+            binding: 4,
+            descriptor_type: vk::DescriptorType::StorageBuffer,
+            descriptor_count: 1,
+            stage_flags: vk::ShaderStageFlags::Compute,
+            ..Default::default()
+        },
+        vk::DescriptorSetLayoutBinding {
+            binding: 5,
+            descriptor_type: vk::DescriptorType::StorageBuffer,
+            descriptor_count: 1,
+            stage_flags: vk::ShaderStageFlags::Compute,
+            ..Default::default()
+        },
     ];
     let info = vk::DescriptorSetLayoutCreateInfo {
         binding_count: bindings.len() as u32,
diff --git a/pomme-client/src/renderer/chunk/mesher.rs b/pomme-client/src/renderer/chunk/mesher.rs
index 2cf6abf4..4856397a 100644
--- a/pomme-client/src/renderer/chunk/mesher.rs
+++ b/pomme-client/src/renderer/chunk/mesher.rs
@@ -94,7 +94,35 @@ pub struct SectionMesh {
     /// per-section upload/replace.
     pub section_index: i32,
     pub vertices: Vec<ChunkVertex>,
+    /// Solid (opaque) indices first, then cutout indices. `solid_index_count`
+    /// splits the two so each renders in its own pass.
     pub indices: Vec<u32>,
+    /// Number of leading `indices` that belong to the solid (no-discard) pass;
+    /// the rest are cutout (discard) geometry.
+    pub solid_index_count: u32,
+}
+
+/// Per-section meshing accumulator: one shared vertex pool plus separate solid
+/// and cutout index lists (routed per quad by sprite opacity). Finalized into a
+/// [`SectionMesh`] with the two index lists concatenated solid-first.
+#[derive(Default)]
+struct MeshSink {
+    vertices: Vec<ChunkVertex>,
+    solid: Vec<u32>,
+    cutout: Vec<u32>,
+}
+
+impl MeshSink {
+    /// Index list a quad's triangles go in: solid sprites render in the
+    /// no-discard pass, everything else (cutout/translucent) in the discard
+    /// pass.
+    fn indices_for(&mut self, opaque: bool) -> &mut Vec<u32> {
+        if opaque {
+            &mut self.solid
+        } else {
+            &mut self.cutout
+        }
+    }
 }
 
 pub struct ChunkMeshData {
@@ -1249,19 +1277,14 @@ fn mesh_chunk_snapshot(
     let by_start = min_y + range.start * 16;
     let by_end = min_y + range.end * 16;
 
-    let mut sections: Vec<SectionMesh> = (0..section_count)
-        .map(|i| SectionMesh {
-            section_index: i,
-            vertices: Vec::new(),
-            indices: Vec::new(),
-        })
-        .collect();
+    let mut sinks: Vec<MeshSink> = (0..section_count).map(|_| MeshSink::default()).collect();
     // In-range sections get recycled buffers (capacity retained from earlier
     // meshes) so the worker fills them without going through the OS allocator.
+    // The cutout list stays un-pooled (empty for the common all-solid section).
     for si in range.clone() {
-        let sec = &mut sections[si as usize];
-        sec.vertices = pool.take_vertices();
-        sec.indices = pool.take_indices();
+        let sink = &mut sinks[si as usize];
+        sink.vertices = pool.take_vertices();
+        sink.solid = pool.take_indices();
     }
 
     // The type map is a state->id map, so it only needs the meshed span (+1-block
@@ -1276,11 +1299,11 @@ fn mesh_chunk_snapshot(
     let mut visibility: Vec<(i32, VisibilitySet)> = Vec::new();
     if let Some(ref tm) = type_map {
         for si in range.clone() {
-            let sec = &mut sections[si as usize];
+            let sink = &mut sinks[si as usize];
             let section_y = min_y + si * 16;
             let vis = greedy_mesh_section(
-                &mut sec.vertices,
-                &mut sec.indices,
+                &mut sink.vertices,
+                &mut sink.solid,
                 snapshot,
                 registry,
                 tm,
@@ -1344,73 +1367,27 @@ fn mesh_chunk_snapshot(
                 // a non-16-aligned world height can't index past the last section.
                 let s =
                     (((by - min_y) / 16) as usize).min((section_count as usize).saturating_sub(1));
-                let sec = &mut sections[s];
+                let sink = &mut sinks[s];
 
                 if lod > 0 {
                     emit_lod_cube(
-                        &mut sec.vertices,
-                        &mut sec.indices,
-                        block_pos,
-                        state,
-                        snapshot,
-                        registry,
-                        uv_map,
-                        bx,
-                        by,
-                        bz,
-                        step,
+                        sink, block_pos, state, snapshot, registry, uv_map, bx, by, bz, step,
                     );
                 } else if let BlockKind::Water | BlockKind::Lava = kind {
                     emit_fluid(
-                        &mut sec.vertices,
-                        &mut sec.indices,
-                        block_pos,
-                        state,
-                        snapshot,
-                        registry,
-                        uv_map,
-                        bx,
-                        by,
-                        bz,
+                        sink, block_pos, state, snapshot, registry, uv_map, bx, by, bz,
                     );
                 } else if let Some(baked) = registry.get_baked_model(state) {
                     emit_baked_model(
-                        &mut sec.vertices,
-                        &mut sec.indices,
-                        block_pos,
-                        baked,
-                        snapshot,
-                        registry,
-                        uv_map,
-                        bx,
-                        by,
-                        bz,
+                        sink, block_pos, baked, snapshot, registry, uv_map, bx, by, bz,
                     );
                 } else if let Some(quads) = registry.get_multipart_quads(state) {
                     emit_multipart(
-                        &mut sec.vertices,
-                        &mut sec.indices,
-                        block_pos,
-                        &quads,
-                        snapshot,
-                        registry,
-                        uv_map,
-                        bx,
-                        by,
-                        bz,
+                        sink, block_pos, &quads, snapshot, registry, uv_map, bx, by, bz,
                     );
                 } else if let Some(textures) = registry.get_textures(state) {
                     emit_cube_faces(
-                        &mut sec.vertices,
-                        &mut sec.indices,
-                        block_pos,
-                        textures,
-                        snapshot,
-                        registry,
-                        uv_map,
-                        bx,
-                        by,
-                        bz,
+                        sink, block_pos, textures, snapshot, registry, uv_map, bx, by, bz,
                     );
                 } else {
                     let block: Box<dyn azalea_block::BlockTrait> = state.into();
@@ -1418,16 +1395,7 @@ fn mesh_chunk_snapshot(
                     if logged_missing.insert(id.clone()) {
                         tracing::warn!("Missing model: {id}");
                     }
-                    emit_missing_cube(
-                        &mut sec.vertices,
-                        &mut sec.indices,
-                        block_pos,
-                        snapshot,
-                        registry,
-                        bx,
-                        by,
-                        bz,
-                    );
+                    emit_missing_cube(sink, block_pos, snapshot, registry, bx, by, bz);
                 }
                 by += step;
             }
@@ -1436,17 +1404,24 @@ fn mesh_chunk_snapshot(
         local_z += step;
     }
 
-    // Keep only non-empty sections; recycle the buffers of in-range sections that
-    // ended up empty (rather than dropping their retained capacity).
-    let mut kept = Vec::with_capacity(sections.len());
-    for s in sections {
-        if s.vertices.is_empty() || s.indices.is_empty() {
-            pool.recycle(s.vertices, s.indices);
-        } else {
-            kept.push(s);
+    // Finalize each non-empty section: concatenate cutout indices after solid
+    // (recording the split), keep the result. Empty in-range sections recycle
+    // their buffers rather than dropping the retained capacity.
+    let mut sections = Vec::with_capacity(sinks.len());
+    for (i, mut sink) in sinks.into_iter().enumerate() {
+        if sink.solid.is_empty() && sink.cutout.is_empty() {
+            pool.recycle(sink.vertices, sink.solid);
+            continue;
         }
+        let solid_index_count = sink.solid.len() as u32;
+        sink.solid.extend_from_slice(&sink.cutout);
+        sections.push(SectionMesh {
+            section_index: i as i32,
+            vertices: sink.vertices,
+            indices: sink.solid,
+            solid_index_count,
+        });
     }
-    let sections = kept;
 
     ChunkMeshData {
         pos,
@@ -1462,8 +1437,7 @@ fn mesh_chunk_snapshot(
 
 #[allow(clippy::too_many_arguments)]
 fn emit_baked_model(
-    vertices: &mut Vec<ChunkVertex>,
-    indices: &mut Vec<u32>,
+    sink: &mut MeshSink,
     block_pos: [f32; 3],
     model: &BakedModel,
     snapshot: &ChunkStoreSnapshot,
@@ -1495,8 +1469,7 @@ fn emit_baked_model(
             [quad.shade_light; 4]
         };
         emit_face(
-            vertices,
-            indices,
+            sink,
             block_pos,
             &quad.positions,
             &quad.uvs,
@@ -1509,8 +1482,7 @@ fn emit_baked_model(
 
 #[allow(clippy::too_many_arguments)]
 fn emit_cube_faces(
-    vertices: &mut Vec<ChunkVertex>,
-    indices: &mut Vec<u32>,
+    sink: &mut MeshSink,
     block_pos: [f32; 3],
     textures: &crate::world::block::registry::FaceTextures,
     snapshot: &ChunkStoreSnapshot,
@@ -1549,8 +1521,7 @@ fn emit_cube_faces(
         let is_side = i >= 2;
         if let Some(overlay) = textures.side_overlay.as_deref().filter(|_| is_side) {
             emit_face(
-                vertices,
-                indices,
+                sink,
                 block_pos,
                 &positions,
                 &uvs,
@@ -1560,8 +1531,7 @@ fn emit_cube_faces(
             );
             let overlay_region = uv_map.get_region(overlay);
             emit_face(
-                vertices,
-                indices,
+                sink,
                 block_pos,
                 &positions,
                 &uvs,
@@ -1577,9 +1547,7 @@ fn emit_cube_faces(
             } else {
                 PACKED_WHITE_SHIFTED
             };
-            emit_face(
-                vertices, indices, block_pos, &positions, &uvs, lights, region, face_tint,
-            );
+            emit_face(sink, block_pos, &positions, &uvs, lights, region, face_tint);
         }
     }
 }
@@ -1656,8 +1624,7 @@ fn block_face_tex_tint(
 
 #[allow(clippy::too_many_arguments)]
 fn emit_fluid(
-    vertices: &mut Vec<ChunkVertex>,
-    indices: &mut Vec<u32>,
+    sink: &mut MeshSink,
     block_pos: [f32; 3],
     state: azalea_block::BlockState,
     snapshot: &ChunkStoreSnapshot,
@@ -1694,16 +1661,13 @@ fn emit_fluid(
             }
         }
 
-        emit_face(
-            vertices, indices, block_pos, &positions, &uvs, [light; 4], region, tint,
-        );
+        emit_face(sink, block_pos, &positions, &uvs, [light; 4], region, tint);
     }
 }
 
 #[allow(clippy::too_many_arguments)]
 fn emit_multipart(
-    vertices: &mut Vec<ChunkVertex>,
-    indices: &mut Vec<u32>,
+    sink: &mut MeshSink,
     block_pos: [f32; 3],
     quads: &[&crate::world::block::model::BakedQuad],
     snapshot: &ChunkStoreSnapshot,
@@ -1730,8 +1694,7 @@ fn emit_multipart(
             snapshot.dry_foliage_tint(bx, by, bz),
         );
         emit_face(
-            vertices,
-            indices,
+            sink,
             block_pos,
             &quad.positions,
             &quad.uvs,
@@ -1744,8 +1707,7 @@ fn emit_multipart(
 
 #[allow(clippy::too_many_arguments)]
 fn emit_lod_cube(
-    vertices: &mut Vec<ChunkVertex>,
-    indices: &mut Vec<u32>,
+    sink: &mut MeshSink,
     block_pos: [f32; 3],
     state: azalea_block::BlockState,
     snapshot: &ChunkStoreSnapshot,
@@ -1788,9 +1750,9 @@ fn emit_lod_cube(
         let (positions, uvs, light) = cube_face_geometry(*dir);
         let s = step as f32;
         let sy = if is_fluid { fluid_top } else { s };
-        let base = vertices.len() as u32;
+        let base = sink.vertices.len() as u32;
         for i in 0..4 {
-            vertices.push(ChunkVertex {
+            sink.vertices.push(ChunkVertex {
                 position: [
                     block_pos[0] + positions[i][0] * s,
                     block_pos[1] + positions[i][1] * sy,
@@ -1803,7 +1765,14 @@ fn emit_lod_cube(
                 light_tint: pack_light_tint(light, tint),
             });
         }
-        indices.extend_from_slice(&[base, base + 1, base + 2, base + 2, base + 3, base]);
+        sink.indices_for(region.opaque).extend_from_slice(&[
+            base,
+            base + 1,
+            base + 2,
+            base + 2,
+            base + 3,
+            base,
+        ]);
     }
 }
 
@@ -1811,8 +1780,7 @@ const MISSING_TINT: u32 = pack_tint_shifted([1.0, 0.0, 1.0]);
 
 #[allow(clippy::too_many_arguments)]
 fn emit_missing_cube(
-    vertices: &mut Vec<ChunkVertex>,
-    indices: &mut Vec<u32>,
+    sink: &mut MeshSink,
     block_pos: [f32; 3],
     snapshot: &ChunkStoreSnapshot,
     registry: &BlockRegistry,
@@ -1828,9 +1796,9 @@ fn emit_missing_cube(
         }
 
         let (positions, _, light) = cube_face_geometry(*dir);
-        let base = vertices.len() as u32;
+        let base = sink.vertices.len() as u32;
         for pos in &positions {
-            vertices.push(ChunkVertex {
+            sink.vertices.push(ChunkVertex {
                 position: [
                     block_pos[0] + pos[0],
                     block_pos[1] + pos[1],
@@ -1840,7 +1808,9 @@ fn emit_missing_cube(
                 light_tint: pack_light_tint(light, MISSING_TINT),
             });
         }
-        indices.extend_from_slice(&[base, base + 1, base + 2, base + 2, base + 3, base]);
+        // The missing tile is a solid checker, so the cube goes in the solid pass.
+        sink.solid
+            .extend_from_slice(&[base, base + 1, base + 2, base + 2, base + 3, base]);
     }
 }
 
@@ -1855,8 +1825,7 @@ pub(crate) const CUBE_FACE_DIRS: [Direction; 6] = [
 
 #[allow(clippy::too_many_arguments)]
 fn emit_face(
-    vertices: &mut Vec<ChunkVertex>,
-    indices: &mut Vec<u32>,
+    sink: &mut MeshSink,
     block_pos: [f32; 3],
     positions: &[[f32; 3]; 4],
     uvs: &[[f32; 2]; 4],
@@ -1864,12 +1833,12 @@ fn emit_face(
     region: AtlasRegion,
     tint: u32,
 ) {
-    let base = vertices.len() as u32;
+    let base = sink.vertices.len() as u32;
     let u_span = region.u_max - region.u_min;
     let v_span = region.v_max - region.v_min;
 
     for i in 0..4 {
-        vertices.push(ChunkVertex {
+        sink.vertices.push(ChunkVertex {
             position: [
                 block_pos[0] + positions[i][0],
                 block_pos[1] + positions[i][1],
@@ -1883,6 +1852,7 @@ fn emit_face(
         });
     }
 
+    let indices = sink.indices_for(region.opaque);
     if lights[0] + lights[2] > lights[1] + lights[3] {
         indices.extend_from_slice(&[base + 1, base + 2, base + 3, base + 3, base, base + 1]);
     } else {
diff --git a/pomme-client/src/renderer/mod.rs b/pomme-client/src/renderer/mod.rs
index a941b612..eb8a8c8a 100644
--- a/pomme-client/src/renderer/mod.rs
+++ b/pomme-client/src/renderer/mod.rs
@@ -1439,8 +1439,12 @@ impl Renderer {
                     .update_and_draw(&self.ctx.device, cmd, frame, &self.camera, sky);
 
                 let t_cull = std::time::Instant::now();
-                self.chunk_pipeline.bind(cmd, frame);
-                self.chunk_buffers.draw_indirect(cmd, frame);
+                // Solid (no discard) first so it lays down depth and early-Z lets
+                // the front-to-back order reject occluded fragments; cutout after.
+                self.chunk_pipeline.bind(cmd, frame, false);
+                self.chunk_buffers.draw_indirect(cmd, frame, false);
+                self.chunk_pipeline.bind(cmd, frame, true);
+                self.chunk_buffers.draw_indirect(cmd, frame, true);
                 let cull_ms = t_cull.elapsed().as_secs_f32() * 1000.0;
 
                 if let Some((block_pos, stage, state)) = destroy_info {
diff --git a/pomme-client/src/renderer/pipelines/chunk.rs b/pomme-client/src/renderer/pipelines/chunk.rs
index c909ff62..1320c504 100644
--- a/pomme-client/src/renderer/pipelines/chunk.rs
+++ b/pomme-client/src/renderer/pipelines/chunk.rs
@@ -9,7 +9,10 @@ use crate::renderer::chunk::atlas::TextureAtlas;
 use crate::renderer::{MAX_FRAMES_IN_FLIGHT, shader, util};
 
 pub struct ChunkPipeline {
-    pub pipeline: vk::Pipeline,
+    /// Opaque terrain: no discard, early-Z. Drawn first (front-to-back).
+    pub pipeline_solid: vk::Pipeline,
+    /// Cutout/translucent terrain: alpha-test discard. Drawn after solid.
+    pub pipeline_cutout: vk::Pipeline,
     pub pipeline_layout: vk::PipelineLayout,
     pub descriptor_set_layout_camera: vk::DescriptorSetLayout,
     pub descriptor_set_layout_atlas: vk::DescriptorSetLayout,
@@ -48,7 +51,8 @@ impl ChunkPipeline {
             .create_pipeline_layout(&layout_info, None)
             .expect("failed to create pipeline layout");
 
-        let pipeline = create_pipeline(device, render_pass, pipeline_layout);
+        let (pipeline_solid, pipeline_cutout) =
+            create_pipelines(device, render_pass, pipeline_layout);
 
         let pool_sizes = [
             vk::DescriptorPoolSize {
@@ -139,7 +143,8 @@ impl ChunkPipeline {
         device.update_descriptor_sets(&[atlas_write], &[]);
 
         Self {
-            pipeline,
+            pipeline_solid,
+            pipeline_cutout,
             pipeline_layout,
             descriptor_set_layout_camera: camera_layout,
             descriptor_set_layout_atlas: atlas_layout,
@@ -174,8 +179,13 @@ impl ChunkPipeline {
         device.update_descriptor_sets(&[write], &[]);
     }
 
-    pub fn bind(&self, cmd: vk::CommandBuffer, frame: usize) {
-        cmd.bind_pipeline(vk::PipelineBindPoint::Graphics, self.pipeline);
+    pub fn bind(&self, cmd: vk::CommandBuffer, frame: usize, cutout: bool) {
+        let pipeline = if cutout {
+            self.pipeline_cutout
+        } else {
+            self.pipeline_solid
+        };
+        cmd.bind_pipeline(vk::PipelineBindPoint::Graphics, pipeline);
         cmd.bind_descriptor_sets(
             vk::PipelineBindPoint::Graphics,
             self.pipeline_layout,
@@ -197,7 +207,8 @@ impl ChunkPipeline {
         }
         drop(alloc);
 
-        device.destroy_pipeline(self.pipeline, None);
+        device.destroy_pipeline(self.pipeline_solid, None);
+        device.destroy_pipeline(self.pipeline_cutout, None);
         device.destroy_pipeline_layout(self.pipeline_layout, None);
         device.destroy_descriptor_pool(self.descriptor_pool, None);
         device.destroy_descriptor_set_layout(self.descriptor_set_layout_camera, None);
@@ -217,20 +228,21 @@ fn shader_stage(
     }
 }
 
-fn create_pipeline(
+/// Builds the two chunk pipelines: `solid` (chunk_solid.frag, no discard,
+/// early-Z) and `cutout` (chunk.frag, alpha-test discard). Identical state
+/// otherwise; both share the vertex shader and layout.
+fn create_pipelines(
     device: &vk::Device,
     render_pass: vk::RenderPass,
     layout: vk::PipelineLayout,
-) -> vk::Pipeline {
+) -> (vk::Pipeline, vk::Pipeline) {
     let vert_module =
         shader::create_shader_module(device, shader::include_spirv!("chunk.vert.spv"));
-    let frag_module =
+    let solid_frag =
+        shader::create_shader_module(device, shader::include_spirv!("chunk_solid.frag.spv"));
+    let cutout_frag =
         shader::create_shader_module(device, shader::include_spirv!("chunk.frag.spv"));
 
-    let stages = [
-        shader_stage(vk::ShaderStageFlags::Vertex, vert_module),
-        shader_stage(vk::ShaderStageFlags::Fragment, frag_module),
-    ];
     let blend_attachment = [vk::PipelineColorBlendAttachmentState {
         blend_enable: vk::FALSE,
         color_write_mask: vk::ColorComponentFlags::RGBA,
@@ -242,10 +254,20 @@ fn create_pipeline(
         ..Default::default()
     };
 
-    let pipeline = build_pipeline(device, render_pass, layout, &stages, &color_blend);
+    let build = |frag| {
+        let stages = [
+            shader_stage(vk::ShaderStageFlags::Vertex, vert_module),
+            shader_stage(vk::ShaderStageFlags::Fragment, frag),
+        ];
+        build_pipeline(device, render_pass, layout, &stages, &color_blend)
+    };
+    let pipeline_solid = build(solid_frag);
+    let pipeline_cutout = build(cutout_frag);
+
     device.destroy_shader_module(vert_module, None);
-    device.destroy_shader_module(frag_module, None);
-    pipeline
+    device.destroy_shader_module(solid_frag, None);
+    device.destroy_shader_module(cutout_frag, None);
+    (pipeline_solid, pipeline_cutout)
 }
 
 /// Shared chunk pipeline state; callers supply the shader stages and
diff --git a/pomme-client/src/renderer/shaders/chunk_solid.frag b/pomme-client/src/renderer/shaders/chunk_solid.frag
new file mode 100644
index 00000000..250aa6f3
--- /dev/null
+++ b/pomme-client/src/renderer/shaders/chunk_solid.frag
@@ -0,0 +1,36 @@
+#version 450
+
+// Solid (opaque) terrain pass. Unlike chunk.frag this has no `discard`, so the
+// driver keeps early-Z: with the front-to-back draw order, fragments occluded by
+// nearer terrain are rejected before this shader runs. `early_fragment_tests`
+// makes that explicit. Only sprites with no transparent texels are routed here
+// (see AtlasRegion::opaque), so the alpha test chunk.frag does is unnecessary.
+layout(early_fragment_tests) in;
+
+#include "fog.glsl"
+
+layout(set = 1, binding = 0) uniform sampler2D atlas_texture;
+
+layout(location = 0) in vec2 v_tex_coords;
+layout(location = 1) in float v_light;
+layout(location = 2) in vec3 v_tint;
+layout(location = 3) flat in float v_visibility;
+layout(location = 4) in vec3 v_fog_color;
+layout(location = 5) in float v_fog;
+
+layout(location = 0) out vec4 out_color;
+
+void main() {
+    vec4 color = texture(atlas_texture, v_tex_coords);
+    vec3 linear_tint = pow(v_tint, vec3(2.2));
+    float linear_light = pow(v_light, 2.2);
+    vec3 tinted = color.rgb * linear_tint * linear_light;
+
+    if (v_visibility < 1.0) {
+        tinted = mix(v_fog_color, tinted, v_visibility);
+    }
+
+    tinted = apply_fog(tinted, v_fog, v_fog_color);
+
+    out_color = vec4(tinted, 1.0);
+}
diff --git a/pomme-client/src/renderer/shaders/cull.comp b/pomme-client/src/renderer/shaders/cull.comp
index 321bc0a5..2d311c7d 100644
--- a/pomme-client/src/renderer/shaders/cull.comp
+++ b/pomme-client/src/renderer/shaders/cull.comp
@@ -40,6 +40,32 @@ layout(set = 0, binding = 3) buffer CountBuf {
     uint draw_count;
 };
 
+layout(set = 0, binding = 4) writeonly buffer IndirectCutoutBuf {
+    DrawCmd cutout_draws[];
+};
+
+layout(set = 0, binding = 5) buffer CutoutCountBuf {
+    uint cutout_draw_count;
+};
+
+// Emit one indexed draw covering [first..first+count) for this section. The
+// per-instance attributes (origin + fade) come from meta entry `idx`, routed via
+// first_instance.
+void emit(uint count, uint first, int vertex_offset, uint idx, bool cutout) {
+    if (count == 0u) return;
+    DrawCmd d;
+    d.index_count = count;
+    d.instance_count = 1u;
+    d.first_index = first;
+    d.vertex_offset = vertex_offset;
+    d.first_instance = idx;
+    if (cutout) {
+        cutout_draws[atomicAdd(cutout_draw_count, 1u)] = d;
+    } else {
+        draws[atomicAdd(draw_count, 1u)] = d;
+    }
+}
+
 void main() {
     uint idx = gl_GlobalInvocationID.x;
     if (idx >= chunk_count) return;
@@ -60,12 +86,8 @@ void main() {
         if (d < 0.0) return;
     }
 
-    uint slot = atomicAdd(draw_count, 1u);
-    draws[slot].index_count = m.index_count;
-    draws[slot].instance_count = 1u;
-    draws[slot].first_index = m.first_index;
-    draws[slot].vertex_offset = m.vertex_offset;
-    // first_instance routes the draw to its meta entry (origin + fade) which the
-    // vertex shader reads as a per-instance attribute.
-    draws[slot].first_instance = idx;
+    // origin.w packs the section's solid index count; indices split solid-first.
+    uint solid_count = floatBitsToUint(m.origin.w);
+    emit(solid_count, m.first_index, m.vertex_offset, idx, false);
+    emit(m.index_count - solid_count, m.first_index + solid_count, m.vertex_offset, idx, true);
 }