From d0accfc8b13516b3763b8c5df1e8cfc8eacfe87a Mon Sep 17 00:00:00 2001 From: Purdze Date: Tue, 23 Jun 2026 22:35:49 +0100 Subject: [PATCH 1/4] Fix chunk-load frame spikes and improve streaming throughput --- Cargo.lock | 39 +- pomme-client/Cargo.toml | 4 + pomme-client/src/app/core.rs | 11 + pomme-client/src/app/phases/connecting.rs | 6 +- pomme-client/src/app/phases/in_game.rs | 59 ++- pomme-client/src/benchmark.rs | 73 +++- pomme-client/src/main.rs | 17 +- pomme-client/src/renderer/chunk/buffer.rs | 492 +++++++++++++--------- pomme-client/src/renderer/chunk/mesher.rs | 131 +++++- pomme-client/src/renderer/mod.rs | 9 +- 10 files changed, 595 insertions(+), 246 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index 7dbe63f9..4364cdb2 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -247,7 +247,7 @@ dependencies = [ "objc2-foundation 0.3.2", "parking_lot", "percent-encoding", - "windows-sys 0.59.0", + "windows-sys 0.60.2", "x11rb", ] @@ -1899,7 +1899,7 @@ dependencies = [ "libc", "option-ext", "redox_users", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -2276,7 +2276,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "39cab71617ae0d63f51a36d69f866391735b51691dbda63cf6f96d042b63efeb" dependencies = [ "libc", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -3834,6 +3834,15 @@ version = "0.2.16" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b6d2cec3eae94f9f509c767b45932f1ada8350c4bdb85af2fcab4a3c14807981" +[[package]] +name = "libmimalloc-sys" +version = "0.1.49" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6a45a52f43e1c16f667ccfe4dd8c85b7f7c204fd5e3bf46c5b0db9a5c3c0b8e9" +dependencies = [ + "cc", +] + [[package]] name = "libredox" version = "0.1.16" @@ -3996,6 +4005,15 @@ dependencies = [ "autocfg", ] +[[package]] +name = "mimalloc" +version = "0.1.52" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "2d4139bb28d14ad1facf21d5eb8825051b326e172d216b39f6d31df53cc97862" +dependencies = [ + "libmimalloc-sys", +] + [[package]] name = "mime" version = "0.3.17" @@ -4161,7 +4179,7 @@ version = "0.50.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7957b9740744892f114936ab4a57b3f487491bbeafaf8083688b16841a4240e5" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -4811,7 +4829,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d8fae84b431384b68627d0f9b3b1245fcf9f46f6c0e3dc902e9dce64edd1967" dependencies = [ "libc", - "windows-sys 0.45.0", + "windows-sys 0.61.2", ] [[package]] @@ -5113,6 +5131,7 @@ dependencies = [ "gilrs", "glam", "image", + "mimalloc", "open", "parking_lot", "png 0.17.16", @@ -5428,7 +5447,7 @@ dependencies = [ "once_cell", "socket2", "tracing", - "windows-sys 0.59.0", + "windows-sys 0.60.2", ] [[package]] @@ -5904,7 +5923,7 @@ dependencies = [ "errno", "libc", "linux-raw-sys 0.12.1", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -5961,7 +5980,7 @@ dependencies = [ "security-framework 3.7.0", "security-framework-sys", "webpki-root-certs", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -7446,7 +7465,7 @@ dependencies = [ "getrandom 0.4.2", "once_cell", "rustix 1.1.4", - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] @@ -8585,7 +8604,7 @@ version = "0.1.11" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c2a7b1c03c876122aa43f3020e6c3c3ee5c05081c9a00739faf7503aeba10d22" dependencies = [ - "windows-sys 0.59.0", + "windows-sys 0.61.2", ] [[package]] diff --git a/pomme-client/Cargo.toml b/pomme-client/Cargo.toml index c17596c5..c46d2f64 100644 --- a/pomme-client/Cargo.toml +++ b/pomme-client/Cargo.toml @@ -23,6 +23,10 @@ tracing = { workspace = true } tracing-subscriber = { workspace = true, features = ["env-filter"] } tracing-appender = "0.2" +# Per-thread-heap allocator: the chunk-mesh worker pool churns vertex/index Vecs +# across threads, and the default system heap's global lock serializes that and +# stalls the main thread. Mirrors vanilla (JVM TLABs + LWJGL's jemalloc). +mimalloc = "0.1" pomme-gpu-allocator = { path = "../pomme-gpu-allocator" } winit = { version = "0.30", features = ["rwh_06"] } raw-window-handle = "0.6" diff --git a/pomme-client/src/app/core.rs b/pomme-client/src/app/core.rs index 1bca4786..5413e3f2 100644 --- a/pomme-client/src/app/core.rs +++ b/pomme-client/src/app/core.rs @@ -313,6 +313,9 @@ impl AppCore { ) -> Option { let rx = &connection.event_rx; + // Phase timers for the chunk-load benchmark's worst-frame breakdown. + let t_net = std::time::Instant::now(); + let mut chunks_to_mesh = Vec::new(); // Block edits go on the priority lane so they apply instantly even while // chunks stream in, instead of starving behind the load backlog. @@ -986,8 +989,16 @@ impl AppCore { // then enqueue everything that needs meshing — visible-first, with hidden // columns backfilled at a bounded rate so the world still completes. let loads_happened = !chunks_to_mesh.is_empty(); + let ms = |t: std::time::Instant| t.elapsed().as_secs_f32() * 1000.0; + game.last_update_phases.net_decode_ms = ms(t_net); + + let t_vis = std::time::Instant::now(); game.update_visibility(renderer, player_chunk, loads_happened); + game.last_update_phases.visibility_ms = ms(t_vis); + + let t_rescan = std::time::Instant::now(); game.rescan_mesh_jobs(player_chunk); + game.last_update_phases.rescan_ms = ms(t_rescan); disconnect_reason } diff --git a/pomme-client/src/app/phases/connecting.rs b/pomme-client/src/app/phases/connecting.rs index 0c92ac63..b5805825 100644 --- a/pomme-client/src/app/phases/connecting.rs +++ b/pomme-client/src/app/phases/connecting.rs @@ -37,8 +37,10 @@ pub fn update_connecting( if matches!(connect_phase, ConnectionPhase::Loading) { game.mesh_dispatcher .set_camera_position(*game.player.position); - for mesh in game.mesh_dispatcher.drain_results() { - gfx.renderer.upload_chunk_mesh(&mesh); + let ready_meshes: Vec<_> = game.mesh_dispatcher.drain_results().collect(); + gfx.renderer.upload_chunk_meshes(&ready_meshes); + for mesh in ready_meshes { + game.mesh_dispatcher.recycle(mesh); } let ready = game.position_set && (game.dead || gfx.renderer.loaded_chunk_count() > 0); diff --git a/pomme-client/src/app/phases/in_game.rs b/pomme-client/src/app/phases/in_game.rs index 5b74eda3..02f590b9 100644 --- a/pomme-client/src/app/phases/in_game.rs +++ b/pomme-client/src/app/phases/in_game.rs @@ -91,6 +91,9 @@ pub struct GameState { /// In-flight/finished upload of the chunk-load result, while its overlay is /// shown. pub chunk_load_upload: Option, + /// Last frame's `update_game` CPU phase timings, for the chunk-load + /// benchmark's worst-frame breakdown. + pub last_update_phases: crate::benchmark::UpdatePhases, /// Monotonic content generation per column, bumped on every edit (and chunk /// load). This is the dirty marker: a column needs (re)meshing whenever its /// `content_gen` outruns what was last enqueued, regardless of visibility, @@ -199,6 +202,7 @@ impl GameState { chunk_load_result: None, chunk_load_abort: false, chunk_load_upload: None, + last_update_phases: crate::benchmark::UpdatePhases::default(), content_gen: HashMap::new(), meshed: HashMap::new(), vis_mask: HashMap::new(), @@ -596,6 +600,11 @@ pub fn update_game( connection: &ConnectionHandle, game: &mut GameState, ) -> GameUpdateResult { + // Snapshot last frame's phase timings before this frame overwrites them: they + // align with `raw_dt`, which measures the previous frame's full duration. + let frame_start = std::time::Instant::now(); + let prev_phases = game.last_update_phases; + // Position the audio listener at the player's head and push current // volumes before draining sound packets this frame. let listener_pos = game.player.eye_pos(); @@ -611,7 +620,13 @@ pub fn update_game( return GameUpdateResult::Disconnected { reason }; } - for mesh in game.mesh_dispatcher.drain_results() { + // Collect the frame's ready meshes, apply their CPU-side bookkeeping, then + // upload them in one coalesced GPU transfer (one fence wait, not one per + // mesh) to avoid the streaming stutter from per-mesh `queue.wait_idle`. + let drain_start = std::time::Instant::now(); + let results: Vec<_> = game.mesh_dispatcher.drain_results().collect(); + let mut batch = Vec::with_capacity(results.len()); + for mut mesh in results { // Drop a mesh built from an out-of-date snapshot. Edits (priority lane, // single section) are keyed per section so editing one section never // drops a sibling's in-flight result; bulk loads keep the column key. @@ -623,6 +638,7 @@ pub fn update_game( mesh.content_gen < game.content_gen.get(&mesh.pos).copied().unwrap_or(0) }; if stale { + game.mesh_dispatcher.recycle(mesh); continue; } if let Some(t) = &mesh.timing { @@ -637,24 +653,34 @@ pub fn update_game( ms(t.enqueued_at.elapsed()), ); } - let dropped = gfx.renderer.upload_chunk_mesh(&mesh); + // Visibility updates are independent of the GPU upload; apply them now so + // the mesh can move into the upload batch. let pos = mesh.pos; - // Sections dropped on pool exhaustion were retired from the buffer; clear - // their meshed bit so the next rescan re-enqueues them. - if !dropped.is_empty() - && let Some(m) = game.meshed.get_mut(&pos) - { - for si in dropped { - m.mask &= !(1u32 << si); - } - } - for (si, vis) in mesh.visibility { + for (si, vis) in std::mem::take(&mut mesh.visibility) { let e = game.section_vis_epoch.entry((pos, si)).or_insert(0); if mesh.upload_epoch >= *e { *e = mesh.upload_epoch; game.section_vis.insert((pos, si), vis); } } + batch.push(mesh); + } + game.last_update_phases.mesh_drain_ms = drain_start.elapsed().as_secs_f32() * 1000.0; + let upload_start = std::time::Instant::now(); + let dropped = gfx.renderer.upload_chunk_meshes(&batch); + game.last_update_phases.upload_ms = upload_start.elapsed().as_secs_f32() * 1000.0; + // Sections dropped on pool exhaustion were retired from the buffer; clear + // their meshed bit so the next rescan re-enqueues them. + for (pos, sections) in dropped { + if let Some(m) = game.meshed.get_mut(&pos) { + for si in sections { + m.mask &= !(1u32 << si); + } + } + } + // Return the uploaded meshes' buffers to the worker pool for reuse. + for mesh in batch { + game.mesh_dispatcher.recycle(mesh); } game.mesh_dispatcher @@ -930,7 +956,12 @@ pub fn update_game( if let Some(mut bench) = game.chunk_load_bench.take() { let count = gfx.renderer.loaded_chunk_count(); - match bench.update(count, raw_dt * 1000.0) { + match bench.update( + count, + raw_dt * 1000.0, + gfx.renderer.last_timings(), + prev_phases, + ) { ChunkLoadStep::Wait => { game.chunk_load_bench = Some(bench); } @@ -1382,6 +1413,8 @@ pub fn update_game( ) { tracing::error!("Render error: {e}"); } + // Whole-frame wall time (incl. render), read next frame to align with `raw_dt`. + game.last_update_phases.update_ms = frame_start.elapsed().as_secs_f32() * 1000.0; if close_inventory { game.inventory_open = false; diff --git a/pomme-client/src/benchmark.rs b/pomme-client/src/benchmark.rs index d3a38643..f7cd9fc6 100644 --- a/pomme-client/src/benchmark.rs +++ b/pomme-client/src/benchmark.rs @@ -247,6 +247,40 @@ fn radius_from_chunk_count(count: u32) -> u32 { (((count as f32).sqrt() - 1.0) / 2.0).round().max(0.0) as u32 } +/// `update_game`'s CPU phase timings — the per-frame work not covered by the +/// render timings. Set each frame and folded into [`FrameBreakdown`]. +/// `update_ms` is the whole-`update_game` wall time (including the render +/// call); if it is far below `total_ms`, the hitch is outside `update_game` +/// (framerate limiter / OS scheduling / inter-frame gap) rather than in any CPU +/// phase. +#[derive(Clone, Copy, Default, serde::Serialize)] +pub struct UpdatePhases { + pub update_ms: f32, + pub net_decode_ms: f32, + pub visibility_ms: f32, + pub rescan_ms: f32, + pub mesh_drain_ms: f32, + pub upload_ms: f32, +} + +/// Phase split of a run's single worst frame, to localize a hitch. `total_ms` +/// is the wall-clock frame (`raw_dt`); `render_ms` the `render_frame` portion +/// (which includes `fence_ms`, the GPU-bound wait); the `update` phases cover +/// the rest. All sub-timings reflect the same prior frame `raw_dt` measures, so +/// the split lines up; whatever `total_ms` exceeds the parts is time spent +/// outside `update_game` (limiter / OS scheduling / inter-frame gap). +#[derive(Clone, Default, serde::Serialize)] +pub struct FrameBreakdown { + pub total_ms: f32, + pub render_ms: f32, + pub fence_ms: f32, + pub acquire_ms: f32, + pub cull_ms: f32, + pub present_ms: f32, + #[serde(flatten)] + pub update: UpdatePhases, +} + /// One reset→load cycle's measurements. #[derive(Clone, serde::Serialize)] pub struct ChunkLoadRun { @@ -256,6 +290,7 @@ pub struct ChunkLoadRun { pub time_to_first_secs: f32, pub avg_frame_ms: f32, pub worst_frame_ms: f32, + pub worst_frame_breakdown: FrameBreakdown, } #[derive(Clone, serde::Serialize)] @@ -293,6 +328,9 @@ pub struct ChunkLoadResult { pub avg_frame_ms: f32, pub worst_frame_ms: f32, pub runs_detail: Vec, + /// Phase split of the worst frame across the measured runs — what the spike + /// was actually spent on. + pub worst_frame_breakdown: FrameBreakdown, /// "debug" or "release" — see [`build_profile`]. pub profile: String, pub measurement_note: String, @@ -348,6 +386,8 @@ pub struct ChunkLoadBench { first_load_at: Option, frame_ms_sum: f32, frame_ms_max: f32, + /// Phase split of the current run's worst frame so far. + worst_breakdown: FrameBreakdown, frame_samples: u32, /// How many reset→load cycles have finished (warmup + measured). runs_done: u32, @@ -389,13 +429,20 @@ impl ChunkLoadBench { first_load_at: None, frame_ms_sum: 0.0, frame_ms_max: 0.0, + worst_breakdown: FrameBreakdown::default(), frame_samples: 0, runs_done: 0, completed: Vec::new(), } } - pub fn update(&mut self, loaded_count: u32, frame_ms: f32) -> ChunkLoadStep { + pub fn update( + &mut self, + loaded_count: u32, + frame_ms: f32, + timings: &RenderTimings, + phases: UpdatePhases, + ) -> ChunkLoadStep { match self.phase { ChunkPhase::Reset => { // Wait for the unload to settle (count stops dropping) so the @@ -421,7 +468,18 @@ impl ChunkLoadBench { } ChunkPhase::Load => { self.frame_ms_sum += frame_ms; - self.frame_ms_max = self.frame_ms_max.max(frame_ms); + if frame_ms > self.frame_ms_max { + self.frame_ms_max = frame_ms; + self.worst_breakdown = FrameBreakdown { + total_ms: frame_ms, + render_ms: timings.frame_ms, + fence_ms: timings.fence_ms, + acquire_ms: timings.acquire_ms, + cull_ms: timings.cull_ms, + present_ms: timings.present_ms, + update: phases, + }; + } self.frame_samples += 1; if loaded_count != self.last_count { @@ -465,6 +523,7 @@ impl ChunkLoadBench { time_to_first_secs, avg_frame_ms, worst_frame_ms: self.frame_ms_max, + worst_frame_breakdown: self.worst_breakdown.clone(), }); self.runs_done += 1; @@ -482,6 +541,7 @@ impl ChunkLoadBench { self.first_load_at = None; self.frame_ms_sum = 0.0; self.frame_ms_max = 0.0; + self.worst_breakdown = FrameBreakdown::default(); self.frame_samples = 0; ChunkLoadStep::Load(CHUNK_LOAD_MIN_RD) } else { @@ -524,6 +584,15 @@ impl ChunkLoadBench { .iter() .map(|r| r.worst_frame_ms) .fold(0.0, f32::max), + worst_frame_breakdown: measured + .iter() + .max_by(|a, b| { + a.worst_frame_ms + .partial_cmp(&b.worst_frame_ms) + .unwrap_or(std::cmp::Ordering::Equal) + }) + .map(|r| r.worst_frame_breakdown.clone()) + .unwrap_or_default(), runs_detail: measured.to_vec(), profile: build_profile().to_owned(), measurement_note: MEASUREMENT_NOTE.to_owned(), diff --git a/pomme-client/src/main.rs b/pomme-client/src/main.rs index 84e39bae..b55a3e09 100644 --- a/pomme-client/src/main.rs +++ b/pomme-client/src/main.rs @@ -1,3 +1,9 @@ +// Per-thread-heap allocator (see Cargo.toml): keeps the chunk-mesh worker +// pool's cross-thread Vec churn from serializing on the system heap's global +// lock and stalling the main thread. +#[global_allocator] +static GLOBAL: mimalloc::MiMalloc = mimalloc::MiMalloc; + mod app; mod args; mod assets; @@ -96,7 +102,16 @@ fn main() { data_dirs.ensure_game_dir().ok(); tracing::info!("Installation directory: {}", data_dirs.game_dir.display()); - let rt = Arc::new(tokio::runtime::Runtime::new().expect("Failed to create tokio runtime")); + // A single connection needs only a few async workers; the default runtime + // spawns one per core and floods them decoding the chunk-load burst, starving + // the render/mesh threads. Cap it so those cores stay free. + let rt = Arc::new( + tokio::runtime::Builder::new_multi_thread() + .worker_threads(4) + .enable_all() + .build() + .expect("Failed to create tokio runtime"), + ); let user = UserData::from_args(args.username, args.uuid, args.access_token); diff --git a/pomme-client/src/renderer/chunk/buffer.rs b/pomme-client/src/renderer/chunk/buffer.rs index 4c7f10eb..90604885 100644 --- a/pomme-client/src/renderer/chunk/buffer.rs +++ b/pomme-client/src/renderer/chunk/buffer.rs @@ -180,6 +180,9 @@ pub struct ChunkBufferStore { staging_size: u64, transfer_pool: vk::CommandPool, transfer_cmd: vk::CommandBuffer, + /// Signals completion of a batched staging->device transfer. Reused (reset + /// before each submit) so a frame's uploads sync once instead of per-mesh. + transfer_fence: vk::Fence, use_staging: bool, /// Exact-size sub-allocators over the vertex and index pools (in elements). @@ -269,7 +272,14 @@ impl ChunkBufferStore { (vb, va, ib, ia) }; - let staging_size = BYTES_PER_BUCKET * 4; + // Discrete GPUs batch a frame's uploads through this buffer in one + // transfer, so size it to hold several columns and keep sub-flushes rare. + // The integrated path writes mapped memory directly and never touches it. + let staging_size = if use_staging { + BYTES_PER_BUCKET * 16 + } else { + BYTES_PER_BUCKET * 4 + }; let (staging_buffer, staging_alloc) = util::create_host_buffer( device, allocator, @@ -299,6 +309,10 @@ impl ChunkBufferStore { } .expect("failed to alloc transfer cmd"); + let transfer_fence = device + .create_fence(&vk::FenceCreateInfo::default(), None) + .expect("failed to create transfer fence"); + tracing::info!( "Chunk buffers: {} (vertex={} MB, index={} MB, staging={} KB)", if use_staging { @@ -314,7 +328,11 @@ impl ChunkBufferStore { let vtx_free = FreeList::new(total_buckets * BUCKET_VERTICES); let idx_free = FreeList::new(total_buckets * BUCKET_INDICES); - let max_meta = (total_buckets * 2) as usize; + // Per-section packing yields many more draws than buckets, so pre-size + // generously: growth (`ensure_meta_capacity`) needs a `device.wait_idle` + // to safely rewrite the descriptor sets, and we don't want that stall + // firing mid-stream. The remaining grow path stays as a rare safety net. + let max_meta = (total_buckets * 16).max(8192) as usize; let meta_size = (max_meta * size_of::()) as u64; let indirect_size = (max_meta * size_of::()) as u64; let count_size = 4u64; @@ -493,6 +511,7 @@ impl ChunkBufferStore { staging_size, transfer_pool, transfer_cmd, + transfer_fence, use_staging, vtx_free, idx_free, @@ -526,17 +545,65 @@ impl ChunkBufferStore { self.last_draw_count } - /// Upload a mesh result, replacing the sections in `mesh.replaced`. Returns - /// the section indices that were dropped due to pool exhaustion (and so - /// need re-meshing); empty on success or for the permanent "too large" - /// skip. - pub fn upload( + /// Submit the accumulated staging copies as a single transfer and block on + /// a fence until it completes. One fence wait per call replaces the old + /// per-mesh `queue.wait_idle`, so a frame's uploads synchronize once + /// instead of once per mesh. + fn flush_transfer( + &mut self, + device: &vk::Device, + queue: vk::Queue, + copy_v: &[vk::BufferCopy], + copy_i: &[vk::BufferCopy], + ) { + if copy_v.is_empty() && copy_i.is_empty() { + return; + } + let begin = vk::CommandBufferBeginInfo { + flags: vk::CommandBufferUsageFlags::OneTimeSubmit, + ..Default::default() + }; + self.transfer_cmd.begin(&begin).unwrap(); + if !copy_v.is_empty() { + self.transfer_cmd + .copy_buffer(self.staging_buffer, self.vertex_buffer, copy_v); + } + if !copy_i.is_empty() { + self.transfer_cmd + .copy_buffer(self.staging_buffer, self.index_buffer, copy_i); + } + self.transfer_cmd.end().unwrap(); + let submit = [vk::SubmitInfo { + command_buffer_count: 1, + command_buffers: &self.transfer_cmd.handle(), + ..Default::default() + }]; + device.reset_fences(&[self.transfer_fence]).unwrap(); + queue.submit(&submit, self.transfer_fence).unwrap(); + device + .wait_for_fences(&[self.transfer_fence], true, u64::MAX) + .unwrap(); + } + + /// Upload a batch of mesh results, each replacing the sections in its + /// `mesh.replaced` range. Staging copies for the whole batch are coalesced + /// into as few transfers as the staging buffer holds (one per overflow, + /// plus a final flush), each synchronized by a single fence wait — so a + /// streaming frame stalls once, not once per mesh. Returns, per mesh + /// that hit pool exhaustion, the section indices that were dropped and + /// need re-meshing. + pub fn upload_batch( &mut self, device: &vk::Device, allocator: &Arc>, queue: vk::Queue, - mesh: &ChunkMeshData, - ) -> Vec { + meshes: &[ChunkMeshData], + ) -> Vec<(ChunkPos, Vec)> { + let mut needs_remesh: Vec<(ChunkPos, Vec)> = Vec::new(); + if meshes.is_empty() { + return needs_remesh; + } + // Tight AABB over a section's own vertices (better cull granularity than // the chunk-column bounds; also robust to LOD cubes that exceed 16 tall). fn section_aabb(verts: &[ChunkVertex]) -> ChunkAABB { @@ -554,6 +621,18 @@ impl ChunkBufferStore { } } + // Sub-allocate an exact-size vertex + index slice for each non-empty + // section. Indices stay section-local and `vertex_offset` rebases the draw, + // so no packing or rebasing is needed — just one slice per section. + struct Plan<'a> { + section_index: i32, + verts: &'a [ChunkVertex], + indices: &'a [u32], + vtx_off: u32, + idx_off: u32, + aabb: ChunkAABB, + } + // Retired slices only reclaim in `begin_frame`; if rendering is paused // while meshing continues (e.g. minimized window) the backlog grows // unbounded. Past a sane bound, force a GPU wait and reclaim it all. @@ -565,155 +644,167 @@ impl ChunkBufferStore { } } - // The covered sections this job is authoritative for: reject any where a - // newer upload (higher epoch) already landed. See - // `ChunkMeshData::upload_epoch`. - let accepted: std::collections::HashSet = mesh - .replaced - .clone() - .filter(|si| { - let stored = self + let staging_half = self.staging_size as usize / 2; + // Copies accumulated for the current (not-yet-submitted) transfer, and the + // running write cursors into each half of the staging buffer. + let mut copy_v: Vec = Vec::new(); + let mut copy_i: Vec = Vec::new(); + let mut stg_v = 0usize; + let mut stg_i = 0usize; + + for mesh in meshes { + // The covered sections this job is authoritative for: reject any where + // a newer upload (higher epoch) already landed. See + // `ChunkMeshData::upload_epoch`. + let accepted: std::collections::HashSet = mesh + .replaced + .clone() + .filter(|si| { + let stored = self + .chunks + .get(&mesh.pos) + .and_then(|c| c.sections.iter().find(|s| s.section_index == *si)) + .map(|s| s.epoch) + .unwrap_or(0); + mesh.upload_epoch >= stored + }) + .collect(); + + // Retire the slices of every accepted covered section: the re-meshed + // ones are re-allocated below, the now-empty ones simply vanish. + // Remember which were present so a re-meshed section swaps instantly + // while a freshly revealed one still fades in. Rejected sections are + // left untouched. + let mut freed: Vec<(u32, u32, u32, u32)> = Vec::new(); + let mut was_present: std::collections::HashSet = std::collections::HashSet::new(); + if let Some(entry) = self.chunks.get_mut(&mesh.pos) { + entry.sections.retain(|s| { + if accepted.contains(&s.section_index) { + was_present.insert(s.section_index); + freed.push(( + s.vertex_offset as u32, + s.vtx_len, + s.first_index, + s.index_count, + )); + false + } else { + true + } + }); + } + self.retire_slices(freed.iter().copied()); + // Sections were removed/replaced, so the draw list must be rebuilt even + // if this mesh is skipped below (otherwise it keeps drawing a retired, + // soon-reused slice). + self.meta_dirty = true; + + let upload_secs: Vec<&SectionMesh> = mesh + .sections + .iter() + .filter(|s| accepted.contains(&s.section_index)) + .collect(); + + if upload_secs.is_empty() { + // Every accepted section is now empty (freed above); drop the + // column if nothing remains. + if self .chunks .get(&mesh.pos) - .and_then(|c| c.sections.iter().find(|s| s.section_index == *si)) - .map(|s| s.epoch) - .unwrap_or(0); - mesh.upload_epoch >= stored - }) - .collect(); - - // Retire the slices of every accepted covered section: the re-meshed ones - // are re-allocated below, the now-empty ones simply vanish. Remember which - // were present so a re-meshed section swaps instantly while a freshly - // revealed one still fades in. Rejected sections are left untouched. - let mut freed: Vec<(u32, u32, u32, u32)> = Vec::new(); - let mut was_present: std::collections::HashSet = std::collections::HashSet::new(); - if let Some(entry) = self.chunks.get_mut(&mesh.pos) { - entry.sections.retain(|s| { - if accepted.contains(&s.section_index) { - was_present.insert(s.section_index); - freed.push(( - s.vertex_offset as u32, - s.vtx_len, - s.first_index, - s.index_count, - )); - false - } else { - true + .is_some_and(|c| c.sections.is_empty()) + { + self.chunks.remove(&mesh.pos); } - }); - } - self.retire_slices(freed.iter().copied()); - // Sections were removed/replaced, so the draw list must be rebuilt even if - // an early return below skips the upload (otherwise it keeps drawing a - // retired, soon-reused slice). - self.meta_dirty = true; - - let upload_secs: Vec<&SectionMesh> = mesh - .sections - .iter() - .filter(|s| accepted.contains(&s.section_index)) - .collect(); - - if upload_secs.is_empty() { - // Every accepted section is now empty (freed above); drop the column - // if nothing remains. - if self - .chunks - .get(&mesh.pos) - .is_some_and(|c| c.sections.is_empty()) - { - self.chunks.remove(&mesh.pos); + continue; } - return Vec::new(); - } - let staging_half = self.staging_size as usize / 2; - if self.use_staging { - // Verts and indices share the staging buffer (two halves), copied in - // one transfer. A chunk too large for staging is skipped rather than - // overflowing the buffer (matches the prior column-sized limit). This - // is permanent, so it's not reported for retry. - let v_bytes: usize = upload_secs - .iter() - .map(|s| s.vertices.len() * VERTEX_SIZE as usize) - .sum(); - let i_bytes: usize = upload_secs - .iter() - .map(|s| s.indices.len() * INDEX_SIZE as usize) - .sum(); - if v_bytes > staging_half || i_bytes > staging_half { - tracing::warn!( - "Chunk {:?} too large for staging ({} v / {} i bytes), skipping", - mesh.pos, - v_bytes, - i_bytes, - ); - return Vec::new(); + if self.use_staging { + // Verts and indices share the staging buffer (two halves). A chunk + // too large for one half is skipped rather than overflowing the + // buffer. This is permanent, so it's not reported for retry. + let v_bytes: usize = upload_secs + .iter() + .map(|s| s.vertices.len() * VERTEX_SIZE as usize) + .sum(); + let i_bytes: usize = upload_secs + .iter() + .map(|s| s.indices.len() * INDEX_SIZE as usize) + .sum(); + if v_bytes > staging_half || i_bytes > staging_half { + tracing::warn!( + "Chunk {:?} too large for staging ({} v / {} i bytes), skipping", + mesh.pos, + v_bytes, + i_bytes, + ); + continue; + } } - } - // Sub-allocate an exact-size vertex + index slice for each non-empty - // section. Indices stay section-local and `vertex_offset` rebases the draw, - // so no packing or rebasing is needed — just one slice per section. - struct Plan<'a> { - section_index: i32, - verts: &'a [ChunkVertex], - indices: &'a [u32], - vtx_off: u32, - idx_off: u32, - aabb: ChunkAABB, - } - - let mut plans: Vec = Vec::with_capacity(upload_secs.len()); - // (vtx_off, vtx_len, idx_off, idx_len) taken this call, for rollback if the - // pool runs out partway through a column. - let mut taken: Vec<(u32, u32, u32, u32)> = Vec::new(); - // The accepted sections were retired above; on a pool-full rollback they - // need re-meshing, so report them for retry (rescan re-enqueues next frame). - let dropped: Vec = accepted.iter().copied().collect(); - for sec in &upload_secs { - let vcount = sec.vertices.len() as u32; - let icount = sec.indices.len() as u32; - if vcount == 0 || icount == 0 { + let mut plans: Vec = Vec::with_capacity(upload_secs.len()); + // (vtx_off, vtx_len, idx_off, idx_len) taken for this mesh, for + // rollback if the pool runs out partway through a column. + let mut taken: Vec<(u32, u32, u32, u32)> = Vec::new(); + let mut pool_full = false; + for sec in &upload_secs { + let vcount = sec.vertices.len() as u32; + let icount = sec.indices.len() as u32; + if vcount == 0 || icount == 0 { + continue; + } + let Some(vtx_off) = self.vtx_free.alloc(vcount) else { + self.free_slices(&taken); + tracing::debug!("Vertex pool full, skipping {:?}", mesh.pos); + pool_full = true; + break; + }; + let Some(idx_off) = self.idx_free.alloc(icount) else { + self.vtx_free.free_region(vtx_off, vcount); + self.free_slices(&taken); + tracing::debug!("Index pool full, skipping {:?}", mesh.pos); + pool_full = true; + break; + }; + taken.push((vtx_off, vcount, idx_off, icount)); + plans.push(Plan { + section_index: sec.section_index, + verts: &sec.vertices, + indices: &sec.indices, + vtx_off, + idx_off, + aabb: section_aabb(&sec.vertices), + }); + } + if pool_full { + // The accepted sections were retired above; report them so the + // next rescan re-enqueues them. + needs_remesh.push((mesh.pos, accepted.iter().copied().collect())); + continue; + } + if plans.is_empty() { + // Nothing to upload (all accepted sections were empty). continue; } - let Some(vtx_off) = self.vtx_free.alloc(vcount) else { - self.free_slices(&taken); - tracing::debug!("Vertex pool full, skipping {:?}", mesh.pos); - return dropped; - }; - let Some(idx_off) = self.idx_free.alloc(icount) else { - self.vtx_free.free_region(vtx_off, vcount); - self.free_slices(&taken); - tracing::debug!("Index pool full, skipping {:?}", mesh.pos); - return dropped; - }; - taken.push((vtx_off, vcount, idx_off, icount)); - plans.push(Plan { - section_index: sec.section_index, - verts: &sec.vertices, - indices: &sec.indices, - vtx_off, - idx_off, - aabb: section_aabb(&sec.vertices), - }); - } - - if plans.is_empty() { - // Nothing to upload (all accepted sections were empty) — not a - // capacity failure, so no retry. - return Vec::new(); - } - if self.use_staging { - let mut copy_v: Vec = Vec::new(); - let mut copy_i: Vec = Vec::new(); - { + if self.use_staging { + let mv: usize = plans + .iter() + .map(|p| p.verts.len() * VERTEX_SIZE as usize) + .sum(); + let mi: usize = plans + .iter() + .map(|p| p.indices.len() * INDEX_SIZE as usize) + .sum(); + // This mesh alone fits a half (checked above), so a flush always + // makes room: submit the pending transfer and reset the cursors. + if stg_v + mv > staging_half || stg_i + mi > staging_half { + self.flush_transfer(device, queue, ©_v, ©_i); + copy_v.clear(); + copy_i.clear(); + stg_v = 0; + stg_i = 0; + } let buf = self.staging_alloc.mapped_slice_mut().unwrap(); - let mut stg_v = 0usize; - let mut stg_i = 0usize; for p in &plans { let vb: &[u8] = bytemuck::cast_slice(p.verts); buf[stg_v..stg_v + vb.len()].copy_from_slice(vb); @@ -734,73 +825,61 @@ impl ChunkBufferStore { }); stg_i += ib.len(); } - } - - let begin = vk::CommandBufferBeginInfo { - flags: vk::CommandBufferUsageFlags::OneTimeSubmit, - ..Default::default() - }; - self.transfer_cmd.begin(&begin).unwrap(); - self.transfer_cmd - .copy_buffer(self.staging_buffer, self.vertex_buffer, ©_v); - self.transfer_cmd - .copy_buffer(self.staging_buffer, self.index_buffer, ©_i); - self.transfer_cmd.end().unwrap(); - let submit = [vk::SubmitInfo { - command_buffer_count: 1, - command_buffers: &self.transfer_cmd.handle(), - ..Default::default() - }]; - queue.submit(&submit, vk::Fence::null()).unwrap(); - queue.wait_idle().unwrap(); - } else { - { - let vbuf = self.vertex_alloc.mapped_slice_mut().unwrap(); - for p in &plans { - let vb: &[u8] = bytemuck::cast_slice(p.verts); - let off = p.vtx_off as usize * VERTEX_SIZE as usize; - vbuf[off..off + vb.len()].copy_from_slice(vb); + } else { + { + let vbuf = self.vertex_alloc.mapped_slice_mut().unwrap(); + for p in &plans { + let vb: &[u8] = bytemuck::cast_slice(p.verts); + let off = p.vtx_off as usize * VERTEX_SIZE as usize; + vbuf[off..off + vb.len()].copy_from_slice(vb); + } } - } - { - let ibuf = self.index_alloc.mapped_slice_mut().unwrap(); - for p in &plans { - let ib: &[u8] = bytemuck::cast_slice(p.indices); - let off = p.idx_off as usize * INDEX_SIZE as usize; - ibuf[off..off + ib.len()].copy_from_slice(ib); + { + let ibuf = self.index_alloc.mapped_slice_mut().unwrap(); + for p in &plans { + let ib: &[u8] = bytemuck::cast_slice(p.indices); + let off = p.idx_off as usize * INDEX_SIZE as usize; + ibuf[off..off + ib.len()].copy_from_slice(ib); + } } } - } - let now = std::time::Instant::now(); - let new_sections = plans.iter().map(|p| SectionAlloc { - section_index: p.section_index, - aabb: p.aabb, - first_index: p.idx_off, - index_count: p.indices.len() as u32, - vertex_offset: p.vtx_off as i32, - vtx_len: p.verts.len() as u32, - // A re-meshed section swaps instantly; a freshly revealed one fades in. - uploaded_at: if was_present.contains(&p.section_index) { - now.checked_sub(std::time::Duration::from_secs(2)) - .unwrap_or(now) - } else { - now - }, - epoch: mesh.upload_epoch, - }); + let now = std::time::Instant::now(); + let new_sections = plans.iter().map(|p| SectionAlloc { + section_index: p.section_index, + aabb: p.aabb, + first_index: p.idx_off, + index_count: p.indices.len() as u32, + vertex_offset: p.vtx_off as i32, + vtx_len: p.verts.len() as u32, + // A re-meshed section swaps instantly; a freshly revealed one fades in. + uploaded_at: if was_present.contains(&p.section_index) { + now.checked_sub(std::time::Duration::from_secs(2)) + .unwrap_or(now) + } else { + now + }, + epoch: mesh.upload_epoch, + }); + + self.chunks + .entry(mesh.pos) + .or_insert_with(|| ChunkAlloc { + sections: Vec::new(), + }) + .sections + .extend(new_sections); + } - self.chunks - .entry(mesh.pos) - .or_insert_with(|| ChunkAlloc { - sections: Vec::new(), - }) - .sections - .extend(new_sections); + // Flush whatever remains accumulated from the last (or only) batch. + if self.use_staging { + self.flush_transfer(device, queue, ©_v, ©_i); + } let total_sections: usize = self.chunks.values().map(|c| c.sections.len()).sum(); self.ensure_meta_capacity(device, allocator, total_sections); - Vec::new() + + needs_remesh } /// Grow the per-frame meta and indirect buffers so they can hold `needed` @@ -1185,6 +1264,7 @@ impl ChunkBufferStore { .ok(); drop(alloc); + device.destroy_fence(self.transfer_fence, None); device.destroy_command_pool(self.transfer_pool, None); device.destroy_pipeline(self.compute_pipeline, None); device.destroy_pipeline_layout(self.compute_layout, None); diff --git a/pomme-client/src/renderer/chunk/mesher.rs b/pomme-client/src/renderer/chunk/mesher.rs index f0d400f4..11d23be6 100644 --- a/pomme-client/src/renderer/chunk/mesher.rs +++ b/pomme-client/src/renderer/chunk/mesher.rs @@ -168,7 +168,12 @@ fn tint_color(tint: Tint, grass: [f32; 3], foliage: [f32; 3], dry_foliage: [f32; } } -const MAX_MESH_UPLOADS_PER_FRAME: usize = 16; +const MAX_MESH_UPLOADS_PER_FRAME: usize = 32; + +/// Bound on un-drained bulk results: past this, workers block on send (back- +/// pressure) rather than piling finished meshes — and their pooled buffers — +/// into an unbounded queue, which would starve the buffer pool. +const MAX_PENDING_RESULTS: usize = 256; pub struct Colormap { pixels: Vec<[u8; 3]>, @@ -381,6 +386,60 @@ pub fn int_to_rgb(color: i32) -> [f32; 3] { [r, g, b] } +/// Pre-allocation hints sized to a typical section so a fresh buffer fills +/// without reallocating (indices run ~1.5x vertices: 6 per quad vs 4). +const SECTION_VERTEX_HINT: usize = 2048; +const SECTION_INDEX_HINT: usize = 3072; + +/// Recycles section vertex/index `Vec`s so workers reuse them instead of +/// allocating/freeing through the OS each mesh (vanilla reuses its +/// `ByteBufferBuilder`s the same way). Bounded: returns past capacity are +/// dropped, takes past it allocate. +struct BufferPool { + vtx_tx: crossbeam_channel::Sender>, + vtx_rx: crossbeam_channel::Receiver>, + idx_tx: crossbeam_channel::Sender>, + idx_rx: crossbeam_channel::Receiver>, +} + +impl BufferPool { + fn new(capacity: usize) -> Self { + let (vtx_tx, vtx_rx) = crossbeam_channel::bounded(capacity); + let (idx_tx, idx_rx) = crossbeam_channel::bounded(capacity); + Self { + vtx_tx, + vtx_rx, + idx_tx, + idx_rx, + } + } + + // A fresh buffer is pre-sized so filling it doesn't realloc-grow; recycled + // buffers keep their capacity, so the pool self-tunes to real section sizes. + fn take_vertices(&self) -> Vec { + self.vtx_rx + .try_recv() + .unwrap_or_else(|_| Vec::with_capacity(SECTION_VERTEX_HINT)) + } + + fn take_indices(&self) -> Vec { + self.idx_rx + .try_recv() + .unwrap_or_else(|_| Vec::with_capacity(SECTION_INDEX_HINT)) + } + + fn recycle(&self, mut vertices: Vec, mut indices: Vec) { + if vertices.capacity() > 0 { + vertices.clear(); + let _ = self.vtx_tx.try_send(vertices); + } + if indices.capacity() > 0 { + indices.clear(); + let _ = self.idx_tx.try_send(indices); + } + } +} + pub struct MeshDispatcher { result_rx: crossbeam_channel::Receiver, result_tx: crossbeam_channel::Sender, @@ -398,6 +457,7 @@ pub struct MeshDispatcher { foliage_colormap: Arc, dry_foliage_colormap: Arc, biome_climate: Arc>, + pool: Arc, } impl MeshDispatcher { @@ -409,13 +469,17 @@ impl MeshDispatcher { dry_foliage_colormap: Colormap, biome_climate: Arc>, ) -> Self { - let (result_tx, result_rx) = crossbeam_channel::unbounded(); + // Bulk results are bounded for back-pressure; edits stay unbounded so a + // block edit never blocks a worker behind the load backlog. + let (result_tx, result_rx) = crossbeam_channel::bounded(MAX_PENDING_RESULTS); let (priority_tx, priority_rx) = crossbeam_channel::unbounded(); let queue = Arc::new(MeshQueue::new()); - // One worker per core minus one, leaving a core for the main/render thread. + // Half the cores, capped: more saturated workers starve the main/render + // thread during a load burst (frame spikes), and pooling makes load + // network-bound so they wouldn't speed it up anyway. let worker_count = std::thread::available_parallelism() - .map(|n| n.get().saturating_sub(1).max(1)) + .map(|n| (n.get() / 2).clamp(2, 12)) .unwrap_or(1); let mut workers = Vec::with_capacity(worker_count); for _ in 0..worker_count { @@ -423,7 +487,10 @@ impl MeshDispatcher { workers.push( std::thread::Builder::new() .name("chunk-mesher".into()) - .spawn(move || queue.run_worker()) + .spawn(move || { + lower_current_thread_priority(); + queue.run_worker() + }) .expect("spawn chunk-mesher thread"), ); } @@ -442,6 +509,15 @@ impl MeshDispatcher { foliage_colormap: Arc::new(foliage_colormap), dry_foliage_colormap: Arc::new(dry_foliage_colormap), biome_climate, + pool: Arc::new(BufferPool::new(1024)), + } + } + + /// Return an uploaded (or stale) mesh's section buffers to the pool for + /// reuse. + pub fn recycle(&self, mesh: ChunkMeshData) { + for sec in mesh.sections { + self.pool.recycle(sec.vertices, sec.indices); } } @@ -517,6 +593,7 @@ impl MeshDispatcher { min_y, height, tx, + pool: Arc::clone(&self.pool), }); } @@ -567,6 +644,7 @@ struct PendingJob { min_y: i32, height: u32, tx: crossbeam_channel::Sender, + pool: Arc, } impl PendingJob { @@ -593,6 +671,7 @@ impl PendingJob { &self.uv_map, self.lod, self.sections, + &self.pool, ); mesh.content_gen = self.content_gen; mesh.upload_epoch = self.upload_epoch; @@ -728,6 +807,26 @@ fn poll(state: &mut QueueState) -> Option { best_initial.map(|(ii, _)| state.tasks.swap_remove(ii)) } +/// Run mesh workers below normal priority so the OS preempts them for the +/// main/render thread during a load burst, while they still use idle cores. +#[cfg(windows)] +fn lower_current_thread_priority() { + const THREAD_PRIORITY_BELOW_NORMAL: i32 = -1; + #[link(name = "kernel32")] + unsafe extern "system" { + fn GetCurrentThread() -> isize; + fn SetThreadPriority(thread: isize, priority: i32) -> i32; + } + unsafe { + SetThreadPriority(GetCurrentThread(), THREAD_PRIORITY_BELOW_NORMAL); + } +} + +#[cfg(not(windows))] +fn lower_current_thread_priority() { + // TODO: lower priority on non-Windows (libc::nice / pthread_setschedparam). +} + struct ChunkStoreSnapshot { chunks: Vec<( ChunkPos, @@ -1081,6 +1180,7 @@ fn mesh_chunk_snapshot( uv_map: &AtlasUVMap, lod: u32, sections_to_mesh: std::ops::Range, + pool: &BufferPool, ) -> ChunkMeshData { let mut logged_missing: std::collections::HashSet = std::collections::HashSet::new(); @@ -1105,6 +1205,13 @@ fn mesh_chunk_snapshot( indices: Vec::new(), }) .collect(); + // In-range sections get recycled buffers (capacity retained from earlier + // meshes) so the worker fills them without going through the OS allocator. + for si in range.clone() { + let sec = &mut sections[si as usize]; + sec.vertices = pool.take_vertices(); + sec.indices = pool.take_indices(); + } // The type map is a state->id map, so it only needs the meshed span (+1-block // border for face culling); states outside it are never queried. @@ -1278,9 +1385,17 @@ fn mesh_chunk_snapshot( local_z += step; } - // Keep only non-empty sections (untouched out-of-range ones stay empty); - // empty indices within `range` are freed by the per-section upload. - sections.retain(|s| !s.vertices.is_empty() && !s.indices.is_empty()); + // Keep only non-empty sections; recycle the buffers of in-range sections that + // ended up empty (rather than dropping their retained capacity). + let mut kept = Vec::with_capacity(sections.len()); + for s in sections { + if s.vertices.is_empty() || s.indices.is_empty() { + pool.recycle(s.vertices, s.indices); + } else { + kept.push(s); + } + } + let sections = kept; ChunkMeshData { pos, diff --git a/pomme-client/src/renderer/mod.rs b/pomme-client/src/renderer/mod.rs index f2f3b4de..a941b612 100644 --- a/pomme-client/src/renderer/mod.rs +++ b/pomme-client/src/renderer/mod.rs @@ -857,14 +857,15 @@ impl Renderer { .wait_for_fences(&self.ctx.in_flight_fences, true, u64::MAX); } - /// Returns the section indices dropped due to pool exhaustion (need + /// Upload a batch of chunk meshes in a single coalesced transfer. Returns, + /// per mesh that hit pool exhaustion, the section indices dropped (need /// re-mesh); empty on success. - pub fn upload_chunk_mesh(&mut self, mesh: &ChunkMeshData) -> Vec { - self.chunk_buffers.upload( + pub fn upload_chunk_meshes(&mut self, meshes: &[ChunkMeshData]) -> Vec<(ChunkPos, Vec)> { + self.chunk_buffers.upload_batch( &self.ctx.device, &self.ctx.allocator, self.ctx.graphics_queue, - mesh, + meshes, ) } From c6c28f08283cd24eeb801e0f1e164fbbe4d165b7 Mon Sep 17 00:00:00 2001 From: Purdze Date: Wed, 24 Jun 2026 21:26:17 +0100 Subject: [PATCH 2/4] Optimize chunk rendering: cull caching, mesh queue, compact vertices --- pomme-client/src/app/phases/in_game.rs | 1 - pomme-client/src/renderer/chunk/buffer.rs | 249 ++++++++++++++++--- pomme-client/src/renderer/chunk/mesher.rs | 148 +++++++---- pomme-client/src/renderer/pipelines/chunk.rs | 6 +- pomme-client/src/renderer/shaders/chunk.vert | 21 +- pomme-client/src/renderer/shaders/cull.comp | 5 +- pomme-client/src/world/chunk.rs | 13 +- 7 files changed, 338 insertions(+), 105 deletions(-) diff --git a/pomme-client/src/app/phases/in_game.rs b/pomme-client/src/app/phases/in_game.rs index 02f590b9..c06e4d98 100644 --- a/pomme-client/src/app/phases/in_game.rs +++ b/pomme-client/src/app/phases/in_game.rs @@ -335,7 +335,6 @@ impl GameState { let rd = self .chunk_store .loaded_positions() - .iter() .map(|p| { (p.x - player_chunk.x) .abs() diff --git a/pomme-client/src/renderer/chunk/buffer.rs b/pomme-client/src/renderer/chunk/buffer.rs index 90604885..a85d565c 100644 --- a/pomme-client/src/renderer/chunk/buffer.rs +++ b/pomme-client/src/renderer/chunk/buffer.rs @@ -10,13 +10,22 @@ use crate::renderer::{MAX_FRAMES_IN_FLIGHT, shader, util}; const BUCKET_VERTICES: u32 = 32768; const BUCKET_INDICES: u32 = 49152; -const VERTEX_SIZE: u64 = size_of::() as u64; +const VERTEX_SIZE: u64 = size_of::() as u64; +/// Section-local position quantization: local coords (block 0..16 plus model +/// overhang) map into `[-POS_BIAS, POS_RANGE - POS_BIAS]` across a u16. Chosen +/// so a 16-block shift is an exact integer number of u16 steps (16/24*65535 = +/// 43690), so the same world position encodes identically in adjacent sections +/// — no seams. Must match `chunk.vert`. +const POS_RANGE: f32 = 24.0; +const POS_BIAS: f32 = 4.0; const INDEX_SIZE: u64 = size_of::() as u64; const BYTES_PER_BUCKET: u64 = BUCKET_VERTICES as u64 * VERTEX_SIZE + BUCKET_INDICES as u64 * INDEX_SIZE; const MIN_BUCKETS: u32 = 128; const MAX_BUCKETS: u32 = 2048; const VRAM_BUDGET_FRACTION: f64 = 0.25; +/// Fade-in duration for a freshly revealed section (ms). +const FADE_DURATION_MS: f32 = 1000.0; /// First-fit free-list sub-allocator over a fixed element range, coalescing on /// free. Each section gets an exact-size vertex (and index) slice instead of @@ -124,6 +133,111 @@ struct ChunkMeta { first_index: u32, vertex_offset: i32, visibility: u32, + /// Section world origin; bound as a per-instance vertex attribute so the + /// vertex shader rebases the quantized local position. `[3]` is padding. + origin: [f32; 4], +} + +/// Compact GPU vertex (14 bytes): position quantized to section-local u16 (see +/// `POS_RANGE`), rebased to world in `chunk.vert` via the per-instance origin. +/// `light_tint` is `[u8; 4]` (not `u32`) so the struct packs to 14 bytes with +/// no alignment padding; byte order matches the old `R8G8B8A8_UNORM` (light, +/// r,g,b). +#[repr(C)] +#[derive(Copy, Clone, bytemuck::Pod, bytemuck::Zeroable)] +struct PackedVertex { + pos: [u16; 3], + uv: [u16; 2], + light_tint: [u8; 4], +} + +fn quantize_coord(world: f32, origin: f32) -> u16 { + let unorm = ((world - origin + POS_BIAS) / POS_RANGE).clamp(0.0, 1.0); + (unorm * 65535.0 + 0.5) as u16 +} + +fn pack_vertex(v: &ChunkVertex, origin: [f32; 3]) -> PackedVertex { + PackedVertex { + pos: [ + quantize_coord(v.position[0], origin[0]), + quantize_coord(v.position[1], origin[1]), + quantize_coord(v.position[2], origin[2]), + ], + uv: v.tex_coords, + light_tint: v.light_tint.to_le_bytes(), + } +} + +/// Pack `verts` (rebased against `origin`) into `dst` starting at byte `off`. +fn write_packed_verts(dst: &mut [u8], off: usize, verts: &[ChunkVertex], origin: [f32; 3]) { + let vsize = VERTEX_SIZE as usize; + for (k, v) in verts.iter().enumerate() { + let o = off + k * vsize; + dst[o..o + vsize].copy_from_slice(bytemuck::bytes_of(&pack_vertex(v, origin))); + } +} + +/// Vertex input for the chunk pipeline: binding 0 is the packed per-vertex +/// pool, binding 1 is the meta buffer read per-instance (origin + fade), +/// indexed by the `first_instance` the cull shader writes. +pub fn chunk_vertex_bindings() -> [vk::VertexInputBindingDescription; 2] { + [ + vk::VertexInputBindingDescription { + binding: 0, + stride: size_of::() as u32, + input_rate: vk::VertexInputRate::Vertex, + }, + vk::VertexInputBindingDescription { + binding: 1, + stride: size_of::() as u32, + input_rate: vk::VertexInputRate::Instance, + }, + ] +} + +pub fn chunk_vertex_attributes() -> [vk::VertexInputAttributeDescription; 6] { + let origin_off = std::mem::offset_of!(ChunkMeta, origin) as u32; + let vis_off = std::mem::offset_of!(ChunkMeta, visibility) as u32; + [ + // binding 0 — packed vertex + vk::VertexInputAttributeDescription { + location: 0, + binding: 0, + format: vk::Format::R16G16Unorm, + offset: 0, + }, + vk::VertexInputAttributeDescription { + location: 1, + binding: 0, + format: vk::Format::R16Unorm, + offset: 4, + }, + vk::VertexInputAttributeDescription { + location: 2, + binding: 0, + format: vk::Format::R16G16Unorm, + offset: 6, + }, + vk::VertexInputAttributeDescription { + location: 3, + binding: 0, + format: vk::Format::R8G8B8A8Unorm, + offset: 10, + }, + // binding 1 — per-instance meta (origin + fade) + vk::VertexInputAttributeDescription { + location: 4, + binding: 1, + format: vk::Format::R32G32B32Sfloat, + offset: origin_off, + }, + vk::VertexInputAttributeDescription { + location: 5, + binding: 1, + format: vk::Format::R32Sfloat, + offset: vis_off, + }, + ] } #[repr(C)] @@ -153,6 +267,9 @@ struct FrustumData { struct SectionAlloc { section_index: i32, aabb: ChunkAABB, + /// Section world origin (`chunk*16`, `min_y + si*16`), used to rebase the + /// quantized vertices and passed to the GPU via `ChunkMeta.origin`. + origin: [f32; 3], first_index: u32, index_count: u32, vertex_offset: i32, @@ -195,6 +312,17 @@ pub struct ChunkBufferStore { chunk_visibility: HashMap, cached_meta: Vec, meta_dirty: bool, + /// End of the current fade-in window. While `now < fade_until` the + /// per-section fade values change each frame, so `cached_meta` must be + /// rebuilt; an O(1) check replacing the old all-sections scan. + fade_until: std::time::Instant, + /// Camera position at the last front-to-back sort; the sort (an early-Z + /// optimization) is only redone once the camera moves past a threshold. + last_sort_cam: [f32; 3], + /// Frame slots still needing the latest `cached_meta` uploaded. Set to + /// `MAX_FRAMES_IN_FLIGHT` whenever the draw list changes, decremented per + /// frame; at steady state the per-frame meta copy stops. + meta_upload_pending: u32, compute_pipeline: vk::Pipeline, compute_layout: vk::PipelineLayout, @@ -352,7 +480,7 @@ impl ChunkBufferStore { device, allocator, meta_size, - vk::BufferUsageFlags::StorageBuffer, + vk::BufferUsageFlags::StorageBuffer | vk::BufferUsageFlags::VertexBuffer, "chunk_meta", ); meta_buffers.push(b); @@ -519,6 +647,9 @@ impl ChunkBufferStore { chunk_visibility: HashMap::new(), cached_meta: Vec::new(), meta_dirty: true, + fade_until: std::time::Instant::now(), + last_sort_cam: [f32::MAX; 3], + meta_upload_pending: 0, compute_pipeline, compute_layout, compute_desc_layout, @@ -631,6 +762,7 @@ impl ChunkBufferStore { vtx_off: u32, idx_off: u32, aabb: ChunkAABB, + origin: [f32; 3], } // Retired slices only reclaim in `begin_frame`; if rendering is paused @@ -773,6 +905,11 @@ impl ChunkBufferStore { vtx_off, idx_off, aabb: section_aabb(&sec.vertices), + origin: [ + (mesh.pos.x * 16) as f32, + (mesh.min_y + sec.section_index * 16) as f32, + (mesh.pos.z * 16) as f32, + ], }); } if pool_full { @@ -806,14 +943,14 @@ impl ChunkBufferStore { } let buf = self.staging_alloc.mapped_slice_mut().unwrap(); for p in &plans { - let vb: &[u8] = bytemuck::cast_slice(p.verts); - buf[stg_v..stg_v + vb.len()].copy_from_slice(vb); + write_packed_verts(buf, stg_v, p.verts, p.origin); + let vbytes = p.verts.len() * VERTEX_SIZE as usize; copy_v.push(vk::BufferCopy { src_offset: stg_v as u64, dst_offset: p.vtx_off as u64 * VERTEX_SIZE, - size: vb.len() as u64, + size: vbytes as u64, }); - stg_v += vb.len(); + stg_v += vbytes; let ib: &[u8] = bytemuck::cast_slice(p.indices); let off = staging_half + stg_i; @@ -829,9 +966,8 @@ impl ChunkBufferStore { { let vbuf = self.vertex_alloc.mapped_slice_mut().unwrap(); for p in &plans { - let vb: &[u8] = bytemuck::cast_slice(p.verts); - let off = p.vtx_off as usize * VERTEX_SIZE as usize; - vbuf[off..off + vb.len()].copy_from_slice(vb); + let base = p.vtx_off as usize * VERTEX_SIZE as usize; + write_packed_verts(vbuf, base, p.verts, p.origin); } } { @@ -848,6 +984,7 @@ impl ChunkBufferStore { let new_sections = plans.iter().map(|p| SectionAlloc { section_index: p.section_index, aabb: p.aabb, + origin: p.origin, first_index: p.idx_off, index_count: p.indices.len() as u32, vertex_offset: p.vtx_off as i32, @@ -862,6 +999,16 @@ impl ChunkBufferStore { epoch: mesh.upload_epoch, }); + // Freshly revealed sections fade in, so extend the fade window the + // cull's O(1) check reads; re-meshed-only uploads swap instantly. + if plans + .iter() + .any(|p| !was_present.contains(&p.section_index)) + { + let dur = std::time::Duration::from_secs_f32(FADE_DURATION_MS / 1000.0); + self.fade_until = self.fade_until.max(now + dur); + } + self.chunks .entry(mesh.pos) .or_insert_with(|| ChunkAlloc { @@ -926,7 +1073,7 @@ impl ChunkBufferStore { device, allocator, meta_size, - vk::BufferUsageFlags::StorageBuffer, + vk::BufferUsageFlags::StorageBuffer | vk::BufferUsageFlags::VertexBuffer, "chunk_meta", ); self.meta_buffers[i] = b; @@ -1055,15 +1202,18 @@ impl ChunkBufferStore { } let now = std::time::Instant::now(); - const FADE_DURATION_MS: f32 = 1000.0; const NEARBY_DIST_SQ: f32 = 768.0; + // Re-sort only once the camera moves ~8 blocks; front-to-back order is an + // early-Z optimization, so finer staleness is harmless. + const SORT_RECAM_SQ: f32 = 64.0; - let any_fading = self.fade_enabled - && self.chunks.values().flat_map(|a| &a.sections).any(|s| { - now.duration_since(s.uploaded_at).as_secs_f32() * 1000.0 < FADE_DURATION_MS - }); + // A fade in flight changes per-section visibility every frame, so the draw + // list must rebuild; otherwise it only changes on edits/loads/visibility + // (`meta_dirty`). The fade check is O(1) against `fade_until`. + let any_fading = self.fade_enabled && now < self.fade_until; + let content_changed = self.meta_dirty || any_fading; - if self.meta_dirty || any_fading { + if content_changed { self.cached_meta.clear(); for (pos, alloc) in self.chunks.iter() { // Near columns never fade; otherwise each section fades on its own @@ -1093,36 +1243,55 @@ impl ChunkBufferStore { first_index: sec.first_index, vertex_offset: sec.vertex_offset, visibility: vis.to_bits(), + origin: [sec.origin[0], sec.origin[1], sec.origin[2], 0.0], }); } } self.meta_dirty = false; } - self.cached_meta.sort_unstable_by(|a, b| { - let center_a = [ - (a.aabb_min[0] + a.aabb_max[0]) * 0.5 - camera_pos[0], - (a.aabb_min[1] + a.aabb_max[1]) * 0.5 - camera_pos[1], - (a.aabb_min[2] + a.aabb_max[2]) * 0.5 - camera_pos[2], - ]; - let center_b = [ - (b.aabb_min[0] + b.aabb_max[0]) * 0.5 - camera_pos[0], - (b.aabb_min[1] + b.aabb_max[1]) * 0.5 - camera_pos[1], - (b.aabb_min[2] + b.aabb_max[2]) * 0.5 - camera_pos[2], - ]; - let dist_a = - center_a[0] * center_a[0] + center_a[1] * center_a[1] + center_a[2] * center_a[2]; - let dist_b = - center_b[0] * center_b[0] + center_b[1] * center_b[1] + center_b[2] * center_b[2]; - dist_a - .partial_cmp(&dist_b) - .unwrap_or(std::cmp::Ordering::Equal) - }); + let cam_moved = { + let dx = camera_pos[0] - self.last_sort_cam[0]; + let dy = camera_pos[1] - self.last_sort_cam[1]; + let dz = camera_pos[2] - self.last_sort_cam[2]; + dx * dx + dy * dy + dz * dz > SORT_RECAM_SQ + }; + if content_changed || cam_moved { + self.cached_meta.sort_unstable_by(|a, b| { + let center_a = [ + (a.aabb_min[0] + a.aabb_max[0]) * 0.5 - camera_pos[0], + (a.aabb_min[1] + a.aabb_max[1]) * 0.5 - camera_pos[1], + (a.aabb_min[2] + a.aabb_max[2]) * 0.5 - camera_pos[2], + ]; + let center_b = [ + (b.aabb_min[0] + b.aabb_max[0]) * 0.5 - camera_pos[0], + (b.aabb_min[1] + b.aabb_max[1]) * 0.5 - camera_pos[1], + (b.aabb_min[2] + b.aabb_max[2]) * 0.5 - camera_pos[2], + ]; + let dist_a = center_a[0] * center_a[0] + + center_a[1] * center_a[1] + + center_a[2] * center_a[2]; + let dist_b = center_b[0] * center_b[0] + + center_b[1] * center_b[1] + + center_b[2] * center_b[2]; + dist_a + .partial_cmp(&dist_b) + .unwrap_or(std::cmp::Ordering::Equal) + }); + self.last_sort_cam = camera_pos; + // Draw list reordered: every frame slot's meta buffer needs the refresh. + self.meta_upload_pending = MAX_FRAMES_IN_FLIGHT as u32; + } let count = self.cached_meta.len() as u32; - let meta_bytes = bytemuck::cast_slice(&self.cached_meta); - self.meta_allocs[frame].mapped_slice_mut().unwrap()[..meta_bytes.len()] - .copy_from_slice(meta_bytes); + // Each frame slot has its own meta buffer; copy only into slots that + // haven't yet seen the current draw list. Steady state stops copying. + if self.meta_upload_pending > 0 { + let meta_bytes = bytemuck::cast_slice(&self.cached_meta); + self.meta_allocs[frame].mapped_slice_mut().unwrap()[..meta_bytes.len()] + .copy_from_slice(meta_bytes); + self.meta_upload_pending -= 1; + } let frustum_data = FrustumData { planes: *frustum, @@ -1191,7 +1360,9 @@ impl ChunkBufferStore { .map(|c| c.sections.len() as u32) .sum::(); - cmd.bind_vertex_buffers(0, &[self.vertex_buffer], &[0]); + // Binding 0: packed vertex pool. Binding 1: the meta buffer, read per + // instance for the section origin + fade (indexed by `first_instance`). + cmd.bind_vertex_buffers(0, &[self.vertex_buffer, self.meta_buffers[frame]], &[0, 0]); cmd.bind_index_buffer(self.index_buffer, 0, vk::IndexType::Uint32); if cfg!(target_os = "macos") { cmd.draw_indexed_indirect( diff --git a/pomme-client/src/renderer/chunk/mesher.rs b/pomme-client/src/renderer/chunk/mesher.rs index 11d23be6..5cf40d84 100644 --- a/pomme-client/src/renderer/chunk/mesher.rs +++ b/pomme-client/src/renderer/chunk/mesher.rs @@ -1,4 +1,4 @@ -use std::collections::HashMap; +use std::collections::{BinaryHeap, HashMap}; use std::sync::atomic::{AtomicBool, AtomicU64, Ordering}; use std::sync::{Arc, Condvar, Mutex}; @@ -99,6 +99,9 @@ pub struct SectionMesh { pub struct ChunkMeshData { pub pos: ChunkPos, + /// World Y of section index 0, so the buffer can derive each section's + /// origin (`min_y + section_index * 16`) for vertex quantization. + pub min_y: i32, /// Non-empty meshed sections (each tagged with its `section_index`). pub sections: Vec, /// The section-index range this job (re)meshed. Upload replaces exactly @@ -475,11 +478,11 @@ impl MeshDispatcher { let (priority_tx, priority_rx) = crossbeam_channel::unbounded(); let queue = Arc::new(MeshQueue::new()); - // Half the cores, capped: more saturated workers starve the main/render - // thread during a load burst (frame spikes), and pooling makes load - // network-bound so they wouldn't speed it up anyway. + // Half the cores, capped. Too many saturated workers starve the + // main/render thread during a load burst (frame spikes); the cap trades + // some load throughput for that. let worker_count = std::thread::available_parallelism() - .map(|n| (n.get() / 2).clamp(2, 12)) + .map(|n| (n.get() / 2).clamp(2, 16)) .unwrap_or(1); let mut workers = Vec::with_capacity(worker_count); for _ in 0..worker_count { @@ -561,14 +564,14 @@ impl MeshDispatcher { let min_y = chunk_store.min_y(); let height = chunk_store.height(); - let light: std::collections::HashMap<(i32, i32), crate::world::chunk::ChunkLightData> = + let light: std::collections::HashMap<(i32, i32), Arc> = chunks_needed .iter() .filter_map(|p| { chunk_store .light_data .get(&(p.x, p.z)) - .map(|ld| ((p.x, p.z), ld.clone())) + .map(|ld| ((p.x, p.z), Arc::clone(ld))) }) .collect(); @@ -634,7 +637,7 @@ struct PendingJob { enqueued_at: Option, chunks_needed: [ChunkPos; 5], chunk_arcs: Vec>>>, - light: HashMap<(i32, i32), crate::world::chunk::ChunkLightData>, + light: HashMap<(i32, i32), Arc>, registry: Arc, uv_map: Arc, grass_colormap: Arc, @@ -686,12 +689,52 @@ impl PendingJob { } } +/// X/Z (column) distance from `cam` to a chunk's centre. Meshing order is +/// purely horizontal distance; occlusion gates drawing, not meshing. +fn column_dist_sq(pos: ChunkPos, cam: glam::DVec3) -> f64 { + let dx = (pos.x as f64 * 16.0 + 8.0) - cam.x; + let dz = (pos.z as f64 * 16.0 + 8.0) - cam.z; + dx * dx + dz * dz +} + +/// A queued bulk-load job keyed by its column distance for the load heap. +struct LoadEntry { + dist: f64, + job: PendingJob, +} + +impl PartialEq for LoadEntry { + fn eq(&self, other: &Self) -> bool { + self.dist == other.dist + } +} +impl Eq for LoadEntry {} +impl PartialOrd for LoadEntry { + fn partial_cmp(&self, other: &Self) -> Option { + Some(self.cmp(other)) + } +} +impl Ord for LoadEntry { + // Reversed so `BinaryHeap` (a max-heap) pops the nearest (smallest dist). + fn cmp(&self, other: &Self) -> std::cmp::Ordering { + other.dist.total_cmp(&self.dist) + } +} + struct QueueState { - tasks: Vec, + /// Edits, kept small (a handful in flight) so a linear scan + in-place + /// replace stays cheap. + recompiles: Vec, + /// Bulk loads, a min-by-distance heap so dequeue is `O(log n)` under the + /// lock instead of an `O(n)` scan (the old contention point). + loads: BinaryHeap, // Consecutive edits served ahead of an initial load before one is forced, so // streaming never starves (vanilla SectionTaskDynamicQueue.MAX_RECOMPILE_QUOTA). recompile_quota: i32, camera: glam::DVec3, + /// Camera the load heap is keyed against; re-keyed only when the camera + /// crosses a bucket, so push/pop stay cheap between rebuilds. + sort_cam: glam::DVec3, } /// Re-orderable mesh queue, a port of vanilla `SectionTaskDynamicQueue`. The @@ -707,9 +750,11 @@ impl MeshQueue { fn new() -> Self { Self { state: Mutex::new(QueueState { - tasks: Vec::new(), + recompiles: Vec::new(), + loads: BinaryHeap::new(), recompile_quota: MAX_RECOMPILE_QUOTA, camera: glam::DVec3::ZERO, + sort_cam: glam::DVec3::ZERO, }), available: Condvar::new(), closed: AtomicBool::new(false), @@ -718,25 +763,44 @@ impl MeshQueue { fn push(&self, job: PendingJob) { let mut state = self.state.lock().unwrap(); - // A re-edit of a still-queued section replaces the queued job in place - // instead of duplicating it. Bulk loads can't duplicate (`meshed` gates - // them), so only edits need this. - if job.is_recompile - && let Some(existing) = state - .tasks + if job.is_recompile { + // A re-edit of a still-queued section replaces the queued job in + // place instead of duplicating it. Bulk loads can't duplicate + // (`meshed` gates them), so only edits need this. + if let Some(existing) = state + .recompiles .iter_mut() - .find(|t| t.is_recompile && t.pos == job.pos && t.sections == job.sections) - { - *existing = job; + .find(|t| t.pos == job.pos && t.sections == job.sections) + { + *existing = job; + } else { + state.recompiles.push(job); + } } else { - state.tasks.push(job); + let dist = column_dist_sq(job.pos, state.sort_cam); + state.loads.push(LoadEntry { dist, job }); } drop(state); self.available.notify_one(); } fn set_camera(&self, camera: glam::DVec3) { - self.state.lock().unwrap().camera = camera; + const BUCKET: f64 = 8.0; + let mut state = self.state.lock().unwrap(); + state.camera = camera; + let crossed = (camera.x / BUCKET).floor() != (state.sort_cam.x / BUCKET).floor() + || (camera.z / BUCKET).floor() != (state.sort_cam.z / BUCKET).floor(); + if crossed { + state.sort_cam = camera; + // Re-key the load heap to the new bucket (pop still gives the nearest). + state.loads = std::mem::take(&mut state.loads) + .into_iter() + .map(|e| LoadEntry { + dist: column_dist_sq(e.job.pos, camera), + job: e.job, + }) + .collect(); + } } fn close(&self) { @@ -770,41 +834,28 @@ impl MeshQueue { /// over initial loads when the edit is closer, bounded by the recompile quota. /// Mirrors vanilla `SectionTaskDynamicQueue.poll`. fn poll(state: &mut QueueState) -> Option { - let camera = state.camera; - let dist_sq = |task: &PendingJob| { - let dx = (task.pos.x as f64 * 16.0 + 8.0) - camera.x; - let dz = (task.pos.z as f64 * 16.0 + 8.0) - camera.z; - dx * dx + dz * dz - }; - - // Both lanes mesh nearest-first; edits (recompiles) are preferred over initial - // loads when closer, bounded by the recompile quota. Occlusion gates drawing, - // not meshing, so meshing order is purely distance-based. - let mut best_initial: Option<(usize, f64)> = None; - let mut best_recompile: Option<(usize, f64)> = None; - for (i, task) in state.tasks.iter().enumerate() { - let dist = dist_sq(task); - if task.is_recompile { - if best_recompile.is_none_or(|(_, d)| dist < d) { - best_recompile = Some((i, dist)); - } - } else if best_initial.is_none_or(|(_, d)| dist < d) { - best_initial = Some((i, dist)); - } - } + let cam = state.sort_cam; + // Nearest queued recompile (edits are few, so the linear scan is cheap). + let best_recompile = state + .recompiles + .iter() + .enumerate() + .map(|(i, t)| (i, column_dist_sq(t.pos, cam))) + .min_by(|a, b| a.1.total_cmp(&b.1)); + let load_dist = state.loads.peek().map(|e| e.dist); if let Some((ri, rd)) = best_recompile { - let take_recompile = match best_initial { + let take_recompile = match load_dist { None => true, - Some((_, id)) => state.recompile_quota > 0 && rd < id, + Some(ld) => state.recompile_quota > 0 && rd < ld, }; if take_recompile { state.recompile_quota -= 1; - return Some(state.tasks.swap_remove(ri)); + return Some(state.recompiles.swap_remove(ri)); } } state.recompile_quota = MAX_RECOMPILE_QUOTA; - best_initial.map(|(ii, _)| state.tasks.swap_remove(ii)) + state.loads.pop().map(|e| e.job) } /// Run mesh workers below normal priority so the OS preempts them for the @@ -832,7 +883,7 @@ struct ChunkStoreSnapshot { ChunkPos, Option>>, )>, - light: std::collections::HashMap<(i32, i32), crate::world::chunk::ChunkLightData>, + light: std::collections::HashMap<(i32, i32), Arc>, grass_colormap: Arc, foliage_colormap: Arc, dry_foliage_colormap: Arc, @@ -1399,6 +1450,7 @@ fn mesh_chunk_snapshot( ChunkMeshData { pos, + min_y, sections, replaced: range, content_gen: 0, diff --git a/pomme-client/src/renderer/pipelines/chunk.rs b/pomme-client/src/renderer/pipelines/chunk.rs index 4b3844c3..c909ff62 100644 --- a/pomme-client/src/renderer/pipelines/chunk.rs +++ b/pomme-client/src/renderer/pipelines/chunk.rs @@ -257,9 +257,9 @@ fn build_pipeline( stages: &[vk::PipelineShaderStageCreateInfo], color_blend: &vk::PipelineColorBlendStateCreateInfo, ) -> vk::Pipeline { - use crate::renderer::chunk::mesher::ChunkVertex; - let binding_descs = [ChunkVertex::binding_description()]; - let attr_descs = ChunkVertex::attribute_descriptions(); + use crate::renderer::chunk::buffer::{chunk_vertex_attributes, chunk_vertex_bindings}; + let binding_descs = chunk_vertex_bindings(); + let attr_descs = chunk_vertex_attributes(); let vertex_input = vk::PipelineVertexInputStateCreateInfo { vertex_binding_description_count: binding_descs.len() as u32, vertex_binding_descriptions: binding_descs.as_ptr(), diff --git a/pomme-client/src/renderer/shaders/chunk.vert b/pomme-client/src/renderer/shaders/chunk.vert index 71879bd8..660a2f1a 100644 --- a/pomme-client/src/renderer/shaders/chunk.vert +++ b/pomme-client/src/renderer/shaders/chunk.vert @@ -8,9 +8,18 @@ layout(set = 0, binding = 0) uniform CameraUniform { vec4 fog_color; }; -layout(location = 0) in vec3 position; -layout(location = 1) in vec2 tex_coords; -layout(location = 2) in vec4 light_tint; +// Position is quantized section-local (unorm); rebased to world via the +// per-instance origin. Must match POS_RANGE / POS_BIAS in buffer.rs. +const float POS_RANGE = 24.0; +const float POS_BIAS = 4.0; + +layout(location = 0) in vec2 in_pos_xy; +layout(location = 1) in float in_pos_z; +layout(location = 2) in vec2 tex_coords; +layout(location = 3) in vec4 light_tint; +// Per-instance (from the meta buffer): +layout(location = 4) in vec3 in_origin; +layout(location = 5) in float in_fade; layout(location = 0) out vec2 v_tex_coords; layout(location = 1) out float v_light; @@ -20,12 +29,14 @@ layout(location = 4) out vec3 v_fog_color; layout(location = 5) out float v_fog; void main() { - vec3 rel = position - camera_pos.xyz; + vec3 local = vec3(in_pos_xy, in_pos_z) * POS_RANGE - POS_BIAS; + vec3 world = in_origin + local; + vec3 rel = world - camera_pos.xyz; gl_Position = view_proj * vec4(rel, 1.0); v_tex_coords = tex_coords; v_light = light_tint.r; v_tint = light_tint.gba; - v_visibility = uintBitsToFloat(gl_InstanceIndex); + v_visibility = in_fade; v_fog_color = fog_color.rgb; v_fog = fog_factor(rel, camera_pos.w, fog_color.w); } diff --git a/pomme-client/src/renderer/shaders/cull.comp b/pomme-client/src/renderer/shaders/cull.comp index 1584433f..321bc0a5 100644 --- a/pomme-client/src/renderer/shaders/cull.comp +++ b/pomme-client/src/renderer/shaders/cull.comp @@ -9,6 +9,7 @@ struct ChunkMeta { uint first_index; int vertex_offset; uint visibility; + vec4 origin; }; struct DrawCmd { @@ -64,5 +65,7 @@ void main() { draws[slot].instance_count = 1u; draws[slot].first_index = m.first_index; draws[slot].vertex_offset = m.vertex_offset; - draws[slot].first_instance = m.visibility; + // first_instance routes the draw to its meta entry (origin + fade) which the + // vertex shader reads as a per-instance attribute. + draws[slot].first_instance = idx; } diff --git a/pomme-client/src/world/chunk.rs b/pomme-client/src/world/chunk.rs index 4cce73b7..bac76b06 100644 --- a/pomme-client/src/world/chunk.rs +++ b/pomme-client/src/world/chunk.rs @@ -76,7 +76,7 @@ impl ChunkLightData { pub struct ChunkStore { pub chunk_storage: ChunkStorage, pub partial_storage: PartialChunkStorage, - pub light_data: std::collections::HashMap<(i32, i32), ChunkLightData>, + pub light_data: std::collections::HashMap<(i32, i32), Arc>, pub block_entities: std::collections::HashMap, } @@ -94,11 +94,8 @@ impl ChunkStore { } } - pub fn loaded_positions(&self) -> Vec { - self.light_data - .keys() - .map(|&(x, z)| ChunkPos::new(x, z)) - .collect() + pub fn loaded_positions(&self) -> impl Iterator + '_ { + self.light_data.keys().map(|&(x, z)| ChunkPos::new(x, z)) } pub fn load_chunk( @@ -151,11 +148,11 @@ impl ChunkStore { self.light_data.insert( (pos.x, pos.z), - ChunkLightData { + Arc::new(ChunkLightData { sky_sections, block_sections, min_y: self.chunk_storage.min_y(), - }, + }), ); } From 99e179fef6ab12505d2b06cbbac33d935a447265 Mon Sep 17 00:00:00 2001 From: Purdze Date: Fri, 26 Jun 2026 21:11:01 +0100 Subject: [PATCH 3/4] cleanup --- pomme-client/src/renderer/chunk/buffer.rs | 25 ++++++++++++++++------- pomme-client/src/renderer/chunk/mesher.rs | 6 +++--- 2 files changed, 21 insertions(+), 10 deletions(-) diff --git a/pomme-client/src/renderer/chunk/buffer.rs b/pomme-client/src/renderer/chunk/buffer.rs index a85d565c..5cf808b0 100644 --- a/pomme-client/src/renderer/chunk/buffer.rs +++ b/pomme-client/src/renderer/chunk/buffer.rs @@ -26,6 +26,9 @@ const MAX_BUCKETS: u32 = 2048; const VRAM_BUDGET_FRACTION: f64 = 0.25; /// Fade-in duration for a freshly revealed section (ms). const FADE_DURATION_MS: f32 = 1000.0; +/// Columns within this squared X/Z distance of the camera render opaque +/// immediately and never fade in. +const NEARBY_DIST_SQ: f32 = 768.0; /// First-fit free-list sub-allocator over a fixed element range, coalescing on /// free. Each section gets an exact-size vertex (and index) slice instead of @@ -676,6 +679,14 @@ impl ChunkBufferStore { self.last_draw_count } + /// Whether `pos`'s column is near enough to `cam` to render opaque + /// immediately (a nearby column never fades in). + fn column_nearby(&self, pos: ChunkPos, cam: [f32; 3]) -> bool { + let dx = pos.x as f32 * 16.0 + 8.0 - cam[0]; + let dz = pos.z as f32 * 16.0 + 8.0 - cam[2]; + !self.fade_enabled || dx * dx + dz * dz < NEARBY_DIST_SQ + } + /// Submit the accumulated staging copies as a single transfer and block on /// a fence until it completes. One fence wait per call replaces the old /// per-mesh `queue.wait_idle`, so a frame's uploads synchronize once @@ -1001,10 +1012,13 @@ impl ChunkBufferStore { // Freshly revealed sections fade in, so extend the fade window the // cull's O(1) check reads; re-meshed-only uploads swap instantly. - if plans + // Nearby columns never fade, so extending for them only forces + // redundant rebuilds — skip them. `last_sort_cam` is the camera the + // draw list is keyed to (unset => far, the safe default). + let revealed = plans .iter() - .any(|p| !was_present.contains(&p.section_index)) - { + .any(|p| !was_present.contains(&p.section_index)); + if revealed && !self.column_nearby(mesh.pos, self.last_sort_cam) { let dur = std::time::Duration::from_secs_f32(FADE_DURATION_MS / 1000.0); self.fade_until = self.fade_until.max(now + dur); } @@ -1202,7 +1216,6 @@ impl ChunkBufferStore { } let now = std::time::Instant::now(); - const NEARBY_DIST_SQ: f32 = 768.0; // Re-sort only once the camera moves ~8 blocks; front-to-back order is an // early-Z optimization, so finer staleness is harmless. const SORT_RECAM_SQ: f32 = 64.0; @@ -1218,9 +1231,7 @@ impl ChunkBufferStore { for (pos, alloc) in self.chunks.iter() { // Near columns never fade; otherwise each section fades on its own // timer (X/Z distance is per-column). - let dx = pos.x as f32 * 16.0 + 8.0 - camera_pos[0]; - let dz = pos.z as f32 * 16.0 + 8.0 - camera_pos[2]; - let nearby = !self.fade_enabled || dx * dx + dz * dz < NEARBY_DIST_SQ; + let nearby = self.column_nearby(*pos, camera_pos); // CPU omission: the visibility graph's mask skips sections proven // occluded, so they never reach the GPU cull (absent => all draw). diff --git a/pomme-client/src/renderer/chunk/mesher.rs b/pomme-client/src/renderer/chunk/mesher.rs index 5cf40d84..2cf6abf4 100644 --- a/pomme-client/src/renderer/chunk/mesher.rs +++ b/pomme-client/src/renderer/chunk/mesher.rs @@ -472,8 +472,8 @@ impl MeshDispatcher { dry_foliage_colormap: Colormap, biome_climate: Arc>, ) -> Self { - // Bulk results are bounded for back-pressure; edits stay unbounded so a - // block edit never blocks a worker behind the load backlog. + // Bulk results are bounded for back-pressure; edit results use the + // unbounded priority channel so they never queue behind the load backlog. let (result_tx, result_rx) = crossbeam_channel::bounded(MAX_PENDING_RESULTS); let (priority_tx, priority_rx) = crossbeam_channel::unbounded(); @@ -483,7 +483,7 @@ impl MeshDispatcher { // some load throughput for that. let worker_count = std::thread::available_parallelism() .map(|n| (n.get() / 2).clamp(2, 16)) - .unwrap_or(1); + .unwrap_or(2); let mut workers = Vec::with_capacity(worker_count); for _ in 0..worker_count { let queue = Arc::clone(&queue); From 3799bb949cac844d6c55dc757650875c33c4b9a1 Mon Sep 17 00:00:00 2001 From: Purdze Date: Sat, 27 Jun 2026 09:07:24 +0100 Subject: [PATCH 4/4] Split terrain into solid and cutout passes to restore early-Z --- pomme-client/build.rs | 1 + pomme-client/src/renderer/chunk/atlas.rs | 17 +- pomme-client/src/renderer/chunk/buffer.rs | 192 +++++++++++++--- pomme-client/src/renderer/chunk/mesher.rs | 210 ++++++++---------- pomme-client/src/renderer/mod.rs | 8 +- pomme-client/src/renderer/pipelines/chunk.rs | 54 +++-- .../src/renderer/shaders/chunk_solid.frag | 36 +++ pomme-client/src/renderer/shaders/cull.comp | 38 +++- 8 files changed, 383 insertions(+), 173 deletions(-) create mode 100644 pomme-client/src/renderer/shaders/chunk_solid.frag diff --git a/pomme-client/build.rs b/pomme-client/build.rs index f759ea11..f8df4d78 100644 --- a/pomme-client/build.rs +++ b/pomme-client/build.rs @@ -37,6 +37,7 @@ fn main() { let shaders = [ ("chunk.vert", shaderc::ShaderKind::Vertex), ("chunk.frag", shaderc::ShaderKind::Fragment), + ("chunk_solid.frag", shaderc::ShaderKind::Fragment), ("cube.vert", shaderc::ShaderKind::Vertex), ("cube.frag", shaderc::ShaderKind::Fragment), ("panorama.vert", shaderc::ShaderKind::Vertex), diff --git a/pomme-client/src/renderer/chunk/atlas.rs b/pomme-client/src/renderer/chunk/atlas.rs index d336160f..b5a7bd24 100644 --- a/pomme-client/src/renderer/chunk/atlas.rs +++ b/pomme-client/src/renderer/chunk/atlas.rs @@ -14,6 +14,10 @@ pub struct AtlasRegion { pub v_min: f32, pub u_max: f32, pub v_max: f32, + /// Every level-0 texel is fully opaque (alpha 255), so quads using this + /// sprite can render in the no-discard solid pass (early-Z). Sprites with + /// any transparent texel are cutout and stay in the discard pass. + pub opaque: bool, } #[derive(Clone)] @@ -149,7 +153,8 @@ impl TextureAtlas { for src in &sources { match placements.get(src.name.as_str()) { Some(Some((cx, cy))) => { - let region = pixel_region(*cx, *cy, src.w, src.h, atlas_size); + let mut region = pixel_region(*cx, *cy, src.w, src.h, atlas_size); + region.opaque = sprite_is_opaque(&src.data); for py in 0..src.h { for px in 0..src.w { let s = ((py * src.w + px) * 4) as usize; @@ -321,9 +326,19 @@ fn pixel_region(x: u32, y: u32, w: u32, h: u32, atlas_size: u32) -> AtlasRegion v_min: (y as f32 + INSET) / s, u_max: ((x + w) as f32 - INSET) / s, v_max: ((y + h) as f32 - INSET) / s, + // Filled in by the caller from the sprite's texels; the missing tile is a + // solid checker, so the geometric default is opaque. + opaque: true, } } +/// Whether every level-0 texel of an RGBA sprite is fully opaque (alpha 255). +/// Conservative: any transparency (or unknown) routes the sprite to the cutout +/// pass, so a hole never renders solid. +fn sprite_is_opaque(data: &[u8]) -> bool { + data.chunks_exact(4).all(|px| px[3] == 255) +} + type PackResult = (HashMap>, AtlasRegion); fn pack(sources: &[Source], atlas_size: u32) -> (PackResult, bool) { diff --git a/pomme-client/src/renderer/chunk/buffer.rs b/pomme-client/src/renderer/chunk/buffer.rs index 5cf808b0..46f1a437 100644 --- a/pomme-client/src/renderer/chunk/buffer.rs +++ b/pomme-client/src/renderer/chunk/buffer.rs @@ -136,8 +136,10 @@ struct ChunkMeta { first_index: u32, vertex_offset: i32, visibility: u32, - /// Section world origin; bound as a per-instance vertex attribute so the - /// vertex shader rebases the quantized local position. `[3]` is padding. + /// `[0..3]`: section world origin, bound as a per-instance vertex attribute + /// so the vertex shader rebases the quantized local position (it reads only + /// xyz). `[3]`: the section's `solid_index_count` reinterpreted as float + /// bits, read by the cull shader to split the solid/cutout draws. origin: [f32; 4], } @@ -275,6 +277,9 @@ struct SectionAlloc { origin: [f32; 3], first_index: u32, index_count: u32, + /// Leading indices belonging to the solid (no-discard) pass; the rest are + /// cutout. Passed to the GPU via `ChunkMeta.origin[3]`. + solid_index_count: u32, vertex_offset: i32, vtx_len: u32, uploaded_at: std::time::Instant, @@ -335,10 +340,17 @@ pub struct ChunkBufferStore { meta_buffers: Vec, meta_allocs: Vec, + // Solid (no-discard, early-Z) draw list, written by the cull shader. indirect_buffers: Vec, indirect_allocs: Vec, count_buffers: Vec, count_allocs: Vec, + // Cutout (discard) draw list. Same sections, the back of each section's + // index slice; drawn in a second pass after solid lays down depth. + indirect_cutout_buffers: Vec, + indirect_cutout_allocs: Vec, + count_cutout_buffers: Vec, + count_cutout_allocs: Vec, frustum_buffers: Vec, frustum_allocs: Vec, fade_enabled: bool, @@ -475,6 +487,10 @@ impl ChunkBufferStore { let mut indirect_allocs = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT); let mut count_buffers = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT); let mut count_allocs = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT); + let mut indirect_cutout_buffers = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT); + let mut indirect_cutout_allocs = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT); + let mut count_cutout_buffers = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT); + let mut count_cutout_allocs = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT); let mut frustum_buffers = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT); let mut frustum_allocs = Vec::with_capacity(MAX_FRAMES_IN_FLIGHT); @@ -509,6 +525,26 @@ impl ChunkBufferStore { count_buffers.push(b); count_allocs.push(a); + let (b, a) = util::create_host_buffer( + device, + allocator, + indirect_size, + vk::BufferUsageFlags::StorageBuffer | vk::BufferUsageFlags::IndirectBuffer, + "indirect_cmds_cutout", + ); + indirect_cutout_buffers.push(b); + indirect_cutout_allocs.push(a); + + let (b, a) = util::create_host_buffer( + device, + allocator, + count_size, + vk::BufferUsageFlags::StorageBuffer | vk::BufferUsageFlags::IndirectBuffer, + "draw_count_cutout", + ); + count_cutout_buffers.push(b); + count_cutout_allocs.push(a); + let (b, a) = util::create_host_buffer( device, allocator, @@ -557,7 +593,8 @@ impl ChunkBufferStore { let pool_sizes = [ vk::DescriptorPoolSize { ty: vk::DescriptorType::StorageBuffer, - descriptor_count: 3 * MAX_FRAMES_IN_FLIGHT as u32, + // meta + solid indirect/count + cutout indirect/count = 5 per frame. + descriptor_count: 5 * MAX_FRAMES_IN_FLIGHT as u32, }, vk::DescriptorPoolSize { ty: vk::DescriptorType::UniformBuffer, @@ -621,12 +658,37 @@ impl ChunkBufferStore { count_size, ); + let (indirect_c_info, mut indirect_c_write) = desc_write( + compute_sets[i], + 4, + vk::DescriptorType::StorageBuffer, + indirect_cutout_buffers[i], + indirect_size, + ); + + let (count_c_info, mut count_c_write) = desc_write( + compute_sets[i], + 5, + vk::DescriptorType::StorageBuffer, + count_cutout_buffers[i], + count_size, + ); + meta_write.buffer_info = meta_info.as_ptr(); frustum_write.buffer_info = frustum_info.as_ptr(); indirect_write.buffer_info = indirect_info.as_ptr(); count_write.buffer_info = count_info.as_ptr(); - - let writes = [meta_write, frustum_write, indirect_write, count_write]; + indirect_c_write.buffer_info = indirect_c_info.as_ptr(); + count_c_write.buffer_info = count_c_info.as_ptr(); + + let writes = [ + meta_write, + frustum_write, + indirect_write, + count_write, + indirect_c_write, + count_c_write, + ]; device.update_descriptor_sets(&writes, &[]); } @@ -664,6 +726,10 @@ impl ChunkBufferStore { indirect_allocs, count_buffers, count_allocs, + indirect_cutout_buffers, + indirect_cutout_allocs, + count_cutout_buffers, + count_cutout_allocs, frustum_buffers, frustum_allocs, fade_enabled: false, @@ -772,6 +838,7 @@ impl ChunkBufferStore { indices: &'a [u32], vtx_off: u32, idx_off: u32, + solid_index_count: u32, aabb: ChunkAABB, origin: [f32; 3], } @@ -915,6 +982,7 @@ impl ChunkBufferStore { indices: &sec.indices, vtx_off, idx_off, + solid_index_count: sec.solid_index_count, aabb: section_aabb(&sec.vertices), origin: [ (mesh.pos.x * 16) as f32, @@ -998,6 +1066,7 @@ impl ChunkBufferStore { origin: p.origin, first_index: p.idx_off, index_count: p.indices.len() as u32, + solid_index_count: p.solid_index_count, vertex_offset: p.vtx_off as i32, vtx_len: p.verts.len() as u32, // A re-meshed section swaps instantly; a freshly revealed one fades in. @@ -1077,6 +1146,13 @@ impl ChunkBufferStore { std::mem::zeroed() })) .ok(); + device.destroy_buffer(self.indirect_cutout_buffers[i], None); + alloc + .free(std::mem::replace( + &mut self.indirect_cutout_allocs[i], + unsafe { std::mem::zeroed() }, + )) + .ok(); } } @@ -1103,6 +1179,16 @@ impl ChunkBufferStore { self.indirect_buffers[i] = b; self.indirect_allocs[i] = a; + let (b, a) = util::create_host_buffer( + device, + allocator, + indirect_size, + vk::BufferUsageFlags::StorageBuffer | vk::BufferUsageFlags::IndirectBuffer, + "indirect_cmds_cutout", + ); + self.indirect_cutout_buffers[i] = b; + self.indirect_cutout_allocs[i] = a; + let (meta_info, mut meta_write) = desc_write( self.compute_sets[i], 0, @@ -1117,9 +1203,17 @@ impl ChunkBufferStore { self.indirect_buffers[i], indirect_size, ); + let (indirect_c_info, mut indirect_c_write) = desc_write( + self.compute_sets[i], + 4, + vk::DescriptorType::StorageBuffer, + self.indirect_cutout_buffers[i], + indirect_size, + ); meta_write.buffer_info = meta_info.as_ptr(); indirect_write.buffer_info = indirect_info.as_ptr(); - device.update_descriptor_sets(&[meta_write, indirect_write], &[]); + indirect_c_write.buffer_info = indirect_c_info.as_ptr(); + device.update_descriptor_sets(&[meta_write, indirect_write, indirect_c_write], &[]); } self.max_meta = new_max; @@ -1254,7 +1348,12 @@ impl ChunkBufferStore { first_index: sec.first_index, vertex_offset: sec.vertex_offset, visibility: vis.to_bits(), - origin: [sec.origin[0], sec.origin[1], sec.origin[2], 0.0], + origin: [ + sec.origin[0], + sec.origin[1], + sec.origin[2], + f32::from_bits(sec.solid_index_count), + ], }); } } @@ -1314,22 +1413,28 @@ impl ChunkBufferStore { .copy_from_slice(frustum_bytes); // This frame slot's GPU work has completed (fence-waited at frame start), - // so the count buffer still holds its previous cull result; capture it for - // the debug overlay before clearing it for this dispatch. + // so the count buffers still hold their previous cull result; capture the + // total (solid + cutout draws) for the debug overlay before clearing them. { - let s = self.count_allocs[frame].mapped_slice_mut().unwrap(); - self.last_draw_count = u32::from_ne_bytes([s[0], s[1], s[2], s[3]]); + let read_and_clear = |a: &mut Allocation| { + let s = a.mapped_slice_mut().unwrap(); + let n = u32::from_ne_bytes([s[0], s[1], s[2], s[3]]); + s[..4].copy_from_slice(&0u32.to_ne_bytes()); + n + }; + self.last_draw_count = read_and_clear(&mut self.count_allocs[frame]) + + read_and_clear(&mut self.count_cutout_allocs[frame]); } - self.count_allocs[frame].mapped_slice_mut().unwrap()[..4] - .copy_from_slice(&0u32.to_ne_bytes()); // macOS draws the whole indirect buffer (no drawIndirectCount), so slots // the cull shader leaves unfilled must read as no-op draws, not stale data. #[cfg(target_os = "macos")] - self.indirect_allocs[frame] - .mapped_slice_mut() - .unwrap() - .fill(0); + for a in [ + &mut self.indirect_allocs[frame], + &mut self.indirect_cutout_allocs[frame], + ] { + a.mapped_slice_mut().unwrap().fill(0); + } cmd.bind_pipeline(vk::PipelineBindPoint::Compute, self.compute_pipeline); cmd.bind_descriptor_sets( @@ -1360,7 +1465,11 @@ impl ChunkBufferStore { } } - pub fn draw_indirect(&self, cmd: vk::CommandBuffer, frame: usize) { + /// Issue one render layer's indirect draws. `cutout` selects the discard + /// pass's draw list (drawn after `solid`, which lays down depth); the + /// caller binds the matching pipeline first. Both layers share the + /// vertex/index/meta buffers and the cull-written draw lists. + pub fn draw_indirect(&self, cmd: vk::CommandBuffer, frame: usize, cutout: bool) { if self.chunks.is_empty() { return; } @@ -1370,23 +1479,26 @@ impl ChunkBufferStore { .values() .map(|c| c.sections.len() as u32) .sum::(); + let (indirect, count) = if cutout { + ( + self.indirect_cutout_buffers[frame], + self.count_cutout_buffers[frame], + ) + } else { + (self.indirect_buffers[frame], self.count_buffers[frame]) + }; // Binding 0: packed vertex pool. Binding 1: the meta buffer, read per // instance for the section origin + fade (indexed by `first_instance`). cmd.bind_vertex_buffers(0, &[self.vertex_buffer, self.meta_buffers[frame]], &[0, 0]); cmd.bind_index_buffer(self.index_buffer, 0, vk::IndexType::Uint32); if cfg!(target_os = "macos") { - cmd.draw_indexed_indirect( - self.indirect_buffers[frame], - 0, - max_draws, - size_of::() as u32, - ); + cmd.draw_indexed_indirect(indirect, 0, max_draws, size_of::() as u32); } else { cmd.draw_indexed_indirect_count( - self.indirect_buffers[frame], + indirect, 0, - self.count_buffers[frame], + count, 0, max_draws, size_of::() as u32, @@ -1415,6 +1527,8 @@ impl ChunkBufferStore { device.destroy_buffer(self.meta_buffers[i], None); device.destroy_buffer(self.indirect_buffers[i], None); device.destroy_buffer(self.count_buffers[i], None); + device.destroy_buffer(self.indirect_cutout_buffers[i], None); + device.destroy_buffer(self.count_cutout_buffers[i], None); device.destroy_buffer(self.frustum_buffers[i], None); alloc @@ -1432,6 +1546,18 @@ impl ChunkBufferStore { std::mem::zeroed() })) .ok(); + alloc + .free(std::mem::replace( + &mut self.indirect_cutout_allocs[i], + unsafe { std::mem::zeroed() }, + )) + .ok(); + alloc + .free(std::mem::replace( + &mut self.count_cutout_allocs[i], + unsafe { std::mem::zeroed() }, + )) + .ok(); alloc .free(std::mem::replace(&mut self.frustum_allocs[i], unsafe { std::mem::zeroed() @@ -1485,6 +1611,20 @@ fn create_cull_desc_layout(device: &vk::Device) -> vk::DescriptorSetLayout { stage_flags: vk::ShaderStageFlags::Compute, ..Default::default() }, + vk::DescriptorSetLayoutBinding { + binding: 4, + descriptor_type: vk::DescriptorType::StorageBuffer, + descriptor_count: 1, + stage_flags: vk::ShaderStageFlags::Compute, + ..Default::default() + }, + vk::DescriptorSetLayoutBinding { + binding: 5, + descriptor_type: vk::DescriptorType::StorageBuffer, + descriptor_count: 1, + stage_flags: vk::ShaderStageFlags::Compute, + ..Default::default() + }, ]; let info = vk::DescriptorSetLayoutCreateInfo { binding_count: bindings.len() as u32, diff --git a/pomme-client/src/renderer/chunk/mesher.rs b/pomme-client/src/renderer/chunk/mesher.rs index 2cf6abf4..4856397a 100644 --- a/pomme-client/src/renderer/chunk/mesher.rs +++ b/pomme-client/src/renderer/chunk/mesher.rs @@ -94,7 +94,35 @@ pub struct SectionMesh { /// per-section upload/replace. pub section_index: i32, pub vertices: Vec, + /// Solid (opaque) indices first, then cutout indices. `solid_index_count` + /// splits the two so each renders in its own pass. pub indices: Vec, + /// Number of leading `indices` that belong to the solid (no-discard) pass; + /// the rest are cutout (discard) geometry. + pub solid_index_count: u32, +} + +/// Per-section meshing accumulator: one shared vertex pool plus separate solid +/// and cutout index lists (routed per quad by sprite opacity). Finalized into a +/// [`SectionMesh`] with the two index lists concatenated solid-first. +#[derive(Default)] +struct MeshSink { + vertices: Vec, + solid: Vec, + cutout: Vec, +} + +impl MeshSink { + /// Index list a quad's triangles go in: solid sprites render in the + /// no-discard pass, everything else (cutout/translucent) in the discard + /// pass. + fn indices_for(&mut self, opaque: bool) -> &mut Vec { + if opaque { + &mut self.solid + } else { + &mut self.cutout + } + } } pub struct ChunkMeshData { @@ -1249,19 +1277,14 @@ fn mesh_chunk_snapshot( let by_start = min_y + range.start * 16; let by_end = min_y + range.end * 16; - let mut sections: Vec = (0..section_count) - .map(|i| SectionMesh { - section_index: i, - vertices: Vec::new(), - indices: Vec::new(), - }) - .collect(); + let mut sinks: Vec = (0..section_count).map(|_| MeshSink::default()).collect(); // In-range sections get recycled buffers (capacity retained from earlier // meshes) so the worker fills them without going through the OS allocator. + // The cutout list stays un-pooled (empty for the common all-solid section). for si in range.clone() { - let sec = &mut sections[si as usize]; - sec.vertices = pool.take_vertices(); - sec.indices = pool.take_indices(); + let sink = &mut sinks[si as usize]; + sink.vertices = pool.take_vertices(); + sink.solid = pool.take_indices(); } // The type map is a state->id map, so it only needs the meshed span (+1-block @@ -1276,11 +1299,11 @@ fn mesh_chunk_snapshot( let mut visibility: Vec<(i32, VisibilitySet)> = Vec::new(); if let Some(ref tm) = type_map { for si in range.clone() { - let sec = &mut sections[si as usize]; + let sink = &mut sinks[si as usize]; let section_y = min_y + si * 16; let vis = greedy_mesh_section( - &mut sec.vertices, - &mut sec.indices, + &mut sink.vertices, + &mut sink.solid, snapshot, registry, tm, @@ -1344,73 +1367,27 @@ fn mesh_chunk_snapshot( // a non-16-aligned world height can't index past the last section. let s = (((by - min_y) / 16) as usize).min((section_count as usize).saturating_sub(1)); - let sec = &mut sections[s]; + let sink = &mut sinks[s]; if lod > 0 { emit_lod_cube( - &mut sec.vertices, - &mut sec.indices, - block_pos, - state, - snapshot, - registry, - uv_map, - bx, - by, - bz, - step, + sink, block_pos, state, snapshot, registry, uv_map, bx, by, bz, step, ); } else if let BlockKind::Water | BlockKind::Lava = kind { emit_fluid( - &mut sec.vertices, - &mut sec.indices, - block_pos, - state, - snapshot, - registry, - uv_map, - bx, - by, - bz, + sink, block_pos, state, snapshot, registry, uv_map, bx, by, bz, ); } else if let Some(baked) = registry.get_baked_model(state) { emit_baked_model( - &mut sec.vertices, - &mut sec.indices, - block_pos, - baked, - snapshot, - registry, - uv_map, - bx, - by, - bz, + sink, block_pos, baked, snapshot, registry, uv_map, bx, by, bz, ); } else if let Some(quads) = registry.get_multipart_quads(state) { emit_multipart( - &mut sec.vertices, - &mut sec.indices, - block_pos, - &quads, - snapshot, - registry, - uv_map, - bx, - by, - bz, + sink, block_pos, &quads, snapshot, registry, uv_map, bx, by, bz, ); } else if let Some(textures) = registry.get_textures(state) { emit_cube_faces( - &mut sec.vertices, - &mut sec.indices, - block_pos, - textures, - snapshot, - registry, - uv_map, - bx, - by, - bz, + sink, block_pos, textures, snapshot, registry, uv_map, bx, by, bz, ); } else { let block: Box = state.into(); @@ -1418,16 +1395,7 @@ fn mesh_chunk_snapshot( if logged_missing.insert(id.clone()) { tracing::warn!("Missing model: {id}"); } - emit_missing_cube( - &mut sec.vertices, - &mut sec.indices, - block_pos, - snapshot, - registry, - bx, - by, - bz, - ); + emit_missing_cube(sink, block_pos, snapshot, registry, bx, by, bz); } by += step; } @@ -1436,17 +1404,24 @@ fn mesh_chunk_snapshot( local_z += step; } - // Keep only non-empty sections; recycle the buffers of in-range sections that - // ended up empty (rather than dropping their retained capacity). - let mut kept = Vec::with_capacity(sections.len()); - for s in sections { - if s.vertices.is_empty() || s.indices.is_empty() { - pool.recycle(s.vertices, s.indices); - } else { - kept.push(s); + // Finalize each non-empty section: concatenate cutout indices after solid + // (recording the split), keep the result. Empty in-range sections recycle + // their buffers rather than dropping the retained capacity. + let mut sections = Vec::with_capacity(sinks.len()); + for (i, mut sink) in sinks.into_iter().enumerate() { + if sink.solid.is_empty() && sink.cutout.is_empty() { + pool.recycle(sink.vertices, sink.solid); + continue; } + let solid_index_count = sink.solid.len() as u32; + sink.solid.extend_from_slice(&sink.cutout); + sections.push(SectionMesh { + section_index: i as i32, + vertices: sink.vertices, + indices: sink.solid, + solid_index_count, + }); } - let sections = kept; ChunkMeshData { pos, @@ -1462,8 +1437,7 @@ fn mesh_chunk_snapshot( #[allow(clippy::too_many_arguments)] fn emit_baked_model( - vertices: &mut Vec, - indices: &mut Vec, + sink: &mut MeshSink, block_pos: [f32; 3], model: &BakedModel, snapshot: &ChunkStoreSnapshot, @@ -1495,8 +1469,7 @@ fn emit_baked_model( [quad.shade_light; 4] }; emit_face( - vertices, - indices, + sink, block_pos, &quad.positions, &quad.uvs, @@ -1509,8 +1482,7 @@ fn emit_baked_model( #[allow(clippy::too_many_arguments)] fn emit_cube_faces( - vertices: &mut Vec, - indices: &mut Vec, + sink: &mut MeshSink, block_pos: [f32; 3], textures: &crate::world::block::registry::FaceTextures, snapshot: &ChunkStoreSnapshot, @@ -1549,8 +1521,7 @@ fn emit_cube_faces( let is_side = i >= 2; if let Some(overlay) = textures.side_overlay.as_deref().filter(|_| is_side) { emit_face( - vertices, - indices, + sink, block_pos, &positions, &uvs, @@ -1560,8 +1531,7 @@ fn emit_cube_faces( ); let overlay_region = uv_map.get_region(overlay); emit_face( - vertices, - indices, + sink, block_pos, &positions, &uvs, @@ -1577,9 +1547,7 @@ fn emit_cube_faces( } else { PACKED_WHITE_SHIFTED }; - emit_face( - vertices, indices, block_pos, &positions, &uvs, lights, region, face_tint, - ); + emit_face(sink, block_pos, &positions, &uvs, lights, region, face_tint); } } } @@ -1656,8 +1624,7 @@ fn block_face_tex_tint( #[allow(clippy::too_many_arguments)] fn emit_fluid( - vertices: &mut Vec, - indices: &mut Vec, + sink: &mut MeshSink, block_pos: [f32; 3], state: azalea_block::BlockState, snapshot: &ChunkStoreSnapshot, @@ -1694,16 +1661,13 @@ fn emit_fluid( } } - emit_face( - vertices, indices, block_pos, &positions, &uvs, [light; 4], region, tint, - ); + emit_face(sink, block_pos, &positions, &uvs, [light; 4], region, tint); } } #[allow(clippy::too_many_arguments)] fn emit_multipart( - vertices: &mut Vec, - indices: &mut Vec, + sink: &mut MeshSink, block_pos: [f32; 3], quads: &[&crate::world::block::model::BakedQuad], snapshot: &ChunkStoreSnapshot, @@ -1730,8 +1694,7 @@ fn emit_multipart( snapshot.dry_foliage_tint(bx, by, bz), ); emit_face( - vertices, - indices, + sink, block_pos, &quad.positions, &quad.uvs, @@ -1744,8 +1707,7 @@ fn emit_multipart( #[allow(clippy::too_many_arguments)] fn emit_lod_cube( - vertices: &mut Vec, - indices: &mut Vec, + sink: &mut MeshSink, block_pos: [f32; 3], state: azalea_block::BlockState, snapshot: &ChunkStoreSnapshot, @@ -1788,9 +1750,9 @@ fn emit_lod_cube( let (positions, uvs, light) = cube_face_geometry(*dir); let s = step as f32; let sy = if is_fluid { fluid_top } else { s }; - let base = vertices.len() as u32; + let base = sink.vertices.len() as u32; for i in 0..4 { - vertices.push(ChunkVertex { + sink.vertices.push(ChunkVertex { position: [ block_pos[0] + positions[i][0] * s, block_pos[1] + positions[i][1] * sy, @@ -1803,7 +1765,14 @@ fn emit_lod_cube( light_tint: pack_light_tint(light, tint), }); } - indices.extend_from_slice(&[base, base + 1, base + 2, base + 2, base + 3, base]); + sink.indices_for(region.opaque).extend_from_slice(&[ + base, + base + 1, + base + 2, + base + 2, + base + 3, + base, + ]); } } @@ -1811,8 +1780,7 @@ const MISSING_TINT: u32 = pack_tint_shifted([1.0, 0.0, 1.0]); #[allow(clippy::too_many_arguments)] fn emit_missing_cube( - vertices: &mut Vec, - indices: &mut Vec, + sink: &mut MeshSink, block_pos: [f32; 3], snapshot: &ChunkStoreSnapshot, registry: &BlockRegistry, @@ -1828,9 +1796,9 @@ fn emit_missing_cube( } let (positions, _, light) = cube_face_geometry(*dir); - let base = vertices.len() as u32; + let base = sink.vertices.len() as u32; for pos in &positions { - vertices.push(ChunkVertex { + sink.vertices.push(ChunkVertex { position: [ block_pos[0] + pos[0], block_pos[1] + pos[1], @@ -1840,7 +1808,9 @@ fn emit_missing_cube( light_tint: pack_light_tint(light, MISSING_TINT), }); } - indices.extend_from_slice(&[base, base + 1, base + 2, base + 2, base + 3, base]); + // The missing tile is a solid checker, so the cube goes in the solid pass. + sink.solid + .extend_from_slice(&[base, base + 1, base + 2, base + 2, base + 3, base]); } } @@ -1855,8 +1825,7 @@ pub(crate) const CUBE_FACE_DIRS: [Direction; 6] = [ #[allow(clippy::too_many_arguments)] fn emit_face( - vertices: &mut Vec, - indices: &mut Vec, + sink: &mut MeshSink, block_pos: [f32; 3], positions: &[[f32; 3]; 4], uvs: &[[f32; 2]; 4], @@ -1864,12 +1833,12 @@ fn emit_face( region: AtlasRegion, tint: u32, ) { - let base = vertices.len() as u32; + let base = sink.vertices.len() as u32; let u_span = region.u_max - region.u_min; let v_span = region.v_max - region.v_min; for i in 0..4 { - vertices.push(ChunkVertex { + sink.vertices.push(ChunkVertex { position: [ block_pos[0] + positions[i][0], block_pos[1] + positions[i][1], @@ -1883,6 +1852,7 @@ fn emit_face( }); } + let indices = sink.indices_for(region.opaque); if lights[0] + lights[2] > lights[1] + lights[3] { indices.extend_from_slice(&[base + 1, base + 2, base + 3, base + 3, base, base + 1]); } else { diff --git a/pomme-client/src/renderer/mod.rs b/pomme-client/src/renderer/mod.rs index a941b612..eb8a8c8a 100644 --- a/pomme-client/src/renderer/mod.rs +++ b/pomme-client/src/renderer/mod.rs @@ -1439,8 +1439,12 @@ impl Renderer { .update_and_draw(&self.ctx.device, cmd, frame, &self.camera, sky); let t_cull = std::time::Instant::now(); - self.chunk_pipeline.bind(cmd, frame); - self.chunk_buffers.draw_indirect(cmd, frame); + // Solid (no discard) first so it lays down depth and early-Z lets + // the front-to-back order reject occluded fragments; cutout after. + self.chunk_pipeline.bind(cmd, frame, false); + self.chunk_buffers.draw_indirect(cmd, frame, false); + self.chunk_pipeline.bind(cmd, frame, true); + self.chunk_buffers.draw_indirect(cmd, frame, true); let cull_ms = t_cull.elapsed().as_secs_f32() * 1000.0; if let Some((block_pos, stage, state)) = destroy_info { diff --git a/pomme-client/src/renderer/pipelines/chunk.rs b/pomme-client/src/renderer/pipelines/chunk.rs index c909ff62..1320c504 100644 --- a/pomme-client/src/renderer/pipelines/chunk.rs +++ b/pomme-client/src/renderer/pipelines/chunk.rs @@ -9,7 +9,10 @@ use crate::renderer::chunk::atlas::TextureAtlas; use crate::renderer::{MAX_FRAMES_IN_FLIGHT, shader, util}; pub struct ChunkPipeline { - pub pipeline: vk::Pipeline, + /// Opaque terrain: no discard, early-Z. Drawn first (front-to-back). + pub pipeline_solid: vk::Pipeline, + /// Cutout/translucent terrain: alpha-test discard. Drawn after solid. + pub pipeline_cutout: vk::Pipeline, pub pipeline_layout: vk::PipelineLayout, pub descriptor_set_layout_camera: vk::DescriptorSetLayout, pub descriptor_set_layout_atlas: vk::DescriptorSetLayout, @@ -48,7 +51,8 @@ impl ChunkPipeline { .create_pipeline_layout(&layout_info, None) .expect("failed to create pipeline layout"); - let pipeline = create_pipeline(device, render_pass, pipeline_layout); + let (pipeline_solid, pipeline_cutout) = + create_pipelines(device, render_pass, pipeline_layout); let pool_sizes = [ vk::DescriptorPoolSize { @@ -139,7 +143,8 @@ impl ChunkPipeline { device.update_descriptor_sets(&[atlas_write], &[]); Self { - pipeline, + pipeline_solid, + pipeline_cutout, pipeline_layout, descriptor_set_layout_camera: camera_layout, descriptor_set_layout_atlas: atlas_layout, @@ -174,8 +179,13 @@ impl ChunkPipeline { device.update_descriptor_sets(&[write], &[]); } - pub fn bind(&self, cmd: vk::CommandBuffer, frame: usize) { - cmd.bind_pipeline(vk::PipelineBindPoint::Graphics, self.pipeline); + pub fn bind(&self, cmd: vk::CommandBuffer, frame: usize, cutout: bool) { + let pipeline = if cutout { + self.pipeline_cutout + } else { + self.pipeline_solid + }; + cmd.bind_pipeline(vk::PipelineBindPoint::Graphics, pipeline); cmd.bind_descriptor_sets( vk::PipelineBindPoint::Graphics, self.pipeline_layout, @@ -197,7 +207,8 @@ impl ChunkPipeline { } drop(alloc); - device.destroy_pipeline(self.pipeline, None); + device.destroy_pipeline(self.pipeline_solid, None); + device.destroy_pipeline(self.pipeline_cutout, None); device.destroy_pipeline_layout(self.pipeline_layout, None); device.destroy_descriptor_pool(self.descriptor_pool, None); device.destroy_descriptor_set_layout(self.descriptor_set_layout_camera, None); @@ -217,20 +228,21 @@ fn shader_stage( } } -fn create_pipeline( +/// Builds the two chunk pipelines: `solid` (chunk_solid.frag, no discard, +/// early-Z) and `cutout` (chunk.frag, alpha-test discard). Identical state +/// otherwise; both share the vertex shader and layout. +fn create_pipelines( device: &vk::Device, render_pass: vk::RenderPass, layout: vk::PipelineLayout, -) -> vk::Pipeline { +) -> (vk::Pipeline, vk::Pipeline) { let vert_module = shader::create_shader_module(device, shader::include_spirv!("chunk.vert.spv")); - let frag_module = + let solid_frag = + shader::create_shader_module(device, shader::include_spirv!("chunk_solid.frag.spv")); + let cutout_frag = shader::create_shader_module(device, shader::include_spirv!("chunk.frag.spv")); - let stages = [ - shader_stage(vk::ShaderStageFlags::Vertex, vert_module), - shader_stage(vk::ShaderStageFlags::Fragment, frag_module), - ]; let blend_attachment = [vk::PipelineColorBlendAttachmentState { blend_enable: vk::FALSE, color_write_mask: vk::ColorComponentFlags::RGBA, @@ -242,10 +254,20 @@ fn create_pipeline( ..Default::default() }; - let pipeline = build_pipeline(device, render_pass, layout, &stages, &color_blend); + let build = |frag| { + let stages = [ + shader_stage(vk::ShaderStageFlags::Vertex, vert_module), + shader_stage(vk::ShaderStageFlags::Fragment, frag), + ]; + build_pipeline(device, render_pass, layout, &stages, &color_blend) + }; + let pipeline_solid = build(solid_frag); + let pipeline_cutout = build(cutout_frag); + device.destroy_shader_module(vert_module, None); - device.destroy_shader_module(frag_module, None); - pipeline + device.destroy_shader_module(solid_frag, None); + device.destroy_shader_module(cutout_frag, None); + (pipeline_solid, pipeline_cutout) } /// Shared chunk pipeline state; callers supply the shader stages and diff --git a/pomme-client/src/renderer/shaders/chunk_solid.frag b/pomme-client/src/renderer/shaders/chunk_solid.frag new file mode 100644 index 00000000..250aa6f3 --- /dev/null +++ b/pomme-client/src/renderer/shaders/chunk_solid.frag @@ -0,0 +1,36 @@ +#version 450 + +// Solid (opaque) terrain pass. Unlike chunk.frag this has no `discard`, so the +// driver keeps early-Z: with the front-to-back draw order, fragments occluded by +// nearer terrain are rejected before this shader runs. `early_fragment_tests` +// makes that explicit. Only sprites with no transparent texels are routed here +// (see AtlasRegion::opaque), so the alpha test chunk.frag does is unnecessary. +layout(early_fragment_tests) in; + +#include "fog.glsl" + +layout(set = 1, binding = 0) uniform sampler2D atlas_texture; + +layout(location = 0) in vec2 v_tex_coords; +layout(location = 1) in float v_light; +layout(location = 2) in vec3 v_tint; +layout(location = 3) flat in float v_visibility; +layout(location = 4) in vec3 v_fog_color; +layout(location = 5) in float v_fog; + +layout(location = 0) out vec4 out_color; + +void main() { + vec4 color = texture(atlas_texture, v_tex_coords); + vec3 linear_tint = pow(v_tint, vec3(2.2)); + float linear_light = pow(v_light, 2.2); + vec3 tinted = color.rgb * linear_tint * linear_light; + + if (v_visibility < 1.0) { + tinted = mix(v_fog_color, tinted, v_visibility); + } + + tinted = apply_fog(tinted, v_fog, v_fog_color); + + out_color = vec4(tinted, 1.0); +} diff --git a/pomme-client/src/renderer/shaders/cull.comp b/pomme-client/src/renderer/shaders/cull.comp index 321bc0a5..2d311c7d 100644 --- a/pomme-client/src/renderer/shaders/cull.comp +++ b/pomme-client/src/renderer/shaders/cull.comp @@ -40,6 +40,32 @@ layout(set = 0, binding = 3) buffer CountBuf { uint draw_count; }; +layout(set = 0, binding = 4) writeonly buffer IndirectCutoutBuf { + DrawCmd cutout_draws[]; +}; + +layout(set = 0, binding = 5) buffer CutoutCountBuf { + uint cutout_draw_count; +}; + +// Emit one indexed draw covering [first..first+count) for this section. The +// per-instance attributes (origin + fade) come from meta entry `idx`, routed via +// first_instance. +void emit(uint count, uint first, int vertex_offset, uint idx, bool cutout) { + if (count == 0u) return; + DrawCmd d; + d.index_count = count; + d.instance_count = 1u; + d.first_index = first; + d.vertex_offset = vertex_offset; + d.first_instance = idx; + if (cutout) { + cutout_draws[atomicAdd(cutout_draw_count, 1u)] = d; + } else { + draws[atomicAdd(draw_count, 1u)] = d; + } +} + void main() { uint idx = gl_GlobalInvocationID.x; if (idx >= chunk_count) return; @@ -60,12 +86,8 @@ void main() { if (d < 0.0) return; } - uint slot = atomicAdd(draw_count, 1u); - draws[slot].index_count = m.index_count; - draws[slot].instance_count = 1u; - draws[slot].first_index = m.first_index; - draws[slot].vertex_offset = m.vertex_offset; - // first_instance routes the draw to its meta entry (origin + fade) which the - // vertex shader reads as a per-instance attribute. - draws[slot].first_instance = idx; + // origin.w packs the section's solid index count; indices split solid-first. + uint solid_count = floatBitsToUint(m.origin.w); + emit(solid_count, m.first_index, m.vertex_offset, idx, false); + emit(m.index_count - solid_count, m.first_index + solid_count, m.vertex_offset, idx, true); }