diff --git a/.gitignore b/.gitignore
index bdb326f..83f036b 100644
--- a/.gitignore
+++ b/.gitignore
@@ -46,6 +46,10 @@ docs/package-lock.json
 # the .bin into docs/notebooks/data/xcat/ (default path, symlink-friendly).
 docs/notebooks/data/
 
+# Pluto cell-output caches written by PlutoSpace / pluto-collab — large
+# (tens of MB), regenerated on every run, never tracked.
+*.pluto-cache.toml
+
 # File generated by Pkg, the package manager, based on a corresponding Project.toml
 # It records a fixed state of all packages used by the project. As such, it should not be
 # committed for packages, but should be committed for applications that require a static
diff --git a/docs/assets/dual_kvp_vmi_projection_grid.png b/docs/assets/dual_kvp_vmi_projection_grid.png
index bdb8228..62cc65a 100644
Binary files a/docs/assets/dual_kvp_vmi_projection_grid.png and b/docs/assets/dual_kvp_vmi_projection_grid.png differ
diff --git a/docs/assets/dual_kvp_vmi_projection_monoplus.png b/docs/assets/dual_kvp_vmi_projection_monoplus.png
index d618b36..7cafdc6 100644
Binary files a/docs/assets/dual_kvp_vmi_projection_monoplus.png and b/docs/assets/dual_kvp_vmi_projection_monoplus.png differ
diff --git a/docs/assets/dual_kvp_vmi_water_noise_vs_energy.png b/docs/assets/dual_kvp_vmi_water_noise_vs_energy.png
index 69a9bb5..1b70eef 100644
Binary files a/docs/assets/dual_kvp_vmi_water_noise_vs_energy.png and b/docs/assets/dual_kvp_vmi_water_noise_vs_energy.png differ
diff --git a/docs/assets/pcct_vmi_projection_grid.png b/docs/assets/pcct_vmi_projection_grid.png
index 60a4fb6..c96932c 100644
Binary files a/docs/assets/pcct_vmi_projection_grid.png and b/docs/assets/pcct_vmi_projection_grid.png differ
diff --git a/docs/assets/pcct_vmi_projection_monoplus.png b/docs/assets/pcct_vmi_projection_monoplus.png
index c8efd6f..9b9e79d 100644
Binary files a/docs/assets/pcct_vmi_projection_monoplus.png and b/docs/assets/pcct_vmi_projection_monoplus.png differ
diff --git a/docs/assets/pcct_vmi_regression.png b/docs/assets/pcct_vmi_regression.png
index 5bd40a6..d24068c 100644
Binary files a/docs/assets/pcct_vmi_regression.png and b/docs/assets/pcct_vmi_regression.png differ
diff --git a/docs/assets/pcct_vmi_vs_theoretical.png b/docs/assets/pcct_vmi_vs_theoretical.png
index fc581aa..2ade936 100644
Binary files a/docs/assets/pcct_vmi_vs_theoretical.png and b/docs/assets/pcct_vmi_vs_theoretical.png differ
diff --git a/docs/assets/pcct_vmi_water_noise_vs_energy.png b/docs/assets/pcct_vmi_water_noise_vs_energy.png
index fd81119..1e385af 100644
Binary files a/docs/assets/pcct_vmi_water_noise_vs_energy.png and b/docs/assets/pcct_vmi_water_noise_vs_energy.png differ
diff --git a/docs/assets/pcct_vmi_water_roi_check.png b/docs/assets/pcct_vmi_water_roi_check.png
index e0fba4d..3f95b20 100644
Binary files a/docs/assets/pcct_vmi_water_roi_check.png and b/docs/assets/pcct_vmi_water_roi_check.png differ
diff --git a/docs/assets/qrm_thorax_vmi_noise_roi.png b/docs/assets/qrm_thorax_vmi_noise_roi.png
index d2b630b..4bafd90 100644
Binary files a/docs/assets/qrm_thorax_vmi_noise_roi.png and b/docs/assets/qrm_thorax_vmi_noise_roi.png differ
diff --git a/docs/assets/qrm_thorax_vmi_vs_theoretical.png b/docs/assets/qrm_thorax_vmi_vs_theoretical.png
index d85621e..dbe4122 100644
Binary files a/docs/assets/qrm_thorax_vmi_vs_theoretical.png and b/docs/assets/qrm_thorax_vmi_vs_theoretical.png differ
diff --git a/docs/assets/qrm_thorax_vmi_water_roi_check.png b/docs/assets/qrm_thorax_vmi_water_roi_check.png
index b1ef2b7..d01ccf3 100644
Binary files a/docs/assets/qrm_thorax_vmi_water_roi_check.png and b/docs/assets/qrm_thorax_vmi_water_roi_check.png differ
diff --git a/docs/assets/recon_compare_4panel.png b/docs/assets/recon_compare_4panel.png
index 5dfe96f..42b630b 100644
Binary files a/docs/assets/recon_compare_4panel.png and b/docs/assets/recon_compare_4panel.png differ
diff --git a/docs/assets/sinogram_standard.png b/docs/assets/sinogram_standard.png
index e6c1651..c49b028 100644
Binary files a/docs/assets/sinogram_standard.png and b/docs/assets/sinogram_standard.png differ
diff --git a/docs/assets/vmi_regression.png b/docs/assets/vmi_regression.png
index c36d036..e9d608c 100644
Binary files a/docs/assets/vmi_regression.png and b/docs/assets/vmi_regression.png differ
diff --git a/docs/assets/vmi_vs_theoretical.png b/docs/assets/vmi_vs_theoretical.png
index 71e3660..c782a5b 100644
Binary files a/docs/assets/vmi_vs_theoretical.png and b/docs/assets/vmi_vs_theoretical.png differ
diff --git a/docs/assets/vmi_water_roi_check.png b/docs/assets/vmi_water_roi_check.png
index a90ca81..26f4c15 100644
Binary files a/docs/assets/vmi_water_roi_check.png and b/docs/assets/vmi_water_roi_check.png differ
diff --git a/docs/assets/xcat_fbp_vs_hir.png b/docs/assets/xcat_fbp_vs_hir.png
index 1f836b4..681b89c 100644
Binary files a/docs/assets/xcat_fbp_vs_hir.png and b/docs/assets/xcat_fbp_vs_hir.png differ
diff --git a/docs/notebooks/01_five_struct_api.jl b/docs/notebooks/01_five_struct_api.jl
index cb1a8ed..72d3a50 100644
--- a/docs/notebooks/01_five_struct_api.jl
+++ b/docs/notebooks/01_five_struct_api.jl
@@ -1,5 +1,8 @@
 ### A Pluto.jl notebook ###
-# v0.19.0
+# v0.1.0
+
+using Markdown
+using InteractiveUtils
 
 # ╔═╡ 01000003-0000-4000-8000-000000000001
 begin
@@ -47,6 +50,7 @@ detect a GPU backend.
 import BasisSimulator as BS
 
 # ╔═╡ 01000003-0000-4000-8000-000000000003
+# ╠═╡ show_logs = false
 import CairoMakie as CM
 
 # ╔═╡ 01000004-0000-4000-8000-000000000001
diff --git a/docs/notebooks/02_xcat_custom_materials.jl b/docs/notebooks/02_xcat_custom_materials.jl
index 7097d53..0a901ab 100644
--- a/docs/notebooks/02_xcat_custom_materials.jl
+++ b/docs/notebooks/02_xcat_custom_materials.jl
@@ -1,5 +1,5 @@
 ### A Pluto.jl notebook ###
-# v0.20.24
+# v0.1.0
 
 using Markdown
 using InteractiveUtils
diff --git a/docs/notebooks/03_dual_kvp_switching_vmi.jl b/docs/notebooks/03_dual_kvp_switching_vmi.jl
index 68d5ace..7d8f78f 100644
--- a/docs/notebooks/03_dual_kvp_switching_vmi.jl
+++ b/docs/notebooks/03_dual_kvp_switching_vmi.jl
@@ -1,5 +1,8 @@
 ### A Pluto.jl notebook ###
-# v0.19.0
+# v0.1.0
+
+using Markdown
+using InteractiveUtils
 
 # ╔═╡ 05000001-0000-4000-8000-000000000001
 begin
@@ -307,7 +310,7 @@ subspace `U[:,2]` with a small separable Gaussian.  One knob:
 """
 
 # ╔═╡ 05000007-0000-4000-8000-000000000005
-SVD_SIGMA_PX = 1.0;   # Gaussian σ (px) for U[:,2]; 0 = passthrough (no denoising)
+SVD_SIGMA_PX = 0.0;   # Gaussian σ (px) for U[:,2]; 0 = passthrough (no denoising)
 
 # ╔═╡ 05000007-0000-4000-8000-000000000010
 # 2-channel joint projection-domain SVD denoise (BS.apply_sino_svd_denoise).
diff --git a/docs/notebooks/04_pcct_vmi.jl b/docs/notebooks/04_pcct_vmi.jl
index 72fad8c..a5efc54 100644
--- a/docs/notebooks/04_pcct_vmi.jl
+++ b/docs/notebooks/04_pcct_vmi.jl
@@ -1,5 +1,8 @@
 ### A Pluto.jl notebook ###
-# v0.19.0
+# v0.1.0
+
+using Markdown
+using InteractiveUtils
 
 # ╔═╡ 06000001-0000-4000-8000-000000000001
 begin
diff --git a/docs/notebooks/07_qrm_thorax_pure_material_vmi.jl b/docs/notebooks/07_qrm_thorax_pure_material_vmi.jl
index cbd9a29..c92194b 100644
--- a/docs/notebooks/07_qrm_thorax_pure_material_vmi.jl
+++ b/docs/notebooks/07_qrm_thorax_pure_material_vmi.jl
@@ -1,5 +1,8 @@
 ### A Pluto.jl notebook ###
-# v0.19.0
+# v0.1.0
+
+using Markdown
+using InteractiveUtils
 
 # ╔═╡ 07010003-0000-4000-8000-000000000001
 begin
diff --git a/src/BasisSimulator.jl b/src/BasisSimulator.jl
index a04a635..47cb35d 100644
--- a/src/BasisSimulator.jl
+++ b/src/BasisSimulator.jl
@@ -79,6 +79,11 @@ include("geometry/affine.jl")
 # Reference: CERN/TIGRE/Common/CUDA/Siddon_projection.cu
 include("projection/siddon.jl")
 
+# Distance-driven forward projection (CatSim/XCIST DD3, gather form)
+# Reference: De Man & Basu 2004; gecatsim DD3Proj_roi_notrans_mm.cpp
+# Drop-in replacement for Siddon (same signatures, cm units, Σμ·l output).
+include("projection/dd.jl")
+
 # =============================================================================
 # Detector (ALL detector effects + noise)
 # =============================================================================
diff --git a/src/api/driver.jl b/src/api/driver.jl
index 5c7a68b..c35db8b 100644
--- a/src/api/driver.jl
+++ b/src/api/driver.jl
@@ -1321,7 +1321,7 @@ function reconstruct!(
             # Forward project with subset geometry → subset_Ax_buf
             ax_view = view(ws.subset_Ax_buf, :, :, 1:n_sub)
             fill!(ax_view, zero(T))
-            siddon_forward_project!(
+            dd_forward_project!(
                 ax_view, ws.volume, geom_s;
                 ws_source_positions = ws.subset_geom_source_positions[s],
                 ws_detector_centers = ws.subset_geom_detector_centers[s],
diff --git a/src/correction/bhc_image_domain.jl b/src/correction/bhc_image_domain.jl
index 4fdb69b..f87cee1 100644
--- a/src/correction/bhc_image_domain.jl
+++ b/src/correction/bhc_image_domain.jl
@@ -107,7 +107,7 @@ function apply_bhc_image_domain(
     # Use geom's recon FOV (no volume_extent override) so this matches Step 3's
     # FDK back-projection grid exactly — same physical box on both halves of
     # the round-trip.
-    ξ_gpu = siddon_forward_project(high_atten_μ, geom)
+    ξ_gpu = dd_forward_project(high_atten_μ, geom)
 
     # Step 3: FDK-reconstruct the error sinogram → error image.
     error_image = fdk_reconstruct(ξ_gpu, geom, matrix_size)
diff --git a/src/correction/bhc_sinogram.jl b/src/correction/bhc_sinogram.jl
index 8abfb08..cb988b3 100644
--- a/src/correction/bhc_sinogram.jl
+++ b/src/correction/bhc_sinogram.jl
@@ -603,7 +603,7 @@ function apply_bhc_two_material(
         end
     end
 
-    p_b_gpu = siddon_forward_project(bone_μ_gpu, geom)
+    p_b_gpu = dd_forward_project(bone_μ_gpu, geom)
 
     p_s_gpu = similar(sino_water)
     copyto!(p_s_gpu, sino_water)
diff --git a/src/detector/photon_counting.jl b/src/detector/photon_counting.jl
index c2e6ec7..1207b6e 100644
--- a/src/detector/photon_counting.jl
+++ b/src/detector/photon_counting.jl
@@ -495,7 +495,7 @@ function pcct_forward_project(
 
             if has_src
                 copyto!(bt_sub, @view ws_source_spectral[:, :, ts:te])
-                siddon_fused_spectral_project!(
+                dd_fused_spectral_project!(
                     _pilot, _outputs_flat, Int32(n_bins), mask, _proj_geom,
                     μ_sub, W_sub, Val(TILE_K), Int32(1);
                     volume_extent=volume_extent,
@@ -503,7 +503,7 @@ function pcct_forward_project(
                     ws_detector_u=_ws_u, ws_detector_v=_ws_v,
                     ws_bowtie_spectral=bt_sub)
             else
-                siddon_fused_spectral_project!(
+                dd_fused_spectral_project!(
                     _pilot, _outputs_flat, Int32(n_bins), mask, _proj_geom,
                     μ_sub, W_sub, Val(TILE_K), Int32(1);
                     volume_extent=volume_extent,
@@ -636,7 +636,7 @@ function pcct_forward_project(
 
         # Forward project at this energy (native or binned resolution)
         fill!(sino_buf, zero(T))
-        siddon_forward_project!(sino_buf, μ_volume, proj_geom;
+        dd_forward_project!(sino_buf, μ_volume, proj_geom;
             ws_source_positions=_ws_src,
             ws_detector_centers=_ws_det,
             ws_detector_u=_ws_u,
diff --git a/src/projection/dd.jl b/src/projection/dd.jl
new file mode 100644
index 0000000..fe967d4
--- /dev/null
+++ b/src/projection/dd.jl
@@ -0,0 +1,564 @@
+# =============================================================================
+# Distance-Driven (DD) Forward Projection — AcceleratedKernels.jl
+# =============================================================================
+#
+# Julia / AcceleratedKernels.jl port of the **distance-driven** 3D forward
+# projector from CatSim / XCIST (gecatsim), specifically
+# `clib_build/src/DD3Proj_roi_notrans_mm.cpp` (the physical-mm, no-transpose
+# variant).  Upstream is GE Precision HealthCare, Apache-2.0 / XCIST license
+# (https://github.com/xcist/main).  Modifications relative to the original
+# C++ are Copyright (c) 2025 MolloiLab and contributors (MIT, see ../../LICENSE).
+#
+# Reference: De Man B, Basu S. "Distance-driven projection and backprojection
+# in three dimensions." Phys Med Biol. 2004;49(11):2463-2475.
+# doi:10.1088/0031-9155/49/11/024
+#
+# Why a second projector
+# ----------------------
+# `siddon.jl` traces each ray independently (ray-driven).  Distance-driven
+# instead maps image-voxel boundaries and detector-cell boundaries onto a
+# common iso-plane (perpendicular to the dominant ray axis) and integrates
+# their *overlap*.  This is the algorithm CatSim/XCIST uses and gives cleaner
+# adjoint / lower high-frequency artifact behaviour than Siddon.
+#
+# Drop-in contract
+# ----------------
+# The four public entry points mirror the Siddon signatures and semantics
+# EXACTLY, so every downstream consumer (BHC, IR, polychromatic EI,
+# photon-counting PCCT, FDK) works unchanged:
+#
+#   dd_forward_project! / dd_forward_project   ←→ siddon_forward_project! / …
+#   dd_fused_poly_project!                     ←→ siddon_fused_poly_project!
+#   dd_fused_spectral_project!                 ←→ siddon_fused_spectral_project!
+#
+# Output-stationary (gather) reformulation
+# ----------------------------------------
+# The upstream C is image-stationary *scatter* (loops image rows, accumulates
+# into a shared per-view detector buffer — a reduction).  We reformulate it
+# output-stationary: ONE thread per output sinogram cell gathers the
+# distance-driven overlap of its own back-projected footprint.  This is
+# faithful DD (only the floating-point summation order differs) and maps onto
+# `AK.foreachindex(sinogram)` exactly like Siddon — no atomics, no races,
+# portable to Metal / CUDA / ROCm / CPU.
+#
+# Per-view axis choice (DD3 `vertical`): integrate along whichever world axis
+# the source most aligns with (|s_y| ≥ |s_x| → integrate along y, else along x),
+# keeping the boundary→iso-plane mapping well conditioned.  Each image row's
+# contributing voxel range is bounded by inverse projection so per-cell work is
+# O(n_rows × footprint), comparable to a Siddon ray walk.
+#
+# The fused poly/spectral kernels reuse Siddon's `@generated` NTuple helpers
+# (`_fused_accum_energies`, `_fused_beer_lambert[_bt]`, `_tiled_*`) verbatim —
+# only the per-voxel weight changes from Siddon's chord `path_length` to DD's
+# overlap `ox·oz·norm`.
+#
+# Assumptions (match the DD3_mm variant)
+# --------------------------------------
+# - Flat detector, detector_v = ẑ (BasisSimulator's CTGeometry construction).
+# - Isotropic in-plane voxels (vox_x ≈ vox_y); the DD3_mm obliquity scaling
+#   uses a single in-plane voxel size.  Enforced with a warning.
+# - Volume centred at isocentre with physical extent `volume_extent` (else
+#   `geom.fov`), identical to Siddon.
+#
+# =============================================================================
+
+import AcceleratedKernels as AK
+
+export dd_forward_project!, dd_forward_project,
+    dd_fused_poly_project!, dd_fused_spectral_project!
+
+# NOTE (dd-raytracing branch): distance-driven is THE projector here.  The
+# internal pipeline (`_forward_project_poly!`, `pcct_forward_project`, BHC, IR,
+# IR-recon) calls the `dd_*` functions below directly, exactly as it used to
+# call `siddon_*` — same signatures, so no downstream/notebook code changed.
+# `siddon.jl` is retained only for reference/benchmarking; on merge it is
+# expected to be deleted and these become the sole projector.
+
+# -----------------------------------------------------------------------------
+# Overlap length of two 1-D intervals [a_lo, a_hi] ∩ [b_lo, b_hi].
+# -----------------------------------------------------------------------------
+@inline function _dd_overlap(a_lo::T, a_hi::T, b_lo::T, b_hi::T) where {T}
+    lo = max(a_lo, b_lo)
+    hi = min(a_hi, b_hi)
+    return hi > lo ? hi - lo : zero(T)
+end
+
+# -----------------------------------------------------------------------------
+# Contributing voxel-index range [i_start, i_end] along one axis: the voxels
+# whose projected boundaries straddle [lo, hi] on the iso-plane.  Inverse of
+#   pos(b) = s0 + (vmin + b*vstep - s0) * mag_fac   (boundary index b, 0-based)
+# Widened by ±1 for float safety; non-overlapping extras are filtered by the
+# `_dd_overlap > 0` test in the loop.
+# -----------------------------------------------------------------------------
+@inline function _dd_bounds(
+        lo::T, hi::T, s0::T, vmin::T, vstep::T, inv_mf::T, n::Int32,
+    ) where {T}
+    c_lo = ((lo - s0) * inv_mf - (vmin - s0)) / vstep
+    c_hi = ((hi - s0) * inv_mf - (vmin - s0)) / vstep
+    c_lo = clamp(c_lo, zero(T), T(n) + one(T))
+    c_hi = clamp(c_hi, zero(T), T(n) + one(T))
+    i_start = max(Int32(1), unsafe_trunc(Int32, floor(c_lo)))
+    i_end = min(n, unsafe_trunc(Int32, ceil(c_hi)) + Int32(1))
+    return (i_start, i_end)
+end
+
+# -----------------------------------------------------------------------------
+# Per-cell distance-driven geometry: maps the detector cell (col,row) onto the
+# iso-plane and returns the footprint + normalisation shared by every DD kernel.
+# Returns a flat tuple (compiler-inlined): the value-tuple plus a `valid` flag.
+# -----------------------------------------------------------------------------
+@inline function _dd_cell_setup(
+        col::Int32, row::Int32,
+        sx::T, sy::T, sz::T,
+        dcx::T, dcy::T, dcz::T, ux::T, uy::T, vvz::T,
+        mag::T, ps::T, prs::T, col_center::T, row_center::T,
+        nx::Int32, ny::Int32,
+        vmin_x::T, vmin_y::T, vx::T, vy::T,
+    ) where {T}
+
+    eps = T(1.0e-12)
+    half = T(0.5)
+
+    # detector COLUMN boundaries (transaxial), world (x,y) at detector mid-row
+    uL = (T(col) - half - col_center) * ps * mag
+    uU = (T(col) + half - col_center) * ps * mag
+    bxL = dcx + uL * ux;  byL = dcy + uL * uy
+    bxU = dcx + uU * ux;  byU = dcy + uU * uy
+
+    # integration axis = dominant source component
+    vertical = abs(sy) >= abs(sx)
+    if vertical
+        s_long = sy;  s_tran = sx
+        dtL = bxL;  dlL = byL;  dtU = bxU;  dlU = byU
+        n_t = nx;  v_t = vx;  vmin_t = vmin_x
+        n_long = ny;  v_long = vy;  vmin_long = vmin_y
+    else
+        s_long = sx;  s_tran = sy
+        dtL = byL;  dlL = bxL;  dtU = byU;  dlU = bxU
+        n_t = ny;  v_t = vy;  vmin_t = vmin_y
+        n_long = nx;  v_long = vx;  vmin_long = vmin_x
+    end
+
+    # project the two column boundaries onto the iso-plane (long = 0)
+    scaleL = s_long / (s_long - dlL)
+    scaleU = s_long / (s_long - dlU)
+    dXa = s_tran + (dtL - s_tran) * scaleL
+    dXb = s_tran + (dtU - s_tran) * scaleU
+    dXlo = min(dXa, dXb);  dXhi = max(dXa, dXb)
+    detXstep = dXhi - dXlo
+    deltaT = half * (dXa + dXb) - s_tran
+    scale_col = half * (scaleL + scaleU)
+
+    # detector ROW boundaries (axial), world z, projected to iso-plane
+    wL = (T(row) - half - row_center) * prs * mag
+    wU = (T(row) + half - row_center) * prs * mag
+    zdL = dcz + wL * vvz
+    zdU = dcz + wU * vvz
+    dZa = sz + scale_col * (zdL - sz)
+    dZb = sz + scale_col * (zdU - sz)
+    dZlo = min(dZa, dZb);  dZhi = max(dZa, dZb)
+    detZstep = dZhi - dZlo
+    deltaZ = half * (dZa + dZb) - sz
+
+    valid = detXstep > eps && detZstep > eps
+    invCos = sqrt(s_long * s_long + deltaT * deltaT + deltaZ * deltaZ) /
+        abs(s_long) * v_long
+    norm = valid ? invCos / (detXstep * detZstep) : zero(T)
+
+    return (valid, vertical, s_long, s_tran, dXlo, dXhi, dZlo, dZhi, norm,
+        n_t, v_t, vmin_t, n_long, v_long, vmin_long)
+end
+
+# -----------------------------------------------------------------------------
+# Single detector-cell mono line integral (Σμ·l), gather form.
+# -----------------------------------------------------------------------------
+@inline function _dd_trace_cell(
+        volume::AbstractArray{T, 3},
+        col::Int32, row::Int32,
+        sx::T, sy::T, sz::T,
+        dcx::T, dcy::T, dcz::T, ux::T, uy::T, vvz::T,
+        mag::T, ps::T, prs::T, col_center::T, row_center::T,
+        nx::Int32, ny::Int32, nz::Int32,
+        vmin_x::T, vmin_y::T, vmin_z::T, vx::T, vy::T, vz::T,
+    ) where {T}
+
+    (valid, vertical, s_long, s_tran, dXlo, dXhi, dZlo, dZhi, norm,
+        n_t, v_t, vmin_t, n_long, v_long, vmin_long) = _dd_cell_setup(
+        col, row, sx, sy, sz, dcx, dcy, dcz, ux, uy, vvz,
+        mag, ps, prs, col_center, row_center, nx, ny, vmin_x, vmin_y, vx, vy)
+
+    valid || return zero(T)
+
+    accum = zero(T)
+    il = Int32(1)
+    while il <= n_long
+        lp = vmin_long + (T(il) - T(0.5)) * v_long
+        mag_fac = s_long / (s_long - lp)
+        inv_mf = one(T) / mag_fac
+
+        (it_s, it_e) = _dd_bounds(dXlo, dXhi, s_tran, vmin_t, v_t, inv_mf, n_t)
+        (ip_s, ip_e) = _dd_bounds(dZlo, dZhi, sz, vmin_z, vz, inv_mf, nz)
+
+        it = it_s
+        while it <= it_e
+            t0 = s_tran + (vmin_t + T(it - Int32(1)) * v_t - s_tran) * mag_fac
+            t1 = s_tran + (vmin_t + T(it) * v_t - s_tran) * mag_fac
+            ox = _dd_overlap(dXlo, dXhi, min(t0, t1), max(t0, t1))
+            if ox > zero(T)
+                ixv = vertical ? it : il
+                iyv = vertical ? il : it
+                ip = ip_s
+                while ip <= ip_e
+                    z0 = sz + (vmin_z + T(ip - Int32(1)) * vz - sz) * mag_fac
+                    z1 = sz + (vmin_z + T(ip) * vz - sz) * mag_fac
+                    oz = _dd_overlap(dZlo, dZhi, min(z0, z1), max(z0, z1))
+                    if oz > zero(T)
+                        accum += ox * oz * volume[ixv, iyv, ip]
+                    end
+                    ip += Int32(1)
+                end
+            end
+            it += Int32(1)
+        end
+        il += Int32(1)
+    end
+
+    return norm * accum
+end
+
+# -----------------------------------------------------------------------------
+# Helper: build a typed geometry array on the same backend as `ref`.
+# -----------------------------------------------------------------------------
+@inline function _dd_geom_array(ws, ref, src, ::Type{T}) where {T}
+    ws !== nothing && return ws
+    a = similar(ref, T, size(src)...)
+    copyto!(a, T.(src))
+    return a
+end
+
+# =============================================================================
+# Mono: in-place + allocating — mirror siddon_forward_project! / …project
+# =============================================================================
+"""
+    dd_forward_project!(sinogram, volume, geom; volume_extent=nothing, ...) -> sinogram
+
+Distance-driven forward projection (CatSim/XCIST DD3, gather form).  Drop-in
+replacement for [`siddon_forward_project!`](@ref): same arguments, same
+`[n_cols, n_rows, n_angles]` sinogram, same cm units and Σμ·l output.
+"""
+function dd_forward_project!(
+        sinogram::AbstractArray{T, 3},
+        volume::AbstractArray{T, 3},
+        geom::CTGeometry;
+        ws_source_positions = nothing,
+        ws_detector_centers = nothing,
+        ws_detector_u = nothing,
+        ws_detector_v = nothing,
+        volume_extent::Union{Nothing, NTuple{3, Float64}} = nothing,
+    ) where {T <: AbstractFloat}
+
+    nx = Int32(size(volume, 1)); ny = Int32(size(volume, 2)); nz = Int32(size(volume, 3))
+    n_cols = Int32(size(sinogram, 1)); n_rows = Int32(size(sinogram, 2))
+
+    vol_bounds = volume_extent !== nothing ? volume_extent : geom.fov
+    vmin_x = T(-vol_bounds[1] / 2); vmin_y = T(-vol_bounds[2] / 2); vmin_z = T(-vol_bounds[3] / 2)
+    vx = T(vol_bounds[1]) / T(nx); vy = T(vol_bounds[2]) / T(ny); vz = T(vol_bounds[3]) / T(nz)
+    _dd_check_isotropy(vx, vy)
+
+    mag = T(geom.SDD / geom.SAD)
+    ps = T(geom.pixel_size); prs = T(geom.pixel_row_size)
+    col_center = (T(n_cols) + one(T)) / T(2)
+    row_center = (T(n_rows) + one(T)) / T(2)
+
+    sp = _dd_geom_array(ws_source_positions, volume, geom.source_positions, T)
+    dc = _dd_geom_array(ws_detector_centers, volume, geom.detector_centers, T)
+    du = _dd_geom_array(ws_detector_u, volume, geom.detector_u, T)
+    dv = _dd_geom_array(ws_detector_v, volume, geom.detector_v, T)
+
+    AK.foreachindex(sinogram) do idx
+        idx_0 = Int32(idx - 1)
+        col = (idx_0 % n_cols) + Int32(1)
+        idx_0 = idx_0 ÷ n_cols
+        row = (idx_0 % n_rows) + Int32(1)
+        angle = (idx_0 ÷ n_rows) + Int32(1)
+
+        sinogram[idx] = _dd_trace_cell(
+            volume, col, row,
+            sp[1, angle], sp[2, angle], sp[3, angle],
+            dc[1, angle], dc[2, angle], dc[3, angle], du[1, angle], du[2, angle], dv[3, angle],
+            mag, ps, prs, col_center, row_center,
+            nx, ny, nz, vmin_x, vmin_y, vmin_z, vx, vy, vz,
+        )
+    end
+    return sinogram
+end
+
+"""
+    dd_forward_project(volume, geom; volume_extent=nothing) -> sinogram
+
+Allocating distance-driven forward projection.  Drop-in for
+[`siddon_forward_project`](@ref).
+"""
+function dd_forward_project(
+        volume::AbstractArray{T, 3},
+        geom::CTGeometry;
+        volume_extent::Union{Nothing, NTuple{3, Float64}} = nothing,
+    ) where {T <: AbstractFloat}
+    sinogram = similar(volume, T, geom.n_cols, geom.n_rows, geom.n_angles)
+    fill!(sinogram, zero(T))
+    return dd_forward_project!(sinogram, volume, geom; volume_extent = volume_extent)
+end
+
+@inline function _dd_check_isotropy(vx::T, vy::T) where {T}
+    if abs(vx - vy) > T(1.0e-4) * max(vx, vy)
+        @warn "dd projector: in-plane voxels not isotropic (vx=$vx, vy=$vy); \
+               DD3 obliquity scaling assumes vox_x ≈ vox_y — results may be biased." maxlog = 1
+    end
+    return nothing
+end
+
+# =============================================================================
+# Fused polychromatic — mirror siddon_fused_poly_project!
+# =============================================================================
+"""
+    dd_fused_poly_project!(sinogram, mask, geom, μ_table_gpu, wη_gpu, Val(N_E); kwargs...)
+
+Distance-driven analog of [`siddon_fused_poly_project!`](@ref): one gather pass
+per detector cell through the material `mask`, accumulating line integrals for
+all `N_E` energy bins, then Beer-Lambert `-log(Σ_e wη[e]·exp(-L_e))`.  Same
+arguments, output, and bowtie-spectral handling as the Siddon version.
+"""
+function dd_fused_poly_project!(
+        sinogram::AbstractArray{T, 3},
+        mask::AbstractArray{<:Unsigned, 3},
+        geom::CTGeometry,
+        μ_table_gpu::AbstractArray{T, 2},
+        wη_gpu::AbstractArray{T, 1},
+        ::Val{N_E};
+        volume_extent::Union{Nothing, NTuple{3, Float64}} = nothing,
+        ws_source_positions = nothing,
+        ws_detector_centers = nothing,
+        ws_detector_u = nothing,
+        ws_detector_v = nothing,
+        ws_bowtie_spectral = nothing,
+    ) where {T <: AbstractFloat, N_E}
+
+    nx = Int32(size(mask, 1)); ny = Int32(size(mask, 2)); nz = Int32(size(mask, 3))
+    n_cols = Int32(size(sinogram, 1)); n_rows = Int32(size(sinogram, 2))
+
+    vol_bounds = volume_extent !== nothing ? volume_extent : geom.fov
+    vmin_x = T(-vol_bounds[1] / 2); vmin_y = T(-vol_bounds[2] / 2); vmin_z = T(-vol_bounds[3] / 2)
+    vx = T(vol_bounds[1]) / T(nx); vy = T(vol_bounds[2]) / T(ny); vz = T(vol_bounds[3]) / T(nz)
+    _dd_check_isotropy(vx, vy)
+
+    mag = T(geom.SDD / geom.SAD)
+    ps = T(geom.pixel_size); prs = T(geom.pixel_row_size)
+    col_center = (T(n_cols) + one(T)) / T(2)
+    row_center = (T(n_rows) + one(T)) / T(2)
+
+    sp = _dd_geom_array(ws_source_positions, sinogram, geom.source_positions, T)
+    dc = _dd_geom_array(ws_detector_centers, sinogram, geom.detector_centers, T)
+    du = _dd_geom_array(ws_detector_u, sinogram, geom.detector_u, T)
+    dv = _dd_geom_array(ws_detector_v, sinogram, geom.detector_v, T)
+
+    nc_nr = n_cols * n_rows
+    has_bowtie = ws_bowtie_spectral !== nothing
+    _bt = has_bowtie ? ws_bowtie_spectral : similar(μ_table_gpu, T, 1, 1, 1)
+
+    let mask = mask, μ_tbl = μ_table_gpu, wη = wη_gpu,
+            sp = sp, dc = dc, du = du, dv = dv, bt = _bt,
+            vmx = vmin_x, vmy = vmin_y, vmz = vmin_z, vsx = vx, vsy = vy, vsz = vz,
+            nx = nx, ny = ny, nz = nz, nc = n_cols, nr = n_rows,
+            mag = mag, ps = ps, prs = prs, cc = col_center, rc = row_center,
+            ncnr = nc_nr, hbt = has_bowtie
+
+        AK.foreachindex(sinogram) do idx
+            idx_0 = Int32(idx - 1)
+            col = (idx_0 % nc) + Int32(1)
+            idx_0 = idx_0 ÷ nc
+            row = (idx_0 % nr) + Int32(1)
+            angle = (idx_0 ÷ nr) + Int32(1)
+
+            sx = sp[1, angle]; sy = sp[2, angle]; sz = sp[3, angle]
+            dcx = dc[1, angle]; dcy = dc[2, angle]; dcz = dc[3, angle]
+            ux = du[1, angle]; uy = du[2, angle]; vvz = dv[3, angle]
+
+            (valid, vertical, s_long, s_tran, dXlo, dXhi, dZlo, dZhi, norm,
+                n_t, v_t, vmin_t, n_long, v_long, vmin_long) = _dd_cell_setup(
+                col, row, sx, sy, sz, dcx, dcy, dcz, ux, uy, vvz,
+                mag, ps, prs, cc, rc, nx, ny, vmx, vmy, vsx, vsy)
+
+            accums = ntuple(_ -> zero(T), Val(N_E))
+            if valid
+                il = Int32(1)
+                while il <= n_long
+                    lp = vmin_long + (T(il) - T(0.5)) * v_long
+                    mag_fac = s_long / (s_long - lp)
+                    inv_mf = one(T) / mag_fac
+                    (it_s, it_e) = _dd_bounds(dXlo, dXhi, s_tran, vmin_t, v_t, inv_mf, n_t)
+                    (ip_s, ip_e) = _dd_bounds(dZlo, dZhi, sz, vmz, vsz, inv_mf, nz)
+                    it = it_s
+                    while it <= it_e
+                        t0 = s_tran + (vmin_t + T(it - Int32(1)) * v_t - s_tran) * mag_fac
+                        t1 = s_tran + (vmin_t + T(it) * v_t - s_tran) * mag_fac
+                        ox = _dd_overlap(dXlo, dXhi, min(t0, t1), max(t0, t1))
+                        if ox > zero(T)
+                            ixv = vertical ? it : il
+                            iyv = vertical ? il : it
+                            ip = ip_s
+                            while ip <= ip_e
+                                z0 = sz + (vmz + T(ip - Int32(1)) * vsz - sz) * mag_fac
+                                z1 = sz + (vmz + T(ip) * vsz - sz) * mag_fac
+                                oz = _dd_overlap(dZlo, dZhi, min(z0, z1), max(z0, z1))
+                                if oz > zero(T)
+                                    mat = Int32(mask[ixv, iyv, ip]) + Int32(1)
+                                    accums = _fused_accum_energies(accums, μ_tbl, mat, ox * oz * norm)
+                                end
+                                ip += Int32(1)
+                            end
+                        end
+                        it += Int32(1)
+                    end
+                    il += Int32(1)
+                end
+            end
+
+            I_total = if hbt
+                bt_base = Int32(col) + (Int32(row) - Int32(1)) * nc
+                _fused_beer_lambert_bt(accums, wη, bt, bt_base, ncnr)
+            else
+                _fused_beer_lambert(accums, wη)
+            end
+            sinogram[idx] = -log(max(I_total, T(1.0e-10)))
+        end
+    end
+    return sinogram
+end
+
+# =============================================================================
+# Fused spectral (PCCT, tiled) — mirror siddon_fused_spectral_project!
+# =============================================================================
+"""
+    dd_fused_spectral_project!(pilot, outputs_flat, n_bins, mask, geom,
+        μ_table_gpu, W_gpu, Val(K), tile_start; kwargs...)
+
+Distance-driven analog of [`siddon_fused_spectral_project!`](@ref): one gather
+pass per detector cell accumulating `K` energy bins (tile starting at
+`tile_start`), then ADDS per-bin partial Beer-Lambert sums to `outputs_flat`.
+Same tiled-accumulation contract as the Siddon version (zero-init before the
+first tile; tiles accumulate).
+"""
+function dd_fused_spectral_project!(
+        pilot::AbstractArray{T, 3},
+        outputs_flat::AbstractArray{T, 1},
+        n_bins::Int32,
+        mask::AbstractArray{<:Unsigned, 3},
+        geom::CTGeometry,
+        μ_table_gpu::AbstractArray{T, 2},
+        W_gpu::AbstractArray{T, 2},
+        ::Val{K},
+        tile_start::Int32;
+        volume_extent::Union{Nothing, NTuple{3, Float64}} = nothing,
+        ws_source_positions = nothing,
+        ws_detector_centers = nothing,
+        ws_detector_u = nothing,
+        ws_detector_v = nothing,
+        ws_bowtie_spectral = nothing,
+    ) where {T <: AbstractFloat, K}
+
+    nx = Int32(size(mask, 1)); ny = Int32(size(mask, 2)); nz = Int32(size(mask, 3))
+    n_cols = Int32(size(pilot, 1)); n_rows = Int32(size(pilot, 2)); n_angles = Int32(size(pilot, 3))
+    n_elem = n_cols * n_rows * n_angles
+
+    vol_bounds = volume_extent !== nothing ? volume_extent : geom.fov
+    vmin_x = T(-vol_bounds[1] / 2); vmin_y = T(-vol_bounds[2] / 2); vmin_z = T(-vol_bounds[3] / 2)
+    vx = T(vol_bounds[1]) / T(nx); vy = T(vol_bounds[2]) / T(ny); vz = T(vol_bounds[3]) / T(nz)
+    _dd_check_isotropy(vx, vy)
+
+    mag = T(geom.SDD / geom.SAD)
+    ps = T(geom.pixel_size); prs = T(geom.pixel_row_size)
+    col_center = (T(n_cols) + one(T)) / T(2)
+    row_center = (T(n_rows) + one(T)) / T(2)
+
+    sp = _dd_geom_array(ws_source_positions, mask, geom.source_positions, T)
+    dc = _dd_geom_array(ws_detector_centers, mask, geom.detector_centers, T)
+    du = _dd_geom_array(ws_detector_u, mask, geom.detector_u, T)
+    dv = _dd_geom_array(ws_detector_v, mask, geom.detector_v, T)
+
+    nc_nr = n_cols * n_rows
+    has_src_spectral = ws_bowtie_spectral !== nothing
+    _bt = has_src_spectral ? ws_bowtie_spectral : similar(μ_table_gpu, T, 1, 1, 1)
+
+    let mask = mask, μ_tbl = μ_table_gpu, W = W_gpu, ts = tile_start,
+            sp = sp, dc = dc, du = du, dv = dv, bt = _bt,
+            vmx = vmin_x, vmy = vmin_y, vmz = vmin_z, vsx = vx, vsy = vy, vsz = vz,
+            nx = nx, ny = ny, nz = nz, nc = n_cols, nr = n_rows,
+            ne = n_elem, nb = n_bins, oflat = outputs_flat,
+            mag = mag, ps = ps, prs = prs, cc = col_center, rc = row_center,
+            ncnr = nc_nr, hbt = has_src_spectral
+
+        AK.foreachindex(pilot) do idx
+            idx_0 = Int32(idx - 1)
+            col = (idx_0 % nc) + Int32(1)
+            idx_0 = idx_0 ÷ nc
+            row = (idx_0 % nr) + Int32(1)
+            angle = (idx_0 ÷ nr) + Int32(1)
+
+            sx = sp[1, angle]; sy = sp[2, angle]; sz = sp[3, angle]
+            dcx = dc[1, angle]; dcy = dc[2, angle]; dcz = dc[3, angle]
+            ux = du[1, angle]; uy = du[2, angle]; vvz = dv[3, angle]
+
+            (valid, vertical, s_long, s_tran, dXlo, dXhi, dZlo, dZhi, norm,
+                n_t, v_t, vmin_t, n_long, v_long, vmin_long) = _dd_cell_setup(
+                col, row, sx, sy, sz, dcx, dcy, dcz, ux, uy, vvz,
+                mag, ps, prs, cc, rc, nx, ny, vmx, vmy, vsx, vsy)
+
+            accums = ntuple(_ -> zero(T), Val(K))
+            if valid
+                il = Int32(1)
+                while il <= n_long
+                    lp = vmin_long + (T(il) - T(0.5)) * v_long
+                    mag_fac = s_long / (s_long - lp)
+                    inv_mf = one(T) / mag_fac
+                    (it_s, it_e) = _dd_bounds(dXlo, dXhi, s_tran, vmin_t, v_t, inv_mf, n_t)
+                    (ip_s, ip_e) = _dd_bounds(dZlo, dZhi, sz, vmz, vsz, inv_mf, nz)
+                    it = it_s
+                    while it <= it_e
+                        t0 = s_tran + (vmin_t + T(it - Int32(1)) * v_t - s_tran) * mag_fac
+                        t1 = s_tran + (vmin_t + T(it) * v_t - s_tran) * mag_fac
+                        ox = _dd_overlap(dXlo, dXhi, min(t0, t1), max(t0, t1))
+                        if ox > zero(T)
+                            ixv = vertical ? it : il
+                            iyv = vertical ? il : it
+                            ip = ip_s
+                            while ip <= ip_e
+                                z0 = sz + (vmz + T(ip - Int32(1)) * vsz - sz) * mag_fac
+                                z1 = sz + (vmz + T(ip) * vsz - sz) * mag_fac
+                                oz = _dd_overlap(dZlo, dZhi, min(z0, z1), max(z0, z1))
+                                if oz > zero(T)
+                                    mat = Int32(mask[ixv, iyv, ip]) + Int32(1)
+                                    accums = _tiled_accum_energies(accums, μ_tbl, mat, ox * oz * norm, ts)
+                                end
+                                ip += Int32(1)
+                            end
+                        end
+                        it += Int32(1)
+                    end
+                    il += Int32(1)
+                end
+            end
+
+            if hbt
+                bt_base = Int32(col) + (Int32(row) - Int32(1)) * nc
+                for b in Int32(1):nb
+                    oflat[idx + (b - Int32(1)) * ne] += _tiled_beer_lambert_col_bt(accums, W, b, ts, bt, bt_base, ncnr)
+                end
+            else
+                for b in Int32(1):nb
+                    oflat[idx + (b - Int32(1)) * ne] += _tiled_beer_lambert_col(accums, W, b, ts)
+                end
+            end
+        end
+    end
+    return outputs_flat
+end
diff --git a/src/projection/polychromatic.jl b/src/projection/polychromatic.jl
index bf37c72..73c3db6 100644
--- a/src/projection/polychromatic.jl
+++ b/src/projection/polychromatic.jl
@@ -474,7 +474,7 @@ function _forward_project_poly!(
             _buf
         end
 
-        siddon_fused_poly_project!(
+        dd_fused_poly_project!(
             sinogram, mask, geom, ws_μ_table_gpu, wη_dev, Val(n_energies);
             volume_extent = volume_extent,
             ws_source_positions = ws_source_positions,
@@ -524,7 +524,7 @@ function _forward_project_poly!(
             end
 
             # Use proven-fast fused kernel with Val(16) — 95ms/tile on Metal
-            siddon_fused_poly_project!(
+            dd_fused_poly_project!(
                 sinogram, mask, geom,
                 μ_sub, wη_sub, Val(TILE_K);
                 volume_extent = volume_extent,
@@ -580,7 +580,7 @@ function _forward_project_poly!(
 
         # Forward project at this energy
         fill!(sino_mono, zero(T))
-        siddon_forward_project!(
+        dd_forward_project!(
             sino_mono, μ_volume, geom;
             ws_source_positions = ws_source_positions,
             ws_detector_centers = ws_detector_centers,
diff --git a/src/reconstruction/ir/utils.jl b/src/reconstruction/ir/utils.jl
index f2dd37e..3aa8fa5 100644
--- a/src/reconstruction/ir/utils.jl
+++ b/src/reconstruction/ir/utils.jl
@@ -183,7 +183,7 @@ function compute_projection_weights(
     ::Type{T}
 ) where T <: AbstractFloat
     ones_volume = ones(T, volume_size...)
-    ray_sums = siddon_forward_project(ones_volume, geom)
+    ray_sums = dd_forward_project(ones_volume, geom)
     eps = T(1e-8)
     AK.foreachindex(ray_sums) do idx
         val = ray_sums[idx]
diff --git a/test/projection.jl b/test/projection.jl
index b3036f7..26cd4e2 100644
--- a/test/projection.jl
+++ b/test/projection.jl
@@ -81,6 +81,122 @@ end
     @test maximum(abs.(sino_alloc .- sino_inplace)) < 1.0e-4
 end
 
+# -----------------------------------------------------------------------------
+# dd_forward_project — distance-driven projector (drop-in for Siddon).
+# Must agree with Siddon on line integrals (both discretise ∫μ·dl) and be a
+# linear operator on μ.  Small edge differences (DD footprint vs Siddon chord)
+# are expected; we bound the mean relative deviation on rays that hit the object.
+# -----------------------------------------------------------------------------
+@testset "dd_forward_project (distance-driven)" begin
+    # Larger toy grid so the object spans many detector rays (fair DD-vs-Siddon stats).
+    geom = _toy_proj_geom(n_cols = 64, n_rows = 8, n_angles = 8, fov_cm = 20.0)
+    nx = ny = 64
+    nz = 8
+
+    @testset "shape + zero volume → zero sinogram" begin
+        vol = zeros(Float32, nx, ny, nz)
+        sino = BS.dd_forward_project(vol, geom)
+        @test size(sino) == (geom.n_cols, geom.n_rows, geom.n_angles)
+        @test eltype(sino) == Float32
+        @test all(sino .== 0)
+    end
+
+    # Centred water cylinder, μ = 0.2 cm⁻¹.
+    vol = zeros(Float32, nx, ny, nz)
+    cx = (nx + 1) / 2
+    cy = (ny + 1) / 2
+    R = 0.6 * (nx / 2)
+    for k in 1:nz, j in 1:ny, i in 1:nx
+        if (i - cx)^2 + (j - cy)^2 <= R^2
+            vol[i, j, k] = 0.2f0
+        end
+    end
+
+    sino_siddon = BS.siddon_forward_project(vol, geom)
+    sino_dd = BS.dd_forward_project(vol, geom)
+
+    @testset "agrees with Siddon on object rays" begin
+        mask = sino_siddon .> 1.0f-4
+        relΔ = abs.(sino_dd[mask] .- sino_siddon[mask]) ./ sino_siddon[mask]
+        @test count(mask) > 1000                 # object actually illuminated
+        @test sum(relΔ) / length(relΔ) < 0.01    # mean relative deviation < 1%
+        @test maximum(relΔ) < 0.05               # worst-ray deviation < 5%
+    end
+
+    @testset "in-place == allocating" begin
+        sino_ip = zeros(Float32, geom.n_cols, geom.n_rows, geom.n_angles)
+        BS.dd_forward_project!(sino_ip, vol, geom)
+        @test maximum(abs.(sino_ip .- sino_dd)) < 1.0e-5
+    end
+
+    @testset "linearity in μ — 2× scale doubles the sinogram" begin
+        sino_2x = BS.dd_forward_project(2 .* vol, geom)
+        @test maximum(abs.(sino_2x .- 2 .* sino_dd)) < 1.0e-4
+    end
+end
+
+# -----------------------------------------------------------------------------
+# dd_fused_poly_project! / dd_fused_spectral_project! — must agree with the
+# Siddon fused kernels (same mask / μ-table / weights) within the DD-vs-Siddon
+# discretisation tolerance.  These are the kernels the EI (polychromatic) and
+# PCCT (photon-counting) pipelines call.
+# -----------------------------------------------------------------------------
+@testset "dd fused projectors vs Siddon fused" begin
+    geom = _toy_proj_geom(n_cols = 64, n_rows = 8, n_angles = 8, fov_cm = 20.0)
+    nx = ny = 64
+    nz = 8
+    N_E = 8
+
+    # material mask: 0 = air, 1 = water cylinder
+    mask = zeros(UInt16, nx, ny, nz)
+    for k in 1:nz, j in 1:ny, i in 1:nx
+        if (i - 32.5)^2 + (j - 32.5)^2 <= (0.6 * 32)^2
+            mask[i, j, k] = UInt16(1)
+        end
+    end
+    μ_table = zeros(Float32, 2, N_E)             # row 1 = air (0), row 2 = water
+    for e in 1:N_E
+        μ_table[2, e] = 0.30f0 - 0.12f0 * (e - 1) / (N_E - 1)
+    end
+    wη = fill(Float32(1 / N_E), N_E)
+
+    @testset "fused poly" begin
+        sino_s = zeros(Float32, geom.n_cols, geom.n_rows, geom.n_angles)
+        sino_d = zeros(Float32, geom.n_cols, geom.n_rows, geom.n_angles)
+        BS.siddon_fused_poly_project!(sino_s, mask, geom, μ_table, wη, Val(N_E))
+        BS.dd_fused_poly_project!(sino_d, mask, geom, μ_table, wη, Val(N_E))
+        m = sino_s .> 1.0f-4
+        relΔ = abs.(sino_d[m] .- sino_s[m]) ./ sino_s[m]
+        @test count(m) > 1000
+        @test sum(relΔ) / length(relΔ) < 0.01      # mean within 1%
+        @test maximum(relΔ) < 0.05
+    end
+
+    @testset "fused spectral (PCCT, single tile)" begin
+        n_bins = Int32(2)
+        K = N_E
+        W = zeros(Float32, N_E, 2)
+        for e in 1:N_E
+            W[e, 1] = wη[e] * (1.0f0 - (e - 1) / (N_E - 1))
+            W[e, 2] = wη[e] * ((e - 1) / (N_E - 1))
+        end
+        pilot = zeros(Float32, geom.n_cols, geom.n_rows, geom.n_angles)
+        ne = length(pilot)
+        out_s = zeros(Float32, ne * 2)
+        out_d = zeros(Float32, ne * 2)
+        BS.siddon_fused_spectral_project!(pilot, out_s, n_bins, mask, geom, μ_table, W, Val(K), Int32(1))
+        BS.dd_fused_spectral_project!(pilot, out_d, n_bins, mask, geom, μ_table, W, Val(K), Int32(1))
+        for b in 1:2
+            seg = ((b - 1) * ne + 1):(b * ne)
+            bs = out_s[seg]
+            bd = out_d[seg]
+            mm = bs .> 1.0f-6
+            relb = abs.(bd[mm] .- bs[mm]) ./ bs[mm]
+            @test sum(relb) / length(relb) < 0.02   # bin mean within 2%
+        end
+    end
+end
+
 # -----------------------------------------------------------------------------
 # create_μ_volume! — label → material → μ at energy mapping.
 # -----------------------------------------------------------------------------