From 36b49d94fc87e26d34a1b08a4e6723e79687eee2 Mon Sep 17 00:00:00 2001
From: viiccwen <viiccwen@gmail.com>
Date: Wed, 8 Apr 2026 10:47:28 +0000
Subject: [PATCH 1/6] feat(kernel): add angle encoding kernel (f32)

---
 qdp/qdp-kernels/src/angle.cu | 55 ++++++++++++++++++++++++++++++++++++
 1 file changed, 55 insertions(+)
diff --git a/qdp/qdp-kernels/src/angle.cu b/qdp/qdp-kernels/src/angle.cu
index a3fd5668f4..2a2d129689 100644
--- a/qdp/qdp-kernels/src/angle.cu
+++ b/qdp/qdp-kernels/src/angle.cu
@@ -42,6 +42,24 @@ __global__ void angle_encode_kernel(
     state[idx] = make_cuDoubleComplex(amplitude, 0.0);
 }
 
+__global__ void angle_encode_kernel_f32(
+    const float* __restrict__ angles,
+    cuComplex* __restrict__ state,
+    size_t state_len,
+    unsigned int num_qubits
+) {
+    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= state_len) return;
+
+    float amplitude = 1.0f;
+    for (unsigned int bit = 0; bit < num_qubits; ++bit) {
+        float angle = angles[bit];
+        amplitude *= ((idx >> bit) & 1U) ? sinf(angle) : cosf(angle);
+    }
+
+    state[idx] = make_cuComplex(amplitude, 0.0f);
+}
+
 __global__ void angle_encode_batch_kernel(
     const double* __restrict__ angles_batch,
     cuDoubleComplex* __restrict__ state_batch,
@@ -109,6 +127,43 @@ int launch_angle_encode(
     return (int)cudaGetLastError();
 }
 
+/// Launch angle encoding kernel for float32 input
+///
+/// # Arguments
+/// * angles_d - Device pointer to per-qubit angles
+/// * state_d - Device pointer to output state vector
+/// * state_len - Target state vector size (2^num_qubits)
+/// * num_qubits - Number of qubits (angles length)
+/// * stream - CUDA stream for async execution (nullptr = default stream)
+///
+/// # Returns
+/// CUDA error code (0 = cudaSuccess)
+int launch_angle_encode_f32(
+    const float* angles_d,
+    void* state_d,
+    size_t state_len,
+    unsigned int num_qubits,
+    cudaStream_t stream
+) {
+    if (state_len == 0 || num_qubits == 0) {
+        return cudaErrorInvalidValue;
+    }
+
+    cuComplex* state_complex_d = static_cast<cuComplex*>(state_d);
+
+    const int blockSize = DEFAULT_BLOCK_SIZE;
+    const int gridSize = (state_len + blockSize - 1) / blockSize;
+
+    angle_encode_kernel_f32<<<gridSize, blockSize, 0, stream>>>(
+        angles_d,
+        state_complex_d,
+        state_len,
+        num_qubits
+    );
+
+    return (int)cudaGetLastError();
+}
+
 /// Launch batch angle encoding kernel
 ///
 /// # Arguments

From 1f1d8d0aca8920f40f41982847da1a8c50597ef0 Mon Sep 17 00:00:00 2001
From: viiccwen <viiccwen@gmail.com>
Date: Wed, 8 Apr 2026 10:48:41 +0000
Subject: [PATCH 2/6] feat: add angle encoding from GPU pointer (f32)

---
 qdp/qdp-core/src/gpu/encodings/angle.rs | 59 +++++++++++++++++++++++++
 qdp/qdp-core/src/lib.rs                 | 57 ++++++++++++++++++++++++
 qdp/qdp-kernels/src/lib.rs              | 25 +++++++++++
 3 files changed, 141 insertions(+)

diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 24ed50a6dc..34de8e31f9 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -401,6 +401,65 @@ impl QuantumEncoder for AngleEncoder {
 }
 
 impl AngleEncoder {
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
+        device: &Arc<CudaDevice>,
+        input_d: *const f32,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        if input_len == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Input data cannot be empty".into(),
+            ));
+        }
+        if input_len != num_qubits {
+            return Err(MahoutError::InvalidInput(format!(
+                "Angle encoding expects {} values (one per qubit), got {}",
+                num_qubits, input_len
+            )));
+        }
+
+        let state_len = 1 << num_qubits;
+        let state_vector = {
+            crate::profile_scope!("GPU::Alloc");
+            GpuStateVector::new(device, num_qubits, Precision::Float32)?
+        };
+        let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "State vector precision mismatch (expected float32 buffer)".to_string(),
+            )
+        })?;
+
+        {
+            crate::profile_scope!("GPU::KernelLaunch");
+            let ret = unsafe {
+                qdp_kernels::launch_angle_encode_f32(
+                    input_d,
+                    state_ptr as *mut c_void,
+                    state_len,
+                    num_qubits as u32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Angle encoding kernel (f32) failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+
+        Ok(state_vector)
+    }
+
     #[cfg(target_os = "linux")]
     fn encode_batch_async_pipeline(
         device: &Arc<CudaDevice>,
diff --git a/qdp/qdp-core/src/lib.rs b/qdp/qdp-core/src/lib.rs
index 0153f8719f..3db9accdc1 100644
--- a/qdp/qdp-core/src/lib.rs
+++ b/qdp/qdp-core/src/lib.rs
@@ -639,6 +639,63 @@ impl QdpEngine {
         Ok(state_vector.to_dlpack())
     }
 
+    /// Encode angle from existing GPU pointer (float32 input only).
+    ///
+    /// Zero-copy encoding from CUDA float32 tensors. Uses the default CUDA stream.
+    /// For stream interop use `encode_angle_from_gpu_ptr_f32_with_stream`.
+    ///
+    /// # Safety
+    /// The input pointer must:
+    /// - Point to valid GPU memory on the same device as the engine
+    /// - Contain at least `input_len` f32 elements
+    /// - Remain valid for the duration of this call
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_angle_from_gpu_ptr_f32(
+        &self,
+        input_d: *const f32,
+        input_len: usize,
+        num_qubits: usize,
+    ) -> Result<*mut DLManagedTensor> {
+        unsafe {
+            self.encode_angle_from_gpu_ptr_f32_with_stream(
+                input_d,
+                input_len,
+                num_qubits,
+                std::ptr::null_mut(),
+            )
+        }
+    }
+
+    /// Encode angle from existing GPU pointer (float32) on a specified CUDA stream.
+    ///
+    /// # Safety
+    /// In addition to the `encode_angle_from_gpu_ptr_f32` requirements, the stream pointer
+    /// must remain valid for the duration of this call.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_angle_from_gpu_ptr_f32_with_stream(
+        &self,
+        input_d: *const f32,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<*mut DLManagedTensor> {
+        crate::profile_scope!("Mahout::EncodeAngleFromGpuPtrF32");
+
+        validate_cuda_input_ptr(&self.device, input_d as *const c_void)?;
+
+        let state_vector = unsafe {
+            gpu::AngleEncoder::encode_from_gpu_ptr_f32_with_stream(
+                &self.device,
+                input_d,
+                input_len,
+                num_qubits,
+                stream,
+            )
+        }?;
+        let state_vector = state_vector.to_precision(&self.device, self.precision)?;
+        Ok(state_vector.to_dlpack())
+    }
+
     /// Encode a batch from an existing GPU pointer (float32 input, amplitude encoding only).
     ///
     /// Zero-copy batch encoding from PyTorch CUDA float32 tensors. Uses the default CUDA stream.
diff --git a/qdp/qdp-kernels/src/lib.rs b/qdp/qdp-kernels/src/lib.rs
index e5ad2b69fd..4649092863 100644
--- a/qdp/qdp-kernels/src/lib.rs
+++ b/qdp/qdp-kernels/src/lib.rs
@@ -227,6 +227,19 @@ unsafe extern "C" {
         stream: *mut c_void,
     ) -> i32;
 
+    /// Launch angle encoding kernel for float32 inputs.
+    /// Returns CUDA error code (0 = success)
+    ///
+    /// # Safety
+    /// Requires valid GPU pointers, must sync before freeing
+    pub fn launch_angle_encode_f32(
+        angles_d: *const f32,
+        state_d: *mut c_void,
+        state_len: usize,
+        num_qubits: u32,
+        stream: *mut c_void,
+    ) -> i32;
+
     /// Launch batch angle encoding kernel
     /// Returns CUDA error code (0 = success)
     ///
@@ -458,6 +471,18 @@ pub extern "C" fn launch_angle_encode(
     999
 }
 
+#[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
+#[unsafe(no_mangle)]
+pub extern "C" fn launch_angle_encode_f32(
+    _angles_d: *const f32,
+    _state_d: *mut c_void,
+    _state_len: usize,
+    _num_qubits: u32,
+    _stream: *mut c_void,
+) -> i32 {
+    999
+}
+
 #[cfg(any(not(target_os = "linux"), qdp_no_cuda))]
 #[unsafe(no_mangle)]
 pub extern "C" fn launch_angle_encode_batch(

From e8d2596e2ac361da7382fa5dffa6ec83934b6c91 Mon Sep 17 00:00:00 2001
From: viiccwen <viiccwen@gmail.com>
Date: Wed, 8 Apr 2026 10:49:02 +0000
Subject: [PATCH 3/6] test: add unit tests for angle encoding

---
 qdp/qdp-core/tests/gpu_ptr_encoding.rs | 148 ++++++++++++++++++++++++
 qdp/qdp-kernels/tests/angle_encode.rs  | 149 +++++++++++++++++++++++++
 2 files changed, 297 insertions(+)
 create mode 100644 qdp/qdp-kernels/tests/angle_encode.rs

diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index 470402727d..ecb1ec6462 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -892,6 +892,154 @@ fn test_encode_from_gpu_ptr_f32_input_exceeds_state_len() {
     }
 }
 
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match common::copy_f32_to_device(&[0.0, std::f32::consts::FRAC_PI_2]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_angle_from_gpu_ptr_f32(ptr, input_d.len(), 2)
+            .expect("encode_angle_from_gpu_ptr_f32")
+    };
+    unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 1, 4) };
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_with_stream_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (device, input_d) = match common::copy_f32_to_device(&[0.0, std::f32::consts::FRAC_PI_2]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let stream = device.fork_default_stream().expect("fork_default_stream");
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_angle_from_gpu_ptr_f32_with_stream(
+                *input_d.device_ptr() as *const f32,
+                input_d.len(),
+                2,
+                stream.stream as *mut c_void,
+            )
+            .expect("encode_angle_from_gpu_ptr_f32_with_stream")
+    };
+    unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 1, 4) };
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_success_f64_engine() {
+    let Some(engine) = common::qdp_engine_with_precision(Precision::Float64) else {
+        println!("SKIP: No GPU");
+        return;
+    };
+    let (_device, input_d) = match common::copy_f32_to_device(&[0.0, std::f32::consts::FRAC_PI_2]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_angle_from_gpu_ptr_f32(ptr, input_d.len(), 2)
+            .expect("encode_angle_from_gpu_ptr_f32 (Float64 engine)")
+    };
+    unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 1, 4) };
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_empty_input() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match common::copy_f32_to_device(&[0.0]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let result = unsafe { engine.encode_angle_from_gpu_ptr_f32(ptr, 0, 1) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(msg.contains("empty") || msg.contains("null"));
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_null_pointer() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let result = unsafe { engine.encode_angle_from_gpu_ptr_f32(std::ptr::null(), 2, 2) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => assert!(msg.contains("null")),
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_qubit_mismatch() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match common::copy_f32_to_device(&[0.0, std::f32::consts::FRAC_PI_2]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let result = unsafe { engine.encode_angle_from_gpu_ptr_f32(ptr, input_d.len(), 1) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(msg.contains("expects 1 values") || msg.contains("got 2"));
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
 #[test]
 fn test_encode_batch_from_gpu_ptr_f32_success() {
     let engine = match engine_f32() {
diff --git a/qdp/qdp-kernels/tests/angle_encode.rs b/qdp/qdp-kernels/tests/angle_encode.rs
new file mode 100644
index 0000000000..f42b3cd50a
--- /dev/null
+++ b/qdp/qdp-kernels/tests/angle_encode.rs
@@ -0,0 +1,149 @@
+//
+// Licensed to the Apache Software Foundation (ASF) under one or more
+// contributor license agreements.  See the NOTICE file distributed with
+// this work for additional information regarding copyright ownership.
+// The ASF licenses this file to You under the Apache License, Version 2.0
+// (the "License"); you may not use this file except in compliance with
+// the License.  You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+// Tests for angle encoding CUDA kernels.
+
+#![allow(unused_unsafe)]
+
+#[cfg(target_os = "linux")]
+use cudarc::driver::{CudaDevice, DevicePtr, DevicePtrMut};
+#[cfg(target_os = "linux")]
+use qdp_kernels::{CuComplex, launch_angle_encode_f32};
+
+const EPSILON_F32: f32 = 1e-5;
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_angle_encode_basic_f32() {
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input = vec![std::f32::consts::FRAC_PI_2, 0.0_f32];
+    let state_len = 4usize;
+
+    let input_d = device.htod_copy(input).unwrap();
+    let mut state_d = device.alloc_zeros::<CuComplex>(state_len).unwrap();
+
+    let result = unsafe {
+        launch_angle_encode_f32(
+            *input_d.device_ptr() as *const f32,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            state_len,
+            2,
+            std::ptr::null_mut(),
+        )
+    };
+    assert_eq!(result, 0, "Kernel launch should succeed");
+
+    let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+    let expected = [0.0_f32, 1.0_f32, 0.0_f32, 0.0_f32];
+
+    for (idx, (actual, expected)) in state_h.iter().zip(expected.iter()).enumerate() {
+        assert!(
+            (actual.x - expected).abs() < EPSILON_F32,
+            "state[{idx}].x expected {expected}, got {}",
+            actual.x
+        );
+        assert!(
+            actual.y.abs() < EPSILON_F32,
+            "state[{idx}].y expected 0, got {}",
+            actual.y
+        );
+    }
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_angle_encode_matches_expected_product_state_f32() {
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let angles = vec![0.3_f32, 0.7_f32];
+    let state_len = 4usize;
+
+    let input_d = device.htod_copy(angles.clone()).unwrap();
+    let mut state_d = device.alloc_zeros::<CuComplex>(state_len).unwrap();
+
+    let result = unsafe {
+        launch_angle_encode_f32(
+            *input_d.device_ptr() as *const f32,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            state_len,
+            2,
+            std::ptr::null_mut(),
+        )
+    };
+    assert_eq!(result, 0, "Kernel launch should succeed");
+
+    let state_h = device.dtoh_sync_copy(&state_d).unwrap();
+    let expected = [
+        angles[0].cos() * angles[1].cos(),
+        angles[0].sin() * angles[1].cos(),
+        angles[0].cos() * angles[1].sin(),
+        angles[0].sin() * angles[1].sin(),
+    ];
+
+    for (idx, (actual, expected)) in state_h.iter().zip(expected.iter()).enumerate() {
+        assert!(
+            (actual.x - expected).abs() < EPSILON_F32,
+            "state[{idx}].x expected {expected}, got {}",
+            actual.x
+        );
+        assert!(
+            actual.y.abs() < EPSILON_F32,
+            "state[{idx}].y expected 0, got {}",
+            actual.y
+        );
+    }
+}
+
+#[test]
+#[cfg(target_os = "linux")]
+fn test_angle_encode_f32_rejects_zero_qubits() {
+    let device = match CudaDevice::new(0) {
+        Ok(d) => d,
+        Err(_) => {
+            println!("SKIP: No CUDA device available");
+            return;
+        }
+    };
+
+    let input = vec![0.0_f32];
+    let input_d = device.htod_copy(input).unwrap();
+    let mut state_d = device.alloc_zeros::<CuComplex>(1).unwrap();
+
+    let result = unsafe {
+        launch_angle_encode_f32(
+            *input_d.device_ptr() as *const f32,
+            *state_d.device_ptr_mut() as *mut std::ffi::c_void,
+            1,
+            0,
+            std::ptr::null_mut(),
+        )
+    };
+
+    assert_ne!(result, 0, "Zero-qubit launch should fail");
+}

From bddfbff6aeb20d8565cd06fc0804e40421e95760 Mon Sep 17 00:00:00 2001
From: viiccwen <viiccwen@gmail.com>
Date: Wed, 8 Apr 2026 14:23:55 +0000
Subject: [PATCH 4/6] fix: pre-commit error

---
 qdp/qdp-core/src/gpu/encodings/angle.rs | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 34de8e31f9..9bc81914b6 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -402,6 +402,15 @@ impl QuantumEncoder for AngleEncoder {
 
 impl AngleEncoder {
     #[cfg(target_os = "linux")]
+    /// Encodes `input_len` angle values from a device-resident `f32` buffer into a GPU state
+    /// vector, using the provided CUDA stream for all launched work.
+    ///
+    /// # Safety
+    /// The caller must ensure that `input_d` points to at least `input_len` contiguous `f32`
+    /// values in GPU-accessible memory and remains valid for the duration of this call.
+    /// The caller must also ensure that `stream` is either null or a valid CUDA stream handle
+    /// associated with `device`, and that no concurrent use of these raw pointers violates Rust's
+    /// aliasing or lifetime rules.
     pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
         device: &Arc<CudaDevice>,
         input_d: *const f32,

From 2c8ac872c3e2623c4ccb71adc163c4a18fc32479 Mon Sep 17 00:00:00 2001
From: viiccwen <viiccwen@gmail.com>
Date: Sun, 12 Apr 2026 05:45:07 +0000
Subject: [PATCH 5/6] fix: add validation for qubit count in AngleEncoder
 methods

---
 qdp/qdp-core/src/gpu/encodings/angle.rs | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/qdp/qdp-core/src/gpu/encodings/angle.rs b/qdp/qdp-core/src/gpu/encodings/angle.rs
index 9bc81914b6..974a7b6c7a 100644
--- a/qdp/qdp-core/src/gpu/encodings/angle.rs
+++ b/qdp/qdp-core/src/gpu/encodings/angle.rs
@@ -232,6 +232,7 @@ impl QuantumEncoder for AngleEncoder {
                 num_qubits, input_len
             )));
         }
+        validate_qubit_count(num_qubits)?;
         let state_len = 1 << num_qubits;
         let angles_d = input_d as *const f64;
         let state_vector = {
@@ -290,6 +291,7 @@ impl QuantumEncoder for AngleEncoder {
                 num_qubits, sample_size
             )));
         }
+        validate_qubit_count(num_qubits)?;
         let state_len = 1 << num_qubits;
         let input_batch_d = input_batch_d as *const f64;
         let angle_validation_buffer = {
@@ -401,7 +403,6 @@ impl QuantumEncoder for AngleEncoder {
 }
 
 impl AngleEncoder {
-    #[cfg(target_os = "linux")]
     /// Encodes `input_len` angle values from a device-resident `f32` buffer into a GPU state
     /// vector, using the provided CUDA stream for all launched work.
     ///
@@ -411,6 +412,7 @@ impl AngleEncoder {
     /// The caller must also ensure that `stream` is either null or a valid CUDA stream handle
     /// associated with `device`, and that no concurrent use of these raw pointers violates Rust's
     /// aliasing or lifetime rules.
+    #[cfg(target_os = "linux")]
     pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
         device: &Arc<CudaDevice>,
         input_d: *const f32,
@@ -430,6 +432,7 @@ impl AngleEncoder {
             )));
         }
 
+        validate_qubit_count(num_qubits)?;
         let state_len = 1 << num_qubits;
         let state_vector = {
             crate::profile_scope!("GPU::Alloc");

From 7cd9dbe536e428c1139b7062e2128aec5fe838e5 Mon Sep 17 00:00:00 2001
From: viiccwen <viiccwen@gmail.com>
Date: Sun, 12 Apr 2026 05:45:24 +0000
Subject: [PATCH 6/6] test: add additional unit tests for angle encoding error
 handling

---
 qdp/qdp-core/tests/gpu_ptr_encoding.rs | 62 ++++++++++++++++++++++++++
 1 file changed, 62 insertions(+)

diff --git a/qdp/qdp-core/tests/gpu_ptr_encoding.rs b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
index ecb1ec6462..80a054a693 100644
--- a/qdp/qdp-core/tests/gpu_ptr_encoding.rs
+++ b/qdp/qdp-core/tests/gpu_ptr_encoding.rs
@@ -1040,6 +1040,68 @@ fn test_encode_angle_from_gpu_ptr_f32_qubit_mismatch() {
     }
 }
 
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_too_many_qubits() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let input = vec![0.0_f32; 31];
+    let (_device, input_d) = match common::copy_f32_to_device(&input) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let result = unsafe { engine.encode_angle_from_gpu_ptr_f32(ptr, input_d.len(), 31) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(msg.contains("exceeds practical limit"), "got: {msg}");
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_with_stream_too_many_qubits() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (device, input_d) = match common::copy_f32_to_device(&[0.0_f32; 31]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let stream = device.fork_default_stream().expect("fork_default_stream");
+    let result = unsafe {
+        engine.encode_angle_from_gpu_ptr_f32_with_stream(
+            *input_d.device_ptr() as *const f32,
+            input_d.len(),
+            31,
+            stream.stream as *mut c_void,
+        )
+    };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(msg.contains("exceeds practical limit"), "got: {msg}");
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
 #[test]
 fn test_encode_batch_from_gpu_ptr_f32_success() {
     let engine = match engine_f32() {