apache · viiccwen · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
@@ -232,6 +232,7 @@ impl QuantumEncoder for AngleEncoder {
                 num_qubits, input_len
             )));
         }
+        validate_qubit_count(num_qubits)?;
         let state_len = 1 << num_qubits;
         let angles_d = input_d as *const f64;
         let state_vector = {
@@ -290,6 +291,7 @@ impl QuantumEncoder for AngleEncoder {
                 num_qubits, sample_size
             )));
         }
+        validate_qubit_count(num_qubits)?;
         let state_len = 1 << num_qubits;
         let input_batch_d = input_batch_d as *const f64;
         let angle_validation_buffer = {
@@ -401,6 +403,75 @@ impl QuantumEncoder for AngleEncoder {
 }
 
 impl AngleEncoder {
+    /// Encodes `input_len` angle values from a device-resident `f32` buffer into a GPU state
+    /// vector, using the provided CUDA stream for all launched work.
+    ///
+    /// # Safety
+    /// The caller must ensure that `input_d` points to at least `input_len` contiguous `f32`
+    /// values in GPU-accessible memory and remains valid for the duration of this call.
+    /// The caller must also ensure that `stream` is either null or a valid CUDA stream handle
+    /// associated with `device`, and that no concurrent use of these raw pointers violates Rust's
+    /// aliasing or lifetime rules.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_from_gpu_ptr_f32_with_stream(
+        device: &Arc<CudaDevice>,
+        input_d: *const f32,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<GpuStateVector> {
+        if input_len == 0 {
+            return Err(MahoutError::InvalidInput(
+                "Input data cannot be empty".into(),
+            ));
+        }
+        if input_len != num_qubits {
+            return Err(MahoutError::InvalidInput(format!(
+                "Angle encoding expects {} values (one per qubit), got {}",
+                num_qubits, input_len
+            )));
+        }
+
+        validate_qubit_count(num_qubits)?;
+        let state_len = 1 << num_qubits;
+        let state_vector = {
+            crate::profile_scope!("GPU::Alloc");
+            GpuStateVector::new(device, num_qubits, Precision::Float32)?
+        };
+        let state_ptr = state_vector.ptr_f32().ok_or_else(|| {
+            MahoutError::InvalidInput(
+                "State vector precision mismatch (expected float32 buffer)".to_string(),
+            )
+        })?;
+
+        {
+            crate::profile_scope!("GPU::KernelLaunch");
+            let ret = unsafe {
+                qdp_kernels::launch_angle_encode_f32(
+                    input_d,
+                    state_ptr as *mut c_void,
+                    state_len,
+                    num_qubits as u32,
+                    stream,
+                )
+            };
+            if ret != 0 {
+                return Err(MahoutError::KernelLaunch(format!(
+                    "Angle encoding kernel (f32) failed with CUDA error code: {} ({})",
+                    ret,
+                    cuda_error_to_string(ret)
+                )));
+            }
+        }
+
+        {
+            crate::profile_scope!("GPU::Synchronize");
+            crate::gpu::cuda_sync::sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
+        }
+
+        Ok(state_vector)
+    }
+
     #[cfg(target_os = "linux")]
     fn encode_batch_async_pipeline(
         device: &Arc<CudaDevice>,

@@ -639,6 +639,63 @@ impl QdpEngine {
         Ok(state_vector.to_dlpack())
     }
 
+    /// Encode angle from existing GPU pointer (float32 input only).
+    ///
+    /// Zero-copy encoding from CUDA float32 tensors. Uses the default CUDA stream.
+    /// For stream interop use `encode_angle_from_gpu_ptr_f32_with_stream`.
+    ///
+    /// # Safety
+    /// The input pointer must:
+    /// - Point to valid GPU memory on the same device as the engine
+    /// - Contain at least `input_len` f32 elements
+    /// - Remain valid for the duration of this call
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_angle_from_gpu_ptr_f32(
+        &self,
+        input_d: *const f32,
+        input_len: usize,
+        num_qubits: usize,
+    ) -> Result<*mut DLManagedTensor> {
+        unsafe {
+            self.encode_angle_from_gpu_ptr_f32_with_stream(
+                input_d,
+                input_len,
+                num_qubits,
+                std::ptr::null_mut(),
+            )
+        }
+    }
+
+    /// Encode angle from existing GPU pointer (float32) on a specified CUDA stream.
+    ///
+    /// # Safety
+    /// In addition to the `encode_angle_from_gpu_ptr_f32` requirements, the stream pointer
+    /// must remain valid for the duration of this call.
+    #[cfg(target_os = "linux")]
+    pub unsafe fn encode_angle_from_gpu_ptr_f32_with_stream(
+        &self,
+        input_d: *const f32,
+        input_len: usize,
+        num_qubits: usize,
+        stream: *mut c_void,
+    ) -> Result<*mut DLManagedTensor> {
+        crate::profile_scope!("Mahout::EncodeAngleFromGpuPtrF32");
+
+        validate_cuda_input_ptr(&self.device, input_d as *const c_void)?;
+
+        let state_vector = unsafe {
+            gpu::AngleEncoder::encode_from_gpu_ptr_f32_with_stream(
+                &self.device,
+                input_d,
+                input_len,
+                num_qubits,
+                stream,
+            )
+        }?;
+        let state_vector = state_vector.to_precision(&self.device, self.precision)?;
+        Ok(state_vector.to_dlpack())
+    }
+
     /// Encode a batch from an existing GPU pointer (float32 input, amplitude encoding only).
     ///
     /// Zero-copy batch encoding from PyTorch CUDA float32 tensors. Uses the default CUDA stream.

@@ -892,6 +892,216 @@ fn test_encode_from_gpu_ptr_f32_input_exceeds_state_len() {
     }
 }
 
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match common::copy_f32_to_device(&[0.0, std::f32::consts::FRAC_PI_2]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_angle_from_gpu_ptr_f32(ptr, input_d.len(), 2)
+            .expect("encode_angle_from_gpu_ptr_f32")
+    };
+    unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 1, 4) };
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_with_stream_success() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (device, input_d) = match common::copy_f32_to_device(&[0.0, std::f32::consts::FRAC_PI_2]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let stream = device.fork_default_stream().expect("fork_default_stream");
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_angle_from_gpu_ptr_f32_with_stream(
+                *input_d.device_ptr() as *const f32,
+                input_d.len(),
+                2,
+                stream.stream as *mut c_void,
+            )
+            .expect("encode_angle_from_gpu_ptr_f32_with_stream")
+    };
+    unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 1, 4) };
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_success_f64_engine() {
+    let Some(engine) = common::qdp_engine_with_precision(Precision::Float64) else {
+        println!("SKIP: No GPU");
+        return;
+    };
+    let (_device, input_d) = match common::copy_f32_to_device(&[0.0, std::f32::consts::FRAC_PI_2]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let dlpack_ptr = unsafe {
+        engine
+            .encode_angle_from_gpu_ptr_f32(ptr, input_d.len(), 2)
+            .expect("encode_angle_from_gpu_ptr_f32 (Float64 engine)")
+    };
+    unsafe { common::assert_dlpack_shape_2d_and_delete(dlpack_ptr, 1, 4) };
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_empty_input() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match common::copy_f32_to_device(&[0.0]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let result = unsafe { engine.encode_angle_from_gpu_ptr_f32(ptr, 0, 1) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(msg.contains("empty") || msg.contains("null"));
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_null_pointer() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let result = unsafe { engine.encode_angle_from_gpu_ptr_f32(std::ptr::null(), 2, 2) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => assert!(msg.contains("null")),
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_qubit_mismatch() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (_device, input_d) = match common::copy_f32_to_device(&[0.0, std::f32::consts::FRAC_PI_2]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let result = unsafe { engine.encode_angle_from_gpu_ptr_f32(ptr, input_d.len(), 1) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(msg.contains("expects 1 values") || msg.contains("got 2"));
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_too_many_qubits() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let input = vec![0.0_f32; 31];
+    let (_device, input_d) = match common::copy_f32_to_device(&input) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let ptr = *input_d.device_ptr() as *const f32;
+    let result = unsafe { engine.encode_angle_from_gpu_ptr_f32(ptr, input_d.len(), 31) };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(msg.contains("exceeds practical limit"), "got: {msg}");
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
+#[test]
+fn test_encode_angle_from_gpu_ptr_f32_with_stream_too_many_qubits() {
+    let engine = match engine_f32() {
+        Some(e) => e,
+        None => {
+            println!("SKIP: No GPU");
+            return;
+        }
+    };
+    let (device, input_d) = match common::copy_f32_to_device(&[0.0_f32; 31]) {
+        Some(t) => t,
+        None => {
+            println!("SKIP: No CUDA device");
+            return;
+        }
+    };
+    let stream = device.fork_default_stream().expect("fork_default_stream");
+    let result = unsafe {
+        engine.encode_angle_from_gpu_ptr_f32_with_stream(
+            *input_d.device_ptr() as *const f32,
+            input_d.len(),
+            31,
+            stream.stream as *mut c_void,
+        )
+    };
+    assert!(result.is_err());
+    match &result.unwrap_err() {
+        MahoutError::InvalidInput(msg) => {
+            assert!(msg.contains("exceeds practical limit"), "got: {msg}");
+        }
+        e => panic!("Expected InvalidInput, got {:?}", e),
+    }
+}
+
 #[test]
 fn test_encode_batch_from_gpu_ptr_f32_success() {
     let engine = match engine_f32() {