Skip to content
7 changes: 5 additions & 2 deletions qdp/qdp-core/src/dlpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -252,13 +252,16 @@ impl GpuStateVector {
/// # Safety
/// Freed by DLPack deleter when PyTorch releases tensor.
/// Do not free manually.
#[allow(clippy::manual_is_multiple_of)]
pub fn to_dlpack(&self) -> *mut DLManagedTensor {
// Always return 2D tensor: Batch [num_samples, state_len], Single [1, state_len]
let (shape, strides) = if let Some(num_samples) = self.num_samples {
// Batch: [num_samples, state_len_per_sample]
debug_assert!(
num_samples > 0 && self.size_elements.is_multiple_of(num_samples),
"Batch state vector size must be divisible by num_samples"
num_samples > 0 && self.size_elements % num_samples == 0,
"Batch mismatch: {} elements cannot be evenly divided into {} samples",
self.size_elements,
num_samples
);
let state_len_per_sample = self.size_elements / num_samples;
let shape = vec![num_samples as i64, state_len_per_sample as i64];
Expand Down
226 changes: 226 additions & 0 deletions qdp/qdp-core/src/gpu/encodings/amplitude.rs
Original file line number Diff line number Diff line change
Expand Up @@ -457,6 +457,232 @@ impl QuantumEncoder for AmplitudeEncoder {
Ok(batch_state_vector)
}

/// Encode multiple samples in a single GPU allocation and kernel launch for f32 inputs
#[cfg(target_os = "linux")]
fn encode_batch_f32(
&self,
device: &Arc<CudaDevice>,
batch_data: &[f32],
num_samples: usize,
sample_size: usize,
num_qubits: usize,
) -> Result<GpuStateVector> {
crate::profile_scope!("AmplitudeEncoder::encode_batch_f32");

let state_len = 1 << num_qubits;

if sample_size == 0 {
return Err(MahoutError::InvalidInput(
"sample_size cannot be zero".into(),
));
}
if sample_size > state_len {
return Err(MahoutError::InvalidInput(format!(
"sample_size {} exceeds state vector length {} (2^{} qubits)",
sample_size, state_len, num_qubits
)));
}
if batch_data.len() != num_samples * sample_size {
return Err(MahoutError::InvalidInput(format!(
"batch_data length mismatch (expected {} * {} = {}, got {})",
num_samples,
sample_size,
num_samples * sample_size,
batch_data.len()
)));
}

let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch_f32");
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
};

// Upload input data to GPU
let input_batch_gpu = {
crate::profile_scope!("GPU::H2D_InputBatch_f32");
device.htod_sync_copy(batch_data).map_err(|e| {
MahoutError::MemoryAllocation(format!("Failed to upload batch input: {:?}", e))
})?
};

// Compute inverse norms on GPU using warp-reduced kernel
let inv_norms_gpu = {
crate::profile_scope!("GPU::BatchNormKernel_f32");
use cudarc::driver::DevicePtrMut;
let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
})?;

let ret = unsafe {
launch_l2_norm_batch_f32(
*input_batch_gpu.device_ptr() as *const f32,
num_samples,
sample_size,
*buffer.device_ptr_mut() as *mut f32,
std::ptr::null_mut(), // default stream
)
};

if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Norm reduction kernel failed: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
buffer
};

// Validate norms on host
{
crate::profile_scope!("GPU::NormValidation_f32");
let host_inv_norms = device
.dtoh_sync_copy(&inv_norms_gpu)
.map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;

if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
return Err(MahoutError::InvalidInput(
"One or more samples have zero or invalid norm".to_string(),
));
}
}

// Launch batch kernel
{
crate::profile_scope!("GPU::BatchKernelLaunch_f32");
use cudarc::driver::DevicePtr;
let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
MahoutError::InvalidInput(
"Batch state vector precision mismatch (expected float32 buffer)".to_string(),
)
})?;
let ret = unsafe {
launch_amplitude_encode_batch_f32(
*input_batch_gpu.device_ptr() as *const f32,
state_ptr as *mut c_void,
*inv_norms_gpu.device_ptr() as *const f32,
num_samples,
sample_size,
state_len,
std::ptr::null_mut(), // default stream
)
};

if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Batch kernel launch failed: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
}

{
crate::profile_scope!("GPU::Synchronize");
device
.synchronize()
.map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", e)))?;
}

Ok(batch_state_vector)
}

#[cfg(target_os = "linux")]
unsafe fn encode_batch_from_gpu_ptr_f32(
&self,
device: &Arc<CudaDevice>,
input_batch_d: *const c_void,
num_samples: usize,
sample_size: usize,
num_qubits: usize,
stream: *mut c_void,
) -> Result<GpuStateVector> {
let state_len = 1 << num_qubits;
if sample_size == 0 {
return Err(MahoutError::InvalidInput(
"Sample size cannot be zero".into(),
));
}
if sample_size > state_len {
return Err(MahoutError::InvalidInput(format!(
"Sample size {} exceeds state vector size {} (2^{} qubits)",
sample_size, state_len, num_qubits
)));
}
let input_batch_d = input_batch_d as *const f32;
let batch_state_vector = {
crate::profile_scope!("GPU::AllocBatch_f32");
GpuStateVector::new_batch(device, num_samples, num_qubits, Precision::Float32)?
};
let inv_norms_gpu = {
crate::profile_scope!("GPU::BatchNormKernel_f32");
use cudarc::driver::DevicePtrMut;
let mut buffer = device.alloc_zeros::<f32>(num_samples).map_err(|e| {
MahoutError::MemoryAllocation(format!("Failed to allocate norm buffer: {:?}", e))
})?;
let ret = unsafe {
launch_l2_norm_batch_f32(
input_batch_d,
num_samples,
sample_size,
*buffer.device_ptr_mut() as *mut f32,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Norm reduction kernel failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
buffer
};
{
crate::profile_scope!("GPU::NormValidation_f32");
let host_inv_norms = device
.dtoh_sync_copy(&inv_norms_gpu)
.map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
return Err(MahoutError::InvalidInput(
"One or more samples have zero or invalid norm".to_string(),
));
}
}
{
crate::profile_scope!("GPU::BatchKernelLaunch_f32");
use cudarc::driver::DevicePtr;
let state_ptr = batch_state_vector.ptr_f32().ok_or_else(|| {
MahoutError::InvalidInput(
"Batch state vector precision mismatch (expected float32 buffer)".to_string(),
)
})?;
let ret = unsafe {
launch_amplitude_encode_batch_f32(
input_batch_d,
state_ptr as *mut c_void,
*inv_norms_gpu.device_ptr() as *const f32,
num_samples,
sample_size,
state_len,
stream,
)
};
if ret != 0 {
return Err(MahoutError::KernelLaunch(format!(
"Batch kernel launch failed with CUDA error code: {} ({})",
ret,
cuda_error_to_string(ret)
)));
}
}
{
crate::profile_scope!("GPU::Synchronize");
sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
}
Ok(batch_state_vector)
}

fn name(&self) -> &'static str {
"amplitude"
}
Expand Down
35 changes: 35 additions & 0 deletions qdp/qdp-core/src/gpu/encodings/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -134,6 +134,41 @@ pub trait QuantumEncoder: Send + Sync {
self.name()
)))
}

/// Encode multiple samples in a single GPU allocation and kernel launch using f32 inputs.
fn encode_batch_f32(
&self,
_device: &Arc<CudaDevice>,
_batch_data: &[f32],
_num_samples: usize,
_sample_size: usize,
_num_qubits: usize,
) -> Result<GpuStateVector> {
Err(MahoutError::NotImplemented(format!(
"encode_batch_f32 not implemented for {}",
self.name()
)))
}

/// Encode batch from existing GPU pointer (zero-copy) for f32 inputs.
///
/// # Safety
/// Caller must ensure `input_batch_d` points to valid GPU memory (f32).
#[cfg(target_os = "linux")]
unsafe fn encode_batch_from_gpu_ptr_f32(
&self,
_device: &Arc<CudaDevice>,
_input_batch_d: *const c_void,
_num_samples: usize,
_sample_size: usize,
_num_qubits: usize,
_stream: *mut c_void,
) -> Result<GpuStateVector> {
Err(MahoutError::NotImplemented(format!(
"encode_batch_from_gpu_ptr_f32 not supported for {}",
self.name()
)))
}
}

// Encoding implementations
Expand Down
Loading
Loading