apache
diff --git a/‎qdp/qdp-core/src/gpu/encodings/amplitude.rs‎
Lines changed: 34 additions & 27 deletions b/‎qdp/qdp-core/src/gpu/encodings/amplitude.rs‎
Lines changed: 34 additions & 27 deletions
diff --git a/‎qdp/qdp-core/src/pipeline_runner.rs‎
Lines changed: 95 additions & 0 deletions b/‎qdp/qdp-core/src/pipeline_runner.rs‎
Lines changed: 95 additions & 0 deletions
diff --git a/‎qdp/qdp-python/benchmark/encoding_benchmarks/README.md‎
Lines changed: 22 additions & 0 deletions b/‎qdp/qdp-python/benchmark/encoding_benchmarks/README.md‎
Lines changed: 22 additions & 0 deletions
@@ -245,21 +245,9 @@ impl QuantumEncoder for AmplitudeEncoder {
             buffer
         };
 
-        // Validate norms on host to catch zero or NaN samples early
-        {
-            crate::profile_scope!("GPU::NormValidation");
-            let host_inv_norms = device
-                .dtoh_sync_copy(&inv_norms_gpu)
-                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
-
-            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
-                return Err(MahoutError::InvalidInput(
-                    "One or more samples have zero or invalid norm".to_string(),
-                ));
-            }
-        }
-
-        // Launch batch kernel
+        // Launch batch encode kernel — takes GPU norm buffer directly, no D2H needed yet.
+        // We defer the norm validation D2H copy until AFTER the encode kernel + sync so that
+        // the norm kernel → encode kernel sequence runs without an intermediate GPU-CPU roundtrip.
         {
             crate::profile_scope!("GPU::BatchKernelLaunch");
             let state_ptr = batch_state_vector.ptr_f64().ok_or_else(|| {
@@ -288,14 +276,30 @@ impl QuantumEncoder for AmplitudeEncoder {
             }
         }
 
-        // Synchronize
+        // Synchronize — all GPU work (norm + encode) complete after this point.
         {
             crate::profile_scope!("GPU::Synchronize");
             device
                 .synchronize()
                 .map_err(|e| MahoutError::Cuda(format!("Sync failed: {:?}", e)))?;
         }
 
+        // Validate norms on host AFTER sync: D2H copy no longer blocks the encode kernel.
+        // This preserves error detection for zero/NaN samples without adding a mid-pipeline
+        // GPU-CPU roundtrip between the norm and encode kernels.
+        {
+            crate::profile_scope!("GPU::NormValidation");
+            let host_inv_norms = device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
+
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid norm".to_string(),
+                ));
+            }
+        }
+
         Ok(batch_state_vector)
     }
 
@@ -412,17 +416,8 @@ impl QuantumEncoder for AmplitudeEncoder {
             }
             buffer
         };
-        {
-            crate::profile_scope!("GPU::NormValidation");
-            let host_inv_norms = device
-                .dtoh_sync_copy(&inv_norms_gpu)
-                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
-            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
-                return Err(MahoutError::InvalidInput(
-                    "One or more samples have zero or invalid norm".to_string(),
-                ));
-            }
-        }
+        // Launch encode kernel before D2H norm validation: GPU norm buffer is passed directly,
+        // so the encode kernel can run immediately after the norm kernel without a CPU roundtrip.
         {
             crate::profile_scope!("GPU::BatchKernelLaunch");
             use cudarc::driver::DevicePtr;
@@ -450,10 +445,22 @@ impl QuantumEncoder for AmplitudeEncoder {
                 )));
             }
         }
+        // Synchronize first; then validate norms on host (D2H after all GPU work is done).
         {
             crate::profile_scope!("GPU::Synchronize");
             sync_cuda_stream(stream, "CUDA stream synchronize failed")?;
         }
+        {
+            crate::profile_scope!("GPU::NormValidation");
+            let host_inv_norms = device
+                .dtoh_sync_copy(&inv_norms_gpu)
+                .map_err(|e| MahoutError::Cuda(format!("Failed to copy norms to host: {:?}", e)))?;
+            if host_inv_norms.iter().any(|v| !v.is_finite() || *v == 0.0) {
+                return Err(MahoutError::InvalidInput(
+                    "One or more samples have zero or invalid norm".to_string(),
+                ));
+            }
+        }
         Ok(batch_state_vector)
     }
 
 
@@ -246,6 +246,47 @@ impl PipelineIterator {
         })
     }
 
+    /// Create a pipeline iterator from an in-memory array (e.g. from Python numpy).
+    /// Data is owned by the iterator; the full encode loop runs in Rust (take_batch + encode_batch).
+    pub fn new_from_array(
+        engine: QdpEngine,
+        data: Vec<f64>,
+        num_samples: usize,
+        sample_size: usize,
+        config: PipelineConfig,
+        batch_limit: usize,
+    ) -> Result<Self> {
+        let vector_len = vector_len(config.num_qubits, &config.encoding_method);
+        if sample_size != vector_len {
+            return Err(MahoutError::InvalidInput(format!(
+                "Array sample_size {} does not match vector_len {} for num_qubits={}, encoding={}",
+                sample_size, vector_len, config.num_qubits, config.encoding_method
+            )));
+        }
+        if data.len() != num_samples * sample_size {
+            return Err(MahoutError::InvalidInput(format!(
+                "Array length {} is not num_samples ({}) * sample_size ({})",
+                data.len(),
+                num_samples,
+                sample_size
+            )));
+        }
+        let source = DataSource::InMemory {
+            data,
+            cursor: 0,
+            num_samples,
+            sample_size,
+            batches_yielded: 0,
+            batch_limit,
+        };
+        Ok(Self {
+            engine,
+            config,
+            source,
+            vector_len,
+        })
+    }
+
     /// Create a pipeline iterator from a Parquet file using streaming read (Phase 2b).
     /// Only `.parquet` is supported; reduces memory for large files by reading in chunks.
     /// Validates sample_size == vector_len after the first chunk.
@@ -411,7 +452,61 @@ impl PipelineIterator {
     }
 
     /// Returns the next batch as a DLPack pointer; `Ok(None)` when exhausted.
+    /// For InMemory source, passes a slice reference to encode_batch (no per-batch copy).
     pub fn next_batch(&mut self) -> Result<Option<*mut DLManagedTensor>> {
+        // InMemory: update cursor, then encode from &data[start..end] to avoid to_vec().
+        let in_memory_range: Option<(usize, usize, usize, usize)> = match &mut self.source {
+            DataSource::InMemory {
+                data,
+                cursor,
+                sample_size,
+                batches_yielded,
+                batch_limit,
+                ..
+            } => {
+                if *batches_yielded >= *batch_limit {
+                    None
+                } else {
+                    let remaining = (data.len() - *cursor) / *sample_size;
+                    if remaining == 0 {
+                        None
+                    } else {
+                        let batch_n = remaining.min(self.config.batch_size);
+                        let start = *cursor;
+                        let end = start + batch_n * *sample_size;
+                        *cursor = end;
+                        *batches_yielded += 1;
+                        Some((
+                            start,
+                            batch_n,
+                            *sample_size,
+                            self.config.num_qubits as usize,
+                        ))
+                    }
+                }
+            }
+            _ => None,
+        };
+
+        if let Some((start, batch_n, sample_size, num_qubits)) = in_memory_range {
+            let slice = match &self.source {
+                DataSource::InMemory { data, .. } => {
+                    let len = batch_n * sample_size;
+                    &data[start..start + len]
+                }
+                _ => unreachable!(),
+            };
+            let ptr = self.engine.encode_batch(
+                slice,
+                batch_n,
+                sample_size,
+                num_qubits,
+                &self.config.encoding_method,
+            )?;
+            return Ok(Some(ptr));
+        }
+
+        // Synthetic / Streaming: take_batch_from_source (may copy) then encode.
         let Some((batch_data, batch_n, sample_size, num_qubits)) = self.take_batch_from_source()?
         else {
             return Ok(None);
 
@@ -75,3 +75,25 @@ To see the full list of options and defaults, append `--help`:
 uv run python benchmark/encoding_benchmarks/pennylane_baseline/iris_amplitude.py --help
 uv run python benchmark/encoding_benchmarks/qdp_pipeline/iris_amplitude.py --help
 ```
+
+## Credit Card Fraud amplitude baseline (PennyLane)
+
+Minimal, reproducible steps (run from `qdp/qdp-python`):
+
+1. **Download dataset (once)** — Kaggle `creditcard.csv` mirror:
+
+   ```bash
+   mkdir -p benchmark/encoding_benchmarks/pennylane_baseline/data
+   curl -L -o benchmark/encoding_benchmarks/pennylane_baseline/data/creditcard.csv \
+     https://raw.githubusercontent.com/nsethi31/Kaggle-Data-Credit-Card-Fraud-Detection/master/creditcard.csv
+   ```
+
+2. **Run the PennyLane baseline** — StandardScaler → PCA(16) → L2 norm → 4‑qubit amplitude VQC:
+
+   ```bash
+   uv run python benchmark/encoding_benchmarks/pennylane_baseline/creditcardfraud_amplitude.py \
+     --data-file benchmark/encoding_benchmarks/pennylane_baseline/data/creditcard.csv \
+     --max-samples 300000 --iters 200 --batch-size 512 --trials 1
+   ```
+
+This prints compile time, train time / throughput, and task metrics (AUPRC, F1, precision, recall) on the test set.