apache · 400Ping · Mar 8, 2026 · Mar 8, 2026 · Apr 17, 2026 · Apr 17, 2026
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -25,7 +25,7 @@ repos:
     hooks:
       - id: ty
         name: ty check
-        entry: uv run ty check .
+        entry: bash -c 'if [ -n "${VIRTUAL_ENV:-}" ]; then python -m ty check .; elif [ -x .venv/bin/python ]; then .venv/bin/python -m ty check .; else uv run ty check .; fi'
         language: system
         pass_filenames: false
         types: [python]

@@ -83,6 +83,7 @@ python_functions = "test_*"
 addopts = ["-v", "--tb=short", "-rs"]
 markers = [
     "gpu: marks tests as requiring GPU and _qdp extension (auto-skipped if unavailable)",
+    "rocm: marks tests that require ROCm + Triton AMD backend",
     "slow: marks tests as slow running",
 ]
 

@@ -19,7 +19,7 @@
 #[cfg(target_os = "linux")]
 use crate::error::cuda_error_to_string;
 use crate::error::{MahoutError, Result};
-use crate::gpu::memory::{BufferStorage, GpuStateVector, Precision};
+use crate::gpu::memory::{BufferStorage, GpuDeviceType, GpuStateVector, Precision};
 use std::os::raw::{c_int, c_void};
 use std::sync::Arc;
 
@@ -113,6 +113,7 @@ pub unsafe fn synchronize_stream(_stream: *mut c_void) -> Result<()> {
 pub enum DLDeviceType {
     kDLCPU = 1,
     kDLCUDA = 2,
+    kDLROCM = 10,
     // Other types omitted
 }
 
@@ -291,7 +292,10 @@ impl GpuStateVector {
         let tensor = DLTensor {
             data: self.ptr_void(),
             device: DLDevice {
-                device_type: DLDeviceType::kDLCUDA,
+                device_type: match self.device_type {
+                    GpuDeviceType::Cuda => DLDeviceType::kDLCUDA,
+                    GpuDeviceType::Rocm => DLDeviceType::kDLROCM,
+                },
                 device_id: self.device_id as c_int,
             },
             ndim,

@@ -35,6 +35,13 @@ pub enum Precision {
     Float64,
 }
 
+/// Backend GPU device type for DLPack metadata.
+#[derive(Clone, Copy, Debug, PartialEq, Eq)]
+pub enum GpuDeviceType {
+    Cuda,
+    Rocm,
+}
+
 #[cfg(target_os = "linux")]
 use crate::gpu::cuda_ffi::{cudaFreeHost, cudaHostAlloc, cudaMemGetInfo};
 
@@ -212,6 +219,8 @@ pub struct GpuStateVector {
     pub(crate) num_samples: Option<usize>,
     /// CUDA device ordinal
     pub device_id: usize,
+    /// GPU backend type used for DLPack device metadata.
+    pub device_type: GpuDeviceType,
 }
 
 // Safety: CudaSlice and Arc are both Send + Sync
@@ -290,6 +299,7 @@ impl GpuStateVector {
             size_elements: _size_elements,
             num_samples: None,
             device_id: _device.ordinal(),
+            device_type: GpuDeviceType::Cuda,
         })
     }
 
@@ -401,6 +411,7 @@ impl GpuStateVector {
                 size_elements: total_elements,
                 num_samples: Some(num_samples),
                 device_id: _device.ordinal(),
+                device_type: GpuDeviceType::Cuda,
             })
         }
 
@@ -487,6 +498,7 @@ impl GpuStateVector {
                         size_elements: self.size_elements,
                         num_samples: self.num_samples,
                         device_id: device.ordinal(),
+                        device_type: self.device_type,
                     })
                 }
 
@@ -562,6 +574,7 @@ impl GpuStateVector {
                         size_elements: self.size_elements,
                         num_samples: self.num_samples, // Preserve batch information
                         device_id: device.ordinal(),
+                        device_type: self.device_type,
                     })
                 }
 

@@ -8,7 +8,25 @@ GPU-accelerated quantum state encoding for [Apache Mahout Qumat](https://github.
 pip install qumat[qdp]
 ```
 
-Requires CUDA-capable GPU.
+Requires one of:
+- NVIDIA GPU (CUDA path via `QdpEngine`)
+- AMD GPU with ROCm (AMD path via `AmdQdpEngine`, with Triton-based AMD support via `TritonAmdKernel`)
+
+Recommended environment setup:
+
+```bash
+python -m venv .venv
+source .venv/bin/activate
+
+# Install the GPU runtime for your platform first:
+# - NVIDIA users: CUDA-compatible torch / triton
+# - AMD users: ROCm-compatible torch / triton
+
+uv sync --active --project qdp/qdp-python --group dev
+```
+
+Use `--active` so `uv` reuses the environment that already has the correct GPU
+runtime stack.
 
 ## Usage
 
@@ -27,6 +45,32 @@ tensor = torch.from_dlpack(qtensor)
 print(tensor)  # Complex tensor on CUDA
 ```
 
+### AMD ROCm Usage
+
+```python
+import qumat.qdp as qdp
+import torch
+from qumat_qdp import TritonAmdKernel, create_encoder_engine
+
+# AMD ROCm engine path
+engine = qdp.AmdQdpEngine(device_id=0, precision="float32")
+qtensor = engine.encode(torch.randn(8, 4, device="cuda"), 2, "amplitude")
+tensor = torch.from_dlpack(qtensor)
+print(tensor.device, tensor.dtype)  # cuda:0, complex64
+
+# Triton-backed AMD kernel path
+kernel = TritonAmdKernel(device_id=0, precision="float32")
+qt = kernel.encode(torch.randn(64, 1024, device="cuda"), 10, "amplitude")
+state = torch.from_dlpack(qt)
+
+# Or let the router select the AMD Triton path automatically when available
+engine_auto = create_encoder_engine(backend="auto", device_id=0, precision="float32")
+qt = engine_auto.encode(torch.randn(8, 4, device="cuda"), 2, "amplitude")
+state = torch.from_dlpack(qt)
+```
+
+See `qdp/qdp-python/TRITON_AMD_BACKEND.md` for Triton AMD setup and validation details.
+
 ## Encoding Methods
 
 | Method | Description |
@@ -36,6 +80,10 @@ print(tensor)  # Complex tensor on CUDA
 | `basis` | Encode integer as computational basis state |
 | `iqp` | IQP-style encoding with entanglement |
 
+Backend support boundary:
+- CUDA (`QdpEngine`): `amplitude`, `angle`, `basis`, `iqp`
+- AMD Triton path (`TritonAmdKernel` / `backend="triton_amd"`): `amplitude`, `angle`, `basis` (no `iqp` yet)
+
 ## Input Sources
 
 ```python

@@ -0,0 +1,88 @@
+# Triton AMD Backend (`triton_amd`)
+
+This document describes how to use the QDP Triton backend on AMD GPUs via ROCm.
+
+## Prerequisites
+
+- AMD GPU supported by ROCm
+- ROCm driver/runtime installed
+- PyTorch ROCm build (`torch.version.hip` is not `None`)
+- Triton installed with HIP support
+
+## Install (project environment)
+
+```bash
+uv sync --project qdp/qdp-python --group benchmark --active
+```
+
+This installs the benchmark group, including `triton`.
+
+## Runtime capability checks
+
+Use:
+
+```python
+from qumat_qdp import is_triton_amd_available
+print(is_triton_amd_available())
+```
+
+The check validates:
+- ROCm runtime is visible through PyTorch
+- Triton is importable
+- Triton active backend is HIP (when query is available)
+
+## Usage
+
+### Direct Triton backend
+
+```python
+import torch
+from qumat_qdp import TritonAmdKernel
+
+engine = TritonAmdKernel(device_id=0, precision="float32")
+x = torch.randn(64, 1024, device="cuda", dtype=torch.float32)
+qt = engine.encode(x, num_qubits=10, encoding_method="amplitude")
+state = torch.from_dlpack(qt)
+```
+
+Supported methods:
+- `amplitude`
+- `angle`
+- `basis`
+
+Not supported in `triton_amd`:
+- `iqp` (currently CUDA backend only)
+
+### Unified Engine Routing (recommended)
+
+```python
+from qumat_qdp import create_encoder_engine
+import torch
+
+engine = create_encoder_engine(backend="auto", device_id=0, precision="float32")
+qt = engine.encode([[1.0, 0.0, 0.0, 0.0]], num_qubits=2, encoding_method="amplitude")
+state = torch.from_dlpack(qt)
+# auto order: triton_amd -> cuda
+```
+
+All routed backends return a unified DLPack-compatible object (`QuantumTensor` from `qumat_qdp.backend`).
+
+## Correctness tests
+
+Run Triton backend tests:
+
+```bash
+uv run --project qdp/qdp-python pytest qdp/qdp-python/tests/test_triton_amd_backend.py -q
+uv run --project qdp/qdp-python pytest -m rocm qdp/qdp-python/tests -q
+```
+
+Tests include:
+- parity against Torch reference outputs (amplitude/angle/basis)
+- optional parity against CUDA backend reference (when NVIDIA CUDA path is present)
+
+## Baseline benchmark
+
+```bash
+uv run --project qdp/qdp-python python qdp/qdp-python/benchmark/benchmark_triton_amd.py \
+  --qubits 12 --batch-size 64 --batches 200 --encoding-method amplitude
+```
@@ -8,6 +8,7 @@ scripts:
 - `benchmark_throughput.py`: DataLoader-style throughput benchmark
   that measures vectors/sec across Mahout, PennyLane, and Qiskit.
 - `benchmark_latency.py`: Data-to-State latency benchmark (CPU RAM -> GPU VRAM).
+- `benchmark_triton_amd.py`: Triton-on-ROCm baseline throughput/latency benchmark for AMD GPUs.
 
 ## Quick Start
 
@@ -32,6 +33,7 @@ To run individual benchmarks after setup:
 uv run --project qdp/qdp-python python qdp/qdp-python/benchmark/benchmark_e2e.py
 uv run --project qdp/qdp-python python qdp/qdp-python/benchmark/benchmark_latency.py
 uv run --project qdp/qdp-python python qdp/qdp-python/benchmark/benchmark_throughput.py
+uv run --project qdp/qdp-python python qdp/qdp-python/benchmark/benchmark_triton_amd.py --qubits 12 --batch-size 64 --batches 200
 ```
 
 This keeps all benchmark dependencies in the unified repo root venv (`mahout/.venv`).

@@ -0,0 +1,99 @@
+#!/usr/bin/env python3
+#
+# Licensed to the Apache Software Foundation (ASF) under one or more
+# contributor license agreements.  See the NOTICE file distributed with
+# this work for additional information regarding copyright ownership.
+# The ASF licenses this file to You under the Apache License, Version 2.0
+# (the "License"); you may not use this file except in compliance with
+# the License.  You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Baseline benchmark for Triton AMD backend (ROCm)."""
+
+from __future__ import annotations
+
+import argparse
+
+import torch
+from qumat_qdp import TritonAmdKernel, is_triton_amd_available
+
+
+def _build_input(method: str, batch_size: int, qubits: int) -> torch.Tensor:
+    if method == "basis":
+        return torch.randint(
+            low=0,
+            high=1 << qubits,
+            size=(batch_size,),
+            device="cuda",
+            dtype=torch.int64,
+        )
+    if method == "angle":
+        return torch.randn(batch_size, qubits, device="cuda", dtype=torch.float32)
+    return torch.randn(batch_size, 1 << qubits, device="cuda", dtype=torch.float32)
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Benchmark Triton AMD backend throughput/latency."
+    )
+    parser.add_argument("--qubits", type=int, default=12)
+    parser.add_argument("--batch-size", type=int, default=64)
+    parser.add_argument("--batches", type=int, default=200)
+    parser.add_argument(
+        "--encoding-method",
+        type=str,
+        default="amplitude",
+        choices=["amplitude", "angle", "basis"],
+    )
+    parser.add_argument(
+        "--precision", type=str, default="float32", choices=["float32", "float64"]
+    )
+    args = parser.parse_args()
+
+    if not is_triton_amd_available():
+        raise SystemExit(
+            "triton_amd backend is unavailable (requires ROCm + Triton HIP target)."
+        )
+
+    engine = TritonAmdKernel(device_id=0, precision=args.precision)
+    data = _build_input(args.encoding_method, args.batch_size, args.qubits)
+
+    # Warmup
+    for _ in range(10):
+        _ = engine.encode(data, args.qubits, args.encoding_method)
+    torch.cuda.synchronize()
+
+    start = torch.cuda.Event(enable_timing=True)
+    end = torch.cuda.Event(enable_timing=True)
+    processed = 0
+    start.record()
+    for _ in range(args.batches):
+        _ = engine.encode(data, args.qubits, args.encoding_method)
+        processed += args.batch_size
+    end.record()
+    torch.cuda.synchronize()
+    dt = start.elapsed_time(end) / 1000.0
+
+    throughput = processed / dt if dt > 0 else 0.0
+    latency_ms_per_vector = (dt / processed) * 1000 if processed else 0.0
+
+    print("TRITON AMD BASELINE")
+    print(f"- Encoding: {args.encoding_method}")
+    print(f"- Qubits: {args.qubits}")
+    print(f"- Batch size: {args.batch_size}")
+    print(f"- Batches: {args.batches}")
+    print(f"- Duration: {dt:.4f} s")
+    print(f"- Throughput: {throughput:.1f} vectors/sec")
+    print(f"- Latency: {latency_ms_per_vector:.6f} ms/vector")
+    return 0
+
+
+if __name__ == "__main__":
+    raise SystemExit(main())