Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@ repos:
hooks:
- id: ty
name: ty check
entry: uv run ty check .
entry: bash -c 'if [ -n "${VIRTUAL_ENV:-}" ]; then python -m ty check .; elif [ -x .venv/bin/python ]; then .venv/bin/python -m ty check .; else uv run ty check .; fi'
language: system
pass_filenames: false
types: [python]
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,7 @@ python_functions = "test_*"
addopts = ["-v", "--tb=short", "-rs"]
markers = [
"gpu: marks tests as requiring GPU and _qdp extension (auto-skipped if unavailable)",
"rocm: marks tests that require ROCm + Triton AMD backend",
"slow: marks tests as slow running",
]

Expand Down
8 changes: 6 additions & 2 deletions qdp/qdp-core/src/dlpack.rs
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,7 @@
#[cfg(target_os = "linux")]
use crate::error::cuda_error_to_string;
use crate::error::{MahoutError, Result};
use crate::gpu::memory::{BufferStorage, GpuStateVector, Precision};
use crate::gpu::memory::{BufferStorage, GpuDeviceType, GpuStateVector, Precision};
use std::os::raw::{c_int, c_void};
use std::sync::Arc;

Expand Down Expand Up @@ -113,6 +113,7 @@ pub unsafe fn synchronize_stream(_stream: *mut c_void) -> Result<()> {
pub enum DLDeviceType {
kDLCPU = 1,
kDLCUDA = 2,
kDLROCM = 10,
// Other types omitted
}

Expand Down Expand Up @@ -291,7 +292,10 @@ impl GpuStateVector {
let tensor = DLTensor {
data: self.ptr_void(),
device: DLDevice {
device_type: DLDeviceType::kDLCUDA,
device_type: match self.device_type {
GpuDeviceType::Cuda => DLDeviceType::kDLCUDA,
GpuDeviceType::Rocm => DLDeviceType::kDLROCM,
},
device_id: self.device_id as c_int,
},
ndim,
Expand Down
13 changes: 13 additions & 0 deletions qdp/qdp-core/src/gpu/memory.rs
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,13 @@ pub enum Precision {
Float64,
}

/// Backend GPU device type for DLPack metadata.
#[derive(Clone, Copy, Debug, PartialEq, Eq)]
pub enum GpuDeviceType {
Cuda,
Rocm,
}

#[cfg(target_os = "linux")]
use crate::gpu::cuda_ffi::{cudaFreeHost, cudaHostAlloc, cudaMemGetInfo};

Expand Down Expand Up @@ -212,6 +219,8 @@ pub struct GpuStateVector {
pub(crate) num_samples: Option<usize>,
/// CUDA device ordinal
pub device_id: usize,
/// GPU backend type used for DLPack device metadata.
pub device_type: GpuDeviceType,
}

// Safety: CudaSlice and Arc are both Send + Sync
Expand Down Expand Up @@ -290,6 +299,7 @@ impl GpuStateVector {
size_elements: _size_elements,
num_samples: None,
device_id: _device.ordinal(),
device_type: GpuDeviceType::Cuda,
})
}

Expand Down Expand Up @@ -401,6 +411,7 @@ impl GpuStateVector {
size_elements: total_elements,
num_samples: Some(num_samples),
device_id: _device.ordinal(),
device_type: GpuDeviceType::Cuda,
})
}

Expand Down Expand Up @@ -487,6 +498,7 @@ impl GpuStateVector {
size_elements: self.size_elements,
num_samples: self.num_samples,
device_id: device.ordinal(),
device_type: self.device_type,
})
}

Expand Down Expand Up @@ -562,6 +574,7 @@ impl GpuStateVector {
size_elements: self.size_elements,
num_samples: self.num_samples, // Preserve batch information
device_id: device.ordinal(),
device_type: self.device_type,
})
}

Expand Down
50 changes: 49 additions & 1 deletion qdp/qdp-python/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,25 @@ GPU-accelerated quantum state encoding for [Apache Mahout Qumat](https://github.
pip install qumat[qdp]
```

Requires CUDA-capable GPU.
Requires one of:
- NVIDIA GPU (CUDA path via `QdpEngine`)
- AMD GPU with ROCm (AMD path via `AmdQdpEngine`, with Triton-based AMD support via `TritonAmdKernel`)

Recommended environment setup:

```bash
python -m venv .venv
source .venv/bin/activate

# Install the GPU runtime for your platform first:
# - NVIDIA users: CUDA-compatible torch / triton
# - AMD users: ROCm-compatible torch / triton

uv sync --active --project qdp/qdp-python --group dev
```

Use `--active` so `uv` reuses the environment that already has the correct GPU
runtime stack.

## Usage

Expand All @@ -27,6 +45,32 @@ tensor = torch.from_dlpack(qtensor)
print(tensor) # Complex tensor on CUDA
```

### AMD ROCm Usage

```python
import qumat.qdp as qdp
import torch
from qumat_qdp import TritonAmdKernel, create_encoder_engine

# AMD ROCm engine path
engine = qdp.AmdQdpEngine(device_id=0, precision="float32")
qtensor = engine.encode(torch.randn(8, 4, device="cuda"), 2, "amplitude")
tensor = torch.from_dlpack(qtensor)
print(tensor.device, tensor.dtype) # cuda:0, complex64

# Triton-backed AMD kernel path
kernel = TritonAmdKernel(device_id=0, precision="float32")
qt = kernel.encode(torch.randn(64, 1024, device="cuda"), 10, "amplitude")
state = torch.from_dlpack(qt)

# Or let the router select the AMD Triton path automatically when available
engine_auto = create_encoder_engine(backend="auto", device_id=0, precision="float32")
qt = engine_auto.encode(torch.randn(8, 4, device="cuda"), 2, "amplitude")
state = torch.from_dlpack(qt)
```

See `qdp/qdp-python/TRITON_AMD_BACKEND.md` for Triton AMD setup and validation details.

## Encoding Methods

| Method | Description |
Expand All @@ -36,6 +80,10 @@ print(tensor) # Complex tensor on CUDA
| `basis` | Encode integer as computational basis state |
| `iqp` | IQP-style encoding with entanglement |

Backend support boundary:
- CUDA (`QdpEngine`): `amplitude`, `angle`, `basis`, `iqp`
- AMD Triton path (`TritonAmdKernel` / `backend="triton_amd"`): `amplitude`, `angle`, `basis` (no `iqp` yet)

## Input Sources

```python
Expand Down
88 changes: 88 additions & 0 deletions qdp/qdp-python/TRITON_AMD_BACKEND.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,88 @@
# Triton AMD Backend (`triton_amd`)

This document describes how to use the QDP Triton backend on AMD GPUs via ROCm.

## Prerequisites

- AMD GPU supported by ROCm
- ROCm driver/runtime installed
- PyTorch ROCm build (`torch.version.hip` is not `None`)
- Triton installed with HIP support

## Install (project environment)

```bash
uv sync --project qdp/qdp-python --group benchmark --active
```

This installs the benchmark group, including `triton`.

## Runtime capability checks

Use:

```python
from qumat_qdp import is_triton_amd_available
print(is_triton_amd_available())
```

The check validates:
- ROCm runtime is visible through PyTorch
- Triton is importable
- Triton active backend is HIP (when query is available)

## Usage

### Direct Triton backend

```python
import torch
from qumat_qdp import TritonAmdKernel

engine = TritonAmdKernel(device_id=0, precision="float32")
x = torch.randn(64, 1024, device="cuda", dtype=torch.float32)
qt = engine.encode(x, num_qubits=10, encoding_method="amplitude")
state = torch.from_dlpack(qt)
```

Supported methods:
- `amplitude`
- `angle`
- `basis`

Not supported in `triton_amd`:
- `iqp` (currently CUDA backend only)

### Unified Engine Routing (recommended)

```python
from qumat_qdp import create_encoder_engine
import torch

engine = create_encoder_engine(backend="auto", device_id=0, precision="float32")
qt = engine.encode([[1.0, 0.0, 0.0, 0.0]], num_qubits=2, encoding_method="amplitude")
state = torch.from_dlpack(qt)
# auto order: triton_amd -> cuda
```

All routed backends return a unified DLPack-compatible object (`QuantumTensor` from `qumat_qdp.backend`).

## Correctness tests

Run Triton backend tests:

```bash
uv run --project qdp/qdp-python pytest qdp/qdp-python/tests/test_triton_amd_backend.py -q
uv run --project qdp/qdp-python pytest -m rocm qdp/qdp-python/tests -q
```

Tests include:
- parity against Torch reference outputs (amplitude/angle/basis)
- optional parity against CUDA backend reference (when NVIDIA CUDA path is present)

## Baseline benchmark

```bash
uv run --project qdp/qdp-python python qdp/qdp-python/benchmark/benchmark_triton_amd.py \
--qubits 12 --batch-size 64 --batches 200 --encoding-method amplitude
```
2 changes: 2 additions & 0 deletions qdp/qdp-python/benchmark/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ scripts:
- `benchmark_throughput.py`: DataLoader-style throughput benchmark
that measures vectors/sec across Mahout, PennyLane, and Qiskit.
- `benchmark_latency.py`: Data-to-State latency benchmark (CPU RAM -> GPU VRAM).
- `benchmark_triton_amd.py`: Triton-on-ROCm baseline throughput/latency benchmark for AMD GPUs.

## Quick Start

Expand All @@ -32,6 +33,7 @@ To run individual benchmarks after setup:
uv run --project qdp/qdp-python python qdp/qdp-python/benchmark/benchmark_e2e.py
uv run --project qdp/qdp-python python qdp/qdp-python/benchmark/benchmark_latency.py
uv run --project qdp/qdp-python python qdp/qdp-python/benchmark/benchmark_throughput.py
uv run --project qdp/qdp-python python qdp/qdp-python/benchmark/benchmark_triton_amd.py --qubits 12 --batch-size 64 --batches 200
```

This keeps all benchmark dependencies in the unified repo root venv (`mahout/.venv`).
Expand Down
99 changes: 99 additions & 0 deletions qdp/qdp-python/benchmark/benchmark_triton_amd.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
#!/usr/bin/env python3
#
# Licensed to the Apache Software Foundation (ASF) under one or more
# contributor license agreements. See the NOTICE file distributed with
# this work for additional information regarding copyright ownership.
# The ASF licenses this file to You under the Apache License, Version 2.0
# (the "License"); you may not use this file except in compliance with
# the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

"""Baseline benchmark for Triton AMD backend (ROCm)."""

from __future__ import annotations

import argparse

import torch
from qumat_qdp import TritonAmdKernel, is_triton_amd_available


def _build_input(method: str, batch_size: int, qubits: int) -> torch.Tensor:
if method == "basis":
return torch.randint(
low=0,
high=1 << qubits,
size=(batch_size,),
device="cuda",
dtype=torch.int64,
)
if method == "angle":
return torch.randn(batch_size, qubits, device="cuda", dtype=torch.float32)
return torch.randn(batch_size, 1 << qubits, device="cuda", dtype=torch.float32)


def main() -> int:
parser = argparse.ArgumentParser(
description="Benchmark Triton AMD backend throughput/latency."
)
parser.add_argument("--qubits", type=int, default=12)
parser.add_argument("--batch-size", type=int, default=64)
parser.add_argument("--batches", type=int, default=200)
parser.add_argument(
"--encoding-method",
type=str,
default="amplitude",
choices=["amplitude", "angle", "basis"],
)
parser.add_argument(
"--precision", type=str, default="float32", choices=["float32", "float64"]
)
args = parser.parse_args()

if not is_triton_amd_available():
raise SystemExit(
"triton_amd backend is unavailable (requires ROCm + Triton HIP target)."
)

engine = TritonAmdKernel(device_id=0, precision=args.precision)
data = _build_input(args.encoding_method, args.batch_size, args.qubits)

# Warmup
for _ in range(10):
_ = engine.encode(data, args.qubits, args.encoding_method)
torch.cuda.synchronize()

start = torch.cuda.Event(enable_timing=True)
end = torch.cuda.Event(enable_timing=True)
processed = 0
start.record()
for _ in range(args.batches):
_ = engine.encode(data, args.qubits, args.encoding_method)
processed += args.batch_size
end.record()
torch.cuda.synchronize()
dt = start.elapsed_time(end) / 1000.0

throughput = processed / dt if dt > 0 else 0.0
latency_ms_per_vector = (dt / processed) * 1000 if processed else 0.0

print("TRITON AMD BASELINE")
print(f"- Encoding: {args.encoding_method}")
print(f"- Qubits: {args.qubits}")
print(f"- Batch size: {args.batch_size}")
print(f"- Batches: {args.batches}")
print(f"- Duration: {dt:.4f} s")
print(f"- Throughput: {throughput:.1f} vectors/sec")
print(f"- Latency: {latency_ms_per_vector:.6f} ms/vector")
return 0


if __name__ == "__main__":
raise SystemExit(main())
Loading
Loading