nikopueringer
diff --git a/‎CorridorKeyModule/inference_engine.py‎
Lines changed: 63 additions & 15 deletions b/‎CorridorKeyModule/inference_engine.py‎
Lines changed: 63 additions & 15 deletions
diff --git a/‎README.md‎
Lines changed: 80 additions & 1 deletion b/‎README.md‎
Lines changed: 80 additions & 1 deletion
diff --git a/‎corridorkey_cli.py‎
Lines changed: 12 additions & 0 deletions b/‎corridorkey_cli.py‎
Lines changed: 12 additions & 0 deletions
@@ -5,16 +5,36 @@
 import os
 import sys
 
-import cv2
-import numpy as np
-import torch
-import torch.nn.functional as F
-import torchvision
-import torchvision.transforms.v2 as T
-import torchvision.transforms.v2.functional as TF
-
-from .core import color_utils as cu
-from .core.model_transformer import GreenFormer
+# ROCm: must be set before importing torch so the CUDA allocator picks them up
+_is_rocm_system = os.environ.get("HIP_VISIBLE_DEVICES") is not None or os.path.exists("/opt/rocm")
+if _is_rocm_system:
+    os.environ.setdefault("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL", "1")
+    os.environ.setdefault("MIOPEN_FIND_MODE", "2")
+    os.environ.setdefault("MIOPEN_LOG_LEVEL", "0")
+    # Enable GTT (system RAM as GPU overflow) on Linux for 16GB cards.
+    # pytorch-rocm-gtt must be installed separately: pip install pytorch-rocm-gtt
+    try:
+        import pytorch_rocm_gtt
+
+        pytorch_rocm_gtt.patch()
+    except ImportError:
+        pass
+
+# Persist torch.compile autotune cache across runs (default is /tmp which
+# gets wiped on reboot — saves 10-20 min re-autotuning on ROCm, ~30s on CUDA)
+_inductor_cache = os.path.join(os.path.expanduser("~"), ".cache", "corridorkey", "inductor")
+os.environ.setdefault("TORCHINDUCTOR_CACHE_DIR", _inductor_cache)
+
+import cv2  # noqa: E402
+import numpy as np  # noqa: E402
+import torch  # noqa: E402
+import torch.nn.functional as F  # noqa: E402
+import torchvision  # noqa: E402
+import torchvision.transforms.v2 as T  # noqa: E402
+import torchvision.transforms.v2.functional as TF  # noqa: E402
+
+from .core import color_utils as cu  # noqa: E402
+from .core.model_transformer import GreenFormer  # noqa: E402
 
 logger = logging.getLogger(__name__)
 
@@ -52,8 +72,15 @@ def __init__(
 
         self.model = self._load_model()
 
-        # We only tested compilation on Windows and Linux. For other platforms compilation is disabled as a precaution.
-        if sys.platform == "linux" or sys.platform == "win32":
+        is_rocm = hasattr(torch.version, "hip") and torch.version.hip
+
+        # torch.compile is tested on CUDA (Windows + Linux) and ROCm (Linux).
+        # ROCm on Windows hangs during Triton kernel compilation — skip it.
+        # CORRIDORKEY_SKIP_COMPILE=1 forces eager mode (useful for testing).
+        skip_compile = (is_rocm and sys.platform == "win32") or os.environ.get("CORRIDORKEY_SKIP_COMPILE") == "1"
+        if skip_compile:
+            logger.info("Skipping torch.compile (eager mode)")
+        elif sys.platform == "linux" or sys.platform == "win32":
             self._compile()
 
     def _load_model(self) -> GreenFormer:
@@ -116,20 +143,41 @@ def _load_model(self) -> GreenFormer:
         return model
 
     def _compile(self):
+        is_rocm = hasattr(torch.version, "hip") and torch.version.hip
+        if is_rocm:
+            # "default" avoids the heavy autotuning that OOM-kills 16GB cards
+            # at 2048x2048. Still compiles Triton kernels, just skips the
+            # exhaustive benchmarking. HIP graphs are also avoided (segfault
+            # on large graphs — pytorch/pytorch#155720).
+            compile_mode = "default"
+        else:
+            compile_mode = "max-autotune"
+
         try:
-            compiled_model = torch.compile(self.model, mode="max-autotune")
-            # Trigger compilation with a dummy input
+            logger.info(
+                "Compiling model (mode=%s) — this may take 10-20 minutes on first run. "
+                "Compiled kernels are cached for future runs.",
+                compile_mode,
+            )
+            compiled_model = torch.compile(self.model, mode=compile_mode)
+            # Trigger compilation with a dummy input (the actual compile
+            # happens here, not in the torch.compile() call above)
             dummy_input = torch.zeros(
                 1, 4, self.img_size, self.img_size, dtype=self.model_precision, device=self.device
             )
             with torch.inference_mode():
                 compiled_model(dummy_input)
+            del dummy_input
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
             self.model = compiled_model
+            logger.info("Model compiled successfully (mode=%s)", compile_mode)
 
         except Exception as e:
             logger.info(f"Compilation error: {e}")
             logger.warning("Model compilation failed. Falling back to eager mode.")
-            torch.cuda.empty_cache()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
 
     def _preprocess_input(
         self, image_batch: torch.Tensor, mask_batch_linear: torch.Tensor, input_is_linear: bool
 
@@ -35,7 +35,8 @@ This project was designed and built on a Linux workstation (Puget Systems PC) eq
 
 The most recent build should work on computers with 6-8 gig of VRAM, and it can run on most M1+ Mac systems with unified memory. Yes, it might even work on your old Macbook pro. Let us know on the Discord!
 
-*   **Windows Users:** To run GPU acceleration natively on Windows, your system MUST have NVIDIA drivers that support **CUDA 12.8 or higher** installed. If your drivers only support older CUDA versions, the installer will likely fallback to the CPU.
+*   **Windows Users (NVIDIA):** To run GPU acceleration natively on Windows, your system MUST have NVIDIA drivers that support **CUDA 12.8 or higher** installed. If your drivers only support older CUDA versions, the installer will likely fallback to the CPU.
+*   **AMD GPU Users (ROCm):** AMD Radeon RX 7000 series (RDNA3) and RX 9000 series (RDNA4) are supported via ROCm on **Linux**. Windows ROCm support is experimental (torch.compile is not yet functional). See the [AMD ROCm Setup](#amd-rocm-setup) section below.
 *   **GVM (Optional):** Requires approximately **80 GB of VRAM** and utilizes massive Stable Video Diffusion models.
 *   **VideoMaMa (Optional):** Natively requires a massive chunk of VRAM as well (originally 80GB+). While the community has tweaked the architecture to run at less than 24GB, those extreme memory optimizations have not yet been fully implemented in this repository.
 *   **BiRefNet (Optional):** Lightweight AlphaHint generator option.
@@ -71,6 +72,10 @@ This project uses **[uv](https://docs.astral.sh/uv/)** to manage Python and all
     uv sync                  # CPU/MPS (default — works everywhere)
     uv sync --extra cuda     # CUDA GPU acceleration (Linux/Windows)
     uv sync --extra mlx      # Apple Silicon MLX acceleration
+
+    # AMD ROCm (Linux) — torch must be installed from AMD's index first:
+    uv pip install torch==2.8.0 torchvision==0.23.0 --index-url https://download.pytorch.org/whl/rocm6.3
+    uv sync
     ```
 4.  **Download the Models:**
     *   **CorridorKey v1.0 Model (~300MB):** Downloads automatically on first run. If no `.pth` file is found in `CorridorKeyModule/checkpoints/`, the engine fetches it from [CorridorKey's HuggingFace](https://huggingface.co/nikopueringer/CorridorKey_v1.0) and saves it as `CorridorKey.pth`. No manual download needed.
@@ -220,6 +225,80 @@ uv run python corridorkey_cli.py wizard --win_path "/path/to/clips"
 
 **Use native MLX instead of PyTorch MPS:** MLX avoids PyTorch's MPS layer entirely and typically runs faster on Apple Silicon. See the [Backend Selection](#backend-selection) section below for setup steps.
 
+### AMD ROCm Setup
+
+CorridorKey supports AMD GPUs via PyTorch's ROCm/HIP backend. The `torch.cuda.*` API works transparently on AMD — HIP intercepts all CUDA calls at runtime, so the inference code runs unchanged.
+
+**Supported GPUs (ROCm 7.2+):**
+- RX 7900 XTX (24GB) / XT (20GB) / GRE (16GB) — RDNA3, gfx1100
+- RX 7800 XT (16GB) / 7700 XT (12GB) — RDNA3, gfx1101
+- RX 9070 XT / 9070 (16GB) — RDNA4, gfx1201
+
+**VRAM requirements:** CorridorKey inference at 2048x2048 needs ~18GB VRAM. The RX 7900 XTX (24GB) and RX 7900 XT (20GB) run at full resolution. Cards with 16GB (RX 7800 XT, 9070 XT) work on Windows (which uses system RAM as overflow) but may OOM on Linux — see notes below.
+
+**Linux native (recommended):**
+```bash
+# Install AMD's ROCm torch wheels, then sync everything else
+pip install torch==2.8.0 torchvision==0.23.0 --index-url https://download.pytorch.org/whl/rocm6.3
+uv sync
+
+# Verify
+uv run python -c "import torch; print(torch.cuda.is_available(), torch.cuda.get_device_name(0))"
+```
+
+**WSL2 (Windows Subsystem for Linux):**
+
+Requires AMD Adrenalin 26.1.1+ driver on Windows. Install ROCm inside WSL2, then use AMD's WSL-specific torch wheels:
+
+```bash
+# 1. Install ROCm for WSL (Ubuntu 24.04)
+sudo apt update
+wget https://repo.radeon.com/amdgpu-install/7.2/ubuntu/noble/amdgpu-install_7.2.70200-1_all.deb
+sudo apt install ./amdgpu-install_7.2.70200-1_all.deb
+amdgpu-install -y --usecase=wsl,rocm --no-dkms
+
+# 2. Verify GPU is visible
+rocminfo  # should show your AMD GPU
+
+# 3. Install AMD's WSL torch wheels (Python 3.12)
+pip3 install \
+  https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/torch-2.9.1%2Brocm7.2.0.lw.git7e1940d4-cp312-cp312-linux_x86_64.whl \
+  https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/torchvision-0.24.0%2Brocm7.2.0.gitb919bd0c-cp312-cp312-linux_x86_64.whl \
+  https://repo.radeon.com/rocm/manylinux/rocm-rel-7.2/triton-3.5.1%2Brocm7.2.0.gita272dfa8-cp312-cp312-linux_x86_64.whl
+
+# 4. Fix WSL runtime library conflict (required)
+location=$(pip3 show torch | grep Location | awk -F ": " '{print $2}')
+rm -f ${location}/torch/lib/libhsa-runtime64.so*
+
+# 5. Install CorridorKey deps AFTER torch (so pip doesn't overwrite ROCm torch)
+pip3 install -e .
+```
+
+**Windows native (experimental):**
+
+Windows ROCm requires Python 3.12 and AMD Adrenalin 25.3.1+ driver. `torch.compile` does not work on Windows ROCm — inference runs in eager mode (significantly slower than Linux).
+
+```powershell
+py -3.12 -m pip install https://repo.radeon.com/rocm/windows/rocm-rel-7.2/rocm-7.2.0.dev0-py3-none-win_amd64.whl
+py -3.12 -m pip install --no-cache-dir https://repo.radeon.com/rocm/windows/rocm-rel-7.2/torch-2.9.1+rocmsdk20260116-cp312-cp312-win_amd64.whl https://repo.radeon.com/rocm/windows/rocm-rel-7.2/torchvision-0.24.1+rocmsdk20260116-cp312-cp312-win_amd64.whl
+```
+
+**What CorridorKey does automatically on ROCm:**
+- Sets `TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL=1` so SDPA dispatches to flash attention kernels on RDNA3 (without this, attention falls back to a slow O(n²) path)
+- Sets `MIOPEN_FIND_MODE=2` for faster convolution kernel selection (reduces warmup from 5-8 minutes to seconds)
+- Uses `torch.compile(mode="max-autotune-no-cudagraphs")` on Linux to avoid a known HIP graph segfault ([pytorch/pytorch#155720](https://github.com/pytorch/pytorch/issues/155720))
+- Skips `torch.compile` entirely on Windows ROCm where Triton compilation hangs
+
+**First-run note:** The first inference run on a new AMD GPU triggers Triton kernel autotuning (10-20 minutes). This is cached in `~/.cache/corridorkey/inductor/` and only happens once per GPU architecture. Subsequent runs start instantly.
+
+**16GB cards on Linux:** CorridorKey at 2048x2048 needs ~18GB. Windows handles this transparently via shared GPU memory (system RAM overflow). On Linux, the GPU has a hard VRAM limit. If you hit OOM on a 16GB card, install `pytorch-rocm-gtt` to enable GTT (system RAM as GPU overflow) — CorridorKey detects and uses it automatically:
+```bash
+pip install pytorch-rocm-gtt
+```
+GTT memory is accessed over PCIe (~10-20x slower than VRAM), so expect slower frame times on 16GB cards vs 20-24GB cards.
+
+**WSL2 limitation:** WSL2 cannot use GTT or shared memory — it has a hard VRAM limit. 16GB cards will OOM in WSL2 at 2048x2048. Use Windows native instead, or a card with 20GB+ VRAM.
+
 ## Backend Selection
 
 CorridorKey supports two inference backends:
 
@@ -19,6 +19,18 @@
 import shutil
 import sys
 import warnings
+
+# ROCm: must be set before any torch import (including transitive via diffusers/GVM)
+if os.environ.get("HIP_VISIBLE_DEVICES") is not None or os.path.exists("/opt/rocm"):
+    os.environ.setdefault("TORCH_ROCM_AOTRITON_ENABLE_EXPERIMENTAL", "1")
+    os.environ.setdefault("MIOPEN_FIND_MODE", "2")
+    os.environ.setdefault("MIOPEN_LOG_LEVEL", "0")
+    try:
+        import pytorch_rocm_gtt
+
+        pytorch_rocm_gtt.patch()
+    except ImportError:
+        pass
 from typing import Annotated, Optional
 
 import typer