From 1d47e7a58d2062961ba543d871da3524f6b616fb Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Wed, 17 Jun 2026 15:32:00 +0000
Subject: [PATCH 1/2] [ROCm] Add setup.py to build the _C extension (CUDA +
 ROCm via hipify)

The faithcontour._C extension has no build wiring on main: there is no
setup.py and pyproject declares only a pure-Python package, so a source
install never compiles _C and "from . import _C" fails at import. This
adds a setup.py with a torch CUDAExtension/BuildExtension over
_C/{bindings.cpp,kernels.cu}. On a CUDA PyTorch it builds the original
CUDA sources unchanged; on a ROCm PyTorch BuildExtension hipifies the
same sources automatically, so the extension builds for AMD GPUs with no
source changes. This complements the existing ROCm kernel/runtime
support already on main by providing the missing compiled artifact.

setup.py also carries a Windows-only /ALTERNATENAME linker directive:
c10.dll does not export the inherited c10::ValueError(SourceLocation,
string) constructor that <torch/extension.h> references, so the import
thunk is aliased to the exported c10::Error(SourceLocation, string)
(ValueError IS-A Error with no extra data members).

kernels.cu converts the int64 index and candidate buffer types and casts
from long to int64_t. This is a no-op on LP64 Linux (long == int64_t) but
is required on Windows LLP64 where long is 32-bit while the torch int64
tensors backing these buffers are 64-bit.

.gitignore adds the hipify byproducts (*.hip, *.prehip) and versioned
shared objects (*.so.*) so a ROCm build leaves the tree clean.

This work was authored with the assistance of Claude, an AI assistant.

Test Plan:
```
rm -f src/faithcontour/_C/kernels.hip src/faithcontour/_C/*.prehip
rm -rf build
cd src && HIP_VISIBLE_DEVICES=0 PYTORCH_ROCM_ARCH=gfx90a \
    python setup.py build_ext --inplace
HIP_VISIBLE_DEVICES=0 python3 agent_space/faithc_harness.py
```
Built cleanly on gfx90a (AMD Instinct MI250X, ROCm 7.2); the harness
drives all four _C bindings on GPU against a torch CPU reference and
reports all checks PASS.
---
 .gitignore                     |  3 ++
 README.md                      | 19 ++++++++++
 setup.py                       | 50 ++++++++++++++++++++++++++
 src/faithcontour/_C/kernels.cu | 66 +++++++++++++++++-----------------
 4 files changed, 105 insertions(+), 33 deletions(-)
 create mode 100644 setup.py
diff --git a/.gitignore b/.gitignore
index 3d9226c..6e6e894 100644
--- a/.gitignore
+++ b/.gitignore
@@ -10,4 +10,7 @@ assets/*.glb
 assets/*.ply
 assets/*.stl
 *.so
+*.so.*
+*.hip
+*.prehip
 src/faithcontour.egg-info/*
diff --git a/README.md b/README.md
index 4654972..7e3e1dc 100644
--- a/README.md
+++ b/README.md
@@ -107,6 +107,25 @@ pip install -e . --no-build-isolation
 pip install trimesh scipy einops
 ```
 
+<details>
+<summary><b>AMD GPU (ROCm)</b></summary>
+
+The `_C` extension also builds on AMD GPUs with ROCm. Install a ROCm build of PyTorch, then build FaithContour against it; the build hipifies the CUDA sources automatically, so no source changes are needed.
+
+```bash
+# Install PyTorch for ROCm (example for ROCm 7.2)
+pip install torch --index-url https://download.pytorch.org/whl/rocm7.2
+
+# Build FaithContour (set the arch(es) for your GPU, e.g. gfx90a, gfx1100)
+git clone https://github.com/Luo-Yihao/FaithC.git
+cd FaithC
+PYTORCH_ROCM_ARCH=gfx90a pip install -e . --no-build-isolation
+```
+
+If `PYTORCH_ROCM_ARCH` is unset, the build targets the architectures of the GPUs visible on the build host.
+
+</details>
+
 
 ## Quick Start
 
diff --git a/setup.py b/setup.py
new file mode 100644
index 0000000..b66c075
--- /dev/null
+++ b/setup.py
@@ -0,0 +1,50 @@
+# Builds the faithcontour._C extension. On a CUDA PyTorch this compiles the
+# original CUDA sources; on a ROCm PyTorch BuildExtension hipifies them so the
+# same sources build for AMD GPUs.
+
+import os
+import sys
+
+from setuptools import find_packages, setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+_C_DIR = os.path.join("src", "faithcontour", "_C")
+
+sources = [
+    os.path.join(_C_DIR, "bindings.cpp"),
+    os.path.join(_C_DIR, "kernels.cu"),
+]
+extra_link_args = []
+if sys.platform == "win32":
+    # c10.dll does not export the c10::ValueError(SourceLocation, string)
+    # constructor that is inherited via "using Error::Error".  Headers included
+    # through <torch/extension.h> (e.g. ATen/TensorIndexing.h) trigger
+    # TORCH_CHECK_VALUE which generates a __declspec(dllimport) reference to
+    # that constructor, causing LNK2001.  Alias it to Error(SourceLocation,
+    # string) which IS exported from c10.dll.  ValueError IS-A Error with no
+    # extra data members; the constructors are semantically identical.
+    # Alias the missing dllimport thunk for ValueError(SourceLocation,string)
+    # to Error(SourceLocation,string) which IS in c10.dll.
+    _val_imp = (
+        "__imp_??0ValueError@c10@@QEAA@USourceLocation@1@"
+        "V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z"
+    )
+    _err_imp = (
+        "__imp_??0Error@c10@@QEAA@USourceLocation@1@"
+        "V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z"
+    )
+    extra_link_args.append(f"/ALTERNATENAME:{_val_imp}={_err_imp}")
+
+setup(
+    name="faithcontour",
+    package_dir={"": "src"},
+    packages=find_packages("src"),
+    ext_modules=[
+        CUDAExtension(
+            name="faithcontour._C",
+            sources=sources,
+            extra_link_args=extra_link_args,
+        )
+    ],
+    cmdclass={"build_ext": BuildExtension},
+)
diff --git a/src/faithcontour/_C/kernels.cu b/src/faithcontour/_C/kernels.cu
index 191979a..5ffd586 100644
--- a/src/faithcontour/_C/kernels.cu
+++ b/src/faithcontour/_C/kernels.cu
@@ -42,8 +42,8 @@ __global__ void k_segment_tri_intersection_fused_float(
     int64_t num_segs,
     int64_t num_tris,
     float eps,
-    long* __restrict__ out_seg_indices,
-    long* __restrict__ out_tri_indices,
+    int64_t* __restrict__ out_seg_indices,
+    int64_t* __restrict__ out_tri_indices,
     float* __restrict__ out_dots,
     int* __restrict__ counter)
 {
@@ -168,7 +168,7 @@ std::vector<at::Tensor> segment_tri_intersection_fused_cuda(
         seg_verts.data_ptr<float>(), tris_verts.data_ptr<float>(),
         tri_aabb_min.data_ptr<float>(), tri_aabb_max.data_ptr<float>(),
         num_segs, num_tris, static_cast<float>(eps),
-        out_seg_indices.data_ptr<long>(), out_tri_indices.data_ptr<long>(),
+        out_seg_indices.data_ptr<int64_t>(), out_tri_indices.data_ptr<int64_t>(),
         out_dots.data_ptr<float>(), counter.data_ptr<int>());
 
     int final_hit_count = counter.item<int>();
@@ -286,8 +286,8 @@ __global__ void k_gen_candidates_overlap(
     const T* __restrict__ tri_min,  const T* __restrict__ tri_max,
     int64_t Na, int64_t Nt,
     int64_t a_offset, int64_t t_offset,
-    long* __restrict__ cand_a_out, long* __restrict__ cand_t_out,
-    int* __restrict__ counter, unsigned char* __restrict__ overflow, long cap, T eps)
+    int64_t* __restrict__ cand_a_out, int64_t* __restrict__ cand_t_out,
+    int* __restrict__ counter, unsigned char* __restrict__ overflow, int64_t cap, T eps)
 {
     int ai = blockIdx.x * blockDim.x + threadIdx.x;
     int ti = blockIdx.y * blockDim.y + threadIdx.y;
@@ -304,9 +304,9 @@ __global__ void k_gen_candidates_overlap(
     if (!ov) return;
 
     int idx = atomicAdd(counter, 1);
-    if ((long)idx >= cap){ *overflow = 1; return; }
-    cand_a_out[idx] = (long)(a_offset + ai);
-    cand_t_out[idx] = (long)(t_offset + ti);
+    if ((int64_t)idx >= cap){ *overflow = 1; return; }
+    cand_a_out[idx] = (int64_t)(a_offset + ai);
+    cand_t_out[idx] = (int64_t)(t_offset + ti);
 }
 
 void gen_candidates_overlap_cuda(
@@ -326,9 +326,9 @@ void gen_candidates_overlap_cuda(
             aabb_min.data_ptr<scalar_t>(), aabb_max.data_ptr<scalar_t>(),
             tri_min.data_ptr<scalar_t>(),  tri_max.data_ptr<scalar_t>(),
             Na, Nt, a_offset, t_offset,
-            cand_a_out.data_ptr<long>(), cand_t_out.data_ptr<long>(),
+            cand_a_out.data_ptr<int64_t>(), cand_t_out.data_ptr<int64_t>(),
             counter.data_ptr<int>(), overflow.data_ptr<unsigned char>(),
-            (long)cand_a_out.size(0),
+            (int64_t)cand_a_out.size(0),
             (scalar_t)eps );
     });
 
@@ -430,9 +430,9 @@ __device__ __forceinline__ int clip_with_plane(const T in_poly[MAXV][3], int in_
 
 // Narrowphase Kernels for aabb_tri_sat_clip_select
 template<typename T>
-__global__ void sat_hit_kernel(const T* __restrict__ aabbs_min, const T* __restrict__ aabbs_max, const T* __restrict__ tris_verts, const long* __restrict__ cand_a, const long* __restrict__ cand_t, int64_t K, T eps, bool* __restrict__ hit_mask, long* __restrict__ out_a_idx, long* __restrict__ out_t_idx) {
+__global__ void sat_hit_kernel(const T* __restrict__ aabbs_min, const T* __restrict__ aabbs_max, const T* __restrict__ tris_verts, const int64_t* __restrict__ cand_a, const int64_t* __restrict__ cand_t, int64_t K, T eps, bool* __restrict__ hit_mask, int64_t* __restrict__ out_a_idx, int64_t* __restrict__ out_t_idx) {
     int k=blockIdx.x*blockDim.x+threadIdx.x; if (k>=K) return;
-    long ai=cand_a[k], ti=cand_t[k];
+    int64_t ai=cand_a[k], ti=cand_t[k];
     const T* bmin=aabbs_min+ai*3; const T* bmax=aabbs_max+ai*3;
     T center[3]={(bmin[0]+bmax[0])*T(0.5),(bmin[1]+bmax[1])*T(0.5),(bmin[2]+bmax[2])*T(0.5)};
     T he[3]={(bmax[0]-bmin[0])*T(0.5),(bmax[1]-bmin[1])*T(0.5),(bmax[2]-bmin[2])*T(0.5)};
@@ -448,21 +448,21 @@ __global__ void sat_centroid_kernel(
     const T* __restrict__ aabbs_min,
     const T* __restrict__ aabbs_max,
     const T* __restrict__ tris_verts,
-    const long* __restrict__ cand_a,
-    const long* __restrict__ cand_t,
+    const int64_t* __restrict__ cand_a,
+    const int64_t* __restrict__ cand_t,
     int64_t K, T eps,
     bool* __restrict__ hit_mask,
     int* __restrict__ poly_counts,
     T* __restrict__ centroids,
     T* __restrict__ areas,
-    long* __restrict__ out_a_idx,
-    long* __restrict__ out_t_idx)
+    int64_t* __restrict__ out_a_idx,
+    int64_t* __restrict__ out_t_idx)
 {
     int k = blockIdx.x * blockDim.x + threadIdx.x;
     if (k >= K) return;
 
-    const long ai = cand_a[k];
-    const long ti = cand_t[k];
+    const int64_t ai = cand_a[k];
+    const int64_t ti = cand_t[k];
     out_a_idx[k] = ai;
     out_t_idx[k] = ti;
 
@@ -564,21 +564,21 @@ template<typename T, int MAXV>
 __global__ void sat_clip_kernel(
     const T* __restrict__ aabbs_min, const T* __restrict__ aabbs_max,
     const T* __restrict__ tris_verts,
-    const long* __restrict__ cand_a, const long* __restrict__ cand_t,
+    const int64_t* __restrict__ cand_a, const int64_t* __restrict__ cand_t,
     int64_t K, T eps,
     bool* __restrict__ hit_mask,
     int* __restrict__ poly_counts,
     T* __restrict__ poly_verts,
     T* __restrict__ centroids,
     T* __restrict__ areas,
-    long* __restrict__ out_a_idx,
-    long* __restrict__ out_t_idx)
+    int64_t* __restrict__ out_a_idx,
+    int64_t* __restrict__ out_t_idx)
 {
     int k = blockIdx.x * blockDim.x + threadIdx.x;
     if (k >= K) return;
 
-    const long ai = cand_a[k];
-    const long ti = cand_t[k];
+    const int64_t ai = cand_a[k];
+    const int64_t ti = cand_t[k];
 
     out_a_idx[k] = ai;
     out_t_idx[k] = ti;
@@ -714,9 +714,9 @@ std::vector<at::Tensor> aabb_tri_sat_clip_select_cuda(
             sat_hit_kernel<scalar_t><<<blocks,threads>>>(
                 aabbs_min.data_ptr<scalar_t>(), aabbs_max.data_ptr<scalar_t>(),
                 tris_verts.data_ptr<scalar_t>(),
-                cand_a_idx.data_ptr<long>(), cand_t_idx.data_ptr<long>(),
+                cand_a_idx.data_ptr<int64_t>(), cand_t_idx.data_ptr<int64_t>(),
                 K, (scalar_t)eps, hit_mask.data_ptr<bool>(),
-                out_a_idx.data_ptr<long>(), out_t_idx.data_ptr<long>());
+                out_a_idx.data_ptr<int64_t>(), out_t_idx.data_ptr<int64_t>());
         });
         poly_counts=torch::empty({0}, opts_i);
         poly_verts =torch::empty({0,0,3}, opts_f);
@@ -731,18 +731,18 @@ std::vector<at::Tensor> aabb_tri_sat_clip_select_cuda(
                 sat_centroid_kernel<scalar_t,8><<<blocks,threads>>>(
                     aabbs_min.data_ptr<scalar_t>(), aabbs_max.data_ptr<scalar_t>(),
                     tris_verts.data_ptr<scalar_t>(),
-                    cand_a_idx.data_ptr<long>(), cand_t_idx.data_ptr<long>(),
+                    cand_a_idx.data_ptr<int64_t>(), cand_t_idx.data_ptr<int64_t>(),
                     K,(scalar_t)eps, hit_mask.data_ptr<bool>(),
                     poly_counts.data_ptr<int>(), centroids.data_ptr<scalar_t>(), areas.data_ptr<scalar_t>(),
-                    out_a_idx.data_ptr<long>(), out_t_idx.data_ptr<long>());
+                    out_a_idx.data_ptr<int64_t>(), out_t_idx.data_ptr<int64_t>());
             } else {
                 sat_centroid_kernel<scalar_t,7><<<blocks,threads>>>(
                     aabbs_min.data_ptr<scalar_t>(), aabbs_max.data_ptr<scalar_t>(),
                     tris_verts.data_ptr<scalar_t>(),
-                    cand_a_idx.data_ptr<long>(), cand_t_idx.data_ptr<long>(),
+                    cand_a_idx.data_ptr<int64_t>(), cand_t_idx.data_ptr<int64_t>(),
                     K,(scalar_t)eps, hit_mask.data_ptr<bool>(),
                     poly_counts.data_ptr<int>(), centroids.data_ptr<scalar_t>(), areas.data_ptr<scalar_t>(),
-                    out_a_idx.data_ptr<long>(), out_t_idx.data_ptr<long>());
+                    out_a_idx.data_ptr<int64_t>(), out_t_idx.data_ptr<int64_t>());
             }
         });
     } else { // mode == 2
@@ -755,24 +755,24 @@ std::vector<at::Tensor> aabb_tri_sat_clip_select_cuda(
                 sat_clip_kernel<scalar_t,8><<<blocks,threads>>>(
                     aabbs_min.data_ptr<scalar_t>(), aabbs_max.data_ptr<scalar_t>(),
                     tris_verts.data_ptr<scalar_t>(),
-                    cand_a_idx.data_ptr<long>(), cand_t_idx.data_ptr<long>(),
+                    cand_a_idx.data_ptr<int64_t>(), cand_t_idx.data_ptr<int64_t>(),
                     K,(scalar_t)eps, hit_mask.data_ptr<bool>(),
                     poly_counts.data_ptr<int>(),
                     poly_verts.data_ptr<scalar_t>(),
                     centroids.data_ptr<scalar_t>(),
                     areas.data_ptr<scalar_t>(),
-                    out_a_idx.data_ptr<long>(), out_t_idx.data_ptr<long>());
+                    out_a_idx.data_ptr<int64_t>(), out_t_idx.data_ptr<int64_t>());
             } else {
                 sat_clip_kernel<scalar_t,7><<<blocks,threads>>>(
                     aabbs_min.data_ptr<scalar_t>(), aabbs_max.data_ptr<scalar_t>(),
                     tris_verts.data_ptr<scalar_t>(),
-                    cand_a_idx.data_ptr<long>(), cand_t_idx.data_ptr<long>(),
+                    cand_a_idx.data_ptr<int64_t>(), cand_t_idx.data_ptr<int64_t>(),
                     K,(scalar_t)eps, hit_mask.data_ptr<bool>(),
                     poly_counts.data_ptr<int>(),
                     poly_verts.data_ptr<scalar_t>(),
                     centroids.data_ptr<scalar_t>(),
                     areas.data_ptr<scalar_t>(),
-                    out_a_idx.data_ptr<long>(), out_t_idx.data_ptr<long>());
+                    out_a_idx.data_ptr<int64_t>(), out_t_idx.data_ptr<int64_t>());
             }
         });
     }

From 5e7e93aa38a53937a552b5758e060fa9b0d642ab Mon Sep 17 00:00:00 2001
From: Jeff Daily <jeff.daily@amd.com>
Date: Wed, 17 Jun 2026 14:59:35 -0700
Subject: [PATCH 2/2] [ROCm] Fix Windows HIP extension build: add .hip to MSVC
 _cpp_extensions

PyTorch's BuildExtension on Windows adds .cu/.cuh to the MSVC compiler
driver's _cpp_extensions list so the spawn wrapper can intercept those
files and route them to hipcc instead of cl.exe. However it does not add
.hip. After a PyTorch update (torch 2.9.1+rocm7.14, Jun 2026), the hipify
step renames kernels.cu to kernels.hip before MSVC's compile loop runs,
and the MSVC driver raises "Don't know how to compile *.hip" because .hip
is absent from _cpp_extensions. Fix by subclassing BuildExtension and
appending .hip to _cpp_extensions on Windows before delegating to the
parent, which installs the spawn wrapper that routes .hip -> hipcc.

This fix is Windows-only (the guard checks sys.platform == "win32" and
the hasattr guard is a no-op on Linux where clang is the host compiler).

Authored with the assistance of Claude, an AI assistant.

Test Plan:
```
export PATH="/c/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/HostX64/x64:$PATH"
VENV=/b/develop/TheRock/external-builds/pytorch/.venv
rm -f src/faithcontour/_C/kernels.hip && rm -rf build/
HIP_VISIBLE_DEVICES=0 PYTORCH_ROCM_ARCH=gfx1201 \
  ROCM_HOME=$VENV/Lib/site-packages/_rocm_sdk_devel \
  DISTUTILS_USE_SDK=1 \
  $VENV/Scripts/python.exe setup.py build_ext --inplace
HIP_VISIBLE_DEVICES=0 $VENV/Scripts/python.exe agent_space/faithc_harness_win.py
```
Built for gfx1201 (AMD Radeon RX 9070 XT, RDNA4, Windows 11); the harness
reports 17/17 PASS on all four _C kernel bindings.
---
 setup.py | 15 ++++++++++++++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index b66c075..e84be11 100644
--- a/setup.py
+++ b/setup.py
@@ -6,7 +6,20 @@
 import sys
 
 from setuptools import find_packages, setup
-from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+from torch.utils.cpp_extension import BuildExtension as _BuildExtension, CUDAExtension
+
+
+class BuildExtension(_BuildExtension):
+    def build_extensions(self):
+        # On Windows, PyTorch's BuildExtension adds .cu/.cuh to the MSVC
+        # compiler's _cpp_extensions so the spawn wrapper can route those to
+        # hipcc, but does not add .hip.  The hipify step renames kernels.cu to
+        # kernels.hip before compilation; without .hip in _cpp_extensions the
+        # MSVC compile loop raises "Don't know how to compile *.hip".
+        if sys.platform == "win32" and hasattr(self.compiler, "_cpp_extensions"):
+            if ".hip" not in self.compiler._cpp_extensions:
+                self.compiler._cpp_extensions.append(".hip")
+        super().build_extensions()
 
 _C_DIR = os.path.join("src", "faithcontour", "_C")