From 1d47e7a58d2062961ba543d871da3524f6b616fb Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Wed, 17 Jun 2026 15:32:00 +0000 Subject: [PATCH 1/2] [ROCm] Add setup.py to build the _C extension (CUDA + ROCm via hipify) The faithcontour._C extension has no build wiring on main: there is no setup.py and pyproject declares only a pure-Python package, so a source install never compiles _C and "from . import _C" fails at import. This adds a setup.py with a torch CUDAExtension/BuildExtension over _C/{bindings.cpp,kernels.cu}. On a CUDA PyTorch it builds the original CUDA sources unchanged; on a ROCm PyTorch BuildExtension hipifies the same sources automatically, so the extension builds for AMD GPUs with no source changes. This complements the existing ROCm kernel/runtime support already on main by providing the missing compiled artifact. setup.py also carries a Windows-only /ALTERNATENAME linker directive: c10.dll does not export the inherited c10::ValueError(SourceLocation, string) constructor that references, so the import thunk is aliased to the exported c10::Error(SourceLocation, string) (ValueError IS-A Error with no extra data members). kernels.cu converts the int64 index and candidate buffer types and casts from long to int64_t. This is a no-op on LP64 Linux (long == int64_t) but is required on Windows LLP64 where long is 32-bit while the torch int64 tensors backing these buffers are 64-bit. .gitignore adds the hipify byproducts (*.hip, *.prehip) and versioned shared objects (*.so.*) so a ROCm build leaves the tree clean. This work was authored with the assistance of Claude, an AI assistant. Test Plan: ``` rm -f src/faithcontour/_C/kernels.hip src/faithcontour/_C/*.prehip rm -rf build cd src && HIP_VISIBLE_DEVICES=0 PYTORCH_ROCM_ARCH=gfx90a \ python setup.py build_ext --inplace HIP_VISIBLE_DEVICES=0 python3 agent_space/faithc_harness.py ``` Built cleanly on gfx90a (AMD Instinct MI250X, ROCm 7.2); the harness drives all four _C bindings on GPU against a torch CPU reference and reports all checks PASS. --- .gitignore | 3 ++ README.md | 19 ++++++++++ setup.py | 50 ++++++++++++++++++++++++++ src/faithcontour/_C/kernels.cu | 66 +++++++++++++++++----------------- 4 files changed, 105 insertions(+), 33 deletions(-) create mode 100644 setup.py diff --git a/.gitignore b/.gitignore index 3d9226c..6e6e894 100644 --- a/.gitignore +++ b/.gitignore @@ -10,4 +10,7 @@ assets/*.glb assets/*.ply assets/*.stl *.so +*.so.* +*.hip +*.prehip src/faithcontour.egg-info/* diff --git a/README.md b/README.md index 4654972..7e3e1dc 100644 --- a/README.md +++ b/README.md @@ -107,6 +107,25 @@ pip install -e . --no-build-isolation pip install trimesh scipy einops ``` +
+AMD GPU (ROCm) + +The `_C` extension also builds on AMD GPUs with ROCm. Install a ROCm build of PyTorch, then build FaithContour against it; the build hipifies the CUDA sources automatically, so no source changes are needed. + +```bash +# Install PyTorch for ROCm (example for ROCm 7.2) +pip install torch --index-url https://download.pytorch.org/whl/rocm7.2 + +# Build FaithContour (set the arch(es) for your GPU, e.g. gfx90a, gfx1100) +git clone https://github.com/Luo-Yihao/FaithC.git +cd FaithC +PYTORCH_ROCM_ARCH=gfx90a pip install -e . --no-build-isolation +``` + +If `PYTORCH_ROCM_ARCH` is unset, the build targets the architectures of the GPUs visible on the build host. + +
+ ## Quick Start diff --git a/setup.py b/setup.py new file mode 100644 index 0000000..b66c075 --- /dev/null +++ b/setup.py @@ -0,0 +1,50 @@ +# Builds the faithcontour._C extension. On a CUDA PyTorch this compiles the +# original CUDA sources; on a ROCm PyTorch BuildExtension hipifies them so the +# same sources build for AMD GPUs. + +import os +import sys + +from setuptools import find_packages, setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +_C_DIR = os.path.join("src", "faithcontour", "_C") + +sources = [ + os.path.join(_C_DIR, "bindings.cpp"), + os.path.join(_C_DIR, "kernels.cu"), +] +extra_link_args = [] +if sys.platform == "win32": + # c10.dll does not export the c10::ValueError(SourceLocation, string) + # constructor that is inherited via "using Error::Error". Headers included + # through (e.g. ATen/TensorIndexing.h) trigger + # TORCH_CHECK_VALUE which generates a __declspec(dllimport) reference to + # that constructor, causing LNK2001. Alias it to Error(SourceLocation, + # string) which IS exported from c10.dll. ValueError IS-A Error with no + # extra data members; the constructors are semantically identical. + # Alias the missing dllimport thunk for ValueError(SourceLocation,string) + # to Error(SourceLocation,string) which IS in c10.dll. + _val_imp = ( + "__imp_??0ValueError@c10@@QEAA@USourceLocation@1@" + "V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z" + ) + _err_imp = ( + "__imp_??0Error@c10@@QEAA@USourceLocation@1@" + "V?$basic_string@DU?$char_traits@D@std@@V?$allocator@D@2@@std@@@Z" + ) + extra_link_args.append(f"/ALTERNATENAME:{_val_imp}={_err_imp}") + +setup( + name="faithcontour", + package_dir={"": "src"}, + packages=find_packages("src"), + ext_modules=[ + CUDAExtension( + name="faithcontour._C", + sources=sources, + extra_link_args=extra_link_args, + ) + ], + cmdclass={"build_ext": BuildExtension}, +) diff --git a/src/faithcontour/_C/kernels.cu b/src/faithcontour/_C/kernels.cu index 191979a..5ffd586 100644 --- a/src/faithcontour/_C/kernels.cu +++ b/src/faithcontour/_C/kernels.cu @@ -42,8 +42,8 @@ __global__ void k_segment_tri_intersection_fused_float( int64_t num_segs, int64_t num_tris, float eps, - long* __restrict__ out_seg_indices, - long* __restrict__ out_tri_indices, + int64_t* __restrict__ out_seg_indices, + int64_t* __restrict__ out_tri_indices, float* __restrict__ out_dots, int* __restrict__ counter) { @@ -168,7 +168,7 @@ std::vector segment_tri_intersection_fused_cuda( seg_verts.data_ptr(), tris_verts.data_ptr(), tri_aabb_min.data_ptr(), tri_aabb_max.data_ptr(), num_segs, num_tris, static_cast(eps), - out_seg_indices.data_ptr(), out_tri_indices.data_ptr(), + out_seg_indices.data_ptr(), out_tri_indices.data_ptr(), out_dots.data_ptr(), counter.data_ptr()); int final_hit_count = counter.item(); @@ -286,8 +286,8 @@ __global__ void k_gen_candidates_overlap( const T* __restrict__ tri_min, const T* __restrict__ tri_max, int64_t Na, int64_t Nt, int64_t a_offset, int64_t t_offset, - long* __restrict__ cand_a_out, long* __restrict__ cand_t_out, - int* __restrict__ counter, unsigned char* __restrict__ overflow, long cap, T eps) + int64_t* __restrict__ cand_a_out, int64_t* __restrict__ cand_t_out, + int* __restrict__ counter, unsigned char* __restrict__ overflow, int64_t cap, T eps) { int ai = blockIdx.x * blockDim.x + threadIdx.x; int ti = blockIdx.y * blockDim.y + threadIdx.y; @@ -304,9 +304,9 @@ __global__ void k_gen_candidates_overlap( if (!ov) return; int idx = atomicAdd(counter, 1); - if ((long)idx >= cap){ *overflow = 1; return; } - cand_a_out[idx] = (long)(a_offset + ai); - cand_t_out[idx] = (long)(t_offset + ti); + if ((int64_t)idx >= cap){ *overflow = 1; return; } + cand_a_out[idx] = (int64_t)(a_offset + ai); + cand_t_out[idx] = (int64_t)(t_offset + ti); } void gen_candidates_overlap_cuda( @@ -326,9 +326,9 @@ void gen_candidates_overlap_cuda( aabb_min.data_ptr(), aabb_max.data_ptr(), tri_min.data_ptr(), tri_max.data_ptr(), Na, Nt, a_offset, t_offset, - cand_a_out.data_ptr(), cand_t_out.data_ptr(), + cand_a_out.data_ptr(), cand_t_out.data_ptr(), counter.data_ptr(), overflow.data_ptr(), - (long)cand_a_out.size(0), + (int64_t)cand_a_out.size(0), (scalar_t)eps ); }); @@ -430,9 +430,9 @@ __device__ __forceinline__ int clip_with_plane(const T in_poly[MAXV][3], int in_ // Narrowphase Kernels for aabb_tri_sat_clip_select template -__global__ void sat_hit_kernel(const T* __restrict__ aabbs_min, const T* __restrict__ aabbs_max, const T* __restrict__ tris_verts, const long* __restrict__ cand_a, const long* __restrict__ cand_t, int64_t K, T eps, bool* __restrict__ hit_mask, long* __restrict__ out_a_idx, long* __restrict__ out_t_idx) { +__global__ void sat_hit_kernel(const T* __restrict__ aabbs_min, const T* __restrict__ aabbs_max, const T* __restrict__ tris_verts, const int64_t* __restrict__ cand_a, const int64_t* __restrict__ cand_t, int64_t K, T eps, bool* __restrict__ hit_mask, int64_t* __restrict__ out_a_idx, int64_t* __restrict__ out_t_idx) { int k=blockIdx.x*blockDim.x+threadIdx.x; if (k>=K) return; - long ai=cand_a[k], ti=cand_t[k]; + int64_t ai=cand_a[k], ti=cand_t[k]; const T* bmin=aabbs_min+ai*3; const T* bmax=aabbs_max+ai*3; T center[3]={(bmin[0]+bmax[0])*T(0.5),(bmin[1]+bmax[1])*T(0.5),(bmin[2]+bmax[2])*T(0.5)}; T he[3]={(bmax[0]-bmin[0])*T(0.5),(bmax[1]-bmin[1])*T(0.5),(bmax[2]-bmin[2])*T(0.5)}; @@ -448,21 +448,21 @@ __global__ void sat_centroid_kernel( const T* __restrict__ aabbs_min, const T* __restrict__ aabbs_max, const T* __restrict__ tris_verts, - const long* __restrict__ cand_a, - const long* __restrict__ cand_t, + const int64_t* __restrict__ cand_a, + const int64_t* __restrict__ cand_t, int64_t K, T eps, bool* __restrict__ hit_mask, int* __restrict__ poly_counts, T* __restrict__ centroids, T* __restrict__ areas, - long* __restrict__ out_a_idx, - long* __restrict__ out_t_idx) + int64_t* __restrict__ out_a_idx, + int64_t* __restrict__ out_t_idx) { int k = blockIdx.x * blockDim.x + threadIdx.x; if (k >= K) return; - const long ai = cand_a[k]; - const long ti = cand_t[k]; + const int64_t ai = cand_a[k]; + const int64_t ti = cand_t[k]; out_a_idx[k] = ai; out_t_idx[k] = ti; @@ -564,21 +564,21 @@ template __global__ void sat_clip_kernel( const T* __restrict__ aabbs_min, const T* __restrict__ aabbs_max, const T* __restrict__ tris_verts, - const long* __restrict__ cand_a, const long* __restrict__ cand_t, + const int64_t* __restrict__ cand_a, const int64_t* __restrict__ cand_t, int64_t K, T eps, bool* __restrict__ hit_mask, int* __restrict__ poly_counts, T* __restrict__ poly_verts, T* __restrict__ centroids, T* __restrict__ areas, - long* __restrict__ out_a_idx, - long* __restrict__ out_t_idx) + int64_t* __restrict__ out_a_idx, + int64_t* __restrict__ out_t_idx) { int k = blockIdx.x * blockDim.x + threadIdx.x; if (k >= K) return; - const long ai = cand_a[k]; - const long ti = cand_t[k]; + const int64_t ai = cand_a[k]; + const int64_t ti = cand_t[k]; out_a_idx[k] = ai; out_t_idx[k] = ti; @@ -714,9 +714,9 @@ std::vector aabb_tri_sat_clip_select_cuda( sat_hit_kernel<<>>( aabbs_min.data_ptr(), aabbs_max.data_ptr(), tris_verts.data_ptr(), - cand_a_idx.data_ptr(), cand_t_idx.data_ptr(), + cand_a_idx.data_ptr(), cand_t_idx.data_ptr(), K, (scalar_t)eps, hit_mask.data_ptr(), - out_a_idx.data_ptr(), out_t_idx.data_ptr()); + out_a_idx.data_ptr(), out_t_idx.data_ptr()); }); poly_counts=torch::empty({0}, opts_i); poly_verts =torch::empty({0,0,3}, opts_f); @@ -731,18 +731,18 @@ std::vector aabb_tri_sat_clip_select_cuda( sat_centroid_kernel<<>>( aabbs_min.data_ptr(), aabbs_max.data_ptr(), tris_verts.data_ptr(), - cand_a_idx.data_ptr(), cand_t_idx.data_ptr(), + cand_a_idx.data_ptr(), cand_t_idx.data_ptr(), K,(scalar_t)eps, hit_mask.data_ptr(), poly_counts.data_ptr(), centroids.data_ptr(), areas.data_ptr(), - out_a_idx.data_ptr(), out_t_idx.data_ptr()); + out_a_idx.data_ptr(), out_t_idx.data_ptr()); } else { sat_centroid_kernel<<>>( aabbs_min.data_ptr(), aabbs_max.data_ptr(), tris_verts.data_ptr(), - cand_a_idx.data_ptr(), cand_t_idx.data_ptr(), + cand_a_idx.data_ptr(), cand_t_idx.data_ptr(), K,(scalar_t)eps, hit_mask.data_ptr(), poly_counts.data_ptr(), centroids.data_ptr(), areas.data_ptr(), - out_a_idx.data_ptr(), out_t_idx.data_ptr()); + out_a_idx.data_ptr(), out_t_idx.data_ptr()); } }); } else { // mode == 2 @@ -755,24 +755,24 @@ std::vector aabb_tri_sat_clip_select_cuda( sat_clip_kernel<<>>( aabbs_min.data_ptr(), aabbs_max.data_ptr(), tris_verts.data_ptr(), - cand_a_idx.data_ptr(), cand_t_idx.data_ptr(), + cand_a_idx.data_ptr(), cand_t_idx.data_ptr(), K,(scalar_t)eps, hit_mask.data_ptr(), poly_counts.data_ptr(), poly_verts.data_ptr(), centroids.data_ptr(), areas.data_ptr(), - out_a_idx.data_ptr(), out_t_idx.data_ptr()); + out_a_idx.data_ptr(), out_t_idx.data_ptr()); } else { sat_clip_kernel<<>>( aabbs_min.data_ptr(), aabbs_max.data_ptr(), tris_verts.data_ptr(), - cand_a_idx.data_ptr(), cand_t_idx.data_ptr(), + cand_a_idx.data_ptr(), cand_t_idx.data_ptr(), K,(scalar_t)eps, hit_mask.data_ptr(), poly_counts.data_ptr(), poly_verts.data_ptr(), centroids.data_ptr(), areas.data_ptr(), - out_a_idx.data_ptr(), out_t_idx.data_ptr()); + out_a_idx.data_ptr(), out_t_idx.data_ptr()); } }); } From 5e7e93aa38a53937a552b5758e060fa9b0d642ab Mon Sep 17 00:00:00 2001 From: Jeff Daily Date: Wed, 17 Jun 2026 14:59:35 -0700 Subject: [PATCH 2/2] [ROCm] Fix Windows HIP extension build: add .hip to MSVC _cpp_extensions PyTorch's BuildExtension on Windows adds .cu/.cuh to the MSVC compiler driver's _cpp_extensions list so the spawn wrapper can intercept those files and route them to hipcc instead of cl.exe. However it does not add .hip. After a PyTorch update (torch 2.9.1+rocm7.14, Jun 2026), the hipify step renames kernels.cu to kernels.hip before MSVC's compile loop runs, and the MSVC driver raises "Don't know how to compile *.hip" because .hip is absent from _cpp_extensions. Fix by subclassing BuildExtension and appending .hip to _cpp_extensions on Windows before delegating to the parent, which installs the spawn wrapper that routes .hip -> hipcc. This fix is Windows-only (the guard checks sys.platform == "win32" and the hasattr guard is a no-op on Linux where clang is the host compiler). Authored with the assistance of Claude, an AI assistant. Test Plan: ``` export PATH="/c/Program Files (x86)/Microsoft Visual Studio/2022/BuildTools/VC/Tools/MSVC/14.44.35207/bin/HostX64/x64:$PATH" VENV=/b/develop/TheRock/external-builds/pytorch/.venv rm -f src/faithcontour/_C/kernels.hip && rm -rf build/ HIP_VISIBLE_DEVICES=0 PYTORCH_ROCM_ARCH=gfx1201 \ ROCM_HOME=$VENV/Lib/site-packages/_rocm_sdk_devel \ DISTUTILS_USE_SDK=1 \ $VENV/Scripts/python.exe setup.py build_ext --inplace HIP_VISIBLE_DEVICES=0 $VENV/Scripts/python.exe agent_space/faithc_harness_win.py ``` Built for gfx1201 (AMD Radeon RX 9070 XT, RDNA4, Windows 11); the harness reports 17/17 PASS on all four _C kernel bindings. --- setup.py | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/setup.py b/setup.py index b66c075..e84be11 100644 --- a/setup.py +++ b/setup.py @@ -6,7 +6,20 @@ import sys from setuptools import find_packages, setup -from torch.utils.cpp_extension import BuildExtension, CUDAExtension +from torch.utils.cpp_extension import BuildExtension as _BuildExtension, CUDAExtension + + +class BuildExtension(_BuildExtension): + def build_extensions(self): + # On Windows, PyTorch's BuildExtension adds .cu/.cuh to the MSVC + # compiler's _cpp_extensions so the spawn wrapper can route those to + # hipcc, but does not add .hip. The hipify step renames kernels.cu to + # kernels.hip before compilation; without .hip in _cpp_extensions the + # MSVC compile loop raises "Don't know how to compile *.hip". + if sys.platform == "win32" and hasattr(self.compiler, "_cpp_extensions"): + if ".hip" not in self.compiler._cpp_extensions: + self.compiler._cpp_extensions.append(".hip") + super().build_extensions() _C_DIR = os.path.join("src", "faithcontour", "_C")