diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
index 0b0306596..f9de9c2dc 100644
--- a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
+++ b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h
@@ -7,6 +7,7 @@
 #include <iostream>
 #include <fstream>
 #include <sstream>
+#include <vector>
 #include <cuda_runtime.h>
 #include <fcntl.h>
 #include <unistd.h>
@@ -44,6 +45,7 @@ struct GpuConfig
 
     unsigned FBP_COUNT = 0;           // Frame Buffer Partitions
     unsigned L2_BANKS = 0;            // L2 Cache Banks (LTCs)
+    unsigned NUM_GPCS = 0;            // Graphics Processing Clusters
 };
 inline GpuConfig config;
 // Parses short flags like --sm 80 into a GpuConfig object
@@ -138,7 +140,8 @@ inline void printGpuConfig(const GpuConfig &c = config)
               << "BLOCKS_NUM: " << c.BLOCKS_NUM << "\n"
               << "TOTAL_THREADS: " << c.TOTAL_THREADS << "\n"
               << "FBP_COUNT: " << c.FBP_COUNT << "\n"
-              << "L2_BANKS: " << c.L2_BANKS << "\n";
+              << "L2_BANKS: " << c.L2_BANKS << "\n"
+              << "NUM_GPCS: " << c.NUM_GPCS << "\n";
 }
 
 // GPU error check
@@ -161,6 +164,8 @@ inline void gpuAssert(cudaError_t code, const char *file, int line,
 inline cudaDeviceProp deviceProp;
 
 // NVIDIA RM API defines
+// All control IDs and struct layouts mirror NVIDIA/open-gpu-kernel-modules driver
+// branch 580.95.05 (src/common/sdk/nvidia/inc/ctrl/ctrl{0080,2080}/ctrl*gr.h).
 #define NV_IOCTL_MAGIC 'F'
 #define NV_ESC_RM_ALLOC 0x2b
 #define NV_ESC_RM_CONTROL 0x2a
@@ -169,31 +174,51 @@ inline cudaDeviceProp deviceProp;
 #define NV01_DEVICE_0 0x00000080
 #define NV20_SUBDEVICE_0 0x00002080
 #define NV2080_CTRL_CMD_GR_GET_INFO 0x20801201
+#define NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS 0x2080120f
 
+// ctrl0080gr.h info-index values (a subset; add more as needed).
 // https://github.com/NVIDIA/open-gpu-kernel-modules/blob/580.95.05/src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080gr.h#L142
+#define NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_GPCS 0x00000014
 #define NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS 0x00000015
 #define NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS 0x00000025
 
+#define NV2080_CTRL_GR_GET_SM_TO_GPC_TPC_MAPPINGS_MAX_SM_COUNT 240
+
 typedef uint32_t NvHandle;
 typedef uint32_t NvV32;
 typedef uint64_t NvP64;
 
-// Query single GR info index using NVIDIA RM API
-inline unsigned queryGrInfo(uint32_t info_index)
-{
-    struct NVOS21_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectNew; NvV32 hClass; NvP64 pAllocParms; uint32_t paramsSize, status; };
-    struct NVOS54_PARAMETERS { NvHandle hClient, hObject; NvV32 cmd, flags; NvP64 params; uint32_t paramsSize, status; };
-    struct NVOS00_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectOld; uint32_t status; };
-    struct NV0080_ALLOC_PARAMETERS { uint32_t deviceId; NvHandle hClientShare, hTargetClient, hTargetDevice; NvV32 flags; uint32_t _pad0; uint64_t vaSpaceSize, vaStartInternal, vaLimitInternal; NvV32 vaMode; uint32_t _pad1; };
-    struct NV2080_ALLOC_PARAMETERS { uint32_t subDeviceId; };
-    struct NVXXXX_CTRL_XXX_INFO { uint32_t index, data; };
-    struct NV0080_CTRL_GR_ROUTE_INFO { uint32_t flags, _pad; uint64_t route; };
-    struct NV2080_CTRL_GR_GET_INFO_PARAMS { uint32_t grInfoListSize, _pad; NvP64 grInfoList; NV0080_CTRL_GR_ROUTE_INFO grRouteInfo; };
+// RM ioctl param shapes (lifted to file scope so multiple query helpers reuse them).
+struct NVOS21_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectNew; NvV32 hClass; NvP64 pAllocParms; uint32_t paramsSize, status; };
+struct NVOS54_PARAMETERS { NvHandle hClient, hObject; NvV32 cmd, flags; NvP64 params; uint32_t paramsSize, status; };
+struct NVOS00_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectOld; uint32_t status; };
+struct NV0080_ALLOC_PARAMETERS { uint32_t deviceId; NvHandle hClientShare, hTargetClient, hTargetDevice; NvV32 flags; uint32_t _pad0; uint64_t vaSpaceSize, vaStartInternal, vaLimitInternal; NvV32 vaMode; uint32_t _pad1; };
+struct NV2080_ALLOC_PARAMETERS { uint32_t subDeviceId; };
+struct NVXXXX_CTRL_XXX_INFO { uint32_t index, data; };
+struct NV0080_CTRL_GR_ROUTE_INFO { uint32_t flags, _pad; uint64_t route; };
+
+// Param struct for NV2080_CTRL_CMD_GR_GET_INFO.
+struct NV2080_CTRL_GR_GET_INFO_PARAMS { uint32_t grInfoListSize, _pad; NvP64 grInfoList; NV0080_CTRL_GR_ROUTE_INFO grRouteInfo; };
+
+// Param struct for NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS.
+// Mirrors ctrl2080gr.h:769-776 in driver branch 580.95.05.
+struct SmGpcTpcEntry { uint32_t gpcId; uint32_t tpcId; };
+struct NV2080_CTRL_GR_GET_SM_TO_GPC_TPC_MAPPINGS_PARAMS {
+    SmGpcTpcEntry smId[NV2080_CTRL_GR_GET_SM_TO_GPC_TPC_MAPPINGS_MAX_SM_COUNT];
+    uint32_t      smCount;
+    uint32_t      _pad;             // 8-byte align grRouteInfo
+    NV0080_CTRL_GR_ROUTE_INFO grRouteInfo;
+};
 
+// Run one NV2080-subdevice control. Sets up the alloc chain
+// (root client -> device -> subdevice), issues the control, and tears it all down.
+// Caller supplies the cmd id and a pre-built params struct; result is in-place.
+inline bool rmSubdeviceControl(uint32_t cmd, void *params, uint32_t paramsSize)
+{
     int ctl_fd = open("/dev/nvidiactl", O_RDWR);
     if (ctl_fd < 0) {
         fprintf(stderr, "DEBUG GR: Failed to open /dev/nvidiactl (errno=%d)\n", errno);
-        return 0;
+        return false;
     }
 
     auto rm_alloc = [&](NvHandle hClient, NvHandle hParent, NvHandle hObject, uint32_t hClass, void *pParams, uint32_t size) {
@@ -202,10 +227,10 @@ inline unsigned queryGrInfo(uint32_t info_index)
         if (!success) fprintf(stderr, "DEBUG GR: rm_alloc failed for class 0x%x, status=0x%x\n", hClass, p.status);
         return success;
     };
-    auto rm_control = [&](NvHandle hClient, NvHandle hObject, uint32_t cmd, void *pParams, uint32_t size) {
-        NVOS54_PARAMETERS p = {hClient, hObject, cmd, 0, (NvP64)(uintptr_t)pParams, size, 0};
+    auto rm_control = [&](NvHandle hClient, NvHandle hObject, uint32_t cmd_, void *pParams, uint32_t size) {
+        NVOS54_PARAMETERS p = {hClient, hObject, cmd_, 0, (NvP64)(uintptr_t)pParams, size, 0};
         bool success = ioctl(ctl_fd, _IOWR(NV_IOCTL_MAGIC, NV_ESC_RM_CONTROL, NVOS54_PARAMETERS), &p) >= 0 && p.status == 0;
-        if (!success) fprintf(stderr, "DEBUG GR: rm_control failed for cmd 0x%x, status=0x%x\n", cmd, p.status);
+        if (!success) fprintf(stderr, "DEBUG GR: rm_control failed for cmd 0x%x, status=0x%x\n", cmd_, p.status);
         return success;
     };
     auto rm_free = [&](NvHandle hClient, NvHandle hParent, NvHandle hObject) {
@@ -216,25 +241,43 @@ inline unsigned queryGrInfo(uint32_t info_index)
     NvHandle hClient = 0xCAFE0001, hDevice = 0xCAFE0002, hSubDevice = 0xCAFE0003;
     NV0080_ALLOC_PARAMETERS devParams = {0};
     NV2080_ALLOC_PARAMETERS subdevParams = {0};
-    NVXXXX_CTRL_XXX_INFO infoList[1] = {{info_index, 0}};
-    NV2080_CTRL_GR_GET_INFO_PARAMS grParams = {1, 0, (NvP64)(uintptr_t)infoList, {0, 0, 0}};
-
-    unsigned result = 0;
-    if (rm_alloc(hClient, hClient, hClient, NV01_ROOT_CLIENT, NULL, 0) &&
-        rm_alloc(hClient, hClient, hDevice, NV01_DEVICE_0, &devParams, sizeof(devParams)) &&
-        rm_alloc(hClient, hDevice, hSubDevice, NV20_SUBDEVICE_0, &subdevParams, sizeof(subdevParams)) &&
-        rm_control(hClient, hSubDevice, NV2080_CTRL_CMD_GR_GET_INFO, &grParams, sizeof(grParams))) {
-        result = infoList[0].data;
-        fprintf(stderr, "DEBUG GR: Successfully queried index 0x%x = %u\n", info_index, result);
-    } else {
-        fprintf(stderr, "DEBUG GR: Query sequence failed for index 0x%x\n", info_index);
-    }
+
+    bool ok = rm_alloc(hClient, hClient, hClient,    NV01_ROOT_CLIENT, NULL, 0)
+           && rm_alloc(hClient, hClient, hDevice,    NV01_DEVICE_0,    &devParams,    sizeof(devParams))
+           && rm_alloc(hClient, hDevice, hSubDevice, NV20_SUBDEVICE_0, &subdevParams, sizeof(subdevParams))
+           && rm_control(hClient, hSubDevice, cmd, params, paramsSize);
 
     rm_free(hClient, hDevice, hSubDevice);
     rm_free(hClient, hClient, hDevice);
     rm_free(hClient, hClient, hClient);
     close(ctl_fd);
-    return result;
+    return ok;
+}
+
+// Query a single LITTER_NUM_* / SM_VERSION / etc. value via NV2080_CTRL_CMD_GR_GET_INFO.
+inline unsigned queryGrInfo(uint32_t info_index)
+{
+    NVXXXX_CTRL_XXX_INFO infoList[1] = {{info_index, 0}};
+    NV2080_CTRL_GR_GET_INFO_PARAMS p = {1, 0, (NvP64)(uintptr_t)infoList, {0, 0, 0}};
+    if (rmSubdeviceControl(NV2080_CTRL_CMD_GR_GET_INFO, &p, sizeof(p))) {
+        fprintf(stderr, "DEBUG GR: Successfully queried index 0x%x = %u\n", info_index, infoList[0].data);
+        return infoList[0].data;
+    }
+    fprintf(stderr, "DEBUG GR: Query sequence failed for index 0x%x\n", info_index);
+    return 0;
+}
+
+// Query the per-physical-SM (GPC, TPC) mapping table via
+// NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS. The returned vector is indexed
+// by physical SM id (the same value PTX %smid returns), so SM i lives in GPC
+// result[i].gpcId and TPC result[i].tpcId. Returns empty on failure.
+inline std::vector<SmGpcTpcEntry> querySmToGpcMapping()
+{
+    NV2080_CTRL_GR_GET_SM_TO_GPC_TPC_MAPPINGS_PARAMS p{};
+    if (!rmSubdeviceControl(NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS, &p, sizeof(p)))
+        return {};
+    fprintf(stderr, "DEBUG GR: SM_TO_GPC_TPC_MAPPINGS returned smCount=%u\n", p.smCount);
+    return std::vector<SmGpcTpcEntry>(p.smId, p.smId + p.smCount);
 }
 
 inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[])
@@ -314,9 +357,10 @@ inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[])
         config.MEM_BITWIDTH = deviceProp.memoryBusWidth;
         config.CLK_FREQUENCY = clockRateKHz * 1e-3f;
 
-        // Get FBP_COUNT and L2_BANKS from NVIDIA RM API
+        // Get FBP_COUNT, L2_BANKS, NUM_GPCS from NVIDIA RM API
         config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS);
         config.L2_BANKS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS);
+        config.NUM_GPCS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_GPCS);
     }
 
     parseGpuConfigArgs(argc, argv);
diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/Makefile b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/Makefile
new file mode 100644
index 000000000..848936317
--- /dev/null
+++ b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/Makefile
@@ -0,0 +1,9 @@
+SRC = sm_gpc_mapping.cu
+
+EXE = sm_gpc_mapping
+
+# Thread-block clusters (%cluster_ctarank, cudaLaunchKernelEx with
+# cudaLaunchAttributeClusterDimension) require sm_90+.
+NVCC_FLAGS += -arch=sm_90
+
+include ../../../common/common.mk
diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/sm_gpc_mapping.cu b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/sm_gpc_mapping.cu
new file mode 100644
index 000000000..5044dc431
--- /dev/null
+++ b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/sm_gpc_mapping.cu
@@ -0,0 +1,334 @@
+// Dumps the per-SM (GPC, TPC) mapping for the local GPU using
+// NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS, then runs a small kernel that
+// captures %smid from many blocks and cross-checks the result.
+//
+// Build:   make release  (in this directory)
+// Run:     ../../../bin/sm_gpc_mapping
+//
+// Output is two streams: '#'-prefixed human/diagnostic lines on stdout,
+// and a CSV block at the end (smid,gpcId,tpcId,observed_in_kernel).
+
+#include <algorithm>
+#include <iostream>
+#include <map>
+#include <set>
+#include <vector>
+using namespace std;
+
+#include "../../../hw_def/hw_def.h"
+
+__device__ __forceinline__ unsigned get_smid()
+{
+    unsigned ret;
+    asm("mov.u32 %0, %%smid;" : "=r"(ret));
+    return ret;
+}
+
+__device__ __forceinline__ unsigned get_cluster_ctarank()
+{
+    unsigned ret;
+    asm("mov.u32 %0, %%cluster_ctarank;" : "=r"(ret));
+    return ret;
+}
+
+// Each block writes its %smid into out[blockIdx.x]. The spin keeps the block
+// resident long enough that the scheduler is forced to dispatch siblings to
+// other SMs, giving us coverage of every enabled SM.
+__global__ void capture_smid(unsigned *out, unsigned spin_iters)
+{
+    unsigned smid = get_smid();
+    if (threadIdx.x == 0)
+        out[blockIdx.x] = smid;
+
+    volatile unsigned acc = 0;
+    for (unsigned i = 0; i < spin_iters; ++i)
+        acc += i;
+    if (threadIdx.x == 0 && acc == 0xdeadbeef)
+        out[blockIdx.x] = 0xffffffffu;  // unreachable; defeats DCE
+}
+
+// Cluster-aware variant: captures both %smid and %cluster_ctarank per block.
+// Indexed by the linear block id derived from blockIdx + gridDim.
+__global__ void capture_smid_with_cluster(unsigned *smid_out,
+                                          unsigned *crank_out,
+                                          unsigned spin_iters)
+{
+    unsigned smid = get_smid();
+    unsigned crank = get_cluster_ctarank();
+
+    if (threadIdx.x == 0) {
+        unsigned linear = blockIdx.x
+                        + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z);
+        smid_out[linear] = smid;
+        crank_out[linear] = crank;
+    }
+
+    volatile unsigned acc = 0;
+    for (unsigned i = 0; i < spin_iters; ++i)
+        acc += i;
+    if (threadIdx.x == 0 && acc == 0xdeadbeef)
+        smid_out[0] = 0xffffffffu;
+}
+
+int main(int argc, char *argv[])
+{
+    initializeDeviceProp(0, argc, argv);
+
+    printf("\n# Device Name        = %s\n", deviceProp.name);
+    printf("# Compute Capability = %d.%d\n", deviceProp.major, deviceProp.minor);
+    printf("# SM_NUMBER          = %u\n", config.SM_NUMBER);
+    printf("# NUM_GPCS (RM)      = %u\n", config.NUM_GPCS);
+    printf("# FBP_COUNT, L2_BANKS = %u, %u\n", config.FBP_COUNT, config.L2_BANKS);
+
+    vector<SmGpcTpcEntry> mapping = querySmToGpcMapping();
+    printf("# RM smCount         = %zu\n", mapping.size());
+
+    if (mapping.empty()) {
+        fprintf(stderr,
+                "# ERROR: querySmToGpcMapping returned no entries. "
+                "The driver may not support NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS, "
+                "or /dev/nvidiactl access was denied.\n");
+        return 1;
+    }
+
+    // Build per-GPC and per-(GPC,TPC) reverse indexes.
+    map<uint32_t, vector<uint32_t>> gpc_to_sms;
+    map<pair<uint32_t, uint32_t>, vector<uint32_t>> gpc_tpc_to_sms;
+    uint32_t max_gpc = 0, max_tpc = 0;
+    for (size_t smid = 0; smid < mapping.size(); ++smid) {
+        auto e = mapping[smid];
+        gpc_to_sms[e.gpcId].push_back((uint32_t)smid);
+        gpc_tpc_to_sms[{e.gpcId, e.tpcId}].push_back((uint32_t)smid);
+        max_gpc = std::max(max_gpc, e.gpcId);
+        max_tpc = std::max(max_tpc, e.tpcId);
+    }
+    printf("# distinct GPCs      = %zu  (max gpcId=%u, max tpcId=%u)\n",
+           gpc_to_sms.size(), max_gpc, max_tpc);
+
+    printf("# GPC histogram (gpcId : sm_count : sm_ids):\n");
+    for (auto &kv : gpc_to_sms) {
+        printf("#   GPC %u: %zu SMs [", kv.first, kv.second.size());
+        for (size_t i = 0; i < kv.second.size(); ++i)
+            printf("%s%u", i ? "," : "", kv.second[i]);
+        printf("]\n");
+    }
+
+    // Sanity invariants printed loudly so failures are obvious.
+    bool ok_count =
+        (config.NUM_GPCS == 0) || (gpc_to_sms.size() == config.NUM_GPCS);
+    bool ok_total = (mapping.size() == config.SM_NUMBER);
+    printf("# CHECK distinct_gpcs(%zu) == NUM_GPCS(%u) : %s\n",
+           gpc_to_sms.size(), config.NUM_GPCS, ok_count ? "OK" : "MISMATCH");
+    printf("# CHECK rm_smCount(%zu)    == SM_NUMBER(%u)   : %s\n",
+           mapping.size(), config.SM_NUMBER, ok_total ? "OK" : "MISMATCH");
+
+    // Runtime %smid capture: launch enough blocks to cover every enabled SM.
+    unsigned blocks = config.SM_NUMBER * 4;
+    if (blocks < 64)
+        blocks = 64;
+    unsigned *d_out = nullptr;
+    gpuErrchk(cudaMalloc(&d_out, blocks * sizeof(unsigned)));
+    gpuErrchk(cudaMemset(d_out, 0xff, blocks * sizeof(unsigned)));
+
+    // ~10us per block at ~1.6GHz keeps blocks resident long enough for the
+    // scheduler to fan siblings out to other SMs without a slow run.
+    capture_smid<<<blocks, 32>>>(d_out, 1u << 14);
+    gpuErrchk(cudaPeekAtLastError());
+    gpuErrchk(cudaDeviceSynchronize());
+
+    vector<unsigned> h_out(blocks);
+    gpuErrchk(cudaMemcpy(h_out.data(), d_out, blocks * sizeof(unsigned),
+                         cudaMemcpyDeviceToHost));
+    cudaFree(d_out);
+
+    set<unsigned> observed_smids(h_out.begin(), h_out.end());
+    printf("# kernel observed %zu distinct %%smid values across %u blocks\n",
+           observed_smids.size(), blocks);
+
+    bool all_observed_in_table = true;
+    for (auto smid : observed_smids) {
+        if (smid >= mapping.size()) {
+            printf("# WARN: kernel saw %%smid=%u but RM table size = %zu\n",
+                   smid, mapping.size());
+            all_observed_in_table = false;
+        }
+    }
+    printf("# CHECK observed_smids subset_of rm_table        : %s\n",
+           all_observed_in_table ? "OK" : "MISMATCH");
+    printf("# CHECK observed_count(%zu) == SM_NUMBER(%u)        : %s\n",
+           observed_smids.size(), config.SM_NUMBER,
+           observed_smids.size() == config.SM_NUMBER ? "OK"
+                                                     : "PARTIAL (scheduler did not cover all SMs)");
+
+    // Stable CSV table.
+    printf("\n# CSV: smid,gpcId,tpcId,observed_in_kernel\n");
+    for (size_t smid = 0; smid < mapping.size(); ++smid) {
+        bool seen = observed_smids.count((unsigned)smid) != 0;
+        printf("%zu,%u,%u,%d\n", smid, mapping[smid].gpcId,
+               mapping[smid].tpcId, seen ? 1 : 0);
+    }
+
+    // ============================================================
+    // Cluster-shape sweep
+    // ============================================================
+    // For each cluster shape, launch one block per cluster slot covering as
+    // many SMs as fit (clusters of size K -> floor(SM_NUMBER / K) clusters).
+    // Per block we capture %smid and %cluster_ctarank, then look up gpcId/tpcId
+    // via the RM-reported mapping. This shows how blocks within a cluster
+    // distribute across one GPC's TPCs and how cluster_id rasterizes across
+    // GPCs.
+    //
+    // Note: user request listed "1x8x8" as the last shape, but cluster_size=64
+    // exceeds the maximum cluster size (8 portable / 16 non-portable on H100).
+    // Treating it as a typo for "1x8x1" so we still cover an 8-block cluster
+    // laid out along y. Edit the shapes[] array below if a different
+    // interpretation is wanted.
+    struct ClusterShape {
+        int x, y, z;
+        const char *name;
+    };
+    const ClusterShape shapes[] = {
+        {1, 1, 1, "1x1x1"}, {1, 2, 1, "1x2x1"}, {2, 1, 1, "2x1x1"},
+        {2, 2, 1, "2x2x1"}, {1, 4, 1, "1x4x1"}, {4, 1, 1, "4x1x1"},
+        {2, 4, 1, "2x4x1"}, {4, 2, 1, "4x2x1"}, {8, 1, 1, "8x1x1"},
+        {1, 8, 1, "1x8x1"}, // user typed "1x8x8"; size=64 > max, treating as typo
+    };
+
+    auto cluster_id_from_block = [](int bx, int by, int bz, int gx, int gy,
+                                    const ClusterShape &s) {
+        int cx = bx / s.x, cy = by / s.y, cz = bz / s.z;
+        int ncx = gx / s.x, ncy = gy / s.y;
+        return cx + ncx * (cy + ncy * cz);
+    };
+
+    for (const auto &s : shapes) {
+        int csize = s.x * s.y * s.z;
+        // Rounding up division to get the number of clusters needed to cover all SMs.
+        int nclusters = (int)((config.SM_NUMBER + csize - 1) / (unsigned)csize);
+        if (nclusters < 1) {
+            printf("\n# === cluster shape %s skipped (cluster_size=%d > "
+                   "SM_NUMBER=%u) ===\n",
+                   s.name, csize, config.SM_NUMBER);
+            continue;
+        }
+
+        // Lay clusters out along x: grid.x = s.x * nclusters, grid.y = s.y,
+        // grid.z = s.z. This trivially satisfies gridDim.* % clusterDim.* == 0.
+        dim3 grid(s.x * nclusters, s.y, s.z);
+        dim3 block(32, 1, 1);
+        unsigned total_blocks = grid.x * grid.y * grid.z;
+
+        unsigned *d_smid = nullptr, *d_crank = nullptr;
+        gpuErrchk(cudaMalloc(&d_smid, total_blocks * sizeof(unsigned)));
+        gpuErrchk(cudaMalloc(&d_crank, total_blocks * sizeof(unsigned)));
+        gpuErrchk(cudaMemset(d_smid, 0xff, total_blocks * sizeof(unsigned)));
+        gpuErrchk(cudaMemset(d_crank, 0xff, total_blocks * sizeof(unsigned)));
+
+        cudaLaunchConfig_t cfg = {};
+        cfg.gridDim = grid;
+        cfg.blockDim = block;
+        cfg.dynamicSmemBytes = 0;
+        cfg.stream = 0;
+
+        cudaLaunchAttribute attr[1] = {};
+        attr[0].id = cudaLaunchAttributeClusterDimension;
+        attr[0].val.clusterDim.x = s.x;
+        attr[0].val.clusterDim.y = s.y;
+        attr[0].val.clusterDim.z = s.z;
+        cfg.attrs = attr;
+        cfg.numAttrs = 1;
+
+        cudaError_t err = cudaLaunchKernelEx(&cfg, capture_smid_with_cluster,
+                                             d_smid, d_crank, 1u << 14);
+        if (err != cudaSuccess) {
+            printf("\n# === cluster shape %s launch failed: %s ===\n", s.name,
+                   cudaGetErrorString(err));
+            cudaFree(d_smid);
+            cudaFree(d_crank);
+            continue;
+        }
+        gpuErrchk(cudaDeviceSynchronize());
+
+        vector<unsigned> h_smid(total_blocks), h_crank(total_blocks);
+        gpuErrchk(cudaMemcpy(h_smid.data(), d_smid,
+                             total_blocks * sizeof(unsigned),
+                             cudaMemcpyDeviceToHost));
+        gpuErrchk(cudaMemcpy(h_crank.data(), d_crank,
+                             total_blocks * sizeof(unsigned),
+                             cudaMemcpyDeviceToHost));
+        cudaFree(d_smid);
+        cudaFree(d_crank);
+
+        // Verify single-GPC-per-cluster invariant and record per-cluster GPC.
+        map<int, set<uint32_t>> cluster_gpcs;
+        map<int, set<uint32_t>> cluster_tpcs;
+        map<int, vector<unsigned>> cluster_smids;
+        for (unsigned i = 0; i < total_blocks; ++i) {
+            unsigned smid = h_smid[i];
+            if (smid >= mapping.size())
+                continue;
+            int bx = (int)(i % grid.x);
+            int by = (int)((i / grid.x) % grid.y);
+            int bz = (int)(i / (grid.x * grid.y));
+            int cid =
+                cluster_id_from_block(bx, by, bz, grid.x, grid.y, s);
+            cluster_gpcs[cid].insert(mapping[smid].gpcId);
+            cluster_tpcs[cid].insert(mapping[smid].tpcId);
+            cluster_smids[cid].push_back(smid);
+        }
+
+        bool single_gpc = true;
+        for (auto &kv : cluster_gpcs)
+            if (kv.second.size() != 1)
+                single_gpc = false;
+
+        // Count unique SMs touched by at least one block (active SMs for this
+        // launch). Useful for showing how the kernel covers the GPU when the
+        // cluster shape doesn't tile SM_NUMBER evenly.
+        set<unsigned> active_smids;
+        for (unsigned i = 0; i < total_blocks; ++i) {
+            if (h_smid[i] < mapping.size())
+                active_smids.insert(h_smid[i]);
+        }
+
+        printf(
+            "\n# === cluster shape %s (size=%d, nclusters=%d, grid=%ux%ux%u, "
+            "blocks=%u) ===\n",
+            s.name, csize, nclusters, grid.x, grid.y, grid.z, total_blocks);
+        printf("# active SMs (touched by >=1 block) = %zu / %u\n",
+               active_smids.size(), config.SM_NUMBER);
+        printf("# CHECK every cluster -> single GPC: %s\n",
+               single_gpc ? "OK" : "MISMATCH");
+
+        // cluster_id -> gpc summary, ordered by cluster_id
+        printf("# cluster_id -> gpcId  (sm_count, smids):\n");
+        for (auto &kv : cluster_gpcs) {
+            int cid = kv.first;
+            uint32_t g = *kv.second.begin();
+            auto &sms = cluster_smids[cid];
+            printf("#   cluster %3d -> GPC %u  (%zu SMs: ", cid, g, sms.size());
+            for (size_t i = 0; i < sms.size(); ++i)
+                printf("%s%u", i ? "," : "", sms[i]);
+            printf(")\n");
+        }
+
+        // Per-block CSV.
+        printf("# CSV: linear,blockIdx_x,blockIdx_y,blockIdx_z,cluster_id,"
+               "rank_in_cluster,smid,gpcId,tpcId\n");
+        for (unsigned i = 0; i < total_blocks; ++i) {
+            int bx = (int)(i % grid.x);
+            int by = (int)((i / grid.x) % grid.y);
+            int bz = (int)(i / (grid.x * grid.y));
+            int cid = cluster_id_from_block(bx, by, bz, grid.x, grid.y, s);
+            unsigned smid = h_smid[i];
+            uint32_t gpc =
+                (smid < mapping.size()) ? mapping[smid].gpcId : 0xffffffffu;
+            uint32_t tpc =
+                (smid < mapping.size()) ? mapping[smid].tpcId : 0xffffffffu;
+            printf("%u,%d,%d,%d,%d,%u,%u,%u,%u\n", i, bx, by, bz, cid,
+                   h_crank[i], smid, gpc, tpc);
+        }
+    }
+
+    return 0;
+}
diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/visualize.py b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/visualize.py
new file mode 100755
index 000000000..a6b98ba4a
--- /dev/null
+++ b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/visualize.py
@@ -0,0 +1,341 @@
+#!/usr/bin/env python3
+"""Render GPC > TPC > SM nested ASCII boxes from sm_gpc_mapping output.
+
+Reads the structured stdout of bin/sm_gpc_mapping (or a saved log via
+--input) and prints one nested-box rendering per cluster shape, with each
+SM cell labeled by the cluster_id and rank_in_cluster of the first block
+that landed on it.
+"""
+
+import argparse
+import re
+import sys
+from collections import defaultdict
+
+
+GPC_LINE = re.compile(r"^#\s+GPC\s+(\d+):\s+(\d+)\s+SMs\s+\[([\d,]+)\]")
+SHAPE_HDR = re.compile(
+    r"^#\s+===\s+cluster\s+shape\s+(\S+)\s+"
+    r"\(size=(\d+),\s+nclusters=(\d+),\s+"
+    r"grid=(\d+)x(\d+)x(\d+),\s+blocks=(\d+)\)\s+==="
+)
+CSV_HDR = re.compile(
+    r"^#\s+CSV:\s+linear,blockIdx_x,blockIdx_y,blockIdx_z,"
+    r"cluster_id,rank_in_cluster,smid,gpcId,tpcId"
+)
+ANSI_RE = re.compile(r"\033\[[0-9;]*m")
+
+
+def vislen(s):
+    return len(ANSI_RE.sub("", s))
+
+
+PALETTE = [9, 10, 11, 12, 13, 14, 51, 87, 123, 159, 195, 219, 226, 208, 99, 250]
+
+
+def colored(text, cid, enabled):
+    if not enabled or cid is None:
+        return text
+    code = PALETTE[cid % len(PALETTE)]
+    return f"\033[38;5;{code}m{text}\033[0m"
+
+
+def parse(text):
+    gpc_to_smids = {}
+    shapes = []
+    cur = None
+    in_csv = False
+    for line in text.splitlines():
+        m = GPC_LINE.match(line)
+        if m:
+            gpc_to_smids[int(m.group(1))] = [int(x) for x in m.group(3).split(",")]
+            continue
+        m = SHAPE_HDR.match(line)
+        if m:
+            cur = {
+                "name": m.group(1),
+                "size": int(m.group(2)),
+                "nclusters": int(m.group(3)),
+                "grid": (int(m.group(4)), int(m.group(5)), int(m.group(6))),
+                "blocks": int(m.group(7)),
+                # smid -> list of (cluster_id, rank, tpc, gpc, linear) tuples,
+                # sorted by linear (the dispatch order on the SM)
+                "occ": defaultdict(list),
+            }
+            shapes.append(cur)
+            in_csv = False
+            continue
+        if cur is not None and CSV_HDR.match(line):
+            in_csv = True
+            continue
+        if in_csv and line and not line.startswith("#"):
+            try:
+                p = line.split(",")
+                linear, cid, rank, smid, gpc, tpc = (
+                    int(p[0]),
+                    int(p[4]),
+                    int(p[5]),
+                    int(p[6]),
+                    int(p[7]),
+                    int(p[8]),
+                )
+            except (IndexError, ValueError):
+                in_csv = False
+                continue
+            cur["occ"][smid].append((cid, rank, tpc, gpc, linear))
+    # sort each SM's occupant list by linear so the visualizer can render
+    # them in dispatch order
+    for s in shapes:
+        for smid in s["occ"]:
+            s["occ"][smid].sort(key=lambda t: t[4])
+    return gpc_to_smids, shapes
+
+
+def build_smid_to_tpc(shapes):
+    out = {}
+    for s in shapes:
+        for smid, occs in s["occ"].items():
+            if occs:
+                out[smid] = occs[0][2]  # tpc is identical for every occupant
+    return out
+
+
+def gpc_tpc_layout(gpc_to_smids, smid_to_tpc):
+    """gpc -> list of (tpc_id, [smids in tpc]), ordered by the lowest smid in
+    each TPC so reading L-to-R matches ascending physical SM ids. Hardware
+    tpc_id is preserved as the label, which makes the permutation visible."""
+    layout = {}
+    for gpc, smids in gpc_to_smids.items():
+        d = defaultdict(list)
+        for sm in smids:
+            tpc = smid_to_tpc.get(sm)
+            if tpc is None:
+                d[-1].append(sm)
+            else:
+                d[tpc].append(sm)
+        for k in d:
+            d[k].sort()
+        layout[gpc] = sorted(d.items(), key=lambda kv: kv[1][0])
+    return layout
+
+
+# ---------- rendering ----------
+
+TPC_WIDTH = 11      # outer
+TPC_INNER = TPC_WIDTH - 2  # 9
+TPCS_PER_ROW = 4
+GAP = " "
+
+
+def pad_to(s, vis_text, width):
+    extra = width - len(vis_text)
+    if extra <= 0:
+        return s
+    left = extra // 2
+    right = extra - left
+    return " " * left + s + " " * right
+
+
+def render_sm_lines(smid, occ, color, occ_rows):
+    """occ_rows = number of occupant lines to emit per SM cell (uniform within
+    a shape, computed from the max occupants/SM observed in the shape).
+    Returns 1 sm-id line + occ_rows occupant lines."""
+    if smid is None:
+        return ["|" + " " * TPC_INNER + "|"] * (1 + occ_rows)
+    sm_label = f"sm{smid:>3d}"
+    head = "|" + sm_label.center(TPC_INNER) + "|"
+    occs = occ.get(smid, []) if isinstance(occ, dict) else occ.get(smid, [])
+    if not occs:
+        # SM not touched in this shape's launch.
+        body = ["|" + pad_to("-", "-", TPC_INNER) + "|"]
+        body += ["|" + " " * TPC_INNER + "|"] * (occ_rows - 1)
+        return [head] + body
+    body = []
+    for i in range(occ_rows):
+        if i < len(occs):
+            cid, rank = occs[i][0], occs[i][1]
+            cr_raw = f"c{cid}:{rank}"
+            cr = colored(cr_raw, cid, color)
+            body.append("|" + pad_to(cr, cr_raw, TPC_INNER) + "|")
+        else:
+            body.append("|" + " " * TPC_INNER + "|")
+    return [head] + body
+
+
+def render_tpc(tpc_id, smids, occ, color, occ_rows):
+    """Return list of strings, each width TPC_WIDTH (visible). Height is
+    1 (top) + 2 * (1 + occ_rows) + 1 (sep) + 1 (bottom)."""
+    label = f" TPC {tpc_id} " if tpc_id >= 0 else " TPC ? "
+    extra = TPC_INNER - len(label)
+    left = extra // 2
+    right = extra - left
+    top = "+" + "-" * left + label + "-" * right + "+"
+    sep = "|" + "-" * TPC_INNER + "|"
+    bot = "+" + "-" * TPC_INNER + "+"
+    sm0 = smids[0] if len(smids) >= 1 else None
+    sm1 = smids[1] if len(smids) >= 2 else None
+    return [
+        top,
+        *render_sm_lines(sm0, occ, color, occ_rows),
+        sep,
+        *render_sm_lines(sm1, occ, color, occ_rows),
+        bot,
+    ]
+
+
+def empty_tpc_block(occ_rows):
+    h = 1 + 2 * (1 + occ_rows) + 1 + 1  # top + 2 SMs + sep + bot
+    return [" " * TPC_WIDTH] * h
+
+
+def hjoin(boxes, gap=GAP):
+    if not boxes:
+        return []
+    return [gap.join(b[r] for b in boxes) for r in range(len(boxes[0]))]
+
+
+def render_gpc_boxed(gpc_id, tpcs, occ, color, occ_rows):
+    sm_count = sum(len(smids) for _, smids in tpcs)
+    n_tpcs = len(tpcs)
+    inner_w = TPCS_PER_ROW * TPC_WIDTH + (TPCS_PER_ROW - 1) * len(GAP)
+    margin = "| "
+    rmargin = " |"
+    width = inner_w + len(margin) + len(rmargin)
+
+    label = f"GPC {gpc_id} [{sm_count} SMs]"
+    head = f"+-- {label} "
+    top = head + "-" * (width - len(head) - 1) + "+"
+    bot = "+" + "-" * (width - 2) + "+"
+
+    out = [top]
+    for i in range(0, n_tpcs, TPCS_PER_ROW):
+        chunk = tpcs[i : i + TPCS_PER_ROW]
+        boxes = [render_tpc(tid, smids, occ, color, occ_rows) for tid, smids in chunk]
+        while len(boxes) < TPCS_PER_ROW:
+            boxes.append(empty_tpc_block(occ_rows))
+        for joined in hjoin(boxes):
+            pad = inner_w - vislen(joined)
+            out.append(margin + joined + " " * pad + rmargin)
+    out.append(bot)
+    return "\n".join(out)
+
+
+def render_gpc_compact(gpc_id, tpcs, occ, color):
+    sm_count = sum(len(smids) for _, smids in tpcs)
+    # Each line: "TPC N: [ smX c0:0,c5:1 | smX c1:0 ]"
+    body_lines = []
+    for tpc_id, smids in tpcs:
+        cells = []
+        for sm in smids[:2]:
+            sm_label = f"sm{sm:>3d}"
+            occs = occ.get(sm, [])
+            if not occs:
+                cr_raw = "-"
+                cr = cr_raw
+            else:
+                cr_raw = ",".join(f"c{c}:{r}" for c, r, *_ in occs)
+                cr = ",".join(colored(f"c{c}:{r}", c, color) for c, r, *_ in occs)
+            cell_vis = f"{sm_label} {cr_raw}"
+            cell = f"{sm_label} {cr}"
+            cells.append((cell, cell_vis))
+        while len(cells) < 2:
+            cells.append(("", ""))
+        # equalize cell widths
+        cell_w = max(len(cv) for _, cv in cells) if cells else 0
+        padded = []
+        for c, cv in cells:
+            padded.append(c + " " * (cell_w - len(cv)))
+        body_vis = f"TPC {tpc_id}: [ {padded[0]} | {padded[1]} ]"
+        body_lines.append(body_vis)
+    inner_w = max(vislen(l) for l in body_lines) if body_lines else 20
+    width = inner_w + 4
+
+    label = f"GPC {gpc_id} [{sm_count} SMs]"
+    head = f"+-- {label} "
+    top = head + "-" * (width - len(head) - 1) + "+"
+    bot = "+" + "-" * (width - 2) + "+"
+
+    out = [top]
+    for line in body_lines:
+        pad = inner_w - vislen(line)
+        out.append("| " + line + " " * pad + " |")
+    out.append(bot)
+    return "\n".join(out)
+
+
+def render_topology_header(gpc_to_smids, gpc_tpc_layout_, total_smids):
+    n_gpc = len(gpc_to_smids)
+    out = [f"=== GPU topology ({total_smids} SMs, {n_gpc} GPCs) ==="]
+    for gpc, tpcs in sorted(gpc_tpc_layout_.items()):
+        sm_count = sum(len(s) for _, s in tpcs)
+        parts = []
+        for tid, smids in tpcs:
+            tag = f"TPC{tid}" if tid >= 0 else "TPC?"
+            parts.append(f"{tag}={{{','.join(str(s) for s in smids)}}}")
+        out.append(f"GPC {gpc} [{sm_count} SMs]: " + "  ".join(parts))
+    return "\n".join(out)
+
+
+def render_shape(shape, gpc_tpc_layout_, style, color, total_smids):
+    occ = shape["occ"]
+    occ_rows = max((len(v) for v in occ.values()), default=1)
+    occ_rows = max(occ_rows, 1)
+    active_sms = sum(1 for v in occ.values() if v)
+
+    out = []
+    out.append("")
+    out.append(
+        f"=== Cluster shape {shape['name']} "
+        f"(size={shape['size']}, {shape['nclusters']} clusters, {shape['blocks']} blocks) ==="
+    )
+    out.append(
+        f"--- active SMs (touched by >=1 block) = {active_sms} / {total_smids}; "
+        f"max occupants per SM in this launch = {occ_rows} ---"
+    )
+    out.append("")
+    if style == "boxed":
+        for gpc, tpcs in sorted(gpc_tpc_layout_.items()):
+            out.append(render_gpc_boxed(gpc, tpcs, occ, color, occ_rows))
+            out.append("")
+    else:
+        for gpc, tpcs in sorted(gpc_tpc_layout_.items()):
+            out.append(render_gpc_compact(gpc, tpcs, occ, color))
+            out.append("")
+    return "\n".join(out)
+
+
+def main():
+    ap = argparse.ArgumentParser(description=__doc__)
+    ap.add_argument("--input", "-i", help="Read from file instead of stdin")
+    ap.add_argument("--shape", "-s", help="Render only this cluster shape (e.g., 2x2x1)")
+    ap.add_argument("--style", choices=("boxed", "compact"), default="boxed")
+    ap.add_argument("--color", action="store_true", help="ANSI color cluster ids")
+    args = ap.parse_args()
+
+    text = open(args.input).read() if args.input else sys.stdin.read()
+    gpc_to_smids, shapes = parse(text)
+    if not gpc_to_smids:
+        print("error: no GPC histogram lines found in input", file=sys.stderr)
+        sys.exit(1)
+    if not shapes:
+        print("error: no cluster shape sections found in input", file=sys.stderr)
+        sys.exit(1)
+
+    smid_to_tpc = build_smid_to_tpc(shapes)
+    layout = gpc_tpc_layout(gpc_to_smids, smid_to_tpc)
+    total_smids = sum(len(v) for v in gpc_to_smids.values())
+
+    print(render_topology_header(gpc_to_smids, layout, total_smids))
+
+    target_shapes = (
+        [s for s in shapes if s["name"] == args.shape] if args.shape else shapes
+    )
+    if args.shape and not target_shapes:
+        print(f"error: cluster shape '{args.shape}' not found in input", file=sys.stderr)
+        sys.exit(1)
+    for s in target_shapes:
+        print(render_shape(s, layout, args.style, args.color, total_smids))
+
+
+if __name__ == "__main__":
+    main()