diff --git a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h index 0b0306596..f9de9c2dc 100644 --- a/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h +++ b/src/cuda/GPU_Microbenchmark/hw_def/common/gpuConfig.h @@ -7,6 +7,7 @@ #include #include #include +#include #include #include #include @@ -44,6 +45,7 @@ struct GpuConfig unsigned FBP_COUNT = 0; // Frame Buffer Partitions unsigned L2_BANKS = 0; // L2 Cache Banks (LTCs) + unsigned NUM_GPCS = 0; // Graphics Processing Clusters }; inline GpuConfig config; // Parses short flags like --sm 80 into a GpuConfig object @@ -138,7 +140,8 @@ inline void printGpuConfig(const GpuConfig &c = config) << "BLOCKS_NUM: " << c.BLOCKS_NUM << "\n" << "TOTAL_THREADS: " << c.TOTAL_THREADS << "\n" << "FBP_COUNT: " << c.FBP_COUNT << "\n" - << "L2_BANKS: " << c.L2_BANKS << "\n"; + << "L2_BANKS: " << c.L2_BANKS << "\n" + << "NUM_GPCS: " << c.NUM_GPCS << "\n"; } // GPU error check @@ -161,6 +164,8 @@ inline void gpuAssert(cudaError_t code, const char *file, int line, inline cudaDeviceProp deviceProp; // NVIDIA RM API defines +// All control IDs and struct layouts mirror NVIDIA/open-gpu-kernel-modules driver +// branch 580.95.05 (src/common/sdk/nvidia/inc/ctrl/ctrl{0080,2080}/ctrl*gr.h). #define NV_IOCTL_MAGIC 'F' #define NV_ESC_RM_ALLOC 0x2b #define NV_ESC_RM_CONTROL 0x2a @@ -169,31 +174,51 @@ inline cudaDeviceProp deviceProp; #define NV01_DEVICE_0 0x00000080 #define NV20_SUBDEVICE_0 0x00002080 #define NV2080_CTRL_CMD_GR_GET_INFO 0x20801201 +#define NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS 0x2080120f +// ctrl0080gr.h info-index values (a subset; add more as needed). // https://github.com/NVIDIA/open-gpu-kernel-modules/blob/580.95.05/src/common/sdk/nvidia/inc/ctrl/ctrl0080/ctrl0080gr.h#L142 +#define NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_GPCS 0x00000014 #define NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS 0x00000015 #define NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS 0x00000025 +#define NV2080_CTRL_GR_GET_SM_TO_GPC_TPC_MAPPINGS_MAX_SM_COUNT 240 + typedef uint32_t NvHandle; typedef uint32_t NvV32; typedef uint64_t NvP64; -// Query single GR info index using NVIDIA RM API -inline unsigned queryGrInfo(uint32_t info_index) -{ - struct NVOS21_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectNew; NvV32 hClass; NvP64 pAllocParms; uint32_t paramsSize, status; }; - struct NVOS54_PARAMETERS { NvHandle hClient, hObject; NvV32 cmd, flags; NvP64 params; uint32_t paramsSize, status; }; - struct NVOS00_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectOld; uint32_t status; }; - struct NV0080_ALLOC_PARAMETERS { uint32_t deviceId; NvHandle hClientShare, hTargetClient, hTargetDevice; NvV32 flags; uint32_t _pad0; uint64_t vaSpaceSize, vaStartInternal, vaLimitInternal; NvV32 vaMode; uint32_t _pad1; }; - struct NV2080_ALLOC_PARAMETERS { uint32_t subDeviceId; }; - struct NVXXXX_CTRL_XXX_INFO { uint32_t index, data; }; - struct NV0080_CTRL_GR_ROUTE_INFO { uint32_t flags, _pad; uint64_t route; }; - struct NV2080_CTRL_GR_GET_INFO_PARAMS { uint32_t grInfoListSize, _pad; NvP64 grInfoList; NV0080_CTRL_GR_ROUTE_INFO grRouteInfo; }; +// RM ioctl param shapes (lifted to file scope so multiple query helpers reuse them). +struct NVOS21_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectNew; NvV32 hClass; NvP64 pAllocParms; uint32_t paramsSize, status; }; +struct NVOS54_PARAMETERS { NvHandle hClient, hObject; NvV32 cmd, flags; NvP64 params; uint32_t paramsSize, status; }; +struct NVOS00_PARAMETERS { NvHandle hRoot, hObjectParent, hObjectOld; uint32_t status; }; +struct NV0080_ALLOC_PARAMETERS { uint32_t deviceId; NvHandle hClientShare, hTargetClient, hTargetDevice; NvV32 flags; uint32_t _pad0; uint64_t vaSpaceSize, vaStartInternal, vaLimitInternal; NvV32 vaMode; uint32_t _pad1; }; +struct NV2080_ALLOC_PARAMETERS { uint32_t subDeviceId; }; +struct NVXXXX_CTRL_XXX_INFO { uint32_t index, data; }; +struct NV0080_CTRL_GR_ROUTE_INFO { uint32_t flags, _pad; uint64_t route; }; + +// Param struct for NV2080_CTRL_CMD_GR_GET_INFO. +struct NV2080_CTRL_GR_GET_INFO_PARAMS { uint32_t grInfoListSize, _pad; NvP64 grInfoList; NV0080_CTRL_GR_ROUTE_INFO grRouteInfo; }; + +// Param struct for NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS. +// Mirrors ctrl2080gr.h:769-776 in driver branch 580.95.05. +struct SmGpcTpcEntry { uint32_t gpcId; uint32_t tpcId; }; +struct NV2080_CTRL_GR_GET_SM_TO_GPC_TPC_MAPPINGS_PARAMS { + SmGpcTpcEntry smId[NV2080_CTRL_GR_GET_SM_TO_GPC_TPC_MAPPINGS_MAX_SM_COUNT]; + uint32_t smCount; + uint32_t _pad; // 8-byte align grRouteInfo + NV0080_CTRL_GR_ROUTE_INFO grRouteInfo; +}; +// Run one NV2080-subdevice control. Sets up the alloc chain +// (root client -> device -> subdevice), issues the control, and tears it all down. +// Caller supplies the cmd id and a pre-built params struct; result is in-place. +inline bool rmSubdeviceControl(uint32_t cmd, void *params, uint32_t paramsSize) +{ int ctl_fd = open("/dev/nvidiactl", O_RDWR); if (ctl_fd < 0) { fprintf(stderr, "DEBUG GR: Failed to open /dev/nvidiactl (errno=%d)\n", errno); - return 0; + return false; } auto rm_alloc = [&](NvHandle hClient, NvHandle hParent, NvHandle hObject, uint32_t hClass, void *pParams, uint32_t size) { @@ -202,10 +227,10 @@ inline unsigned queryGrInfo(uint32_t info_index) if (!success) fprintf(stderr, "DEBUG GR: rm_alloc failed for class 0x%x, status=0x%x\n", hClass, p.status); return success; }; - auto rm_control = [&](NvHandle hClient, NvHandle hObject, uint32_t cmd, void *pParams, uint32_t size) { - NVOS54_PARAMETERS p = {hClient, hObject, cmd, 0, (NvP64)(uintptr_t)pParams, size, 0}; + auto rm_control = [&](NvHandle hClient, NvHandle hObject, uint32_t cmd_, void *pParams, uint32_t size) { + NVOS54_PARAMETERS p = {hClient, hObject, cmd_, 0, (NvP64)(uintptr_t)pParams, size, 0}; bool success = ioctl(ctl_fd, _IOWR(NV_IOCTL_MAGIC, NV_ESC_RM_CONTROL, NVOS54_PARAMETERS), &p) >= 0 && p.status == 0; - if (!success) fprintf(stderr, "DEBUG GR: rm_control failed for cmd 0x%x, status=0x%x\n", cmd, p.status); + if (!success) fprintf(stderr, "DEBUG GR: rm_control failed for cmd 0x%x, status=0x%x\n", cmd_, p.status); return success; }; auto rm_free = [&](NvHandle hClient, NvHandle hParent, NvHandle hObject) { @@ -216,25 +241,43 @@ inline unsigned queryGrInfo(uint32_t info_index) NvHandle hClient = 0xCAFE0001, hDevice = 0xCAFE0002, hSubDevice = 0xCAFE0003; NV0080_ALLOC_PARAMETERS devParams = {0}; NV2080_ALLOC_PARAMETERS subdevParams = {0}; - NVXXXX_CTRL_XXX_INFO infoList[1] = {{info_index, 0}}; - NV2080_CTRL_GR_GET_INFO_PARAMS grParams = {1, 0, (NvP64)(uintptr_t)infoList, {0, 0, 0}}; - - unsigned result = 0; - if (rm_alloc(hClient, hClient, hClient, NV01_ROOT_CLIENT, NULL, 0) && - rm_alloc(hClient, hClient, hDevice, NV01_DEVICE_0, &devParams, sizeof(devParams)) && - rm_alloc(hClient, hDevice, hSubDevice, NV20_SUBDEVICE_0, &subdevParams, sizeof(subdevParams)) && - rm_control(hClient, hSubDevice, NV2080_CTRL_CMD_GR_GET_INFO, &grParams, sizeof(grParams))) { - result = infoList[0].data; - fprintf(stderr, "DEBUG GR: Successfully queried index 0x%x = %u\n", info_index, result); - } else { - fprintf(stderr, "DEBUG GR: Query sequence failed for index 0x%x\n", info_index); - } + + bool ok = rm_alloc(hClient, hClient, hClient, NV01_ROOT_CLIENT, NULL, 0) + && rm_alloc(hClient, hClient, hDevice, NV01_DEVICE_0, &devParams, sizeof(devParams)) + && rm_alloc(hClient, hDevice, hSubDevice, NV20_SUBDEVICE_0, &subdevParams, sizeof(subdevParams)) + && rm_control(hClient, hSubDevice, cmd, params, paramsSize); rm_free(hClient, hDevice, hSubDevice); rm_free(hClient, hClient, hDevice); rm_free(hClient, hClient, hClient); close(ctl_fd); - return result; + return ok; +} + +// Query a single LITTER_NUM_* / SM_VERSION / etc. value via NV2080_CTRL_CMD_GR_GET_INFO. +inline unsigned queryGrInfo(uint32_t info_index) +{ + NVXXXX_CTRL_XXX_INFO infoList[1] = {{info_index, 0}}; + NV2080_CTRL_GR_GET_INFO_PARAMS p = {1, 0, (NvP64)(uintptr_t)infoList, {0, 0, 0}}; + if (rmSubdeviceControl(NV2080_CTRL_CMD_GR_GET_INFO, &p, sizeof(p))) { + fprintf(stderr, "DEBUG GR: Successfully queried index 0x%x = %u\n", info_index, infoList[0].data); + return infoList[0].data; + } + fprintf(stderr, "DEBUG GR: Query sequence failed for index 0x%x\n", info_index); + return 0; +} + +// Query the per-physical-SM (GPC, TPC) mapping table via +// NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS. The returned vector is indexed +// by physical SM id (the same value PTX %smid returns), so SM i lives in GPC +// result[i].gpcId and TPC result[i].tpcId. Returns empty on failure. +inline std::vector querySmToGpcMapping() +{ + NV2080_CTRL_GR_GET_SM_TO_GPC_TPC_MAPPINGS_PARAMS p{}; + if (!rmSubdeviceControl(NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS, &p, sizeof(p))) + return {}; + fprintf(stderr, "DEBUG GR: SM_TO_GPC_TPC_MAPPINGS returned smCount=%u\n", p.smCount); + return std::vector(p.smId, p.smId + p.smCount); } inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[]) @@ -314,9 +357,10 @@ inline unsigned initializeDeviceProp(unsigned deviceID, int argc, char *argv[]) config.MEM_BITWIDTH = deviceProp.memoryBusWidth; config.CLK_FREQUENCY = clockRateKHz * 1e-3f; - // Get FBP_COUNT and L2_BANKS from NVIDIA RM API + // Get FBP_COUNT, L2_BANKS, NUM_GPCS from NVIDIA RM API config.FBP_COUNT = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_FBPS); config.L2_BANKS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_LTCS); + config.NUM_GPCS = queryGrInfo(NV2080_CTRL_GR_INFO_INDEX_LITTER_NUM_GPCS); } parseGpuConfigArgs(argc, argv); diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/Makefile b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/Makefile new file mode 100644 index 000000000..848936317 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/Makefile @@ -0,0 +1,9 @@ +SRC = sm_gpc_mapping.cu + +EXE = sm_gpc_mapping + +# Thread-block clusters (%cluster_ctarank, cudaLaunchKernelEx with +# cudaLaunchAttributeClusterDimension) require sm_90+. +NVCC_FLAGS += -arch=sm_90 + +include ../../../common/common.mk diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/sm_gpc_mapping.cu b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/sm_gpc_mapping.cu new file mode 100644 index 000000000..5044dc431 --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/sm_gpc_mapping.cu @@ -0,0 +1,334 @@ +// Dumps the per-SM (GPC, TPC) mapping for the local GPU using +// NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS, then runs a small kernel that +// captures %smid from many blocks and cross-checks the result. +// +// Build: make release (in this directory) +// Run: ../../../bin/sm_gpc_mapping +// +// Output is two streams: '#'-prefixed human/diagnostic lines on stdout, +// and a CSV block at the end (smid,gpcId,tpcId,observed_in_kernel). + +#include +#include +#include +#include +#include +using namespace std; + +#include "../../../hw_def/hw_def.h" + +__device__ __forceinline__ unsigned get_smid() +{ + unsigned ret; + asm("mov.u32 %0, %%smid;" : "=r"(ret)); + return ret; +} + +__device__ __forceinline__ unsigned get_cluster_ctarank() +{ + unsigned ret; + asm("mov.u32 %0, %%cluster_ctarank;" : "=r"(ret)); + return ret; +} + +// Each block writes its %smid into out[blockIdx.x]. The spin keeps the block +// resident long enough that the scheduler is forced to dispatch siblings to +// other SMs, giving us coverage of every enabled SM. +__global__ void capture_smid(unsigned *out, unsigned spin_iters) +{ + unsigned smid = get_smid(); + if (threadIdx.x == 0) + out[blockIdx.x] = smid; + + volatile unsigned acc = 0; + for (unsigned i = 0; i < spin_iters; ++i) + acc += i; + if (threadIdx.x == 0 && acc == 0xdeadbeef) + out[blockIdx.x] = 0xffffffffu; // unreachable; defeats DCE +} + +// Cluster-aware variant: captures both %smid and %cluster_ctarank per block. +// Indexed by the linear block id derived from blockIdx + gridDim. +__global__ void capture_smid_with_cluster(unsigned *smid_out, + unsigned *crank_out, + unsigned spin_iters) +{ + unsigned smid = get_smid(); + unsigned crank = get_cluster_ctarank(); + + if (threadIdx.x == 0) { + unsigned linear = blockIdx.x + + gridDim.x * (blockIdx.y + gridDim.y * blockIdx.z); + smid_out[linear] = smid; + crank_out[linear] = crank; + } + + volatile unsigned acc = 0; + for (unsigned i = 0; i < spin_iters; ++i) + acc += i; + if (threadIdx.x == 0 && acc == 0xdeadbeef) + smid_out[0] = 0xffffffffu; +} + +int main(int argc, char *argv[]) +{ + initializeDeviceProp(0, argc, argv); + + printf("\n# Device Name = %s\n", deviceProp.name); + printf("# Compute Capability = %d.%d\n", deviceProp.major, deviceProp.minor); + printf("# SM_NUMBER = %u\n", config.SM_NUMBER); + printf("# NUM_GPCS (RM) = %u\n", config.NUM_GPCS); + printf("# FBP_COUNT, L2_BANKS = %u, %u\n", config.FBP_COUNT, config.L2_BANKS); + + vector mapping = querySmToGpcMapping(); + printf("# RM smCount = %zu\n", mapping.size()); + + if (mapping.empty()) { + fprintf(stderr, + "# ERROR: querySmToGpcMapping returned no entries. " + "The driver may not support NV2080_CTRL_CMD_GR_GET_SM_TO_GPC_TPC_MAPPINGS, " + "or /dev/nvidiactl access was denied.\n"); + return 1; + } + + // Build per-GPC and per-(GPC,TPC) reverse indexes. + map> gpc_to_sms; + map, vector> gpc_tpc_to_sms; + uint32_t max_gpc = 0, max_tpc = 0; + for (size_t smid = 0; smid < mapping.size(); ++smid) { + auto e = mapping[smid]; + gpc_to_sms[e.gpcId].push_back((uint32_t)smid); + gpc_tpc_to_sms[{e.gpcId, e.tpcId}].push_back((uint32_t)smid); + max_gpc = std::max(max_gpc, e.gpcId); + max_tpc = std::max(max_tpc, e.tpcId); + } + printf("# distinct GPCs = %zu (max gpcId=%u, max tpcId=%u)\n", + gpc_to_sms.size(), max_gpc, max_tpc); + + printf("# GPC histogram (gpcId : sm_count : sm_ids):\n"); + for (auto &kv : gpc_to_sms) { + printf("# GPC %u: %zu SMs [", kv.first, kv.second.size()); + for (size_t i = 0; i < kv.second.size(); ++i) + printf("%s%u", i ? "," : "", kv.second[i]); + printf("]\n"); + } + + // Sanity invariants printed loudly so failures are obvious. + bool ok_count = + (config.NUM_GPCS == 0) || (gpc_to_sms.size() == config.NUM_GPCS); + bool ok_total = (mapping.size() == config.SM_NUMBER); + printf("# CHECK distinct_gpcs(%zu) == NUM_GPCS(%u) : %s\n", + gpc_to_sms.size(), config.NUM_GPCS, ok_count ? "OK" : "MISMATCH"); + printf("# CHECK rm_smCount(%zu) == SM_NUMBER(%u) : %s\n", + mapping.size(), config.SM_NUMBER, ok_total ? "OK" : "MISMATCH"); + + // Runtime %smid capture: launch enough blocks to cover every enabled SM. + unsigned blocks = config.SM_NUMBER * 4; + if (blocks < 64) + blocks = 64; + unsigned *d_out = nullptr; + gpuErrchk(cudaMalloc(&d_out, blocks * sizeof(unsigned))); + gpuErrchk(cudaMemset(d_out, 0xff, blocks * sizeof(unsigned))); + + // ~10us per block at ~1.6GHz keeps blocks resident long enough for the + // scheduler to fan siblings out to other SMs without a slow run. + capture_smid<<>>(d_out, 1u << 14); + gpuErrchk(cudaPeekAtLastError()); + gpuErrchk(cudaDeviceSynchronize()); + + vector h_out(blocks); + gpuErrchk(cudaMemcpy(h_out.data(), d_out, blocks * sizeof(unsigned), + cudaMemcpyDeviceToHost)); + cudaFree(d_out); + + set observed_smids(h_out.begin(), h_out.end()); + printf("# kernel observed %zu distinct %%smid values across %u blocks\n", + observed_smids.size(), blocks); + + bool all_observed_in_table = true; + for (auto smid : observed_smids) { + if (smid >= mapping.size()) { + printf("# WARN: kernel saw %%smid=%u but RM table size = %zu\n", + smid, mapping.size()); + all_observed_in_table = false; + } + } + printf("# CHECK observed_smids subset_of rm_table : %s\n", + all_observed_in_table ? "OK" : "MISMATCH"); + printf("# CHECK observed_count(%zu) == SM_NUMBER(%u) : %s\n", + observed_smids.size(), config.SM_NUMBER, + observed_smids.size() == config.SM_NUMBER ? "OK" + : "PARTIAL (scheduler did not cover all SMs)"); + + // Stable CSV table. + printf("\n# CSV: smid,gpcId,tpcId,observed_in_kernel\n"); + for (size_t smid = 0; smid < mapping.size(); ++smid) { + bool seen = observed_smids.count((unsigned)smid) != 0; + printf("%zu,%u,%u,%d\n", smid, mapping[smid].gpcId, + mapping[smid].tpcId, seen ? 1 : 0); + } + + // ============================================================ + // Cluster-shape sweep + // ============================================================ + // For each cluster shape, launch one block per cluster slot covering as + // many SMs as fit (clusters of size K -> floor(SM_NUMBER / K) clusters). + // Per block we capture %smid and %cluster_ctarank, then look up gpcId/tpcId + // via the RM-reported mapping. This shows how blocks within a cluster + // distribute across one GPC's TPCs and how cluster_id rasterizes across + // GPCs. + // + // Note: user request listed "1x8x8" as the last shape, but cluster_size=64 + // exceeds the maximum cluster size (8 portable / 16 non-portable on H100). + // Treating it as a typo for "1x8x1" so we still cover an 8-block cluster + // laid out along y. Edit the shapes[] array below if a different + // interpretation is wanted. + struct ClusterShape { + int x, y, z; + const char *name; + }; + const ClusterShape shapes[] = { + {1, 1, 1, "1x1x1"}, {1, 2, 1, "1x2x1"}, {2, 1, 1, "2x1x1"}, + {2, 2, 1, "2x2x1"}, {1, 4, 1, "1x4x1"}, {4, 1, 1, "4x1x1"}, + {2, 4, 1, "2x4x1"}, {4, 2, 1, "4x2x1"}, {8, 1, 1, "8x1x1"}, + {1, 8, 1, "1x8x1"}, // user typed "1x8x8"; size=64 > max, treating as typo + }; + + auto cluster_id_from_block = [](int bx, int by, int bz, int gx, int gy, + const ClusterShape &s) { + int cx = bx / s.x, cy = by / s.y, cz = bz / s.z; + int ncx = gx / s.x, ncy = gy / s.y; + return cx + ncx * (cy + ncy * cz); + }; + + for (const auto &s : shapes) { + int csize = s.x * s.y * s.z; + // Rounding up division to get the number of clusters needed to cover all SMs. + int nclusters = (int)((config.SM_NUMBER + csize - 1) / (unsigned)csize); + if (nclusters < 1) { + printf("\n# === cluster shape %s skipped (cluster_size=%d > " + "SM_NUMBER=%u) ===\n", + s.name, csize, config.SM_NUMBER); + continue; + } + + // Lay clusters out along x: grid.x = s.x * nclusters, grid.y = s.y, + // grid.z = s.z. This trivially satisfies gridDim.* % clusterDim.* == 0. + dim3 grid(s.x * nclusters, s.y, s.z); + dim3 block(32, 1, 1); + unsigned total_blocks = grid.x * grid.y * grid.z; + + unsigned *d_smid = nullptr, *d_crank = nullptr; + gpuErrchk(cudaMalloc(&d_smid, total_blocks * sizeof(unsigned))); + gpuErrchk(cudaMalloc(&d_crank, total_blocks * sizeof(unsigned))); + gpuErrchk(cudaMemset(d_smid, 0xff, total_blocks * sizeof(unsigned))); + gpuErrchk(cudaMemset(d_crank, 0xff, total_blocks * sizeof(unsigned))); + + cudaLaunchConfig_t cfg = {}; + cfg.gridDim = grid; + cfg.blockDim = block; + cfg.dynamicSmemBytes = 0; + cfg.stream = 0; + + cudaLaunchAttribute attr[1] = {}; + attr[0].id = cudaLaunchAttributeClusterDimension; + attr[0].val.clusterDim.x = s.x; + attr[0].val.clusterDim.y = s.y; + attr[0].val.clusterDim.z = s.z; + cfg.attrs = attr; + cfg.numAttrs = 1; + + cudaError_t err = cudaLaunchKernelEx(&cfg, capture_smid_with_cluster, + d_smid, d_crank, 1u << 14); + if (err != cudaSuccess) { + printf("\n# === cluster shape %s launch failed: %s ===\n", s.name, + cudaGetErrorString(err)); + cudaFree(d_smid); + cudaFree(d_crank); + continue; + } + gpuErrchk(cudaDeviceSynchronize()); + + vector h_smid(total_blocks), h_crank(total_blocks); + gpuErrchk(cudaMemcpy(h_smid.data(), d_smid, + total_blocks * sizeof(unsigned), + cudaMemcpyDeviceToHost)); + gpuErrchk(cudaMemcpy(h_crank.data(), d_crank, + total_blocks * sizeof(unsigned), + cudaMemcpyDeviceToHost)); + cudaFree(d_smid); + cudaFree(d_crank); + + // Verify single-GPC-per-cluster invariant and record per-cluster GPC. + map> cluster_gpcs; + map> cluster_tpcs; + map> cluster_smids; + for (unsigned i = 0; i < total_blocks; ++i) { + unsigned smid = h_smid[i]; + if (smid >= mapping.size()) + continue; + int bx = (int)(i % grid.x); + int by = (int)((i / grid.x) % grid.y); + int bz = (int)(i / (grid.x * grid.y)); + int cid = + cluster_id_from_block(bx, by, bz, grid.x, grid.y, s); + cluster_gpcs[cid].insert(mapping[smid].gpcId); + cluster_tpcs[cid].insert(mapping[smid].tpcId); + cluster_smids[cid].push_back(smid); + } + + bool single_gpc = true; + for (auto &kv : cluster_gpcs) + if (kv.second.size() != 1) + single_gpc = false; + + // Count unique SMs touched by at least one block (active SMs for this + // launch). Useful for showing how the kernel covers the GPU when the + // cluster shape doesn't tile SM_NUMBER evenly. + set active_smids; + for (unsigned i = 0; i < total_blocks; ++i) { + if (h_smid[i] < mapping.size()) + active_smids.insert(h_smid[i]); + } + + printf( + "\n# === cluster shape %s (size=%d, nclusters=%d, grid=%ux%ux%u, " + "blocks=%u) ===\n", + s.name, csize, nclusters, grid.x, grid.y, grid.z, total_blocks); + printf("# active SMs (touched by >=1 block) = %zu / %u\n", + active_smids.size(), config.SM_NUMBER); + printf("# CHECK every cluster -> single GPC: %s\n", + single_gpc ? "OK" : "MISMATCH"); + + // cluster_id -> gpc summary, ordered by cluster_id + printf("# cluster_id -> gpcId (sm_count, smids):\n"); + for (auto &kv : cluster_gpcs) { + int cid = kv.first; + uint32_t g = *kv.second.begin(); + auto &sms = cluster_smids[cid]; + printf("# cluster %3d -> GPC %u (%zu SMs: ", cid, g, sms.size()); + for (size_t i = 0; i < sms.size(); ++i) + printf("%s%u", i ? "," : "", sms[i]); + printf(")\n"); + } + + // Per-block CSV. + printf("# CSV: linear,blockIdx_x,blockIdx_y,blockIdx_z,cluster_id," + "rank_in_cluster,smid,gpcId,tpcId\n"); + for (unsigned i = 0; i < total_blocks; ++i) { + int bx = (int)(i % grid.x); + int by = (int)((i / grid.x) % grid.y); + int bz = (int)(i / (grid.x * grid.y)); + int cid = cluster_id_from_block(bx, by, bz, grid.x, grid.y, s); + unsigned smid = h_smid[i]; + uint32_t gpc = + (smid < mapping.size()) ? mapping[smid].gpcId : 0xffffffffu; + uint32_t tpc = + (smid < mapping.size()) ? mapping[smid].tpcId : 0xffffffffu; + printf("%u,%d,%d,%d,%d,%u,%u,%u,%u\n", i, bx, by, bz, cid, + h_crank[i], smid, gpc, tpc); + } + } + + return 0; +} diff --git a/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/visualize.py b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/visualize.py new file mode 100755 index 000000000..a6b98ba4a --- /dev/null +++ b/src/cuda/GPU_Microbenchmark/ubench/system/sm_gpc_mapping/visualize.py @@ -0,0 +1,341 @@ +#!/usr/bin/env python3 +"""Render GPC > TPC > SM nested ASCII boxes from sm_gpc_mapping output. + +Reads the structured stdout of bin/sm_gpc_mapping (or a saved log via +--input) and prints one nested-box rendering per cluster shape, with each +SM cell labeled by the cluster_id and rank_in_cluster of the first block +that landed on it. +""" + +import argparse +import re +import sys +from collections import defaultdict + + +GPC_LINE = re.compile(r"^#\s+GPC\s+(\d+):\s+(\d+)\s+SMs\s+\[([\d,]+)\]") +SHAPE_HDR = re.compile( + r"^#\s+===\s+cluster\s+shape\s+(\S+)\s+" + r"\(size=(\d+),\s+nclusters=(\d+),\s+" + r"grid=(\d+)x(\d+)x(\d+),\s+blocks=(\d+)\)\s+===" +) +CSV_HDR = re.compile( + r"^#\s+CSV:\s+linear,blockIdx_x,blockIdx_y,blockIdx_z," + r"cluster_id,rank_in_cluster,smid,gpcId,tpcId" +) +ANSI_RE = re.compile(r"\033\[[0-9;]*m") + + +def vislen(s): + return len(ANSI_RE.sub("", s)) + + +PALETTE = [9, 10, 11, 12, 13, 14, 51, 87, 123, 159, 195, 219, 226, 208, 99, 250] + + +def colored(text, cid, enabled): + if not enabled or cid is None: + return text + code = PALETTE[cid % len(PALETTE)] + return f"\033[38;5;{code}m{text}\033[0m" + + +def parse(text): + gpc_to_smids = {} + shapes = [] + cur = None + in_csv = False + for line in text.splitlines(): + m = GPC_LINE.match(line) + if m: + gpc_to_smids[int(m.group(1))] = [int(x) for x in m.group(3).split(",")] + continue + m = SHAPE_HDR.match(line) + if m: + cur = { + "name": m.group(1), + "size": int(m.group(2)), + "nclusters": int(m.group(3)), + "grid": (int(m.group(4)), int(m.group(5)), int(m.group(6))), + "blocks": int(m.group(7)), + # smid -> list of (cluster_id, rank, tpc, gpc, linear) tuples, + # sorted by linear (the dispatch order on the SM) + "occ": defaultdict(list), + } + shapes.append(cur) + in_csv = False + continue + if cur is not None and CSV_HDR.match(line): + in_csv = True + continue + if in_csv and line and not line.startswith("#"): + try: + p = line.split(",") + linear, cid, rank, smid, gpc, tpc = ( + int(p[0]), + int(p[4]), + int(p[5]), + int(p[6]), + int(p[7]), + int(p[8]), + ) + except (IndexError, ValueError): + in_csv = False + continue + cur["occ"][smid].append((cid, rank, tpc, gpc, linear)) + # sort each SM's occupant list by linear so the visualizer can render + # them in dispatch order + for s in shapes: + for smid in s["occ"]: + s["occ"][smid].sort(key=lambda t: t[4]) + return gpc_to_smids, shapes + + +def build_smid_to_tpc(shapes): + out = {} + for s in shapes: + for smid, occs in s["occ"].items(): + if occs: + out[smid] = occs[0][2] # tpc is identical for every occupant + return out + + +def gpc_tpc_layout(gpc_to_smids, smid_to_tpc): + """gpc -> list of (tpc_id, [smids in tpc]), ordered by the lowest smid in + each TPC so reading L-to-R matches ascending physical SM ids. Hardware + tpc_id is preserved as the label, which makes the permutation visible.""" + layout = {} + for gpc, smids in gpc_to_smids.items(): + d = defaultdict(list) + for sm in smids: + tpc = smid_to_tpc.get(sm) + if tpc is None: + d[-1].append(sm) + else: + d[tpc].append(sm) + for k in d: + d[k].sort() + layout[gpc] = sorted(d.items(), key=lambda kv: kv[1][0]) + return layout + + +# ---------- rendering ---------- + +TPC_WIDTH = 11 # outer +TPC_INNER = TPC_WIDTH - 2 # 9 +TPCS_PER_ROW = 4 +GAP = " " + + +def pad_to(s, vis_text, width): + extra = width - len(vis_text) + if extra <= 0: + return s + left = extra // 2 + right = extra - left + return " " * left + s + " " * right + + +def render_sm_lines(smid, occ, color, occ_rows): + """occ_rows = number of occupant lines to emit per SM cell (uniform within + a shape, computed from the max occupants/SM observed in the shape). + Returns 1 sm-id line + occ_rows occupant lines.""" + if smid is None: + return ["|" + " " * TPC_INNER + "|"] * (1 + occ_rows) + sm_label = f"sm{smid:>3d}" + head = "|" + sm_label.center(TPC_INNER) + "|" + occs = occ.get(smid, []) if isinstance(occ, dict) else occ.get(smid, []) + if not occs: + # SM not touched in this shape's launch. + body = ["|" + pad_to("-", "-", TPC_INNER) + "|"] + body += ["|" + " " * TPC_INNER + "|"] * (occ_rows - 1) + return [head] + body + body = [] + for i in range(occ_rows): + if i < len(occs): + cid, rank = occs[i][0], occs[i][1] + cr_raw = f"c{cid}:{rank}" + cr = colored(cr_raw, cid, color) + body.append("|" + pad_to(cr, cr_raw, TPC_INNER) + "|") + else: + body.append("|" + " " * TPC_INNER + "|") + return [head] + body + + +def render_tpc(tpc_id, smids, occ, color, occ_rows): + """Return list of strings, each width TPC_WIDTH (visible). Height is + 1 (top) + 2 * (1 + occ_rows) + 1 (sep) + 1 (bottom).""" + label = f" TPC {tpc_id} " if tpc_id >= 0 else " TPC ? " + extra = TPC_INNER - len(label) + left = extra // 2 + right = extra - left + top = "+" + "-" * left + label + "-" * right + "+" + sep = "|" + "-" * TPC_INNER + "|" + bot = "+" + "-" * TPC_INNER + "+" + sm0 = smids[0] if len(smids) >= 1 else None + sm1 = smids[1] if len(smids) >= 2 else None + return [ + top, + *render_sm_lines(sm0, occ, color, occ_rows), + sep, + *render_sm_lines(sm1, occ, color, occ_rows), + bot, + ] + + +def empty_tpc_block(occ_rows): + h = 1 + 2 * (1 + occ_rows) + 1 + 1 # top + 2 SMs + sep + bot + return [" " * TPC_WIDTH] * h + + +def hjoin(boxes, gap=GAP): + if not boxes: + return [] + return [gap.join(b[r] for b in boxes) for r in range(len(boxes[0]))] + + +def render_gpc_boxed(gpc_id, tpcs, occ, color, occ_rows): + sm_count = sum(len(smids) for _, smids in tpcs) + n_tpcs = len(tpcs) + inner_w = TPCS_PER_ROW * TPC_WIDTH + (TPCS_PER_ROW - 1) * len(GAP) + margin = "| " + rmargin = " |" + width = inner_w + len(margin) + len(rmargin) + + label = f"GPC {gpc_id} [{sm_count} SMs]" + head = f"+-- {label} " + top = head + "-" * (width - len(head) - 1) + "+" + bot = "+" + "-" * (width - 2) + "+" + + out = [top] + for i in range(0, n_tpcs, TPCS_PER_ROW): + chunk = tpcs[i : i + TPCS_PER_ROW] + boxes = [render_tpc(tid, smids, occ, color, occ_rows) for tid, smids in chunk] + while len(boxes) < TPCS_PER_ROW: + boxes.append(empty_tpc_block(occ_rows)) + for joined in hjoin(boxes): + pad = inner_w - vislen(joined) + out.append(margin + joined + " " * pad + rmargin) + out.append(bot) + return "\n".join(out) + + +def render_gpc_compact(gpc_id, tpcs, occ, color): + sm_count = sum(len(smids) for _, smids in tpcs) + # Each line: "TPC N: [ smX c0:0,c5:1 | smX c1:0 ]" + body_lines = [] + for tpc_id, smids in tpcs: + cells = [] + for sm in smids[:2]: + sm_label = f"sm{sm:>3d}" + occs = occ.get(sm, []) + if not occs: + cr_raw = "-" + cr = cr_raw + else: + cr_raw = ",".join(f"c{c}:{r}" for c, r, *_ in occs) + cr = ",".join(colored(f"c{c}:{r}", c, color) for c, r, *_ in occs) + cell_vis = f"{sm_label} {cr_raw}" + cell = f"{sm_label} {cr}" + cells.append((cell, cell_vis)) + while len(cells) < 2: + cells.append(("", "")) + # equalize cell widths + cell_w = max(len(cv) for _, cv in cells) if cells else 0 + padded = [] + for c, cv in cells: + padded.append(c + " " * (cell_w - len(cv))) + body_vis = f"TPC {tpc_id}: [ {padded[0]} | {padded[1]} ]" + body_lines.append(body_vis) + inner_w = max(vislen(l) for l in body_lines) if body_lines else 20 + width = inner_w + 4 + + label = f"GPC {gpc_id} [{sm_count} SMs]" + head = f"+-- {label} " + top = head + "-" * (width - len(head) - 1) + "+" + bot = "+" + "-" * (width - 2) + "+" + + out = [top] + for line in body_lines: + pad = inner_w - vislen(line) + out.append("| " + line + " " * pad + " |") + out.append(bot) + return "\n".join(out) + + +def render_topology_header(gpc_to_smids, gpc_tpc_layout_, total_smids): + n_gpc = len(gpc_to_smids) + out = [f"=== GPU topology ({total_smids} SMs, {n_gpc} GPCs) ==="] + for gpc, tpcs in sorted(gpc_tpc_layout_.items()): + sm_count = sum(len(s) for _, s in tpcs) + parts = [] + for tid, smids in tpcs: + tag = f"TPC{tid}" if tid >= 0 else "TPC?" + parts.append(f"{tag}={{{','.join(str(s) for s in smids)}}}") + out.append(f"GPC {gpc} [{sm_count} SMs]: " + " ".join(parts)) + return "\n".join(out) + + +def render_shape(shape, gpc_tpc_layout_, style, color, total_smids): + occ = shape["occ"] + occ_rows = max((len(v) for v in occ.values()), default=1) + occ_rows = max(occ_rows, 1) + active_sms = sum(1 for v in occ.values() if v) + + out = [] + out.append("") + out.append( + f"=== Cluster shape {shape['name']} " + f"(size={shape['size']}, {shape['nclusters']} clusters, {shape['blocks']} blocks) ===" + ) + out.append( + f"--- active SMs (touched by >=1 block) = {active_sms} / {total_smids}; " + f"max occupants per SM in this launch = {occ_rows} ---" + ) + out.append("") + if style == "boxed": + for gpc, tpcs in sorted(gpc_tpc_layout_.items()): + out.append(render_gpc_boxed(gpc, tpcs, occ, color, occ_rows)) + out.append("") + else: + for gpc, tpcs in sorted(gpc_tpc_layout_.items()): + out.append(render_gpc_compact(gpc, tpcs, occ, color)) + out.append("") + return "\n".join(out) + + +def main(): + ap = argparse.ArgumentParser(description=__doc__) + ap.add_argument("--input", "-i", help="Read from file instead of stdin") + ap.add_argument("--shape", "-s", help="Render only this cluster shape (e.g., 2x2x1)") + ap.add_argument("--style", choices=("boxed", "compact"), default="boxed") + ap.add_argument("--color", action="store_true", help="ANSI color cluster ids") + args = ap.parse_args() + + text = open(args.input).read() if args.input else sys.stdin.read() + gpc_to_smids, shapes = parse(text) + if not gpc_to_smids: + print("error: no GPC histogram lines found in input", file=sys.stderr) + sys.exit(1) + if not shapes: + print("error: no cluster shape sections found in input", file=sys.stderr) + sys.exit(1) + + smid_to_tpc = build_smid_to_tpc(shapes) + layout = gpc_tpc_layout(gpc_to_smids, smid_to_tpc) + total_smids = sum(len(v) for v in gpc_to_smids.values()) + + print(render_topology_header(gpc_to_smids, layout, total_smids)) + + target_shapes = ( + [s for s in shapes if s["name"] == args.shape] if args.shape else shapes + ) + if args.shape and not target_shapes: + print(f"error: cluster shape '{args.shape}' not found in input", file=sys.stderr) + sys.exit(1) + for s in target_shapes: + print(render_shape(s, layout, args.style, args.color, total_smids)) + + +if __name__ == "__main__": + main()