Skip to content

Expose Container info for MIG enabled GPU #272

@krishh85

Description

@krishh85

Currently is doesn't seem like container/pod/namespace information is emitted from dcgm-exporter when MIG is enabled in GPU. This is important when we need to do gpu utilization aggregation across containers/cgroups. The contaiiner info seems to be emitted only on GPU's that have MIG disabled.

Version Info
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |

dcgm_fi_prof_gr_engine_active{gpu="0",uuid="GPU-ed1353d6-52ba-8793-7230-4d5d3eb68167",device="nvidia0",modelName="NVIDIA A100 80GB PCIe",GPU_I_PROFILE="3g.40gb",GPU_I_ID="1",container_name="",pod_name="",pod_namespace=""} 0.000000
dcgm_fi_prof_gr_engine_active{gpu="0",uuid="GPU-ed1353d6-52ba-8793-7230-4d5d3eb68167",device="nvidia0",modelName="NVIDIA A100 80GB PCIe",GPU_I_PROFILE="4g.40gb",GPU_I_ID="2",container_name="",pod_name="",pod_namespace=""} 0.000000
dcgm_fi_prof_gr_engine_active{gpu="1",uuid="GPU-6043e4b2-feaa-8010-34c7-61d1a01576bb",device="nvidia1",modelName="NVIDIA A100 80GB PCIe",GPU_I_PROFILE="3g.40gb",GPU_I_ID="1",container_name="",pod_name="",pod_namespace=""} 0.000000
dcgm_fi_prof_gr_engine_active{gpu="1",uuid="GPU-6043e4b2-feaa-8010-34c7-61d1a01576bb",device="nvidia1",modelName="NVIDIA A100 80GB PCIe",GPU_I_PROFILE="4g.40gb",GPU_I_ID="2",container_name="",pod_name="",pod_namespace=""} 0.001359
dcgm_fi_prof_gr_engine_active{gpu="2",uuid="GPU-2dddf823-c00c-0fd6-3e84-ad84d322de02",device="nvidia2",modelName="NVIDIA A100 80GB PCIe",GPU_I_PROFILE="3g.40gb",GPU_I_ID="2",container_name="",pod_name="",pod_namespace=""} 0.000000
dcgm_fi_prof_gr_engine_active{gpu="2",uuid="GPU-2dddf823-c00c-0fd6-3e84-ad84d322de02",device="nvidia2",modelName="NVIDIA A100 80GB PCIe",GPU_I_PROFILE="4g.40gb",GPU_I_ID="1",container_name="",pod_name="",pod_namespace=""} 0.000000
dcgm_fi_prof_gr_engine_active{gpu="3",uuid="GPU-af2c63a8-8c8b-21cc-581a-bab6bba89d08",device="nvidia3",modelName="NVIDIA A100 80GB PCIe",GPU_I_PROFILE="3g.40gb",GPU_I_ID="2",container_name="",pod_name="",pod_namespace=""} 0.000000
dcgm_fi_prof_gr_engine_active{gpu="3",uuid="GPU-af2c63a8-8c8b-21cc-581a-bab6bba89d08",device="nvidia3",modelName="NVIDIA A100 80GB PCIe",GPU_I_PROFILE="4g.40gb",GPU_I_ID="1",container_name="",pod_name="",pod_namespace=""} 0.000000
dcgm_fi_prof_gr_engine_active{gpu="4",uuid="GPU-931446e6-0413-2864-5b76-11227dcda4ae",device="nvidia4",modelName="NVIDIA A100 80GB PCIe",container_name="-grpc-pod",pod_name="d85np-t6mdg",pod_namespace="ai-1"} 0.008246
dcgm_fi_prof_gr_engine_active{gpu="5",uuid="GPU-b3c602ed-2a60-c7e6-a685-a4a4ef0fb831",device="nvidia5",modelName="NVIDIA A100 80GB PCIe",container_name="language-models",pod_name="64bl2-8rsmr-tnrmr",pod_namespace="ai-7"} 0.000000
dcgm_fi_prof_gr_engine_active{gpu="6",uuid="GPU-3f5d3fb2-5308-160b-ebe4-cf4c3c1e8b1d",device="nvidia6",modelName="NVIDIA A100 80GB PCIe",container_name=“llm",pod_name="-8lrkr-xqd9x",pod_namespace="ai-2"} 0.151145
dcgm_fi_prof_gr_engine_active{gpu="7",uuid="GPU-d41aa3aa-2bf3-27e9-1a2d-4f54cca57cbb",device="nvidia7",modelName="NVI

Metadata

Metadata

Assignees

Labels

bugSomething isn't working

Type

No type
No fields configured for issues without a type.

Projects

No projects

Milestone

No milestone

Relationships

None yet

Development

No branches or pull requests

Issue actions