diff --git a/makani/utils/inference/inferencer.py b/makani/utils/inference/inferencer.py index 7f8999b..ea41634 100644 --- a/makani/utils/inference/inferencer.py +++ b/makani/utils/inference/inferencer.py @@ -19,7 +19,6 @@ import numpy as np from tqdm import tqdm -import pynvml import datetime as dt import h5py as h5 @@ -47,6 +46,7 @@ # checkpoint helpers from makani.utils.checkpoint_helpers import get_latest_checkpoint_version +from makani.utils.training.training_helpers import get_memory_usage class Inferencer(Driver): """ @@ -70,11 +70,6 @@ def __init__(self, params: Optional[YParams] = None, world_rank: Optional[int] = if self.log_to_wandb: self._init_wandb(self.params, job_type="inference") - # nvml stuff - if self.log_to_screen: - pynvml.nvmlInit() - self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device.index) - # set amp_parameters if hasattr(self.params, "amp_mode") and (self.params.amp_mode != "none"): self.amp_enabled = True @@ -754,9 +749,8 @@ def score_model( # log parameters if self.log_to_screen: # log memory usage so far - all_mem_gb = pynvml.nvmlDeviceGetMemoryInfo(self.nvml_handle).used / (1024.0 * 1024.0 * 1024.0) - max_mem_gb = torch.cuda.max_memory_allocated(device=self.device) / (1024.0 * 1024.0 * 1024.0) - self.logger.info(f"Scaffolding memory high watermark: {all_mem_gb} GB ({max_mem_gb} GB for pytorch)") + all_mem_gb, max_mem_gb = get_memory_usage(self.device) + self.logger.info(f"Scaffolding memory high watermark: {all_mem_gb:.2f} GB ({max_mem_gb:.2f} GB for pytorch)") # announce training start self.logger.info("Starting Scoring...") diff --git a/makani/utils/training/autoencoder_trainer.py b/makani/utils/training/autoencoder_trainer.py index f0802e9..8ad689a 100644 --- a/makani/utils/training/autoencoder_trainer.py +++ b/makani/utils/training/autoencoder_trainer.py @@ -21,9 +21,6 @@ import numpy as np from tqdm import tqdm -# gpu info -import pynvml - # torch import torch import torch.amp as amp @@ -56,7 +53,7 @@ from makani.utils.checkpoint_helpers import get_latest_checkpoint_version # weight normalizing helper -from makani.utils.training.training_helpers import clip_grads +from makani.utils.training.training_helpers import get_memory_usage, clip_grads class AutoencoderTrainer(Driver): """ @@ -79,11 +76,6 @@ def __init__(self, params: Optional[YParams] = None, world_rank: Optional[int] = tens = torch.ones(1, device=self.device) dist.all_reduce(tens, group=comm.get_group("data")) - # nvml stuff - if self.log_to_screen: - pynvml.nvmlInit() - self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device.index) - # set amp_parameters if hasattr(self.params, "amp_mode") and (self.params.amp_mode != "none"): self.amp_enabled = True @@ -327,9 +319,8 @@ def train(self, training_profiler=None, validation_profiler=None): # log parameters if self.log_to_screen: # log memory usage so far - all_mem_gb = pynvml.nvmlDeviceGetMemoryInfo(self.nvml_handle).used / (1024.0 * 1024.0 * 1024.0) - max_mem_gb = torch.cuda.max_memory_allocated(device=self.device) / (1024.0 * 1024.0 * 1024.0) - self.logger.info(f"Scaffolding memory high watermark: {all_mem_gb} GB ({max_mem_gb} GB for pytorch)") + all_mem_gb, max_mem_gb = get_memory_usage(self.device) + self.logger.info(f"Scaffolding memory high watermark: {all_mem_gb:.2f} GB ({max_mem_gb:.2f} GB for pytorch)") # announce training start self.logger.info("Starting Training Loop...") @@ -721,7 +712,7 @@ def get_pad(nchar): self.logger.info(f"Performance Parameters:") self.logger.info(print_prefix + "training steps: {}".format(train_logs["train_steps"])) self.logger.info(print_prefix + "validation steps: {}".format(valid_logs["base"]["validation steps"])) - all_mem_gb = pynvml.nvmlDeviceGetMemoryInfo(self.nvml_handle).used / (1024.0 * 1024.0 * 1024.0) + all_mem_gb, _ = get_memory_usage(self.device) self.logger.info(print_prefix + f"memory footprint [GB]: {all_mem_gb:.2f}") for key in timing_logs.keys(): self.logger.info(print_prefix + key + ": {:.2f}".format(timing_logs[key])) diff --git a/makani/utils/training/deterministic_trainer.py b/makani/utils/training/deterministic_trainer.py index 4b832a3..c1be7a8 100644 --- a/makani/utils/training/deterministic_trainer.py +++ b/makani/utils/training/deterministic_trainer.py @@ -21,9 +21,6 @@ import numpy as np from tqdm import tqdm -# gpu info -import pynvml - # torch import torch from torch import amp @@ -59,7 +56,7 @@ from makani.utils.checkpoint_helpers import get_latest_checkpoint_version # weight normalizing helper -from makani.utils.training.training_helpers import clip_grads +from makani.utils.training.training_helpers import get_memory_usage, clip_grads class Trainer(Driver): """ @@ -86,11 +83,6 @@ def __init__(self, params: Optional[YParams] = None, world_rank: Optional[int] = dist.all_reduce(tens, group=comm.get_group("data")) self.timers["nccl init"] = timer.time - # nvml stuff - if self.log_to_screen: - pynvml.nvmlInit() - self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device.index) - # set amp_parameters if hasattr(self.params, "amp_mode") and (self.params.amp_mode != "none"): self.amp_enabled = True @@ -371,9 +363,8 @@ def train(self, training_profiler=None, validation_profiler=None): # log parameters if self.log_to_screen: # log memory usage so far - all_mem_gb = pynvml.nvmlDeviceGetMemoryInfo(self.nvml_handle).used / (1024.0 * 1024.0 * 1024.0) - max_mem_gb = torch.cuda.max_memory_allocated(device=self.device) / (1024.0 * 1024.0 * 1024.0) - self.logger.info(f"Scaffolding memory high watermark: {all_mem_gb} GB ({max_mem_gb} GB for pytorch)") + all_mem_gb, max_mem_gb = get_memory_usage(self.device) + self.logger.info(f"Scaffolding memory high watermark: {all_mem_gb:.2f} GB ({max_mem_gb:.2f} GB for pytorch)") # announce training start self.logger.info("Starting Training Loop...") @@ -727,7 +718,7 @@ def get_pad(nchar): self.logger.info(f"Performance Parameters:") self.logger.info(print_prefix + "training steps: {}".format(train_logs["train_steps"])) self.logger.info(print_prefix + "validation steps: {}".format(valid_logs["base"]["validation steps"])) - all_mem_gb = pynvml.nvmlDeviceGetMemoryInfo(self.nvml_handle).used / (1024.0 * 1024.0 * 1024.0) + all_mem_gb, _ = get_memory_usage(self.device) self.logger.info(print_prefix + f"memory footprint [GB]: {all_mem_gb:.2f}") for key in timing_logs.keys(): self.logger.info(print_prefix + key + ": {:.2f}".format(timing_logs[key])) diff --git a/makani/utils/training/ensemble_trainer.py b/makani/utils/training/ensemble_trainer.py index 63234b3..603e30f 100644 --- a/makani/utils/training/ensemble_trainer.py +++ b/makani/utils/training/ensemble_trainer.py @@ -21,9 +21,6 @@ import numpy as np from tqdm import tqdm -# gpu info -import pynvml - # torch import torch from torch import amp @@ -35,9 +32,6 @@ # timers from makani.utils.profiling import Timer -# for the manipulation of state dict -from collections import OrderedDict - # makani depenedencies from makani.utils import LossHandler, MetricsHandler from makani.utils.driver import Driver @@ -64,7 +58,7 @@ from makani.utils.checkpoint_helpers import get_latest_checkpoint_version # weight normalizing helper -from makani.utils.training.training_helpers import clip_grads +from makani.utils.training.training_helpers import get_memory_usage, clip_grads class EnsembleTrainer(Trainer): """ @@ -91,11 +85,6 @@ def __init__(self, params: Optional[YParams] = None, world_rank: Optional[int] = dist.all_reduce(tens, group=comm.get_group("data")) self.timers["nccl init"] = timer.time - # nvml stuff - if self.log_to_screen: - pynvml.nvmlInit() - self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device.index) - # set amp_parameters if hasattr(self.params, "amp_mode") and (self.params.amp_mode != "none"): self.amp_enabled = True @@ -367,9 +356,8 @@ def train(self, training_profiler=None, validation_profiler=None): # log parameters if self.log_to_screen: # log memory usage so far - all_mem_gb = pynvml.nvmlDeviceGetMemoryInfo(self.nvml_handle).used / (1024.0 * 1024.0 * 1024.0) - max_mem_gb = torch.cuda.max_memory_allocated(device=self.device) / (1024.0 * 1024.0 * 1024.0) - self.logger.info(f"Scaffolding memory high watermark: {all_mem_gb} GB ({max_mem_gb} GB for pytorch)") + all_mem_gb, max_mem_gb = get_memory_usage(self.device) + self.logger.info(f"Scaffolding memory high watermark: {all_mem_gb:.2f} GB ({max_mem_gb:.2f} GB for pytorch)") # announce training start self.logger.info("Starting Ensemble Training Loop...") @@ -776,7 +764,7 @@ def get_pad(nchar): self.logger.info(f"Performance Parameters:") self.logger.info(print_prefix + "training steps: {}".format(train_logs["train_steps"])) self.logger.info(print_prefix + "validation steps: {}".format(valid_logs["base"]["validation steps"])) - all_mem_gb = pynvml.nvmlDeviceGetMemoryInfo(self.nvml_handle).used / (1024.0 * 1024.0 * 1024.0) + all_mem_gb, _ = get_memory_usage(self.device) self.logger.info(print_prefix + f"memory footprint [GB]: {all_mem_gb:.2f}") for key in timing_logs.keys(): self.logger.info(print_prefix + key + ": {:.2f}".format(timing_logs[key])) diff --git a/makani/utils/training/stochastic_trainer.py b/makani/utils/training/stochastic_trainer.py index f8edd2a..9017bea 100644 --- a/makani/utils/training/stochastic_trainer.py +++ b/makani/utils/training/stochastic_trainer.py @@ -21,9 +21,6 @@ import numpy as np from tqdm import tqdm -# gpu info -import pynvml - # torch import torch import torch.optim as optim @@ -60,7 +57,7 @@ from makani.utils.checkpoint_helpers import get_latest_checkpoint_version # weight normalizing helper -from makani.utils.training.training_helpers import normalize_weights, clip_grads +from makani.utils.training.training_helpers import get_memory_usage, normalize_weights, clip_grads class StochasticTrainer(Driver): @@ -87,11 +84,6 @@ def __init__(self, params: Optional[YParams] = None, world_rank: Optional[int] = tens = torch.ones(1, device=self.device) dist.all_reduce(tens, group=comm.get_group("data")) - # nvml stuff - if self.log_to_screen: - pynvml.nvmlInit() - self.nvml_handle = pynvml.nvmlDeviceGetHandleByIndex(self.device.index) - # set amp_parameters if hasattr(self.params, "amp_mode") and (self.params.amp_mode != "none"): self.amp_enabled = True @@ -346,9 +338,8 @@ def train(self): # log parameters if self.log_to_screen: # log memory usage so far - all_mem_gb = pynvml.nvmlDeviceGetMemoryInfo(self.nvml_handle).used / (1024.0 * 1024.0 * 1024.0) - max_mem_gb = torch.cuda.max_memory_allocated(device=self.device) / (1024.0 * 1024.0 * 1024.0) - self.logger.info(f"Scaffolding memory high watermark: {all_mem_gb} GB ({max_mem_gb} GB for pytorch)") + all_mem_gb, max_mem_gb = get_memory_usage(self.device) + self.logger.info(f"Scaffolding memory high watermark: {all_mem_gb:.2f} GB ({max_mem_gb:.2f} GB for pytorch)") # announce training start self.logger.info("Starting Training Loop...") @@ -712,7 +703,7 @@ def get_pad(nchar): self.logger.info(f"Performance Parameters:") self.logger.info(print_prefix + "training steps: {}".format(train_logs["train_steps"])) self.logger.info(print_prefix + "validation steps: {}".format(valid_logs["base"]["validation steps"])) - all_mem_gb = pynvml.nvmlDeviceGetMemoryInfo(self.nvml_handle).used / (1024.0 * 1024.0 * 1024.0) + all_mem_gb, _ = get_memory_usage(self.device) self.logger.info(print_prefix + f"memory footprint [GB]: {all_mem_gb:.2f}") for key in timing_logs.keys(): self.logger.info(print_prefix + key + ": {:.2f}".format(timing_logs[key])) diff --git a/makani/utils/training/training_helpers.py b/makani/utils/training/training_helpers.py index 98273be..f64b621 100644 --- a/makani/utils/training/training_helpers.py +++ b/makani/utils/training/training_helpers.py @@ -19,6 +19,14 @@ from makani.utils import comm +def get_memory_usage(device): + free_mem, total_mem = torch.cuda.mem_get_info(device=device) + allocated_mem_gb = (total_mem - free_mem) / (1024.0 * 1024.0 * 1024.0) + torch_mem_gb = torch.cuda.max_memory_allocated(device=device) / (1024.0 * 1024.0 * 1024.0) + + return allocated_mem_gb, torch_mem_gb + + def normalize_weights(model, eps=1e-5): for param in model.parameters(): # numel = torch.tensor(param.numel(), dtype=torch.long, device=param.device) diff --git a/pyproject.toml b/pyproject.toml index 1b8176a..acb6843 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -58,7 +58,6 @@ dependencies = [ "wandb>=0.13.7", "numba", "tqdm>=4.60.0", - "pynvml>=10.0.0", "jsbeautifier", "more-itertools", "importlib-metadata",