-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathtorch_utils.py
More file actions
49 lines (39 loc) · 1.42 KB
/
torch_utils.py
File metadata and controls
49 lines (39 loc) · 1.42 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
"""PyTorch setup utilities for DDP and device configuration."""
import os
import random
import numpy as np
import torch
from torch.distributed import destroy_process_group, init_process_group
def pytorch_setup(cfg):
"""Initialize PyTorch, DDP, and return (local_rank, world_size, device, master_process)."""
ddp = int(os.environ.get("RANK", -1)) != -1
if ddp:
init_process_group(backend="nccl")
rank = int(os.environ["RANK"])
local_rank = int(os.environ["LOCAL_RANK"])
world_size = int(os.environ["WORLD_SIZE"])
device = f"cuda:{local_rank}"
torch.cuda.set_device(device)
master_process = rank == 0
seed_offset = rank
else:
master_process = True
seed_offset = 0
local_rank = None
world_size = 1
device = "cpu"
if torch.cuda.is_available():
device = "cuda"
elif torch.backends.mps.is_available():
device = "mps"
random.seed(cfg.system.seed + seed_offset)
np.random.seed(cfg.system.seed + seed_offset)
torch.manual_seed(cfg.system.seed + seed_offset)
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
return local_rank, world_size, device, master_process
def destroy_ddp():
"""Clean up DDP process group."""
if torch.distributed.is_initialized():
torch.distributed.barrier()
destroy_process_group()