import os import socket import torch import torch.distributed as dist from torch.multiprocessing import Process def run(rank, size, hostname, gpu, ngpus_per_node): print(f"I am {rank} of {size} in {hostname}") group = dist.new_group([0, 1,2,3]) tensor = torch.ones(1).cuda() dist.all_reduce(tensor, op=dist.reduce_op.SUM, group=group) print('Rank ', rank, ' has data ', tensor[0]) def init_processes(Myrank, size,ngpus_per_node, hostname, fn, backend='nccl'): """ Initialize the distributed environment. """ os.environ['MASTER_ADDR'] = os.environ['SLURM_LAUNCH_NODE_IPADDR'] os.environ['MASTER_PORT'] = '8933' dist.init_process_group("nccl",init_method='env://', rank=Myrank, world_size=size) print("Initialized Rank:", dist.get_rank()) hostname = socket.gethostname() ip_address = socket.gethostbyname(hostname) print(ip_address) if dist.get_rank()%2==0: torch.cuda.set_device(0) gpu=0 else: torch.cuda.set_device(1) gpu=1 fn(Myrank, size, hostname, gpu, ngpus_per_node) if __name__ == "__main__": world_size = int(os.environ['SLURM_NPROCS']) world_rank = int(os.environ['SLURM_PROCID']) ngpus_per_node=torch.cuda.device_count() hostname = socket.gethostname() init_processes(world_rank, world_size,ngpus_per_node, hostname, run, backend='nccl')