6
import torch.distributed as c10d
10
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s", level=logging.INFO
13
if __name__ == "__main__":
14
parser = argparse.ArgumentParser(
15
description="Simple script to simulate NCCL errors. The script is "
16
"supposed to be run on multiple different nodes simultaneously with "
17
"appropriate rank and world_size. The script run an allreduce() on "
18
"the rank 0 node and aborts all the other nodes to simulate an error "
21
parser.add_argument("addr", help="address of the master node to connect to.")
22
parser.add_argument("port", help="port of the master node to connect to.")
23
parser.add_argument("rank", help="rank of this node")
24
parser.add_argument("world_size", help="number of nodes in process group")
25
args = parser.parse_args()
27
world_size = int(args.world_size)
30
store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
31
process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
32
logging.info("Running first allreduce")
33
process_group.allreduce(torch.rand(10).cuda(rank)).wait()
35
logging.info("Running second allreduce only on rank 0")
36
work = process_group.allreduce(torch.rand(10).cuda(rank))
37
logging.info("Waiting for allreduce to complete...")
39
logging.info("Second allreduce successful: %s", work.is_success())
41
logging.info("Aborting all other ranks.")