pytorch

simulate_nccl_errors.py
36 строк · 1.6 Кб
Перенос по словам
1

2
import torch.distributed as c10d
3
import torch
4
import argparse
5
import os
6
import logging
7
logging.basicConfig(format='%(asctime)s - %(name)s - %(levelname)s - %(message)s', level=logging.INFO)
8

9
if __name__ == "__main__":
10
    parser = argparse.ArgumentParser(
11
        description='Simple script to simulate NCCL errors. The script is '
12
        'supposed to be run on multiple different nodes simultaneously with '
13
        'appropriate rank and world_size. The script run an allreduce() on '
14
        'the rank 0 node and aborts all the other nodes to simulate an error '
15
        'in NCCL')
16
    parser.add_argument('addr', help='address of the master node to connect to.')
17
    parser.add_argument('port', help='port of the master node to connect to.')
18
    parser.add_argument('rank', help='rank of this node')
19
    parser.add_argument('world_size', help='number of nodes in process group')
20
    args = parser.parse_args()
21
    rank = int(args.rank)
22
    world_size = int(args.world_size)
23
    port = int(args.port)
24

25
    store = c10d.TCPStore(args.addr, port, world_size, rank == 0)
26
    process_group = c10d.ProcessGroupNCCL(store, rank, world_size)
27
    logging.info('Running first allreduce')
28
    process_group.allreduce(torch.rand(10).cuda(rank)).wait()
29
    if rank == 0:
30
        logging.info('Running second allreduce only on rank 0')
31
        work = process_group.allreduce(torch.rand(10).cuda(rank))
32
        logging.info('Waiting for allreduce to complete...')
33
        work.wait()
34
        logging.info('Second allreduce successful: %s', work.is_success())
35
    else:
36
        logging.info('Aborting all other ranks.')
37
        os.abort()
38
pytorch

Использование cookies