pytorch

device_mesh.py
553 строки · 24.1 Кб
Перенос по словам
1
# Copyright (c) Meta Platforms, Inc. and affiliates
2
import logging
3
import math
4
from typing import Dict, List, Optional, Tuple, TYPE_CHECKING, Union
5

6
import torch
7

8
from torch.distributed import is_available
9

10
from ..utils._typing_utils import not_none
11

12
__all__ = ["init_device_mesh", "DeviceMesh"]
13

14

15
if not is_available():
16
    import sys
17

18
    # We need to create the stubs when distributed is not available.
19
    # Otherwise, we would fail the doc tests (```./.ci/pytorch/docs-test.sh```),
20
    # since it would try to import ``torch.distributed.device_mesh`` or
21
    # ``torch.distributed.init_device_mesh`` but cannot find them.
22

23
    class _DeviceMeshStub:
24
        pass
25

26
    def _init_device_mesh_stub():
27
        pass
28

29
    sys.modules["torch.distributed.device_mesh"].DeviceMesh = _DeviceMeshStub  # type: ignore[attr-defined]
30
    sys.modules[
31
        "torch.distributed.device_mesh"
32
    ].init_device_mesh = _init_device_mesh_stub  # type: ignore[attr-defined]
33

34

35
else:
36
    from torch.distributed.distributed_c10d import (
37
        _find_pg_by_ranks_and_tag,
38
        _get_default_group,
39
        _get_group_tag,
40
        get_rank,
41
        get_world_size,
42
        init_process_group,
43
        is_initialized,
44
        new_group,
45
        ProcessGroup,
46
    )
47

48
    logger = logging.getLogger(__name__)
49

50
    # only import numpy typing when type checking
51
    if TYPE_CHECKING:
52
        try:
53
            from numpy.typing import ArrayLike
54
        except ImportError:
55
            logger.warning(
56
                "DeviceMesh requires numpy >= 1.21 to be installed for type checking"
57
            )
58

59
    class _MeshEnv:
60
        def __init__(self) -> None:
61
            self.mesh_stack: List[DeviceMesh] = []
62
            self.child_to_parent_mapping: Dict[DeviceMesh, DeviceMesh] = {}
63

64
        def get_current_mesh(self) -> "DeviceMesh":
65
            if len(self.mesh_stack) == 0:
66
                raise RuntimeError("No device mesh is currently active!")
67
            return self.mesh_stack[-1]
68

69
        def create_child_mesh(
70
            self, device_mesh: "DeviceMesh", mesh_dim: int, mesh_dim_name: str
71
        ) -> "DeviceMesh":
72
            # swap the current dim to the last dim then reshape to flatten out other
73
            # dims, so we can just extract the list of ranks which contains cur_rank.
74
            cur_rank = device_mesh.get_rank()
75
            pg_ranks_by_dim = device_mesh.mesh.swapdims(-1, mesh_dim).reshape(
76
                -1, device_mesh.mesh.size(mesh_dim)
77
            )
78

79
            for mesh_1d in pg_ranks_by_dim:
80
                sub_mesh = DeviceMesh(
81
                    device_mesh.device_type,
82
                    mesh_1d,
83
                    mesh_dim_names=(mesh_dim_name,),
84
                )
85
                if cur_rank in mesh_1d:
86
                    res_sub_mesh = sub_mesh
87

88
            res_sub_mesh._dim_group_infos = [device_mesh._dim_group_infos[mesh_dim]]  # type: ignore[possibly-undefined]
89
            # Assign the current DeviceMesh as the parent of the child DeviceMesh.
90
            self.child_to_parent_mapping[res_sub_mesh] = device_mesh
91
            return res_sub_mesh
92

93
        def get_parent_mesh(self, device_mesh: "DeviceMesh") -> Optional["DeviceMesh"]:
94
            return self.child_to_parent_mapping.get(device_mesh, None)
95

96
        def get_parent_mesh_dim(self, device_mesh: "DeviceMesh") -> Optional[int]:
97
            """
98
            Return the index of the mesh dim in the parent mesh.
99
            The device_mesh passed in needs to be sliced out from a parent mesh.
100
            """
101
            parent_mesh = self.get_parent_mesh(device_mesh)
102
            child_mesh_dim_names = device_mesh.mesh_dim_names
103
            if parent_mesh and child_mesh_dim_names:
104
                assert (
105
                    len(child_mesh_dim_names) == 1
106
                ), "The child mesh can only be a 1D mesh."
107
                child_mesh_dim_name = child_mesh_dim_names[0]
108
                return self.get_mesh_dim_by_name(parent_mesh, child_mesh_dim_name)
109
            return None
110

111
        @staticmethod
112
        def num_devices_per_host(device_type: str) -> int:
113
            return _get_device_handle(device_type).device_count()
114

115
        @staticmethod
116
        def num_hosts(device_type: str) -> int:
117
            # ProcessGroup can't tell us this info so we have to infer it, assume
118
            # homogeneous hardware for now
119
            return get_world_size() // _MeshEnv.num_devices_per_host(device_type)
120

121
        def get_mesh_dim_by_name(
122
            self, device_mesh: "DeviceMesh", mesh_dim_name: str
123
        ) -> int:
124
            if (
125
                device_mesh.mesh_dim_names is None
126
                or len(device_mesh.mesh_dim_names) == 0
127
            ):
128
                raise KeyError(
129
                    "No `mesh_dim_names` found.",
130
                )
131
            if mesh_dim_name not in device_mesh.mesh_dim_names:
132
                raise KeyError(
133
                    f"Mesh dimension '{mesh_dim_name}' does not exist.",
134
                    f"Available mesh dimensions are: mesh_dim_names={device_mesh.mesh_dim_names}",
135
                )
136
            return not_none(device_mesh.mesh_dim_names.index(mesh_dim_name))
137

138
    _mesh_resources: _MeshEnv = _MeshEnv()
139

140
    def _get_device_handle(device_type: str = "cuda"):
141
        """
142
        Get the module corresponding to the device_type which is cuda or cuda-like device.
143
        For example, when the device_type is cuda, the module `torch.cuda` is returned.
144
        Return None when there is no corresponding module for device_type, otherwise
145
        return the corresponding module.
146
        """
147
        return getattr(torch, device_type, None)
148

149
    class DeviceMesh:
150
        """
151
        DeviceMesh represents a mesh of devices, where layout of devices could be
152
        represented as a n-d dimension array, and each value of the n-d dimensional
153
        array is the global id of the default process group ranks.
154

155
        DeviceMesh could be used to describe the layout of devices across the cluster,
156
        and serves as a proxy for communication among the device lists within the cluster.
157

158
        DeviceMesh can be used as a context manager.
159

160
        .. note::
161
            DeviceMesh follows SPMD programming model, which means the same PyTorch Python program
162
            is running on all processes/ranks in the cluster. Therefore, users need to make sure the
163
            `mesh` array (which describes the layout of devices) should be identical across all ranks.
164
            Inconsistent `mesh` will lead to silent hang.
165

166
        Args:
167
            device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
168
            mesh (ndarray): A multi-dimensional array or an integer tensor describing the layout
169
                of devices, where the IDs are global IDs of the default process group.
170

171
        Returns:
172
            DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
173

174
        The following program runs on each process/rank in an SPMD manner. In this example, we have 2
175
        hosts with 4 GPUs each.
176
        A reduction over the first dimension of mesh will reduce across
177
        columns (0, 4), .. and (3, 7), a reduction over the second dimension
178
        of mesh reduces across rows (0, 1, 2, 3) and (4, 5, 6, 7).
179

180
        Example::
181
            >>> # xdoctest: +SKIP("no rank")
182
            >>> from torch.distributed.device_mesh import DeviceMesh
183
            >>>
184
            >>> # Initialize device mesh as (2, 4) to represent the topology
185
            >>> # of cross-host(dim 0), and within-host (dim 1).
186
            >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
187
        """
188

189
        device_type: str
190
        mesh: torch.Tensor
191
        mesh_dim_names: Optional[Tuple[str, ...]]
192

193
        def __init__(
194
            self,
195
            device_type: str,
196
            mesh: Union[torch.Tensor, "ArrayLike"],
197
            *,
198
            mesh_dim_names: Optional[Tuple[str, ...]] = None,
199
        ) -> None:
200
            self.device_type = device_type
201
            if isinstance(mesh, torch.Tensor) and mesh.device.type != "cpu":
202
                raise ValueError(f"`mesh` must be a CPU tensor, got {mesh}")
203
            self.mesh = (
204
                mesh.detach().cpu()
205
                if isinstance(mesh, torch.Tensor)
206
                else torch.tensor(mesh, dtype=torch.int)
207
            )
208
            self.mesh_dim_names = mesh_dim_names
209

210
            # private field to pre-generate DeviceMesh's hash
211
            self._flatten_mesh_list = tuple(self.mesh.flatten().tolist())
212
            self._hash = hash((self._flatten_mesh_list, self.mesh.shape, id(self)))
213

214
            # Skip process group initialization if xla device.
215
            # TODO(yeounoh) implement DeviceMesh backend and register XLA backend.
216
            if device_type != "xla":
217
                # always try to create default (world) pg, even if it is not initialized
218
                # already. The world pg is used for device mesh identity (rank) on each
219
                # process (we need to know if the current global rank is in the mesh or not).
220
                self._get_or_create_default_group()
221
                self._init_process_groups()
222

223
        def _get_or_create_default_group(self):
224
            default_initialized = is_initialized()
225
            if not default_initialized:
226
                init_process_group()
227

228
            world_size = get_world_size()
229
            if self.mesh.numel() > world_size:
230
                raise RuntimeError(
231
                    f"Mesh should not be bigger than default world size, but found {self.mesh.numel()} ranks!"
232
                )
233

234
            device_handle = _get_device_handle(self.device_type)
235
            # TODO: if user want to pass pg_options, offer a way to do it
236
            if not default_initialized and device_handle:
237
                # automatically set the current cuda/cuda-like device base on num of gpu devices available in each host
238
                # NOTE: This device selection would only work for homogeneous hardware.
239
                num_devices_per_host = device_handle.device_count()
240
                if (
241
                    world_size > num_devices_per_host
242
                    and world_size % num_devices_per_host != 0
243
                ):
244
                    raise RuntimeError(
245
                        f"DeviceMesh only support homogeneous hardware, but found "
246
                        f"{world_size} ranks and {num_devices_per_host} {self.device_type} devices!"
247
                    )
248
                device_handle.set_device(get_rank() % num_devices_per_host)
249

250
            # calculate the coordinates of the current global rank on the mesh
251
            rank_coords = (self.mesh == get_rank()).nonzero()
252
            assert rank_coords.size(0) in (0, 1)
253
            self._coordinate_on_dim: Optional[List[int]] = (
254
                rank_coords[0].tolist() if rank_coords.size(0) > 0 else None
255
            )
256
            return _get_default_group()
257

258
        def _init_process_groups(self):
259
            # tag/ranks/group_name associated with each mesh dimension, each
260
            # mesh dimension should have one sub-group per rank
261
            #
262
            # TODO(yifu): remove tag and ranks once we fully migrate to native
263
            # functional collectives. See details in:
264
            # https://github.com/pytorch/pytorch/issues/93173#issuecomment-1907095208
265
            dim_group_infos: List[Tuple[str, List[int], str]] = []
266

267
            if self.mesh.ndim == 1 and self.mesh.numel() == get_world_size():
268
                # if the mesh is the same as world_pg, we just append the default
269
                # pg to the first dim groups, as new_group cannot have the exact
270
                # same ranks as world
271
                dim_group_infos.append(
272
                    (
273
                        _get_group_tag(_get_default_group()),
274
                        list(range(get_world_size())),
275
                        _get_default_group().group_name,
276
                    )
277
                )
278
            else:
279
                # create sub pgs base on the mesh argument specified
280
                for dim in range(self.mesh.ndim):
281
                    # swap the current dim to the last dim
282
                    # then reshape to flatten out other dims
283
                    pg_ranks_by_dim = self.mesh.swapdims(-1, dim).reshape(
284
                        -1, self.mesh.size(dim)
285
                    )
286
                    # multi-dim mesh, create subgroups by looping over the pg_ranks
287
                    # for each dim and append the groups
288
                    for dim_mesh in pg_ranks_by_dim:
289
                        subgroup_ranks = dim_mesh.tolist()
290

291
                        # We temporarily revert the re-use subgroup, since it breaks two internal tests.
292
                        # Temporarily reverting to resolve test timeout while root-causing.
293
                        # TODO: Add two tests to cover internal tests scenarios and re-enable reuse subgroup if exists.
294
                        dim_group = new_group(ranks=subgroup_ranks)
295

296
                        # only add to dim_groups if the current rank in the subgroup
297
                        if self.get_rank() in subgroup_ranks:
298
                            if len(dim_group_infos) > dim:
299
                                raise RuntimeError(
300
                                    f"Each device mesh dimension should get only one process group, but got {self.get_rank} "
301
                                    f"in {subgroup_ranks}!"
302
                                )
303
                            dim_group_infos.append(
304
                                (
305
                                    _get_group_tag(not_none(dim_group)),
306
                                    subgroup_ranks,
307
                                    dim_group.group_name,
308
                                )
309
                            )
310
            self._dim_group_infos = dim_group_infos
311

312
        def __enter__(self) -> "DeviceMesh":
313
            # set this mesh as the current mesh in mesh env
314
            _mesh_resources.mesh_stack.append(self)
315
            return self
316

317
        # pyre-fixme[2]: Parameter must be annotated.
318
        def __exit__(self, exc_type, exc_value, exc_traceback) -> None:
319
            # pop this mesh from mesh env
320
            _mesh_resources.mesh_stack.pop()
321

322
        def __repr__(self) -> str:
323
            device_mesh_repr = (
324
                f"DeviceMesh({self.mesh.tolist()})"
325
                if not self.mesh_dim_names
326
                else f"DeviceMesh({self.mesh.tolist()}, mesh_dim_names={self.mesh_dim_names})"
327
            )
328
            return device_mesh_repr
329

330
        def __hash__(self):
331
            return self._hash
332

333
        def __eq__(self, other: object) -> bool:
334
            if not isinstance(other, DeviceMesh):
335
                return False
336
            if id(self.mesh) == id(other.mesh):
337
                return True
338
            return (
339
                self.mesh.shape == other.mesh.shape
340
                and self._flatten_mesh_list == other._flatten_mesh_list
341
            )
342

343
        def __getitem__(self, mesh_dim_name: str) -> "DeviceMesh":
344
            """
345
            Slice the current DeviceMesh based on the mesh_dim_name given to create a child
346
            DeviceMesh.
347

348
            Args:
349
                mesh_dim_name (str): the name of the mesh dimension of the parent DeviceMesh
350
                to create a child DeviceMesh for.
351
            Returns:
352
                A :class:`DeviceMesh` object
353

354
            The following program runs on each process/rank in an SPMD manner. In this example, we have 2
355
            hosts with 4 GPUs each.
356
            Calling mesh["tp"] on rank 0, 1, 2, 3 would return a 1D child DeviceMesh:([0, 1, 2, 3]).
357
            Calling mesh["tp"] on rank 4, 5, 6, 7 would return a 1D child DeviceMesh:([4, 5, 6, 7]).
358
            Calling mesh["dp"] on rank 0, 4 would return a 1D child DeviceMesh:([0, 4]).
359
            Calling mesh["dp"] on rank 1, 5 would return a 1D child DeviceMesh:([1, 5]).
360
            Calling mesh["dp"] on rank 2, 6 would return a 1D child DeviceMesh:([2, 6]).
361
            Calling mesh["dp"] on rank 3, 7 would return a 1D child DeviceMesh:([3, 7]).
362

363
            Example::
364
                >>> # xdoctest: +SKIP("no rank")
365
                >>> from torch.distributed.device_mesh import DeviceMesh
366
                >>>
367
                >>> # Initialize device mesh as (2, 4) to represent the topology
368
                >>> # of cross-host(dim 0), and within-host (dim 1).
369
                >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
370
            """
371
            if self.mesh.ndim == 1:
372
                if self.mesh_dim_names and mesh_dim_name == self.mesh_dim_names[0]:
373
                    return self
374
                else:
375
                    raise RuntimeError(
376
                        f"Invalid mesh_dim_name {mesh_dim_name} specified."
377
                    )
378

379
            mesh_dim = _mesh_resources.get_mesh_dim_by_name(self, mesh_dim_name)
380
            submesh = _mesh_resources.create_child_mesh(self, mesh_dim, mesh_dim_name)
381

382
            return submesh
383

384
        def get_group(
385
            self, mesh_dim: Optional[Union[int, str]] = None
386
        ) -> Union[ProcessGroup, List[ProcessGroup]]:
387
            """
388
            Returns a list of ProcessGroups corresponding to the mesh dimensions, or
389
            returns a single ProcessGroup if mesh_dim is specified or the given mesh has
390
            only one mesh dimension.
391

392
            Args:
393
                mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
394
                of the mesh dimension. Default is None.
395

396
            Returns:
397
                A list of :class:`ProcessGroup` object when `mesh_dim` is not specified for
398
                a DeviceMesh with more than 1 dimension; otherwise, returns a single
399
                :class:`ProcessGroup` object.
400
            """
401
            if not hasattr(self, "_dim_group_infos"):
402
                raise RuntimeError("DeviceMesh process groups not initialized!")
403

404
            if self.mesh.ndim == 1:
405
                return not_none(
406
                    _find_pg_by_ranks_and_tag(*self._dim_group_infos[0][:2])
407
                )
408

409
            if mesh_dim is not None:
410
                if isinstance(mesh_dim, str):
411
                    mesh_dim = _mesh_resources.get_mesh_dim_by_name(self, mesh_dim)
412
                return not_none(
413
                    _find_pg_by_ranks_and_tag(*self._dim_group_infos[mesh_dim][:2])
414
                )
415
            else:
416
                dim_groups = []
417
                for ith_dim in range(self.mesh.ndim):
418
                    dim_groups.append(
419
                        not_none(
420
                            _find_pg_by_ranks_and_tag(
421
                                *self._dim_group_infos[ith_dim][:2]
422
                            )
423
                        )
424
                    )
425
                return dim_groups
426

427
        def size(self, mesh_dim: Optional[int] = None) -> int:
428
            return self.mesh.numel() if mesh_dim is None else self.mesh.size(mesh_dim)
429

430
        @property
431
        def ndim(self) -> int:
432
            return self.mesh.ndim
433

434
        @property
435
        def shape(self) -> Tuple[int, ...]:
436
            return tuple(self.mesh.shape)
437

438
        def get_rank(self) -> int:
439
            """
440
            Returns the current global rank.
441
            """
442
            return get_rank()
443

444
        def get_local_rank(self, mesh_dim: Optional[Union[int, str]] = None) -> int:
445
            """
446
            Returns the local rank of the given mesh_dim of the DeviceMesh.
447

448
            Args:
449
                mesh_dim (str/int, optional): it can be the name of the mesh dimension or the index
450
                of the mesh dimension. Default is None.
451

452
            Returns:
453
                An integer denotes the local rank.
454

455
            The following program runs on each process/rank in an SPMD manner. In this example, we have 2
456
            hosts with 4 GPUs each.
457
            Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 0, 1, 2, 3 would return 0.
458
            Calling mesh_2d.get_local_rank(mesh_dim=0) on rank 4, 5, 6, 7 would return 1.
459
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 0, 4 would return 0.
460
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 1, 5 would return 1.
461
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 2, 6 would return 2.
462
            Calling mesh_2d.get_local_rank(mesh_dim=1) on rank 3, 7 would return 3.
463

464
            Example::
465
                >>> # xdoctest: +SKIP("no rank")
466
                >>> from torch.distributed.device_mesh import DeviceMesh
467
                >>>
468
                >>> # Initialize device mesh as (2, 4) to represent the topology
469
                >>> # of cross-host(dim 0), and within-host (dim 1).
470
                >>> mesh = DeviceMesh(device_type="cuda", mesh=[[0, 1, 2, 3],[4, 5, 6, 7]])
471
            """
472
            if self.ndim > 1 and mesh_dim is None:
473
                raise RuntimeError(
474
                    f"Found the DeviceMesh have {self.mesh.ndim} dimensions",
475
                    "Optional kwarg `mesh_dim` needs to be specified when device_mesh.ndim > 1.",
476
                )
477
            elif mesh_dim is None:
478
                mesh_dim = 0
479

480
            mesh_dim_group = not_none(self.get_group(mesh_dim))
481
            assert isinstance(
482
                mesh_dim_group, ProcessGroup
483
            ), "We expect ProcessGroup before calling `get_rank`!"
484
            return not_none(get_rank(mesh_dim_group))
485

486
        def get_coordinate(self) -> Optional[List[int]]:
487
            """
488
            Return the relative indices of this rank relative to all
489
            dimensions of the mesh. If this rank is not part of the mesh, return None.
490
            """
491
            return self._coordinate_on_dim if self._coordinate_on_dim else None
492

493
    def init_device_mesh(
494
        device_type: str,
495
        mesh_shape: Tuple[int, ...],
496
        *,
497
        mesh_dim_names: Optional[Tuple[str, ...]] = None,
498
    ) -> DeviceMesh:
499
        """
500
        Initializes a `DeviceMesh` based on `device_type`, `mesh_shape`, and `mesh_dim_names` parameters.
501

502
        This creates a DeviceMesh with an n-dimensional array layout, where `n` is the length of `mesh_shape`.
503
        If `mesh_dim_names` is provided, each dimension is labeled as `mesh_dim_names[i]`.
504

505
        .. note::
506
            `init_device_mesh` follows SPMD programming model, meaning the same PyTorch Python program
507
            runs on all processes/ranks in the cluster. Ensure `mesh_shape` (the dimensions of the nD array
508
            describing device layout) is identical across all ranks. Inconsistent `mesh_shape` may lead to hanging.
509

510
        .. note::
511
            If no process group is found, init_device_mesh will initialize distributed process group/groups
512
            required for distributed communications behind the scene.
513

514
        Args:
515
            device_type (str): The device type of the mesh. Currently supports: "cpu", "cuda/cuda-like".
516
            mesh_shape (Tuple[int]): A tuple defining the dimensions of the multi-dimensional array
517
                describing the layout of devices.
518
            mesh_dim_names (Tuple[str], optional): A tuple of mesh dimension names to assign to each dimension
519
                of the multi-dimensional array describing the layout of devices. Its length must match the length
520
                of `mesh_shape`. Each string in `mesh_dim_names` must be unique.
521

522
        Returns:
523
            DeviceMesh: A :class:`DeviceMesh` object representing the device layout.
524

525
        Example::
526
            >>> # xdoctest: +SKIP("no rank")
527
            >>> from torch.distributed.device_mesh import init_device_mesh
528
            >>>
529
            >>> mesh_1d = init_device_mesh("cuda", mesh_shape=(8,))
530
            >>> mesh_2d = init_device_mesh("cuda", mesh_shape=(2, 8), mesh_dim_names=("dp", "tp"))
531

532
        """
533
        if mesh_dim_names is not None:
534
            if len(set(mesh_dim_names)) != len(mesh_dim_names):
535
                raise RuntimeError(
536
                    "Each mesh_dim_name must be unique.",
537
                    f"Found repeated mesh_dim_name in mesh_dim_names {mesh_dim_names}",
538
                )
539

540
            if len(mesh_shape) != len(mesh_dim_names):
541
                raise RuntimeError(
542
                    "mesh_shape and mesh_dim_names should have same length!",
543
                    f"Found len(mesh_dim_names): {len(mesh_dim_names)} and len(mesh_shape):{len(mesh_shape)}.",
544
                )
545

546
        mesh = torch.arange(math.prod(mesh_shape)).view(mesh_shape)
547
        device_mesh = DeviceMesh(
548
            device_type=device_type,
549
            mesh=mesh,
550
            mesh_dim_names=mesh_dim_names,
551
        )
552

553
        return device_mesh
554
pytorch

Использование cookies