pytorch

_dynamo_utils.py
45 строк · 2.6 Кб
Перенос по словам
1
from typing import Set
2

3
import torch.nn as nn
4

5

6
def _annotate_modules_for_dynamo(
7
    module: nn.Module,
8
    ignored_modules: Set[nn.Module],
9
    use_orig_params: bool,
10
):
11
    """
12
    Annotates the submodules in ``module`` 's tree, except those in
13
    ``ignored_modules``, indicating that the submodules are FSDP-managed and
14
    saving the ``use_orig_params`` setting passed to the FSDP constructor.
15
    """
16
    for submodule in module.modules():
17
        if submodule not in ignored_modules:
18
            """[note: Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
19

20
            Dynamo doesn't get to see this instance (FullyShardedDataParallel) during tracing, since
21
            it skips tracing all the torch.distributed.fsdp code.
22
                - Why? Running the FSDP code eagerly avoids lots of issues trying to trace complex hooks, and also
23
                gets us graph-breaks on FSDP module boundaries which we want anyway for comm ops.
24
                - However, we _also_ want dynamo to treat the wrapped module inside FSDP 'unspecially' (*),
25
                and we need a way to indicate to dynamo which modules are wrapped by FSDP.
26

27
            (*) UnspecializedNNModules in dynamo are traced-through without any assumptions, and with thorough
28
            guards.  NNModules otherwise are 'specialized', meaning there is less overhead due to assuming
29
            their code is well-behaved.
30

31
            One particular issue with specialized NNModules for FSDP is that the
32
            views created for orig_params are captured into the compiled graph on the first iteration, and while
33
            they are always going to point to the correct flatparameter and give correct results, their order
34
            of creation influences the order of backward execution, preventing overlap of comm and computation
35
            during backward.  We need to _use_ the new parameter views created on each forward iteration, in
36
            order for backward to interleave hooks with compute per layer.  UnspecializedNNModule lets us achieve
37
            this by capturing the module code more 'functionally' and passing parameters in as inputs each time.
38
            """
39
            submodule._is_fsdp_managed_module = True  # type: ignore[assignment]
40

41
            # Dynamo only supports FSDP with use_orig_params=True.
42
            # This is hacky, but I could not think of another way to add an assertion to dynamo
43
            # for this, since Dynamo skips all the FSDP code frames and thus can't inspect the
44
            # FSDP module directly
45
            submodule._fsdp_use_orig_params = use_orig_params  # type: ignore[assignment]
46
pytorch

Использование cookies