pytorch
45 строк · 2.6 Кб
1from typing import Set
2
3import torch.nn as nn
4
5
6def _annotate_modules_for_dynamo(
7module: nn.Module,
8ignored_modules: Set[nn.Module],
9use_orig_params: bool,
10):
11"""
12Annotates the submodules in ``module`` 's tree, except those in
13``ignored_modules``, indicating that the submodules are FSDP-managed and
14saving the ``use_orig_params`` setting passed to the FSDP constructor.
15"""
16for submodule in module.modules():
17if submodule not in ignored_modules:
18"""[note: Dynamo treats FSDP wrapped modules as UnspecializedNNModule]
19
20Dynamo doesn't get to see this instance (FullyShardedDataParallel) during tracing, since
21it skips tracing all the torch.distributed.fsdp code.
22- Why? Running the FSDP code eagerly avoids lots of issues trying to trace complex hooks, and also
23gets us graph-breaks on FSDP module boundaries which we want anyway for comm ops.
24- However, we _also_ want dynamo to treat the wrapped module inside FSDP 'unspecially' (*),
25and we need a way to indicate to dynamo which modules are wrapped by FSDP.
26
27(*) UnspecializedNNModules in dynamo are traced-through without any assumptions, and with thorough
28guards. NNModules otherwise are 'specialized', meaning there is less overhead due to assuming
29their code is well-behaved.
30
31One particular issue with specialized NNModules for FSDP is that the
32views created for orig_params are captured into the compiled graph on the first iteration, and while
33they are always going to point to the correct flatparameter and give correct results, their order
34of creation influences the order of backward execution, preventing overlap of comm and computation
35during backward. We need to _use_ the new parameter views created on each forward iteration, in
36order for backward to interleave hooks with compute per layer. UnspecializedNNModule lets us achieve
37this by capturing the module code more 'functionally' and passing parameters in as inputs each time.
38"""
39submodule._is_fsdp_managed_module = True # type: ignore[assignment]
40
41# Dynamo only supports FSDP with use_orig_params=True.
42# This is hacky, but I could not think of another way to add an assertion to dynamo
43# for this, since Dynamo skips all the FSDP code frames and thus can't inspect the
44# FSDP module directly
45submodule._fsdp_use_orig_params = use_orig_params # type: ignore[assignment]
46