pytorch-lightning

Форк
0
176 строк · 7.3 Кб
1
# Copyright The Lightning AI team.
2
#
3
# Licensed under the Apache License, Version 2.0 (the "License");
4
# you may not use this file except in compliance with the License.
5
# You may obtain a copy of the License at
6
#
7
#     http://www.apache.org/licenses/LICENSE-2.0
8
#
9
# Unless required by applicable law or agreed to in writing, software
10
# distributed under the License is distributed on an "AS IS" BASIS,
11
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
# See the License for the specific language governing permissions and
13
# limitations under the License.
14
import os
15
import queue
16
from typing import TYPE_CHECKING, Any, Callable, Optional, Union
17

18
import torch.multiprocessing as mp
19
from typing_extensions import override
20

21
from lightning.fabric.accelerators.xla import _XLA_AVAILABLE
22
from lightning.fabric.strategies.launchers.xla import _rank_teardown
23
from lightning.fabric.utilities import move_data_to_device
24
from lightning.pytorch.strategies.launchers.multiprocessing import (
25
    _GlobalStateSnapshot,
26
    _MultiProcessingLauncher,
27
    _WorkerOutput,
28
)
29
from lightning.pytorch.trainer.states import TrainerFn
30
from lightning.pytorch.utilities.rank_zero import rank_zero_debug
31

32
if TYPE_CHECKING:
33
    import lightning.pytorch as pl
34

35

36
class _XLALauncher(_MultiProcessingLauncher):
37
    r"""Launches processes that run a given function in parallel on XLA supported hardware, and joins them all at the
38
    end.
39

40
    The main process in which this launcher is invoked creates N so-called worker processes (using the
41
    `torch_xla` :func:`xmp.spawn`) that run the given function.
42
    Worker processes have a rank that ranges from 0 to N - 1.
43

44
    Note:
45
        - This launcher requires all objects to be pickleable.
46
        - It is important that the entry point to the program/script is guarded by ``if __name__ == "__main__"``.
47

48
    Args:
49
        strategy: A reference to the strategy that is used together with this launcher
50

51
    """
52

53
    def __init__(self, strategy: "pl.strategies.XLAStrategy") -> None:
54
        if not _XLA_AVAILABLE:
55
            raise ModuleNotFoundError(str(_XLA_AVAILABLE))
56
        super().__init__(strategy=strategy, start_method="fork")
57

58
    @property
59
    @override
60
    def is_interactive_compatible(self) -> bool:
61
        return True
62

63
    @override
64
    def launch(self, function: Callable, *args: Any, trainer: Optional["pl.Trainer"] = None, **kwargs: Any) -> Any:
65
        """Launches processes that run the given function in parallel.
66

67
        The function is allowed to have a return value. However, when all processes join, only the return value
68
        of worker process 0 gets returned from this `launch` method in the main process.
69

70
        Arguments:
71
            function: The entry point for all launched processes.
72
            *args: Optional positional arguments to be passed to the given function.
73
            trainer: Optional reference to the :class:`~lightning.pytorch.trainer.trainer.Trainer` for which
74
                a selected set of attributes get restored in the main process after processes join.
75
            **kwargs: Optional keyword arguments to be passed to the given function.
76

77
        """
78
        if self._already_fit and trainer is not None and trainer.state.fn == TrainerFn.FITTING:
79
            # resolving https://github.com/Lightning-AI/lightning/issues/18775 will lift this restriction
80
            raise NotImplementedError(
81
                "Calling `trainer.fit()` twice on the same Trainer instance using a spawn-based strategy is not"
82
                " supported. You can work around this by creating a new Trainer instance and passing the"
83
                " `fit(ckpt_path=...)` argument."
84
            )
85

86
        # pjrt requires that the queue is serializable
87
        return_queue = mp.Manager().Queue()
88

89
        import torch_xla.distributed.xla_multiprocessing as xmp
90

91
        spawn_kwargs = {}
92
        nprocs = self._strategy.num_processes
93
        if nprocs == 1:
94
            # avoid warning: "Unsupported nprocs". If it's 1, it will call the launched function directly.
95
            # otherwise it will use all devices
96
            spawn_kwargs["nprocs"] = nprocs
97

98
        process_context = xmp.spawn(
99
            self._wrapping_function,
100
            args=(trainer, function, args, kwargs, return_queue),
101
            start_method=self._start_method,
102
            join=False,  # we will join ourselves to get the process references
103
            **spawn_kwargs,
104
        )
105
        # xla will not actually create processes if only 1 device
106
        if process_context is not None:
107
            self.procs = process_context.processes
108
            while not process_context.join():
109
                pass
110

111
        worker_output = return_queue.get()
112
        if trainer is None:
113
            return worker_output
114

115
        self._already_fit |= trainer.state.fn == TrainerFn.FITTING
116
        self._recover_results_in_main_process(worker_output, trainer)
117
        return worker_output.trainer_results
118

119
    @override
120
    def _wrapping_function(
121
        self,
122
        # XLA's multiprocessing returns the global index, not the local index as torch's multiprocessing
123
        # https://github.com/pytorch/xla/blob/v1.13.0/torch_xla/distributed/xla_multiprocessing.py#L321
124
        process_idx: int,
125
        trainer: Optional["pl.Trainer"],
126
        function: Callable,
127
        args: Any,
128
        kwargs: Any,
129
        return_queue: Union[mp.SimpleQueue, queue.Queue],
130
        global_states: Optional[_GlobalStateSnapshot] = None,
131
    ) -> None:
132
        import torch_xla.core.xla_model as xm
133

134
        if len(xm.get_xla_supported_devices()) > 1:
135
            # `get_xla_supported_devices` in the spawned process returns the logical devices (2 for v2/v3 and 1 for v4)
136
            # so when there's more than one (multithreading), objects need to be deep-copied
137
            import copy
138

139
            trainer, function, args, kwargs = copy.deepcopy((trainer, function, args, kwargs))
140

141
        results = function(*args, **kwargs)
142

143
        if trainer is not None:
144
            results = self._collect_rank_zero_results(trainer, results)
145

146
        if self._strategy.local_rank == 0:
147
            return_queue.put(move_data_to_device(results, "cpu"))
148

149
        _rank_teardown(self._strategy.local_rank)
150

151
    @override
152
    def _collect_rank_zero_results(self, trainer: "pl.Trainer", results: Any) -> Optional["_WorkerOutput"]:
153
        rank_zero_debug("Collecting results from rank 0 process.")
154
        checkpoint_callback = trainer.checkpoint_callback
155
        best_model_path = (
156
            checkpoint_callback.best_model_path
157
            if checkpoint_callback and hasattr(checkpoint_callback, "best_model_path")
158
            else None
159
        )
160

161
        # save the last weights
162
        weights_path = None
163
        if trainer.state.fn == TrainerFn.FITTING:
164
            # requires to compute the state_dict on all processes in case Metrics are present
165
            state_dict = self._strategy.lightning_module_state_dict()
166
            weights_path = os.path.join(trainer.default_root_dir, ".temp.ckpt")
167
            self._strategy.checkpoint_io.save_checkpoint(state_dict, weights_path)
168

169
        # We use `local_rank` here as separate filesystems are used for each VM for TPU Pod Training
170
        if self._strategy.local_rank != 0:
171
            return None
172

173
        # add extra result data from trainer to send to main process
174
        extra = self.get_extra_results(trainer)
175

176
        return _WorkerOutput(best_model_path, weights_path, trainer.state, results, extra)
177

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.