pytorch-lightning

gpu-tests-fabric.yml
166 строк · 6.4 Кб
Перенос по словам
1
# Python package
2
# Create and test a Python package on multiple Python versions.
3
# Add steps that analyze code, save the dist with the build record, publish to a PyPI-compatible index, and more:
4
# https://docs.microsoft.com/azure/devops/pipelines/languages/python
5

6
trigger:
7
  tags:
8
    include: ["*"]
9
  branches:
10
    include:
11
      - "master"
12
      - "release/*"
13
      - "refs/tags/*"
14

15
pr:
16
  branches:
17
    include:
18
      - "master"
19
      - "release/*"
20
  paths:
21
    include:
22
      - ".actions/*"
23
      - ".azure/gpu-tests-fabric.yml"
24
      - "examples/fabric/**"
25
      - "examples/run_fabric_examples.sh"
26
      - "tests/run_standalone_*.sh"
27
      - "requirements/fabric/**"
28
      - "src/lightning/__init__.py"
29
      - "src/lightning/__setup__.py"
30
      - "src/lightning/__version__.py"
31
      - "src/lightning/fabric/**"
32
      - "src/lightning_fabric/*"
33
      - "tests/tests_fabric/**"
34
      - "pyproject.toml" # includes pytest config
35
    exclude:
36
      - "requirements/*/docs.txt"
37
      - "*.md"
38
      - "**/*.md"
39

40
jobs:
41
  - job: testing
42
    # how long to run the job before automatically cancelling
43
    timeoutInMinutes: "20"
44
    # how much time to give 'run always even if cancelled tasks' before stopping them
45
    cancelTimeoutInMinutes: "2"
46
    pool: lit-rtx-3090
47
    variables:
48
      DEVICES: $( python -c 'print("$(Agent.Name)".split("_")[-1])' )
49
      FREEZE_REQUIREMENTS: "1"
50
      PIP_CACHE_DIR: "/var/tmp/pip"
51
      PL_RUN_CUDA_TESTS: "1"
52
    container:
53
      image: $(image)
54
      # default shm size is 64m. Increase it to avoid:
55
      # 'Error while creating shared memory: unhandled system error, NCCL version 2.7.8'
56
      options: "--gpus=all --shm-size=2gb  -v /var/tmp:/var/tmp"
57
    strategy:
58
      matrix:
59
        # TODO: Upgrade to Python 3.11
60
        "Fabric | latest":
61
          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
62
          PACKAGE_NAME: "fabric"
63
        "Lightning | latest":
64
          image: "pytorchlightning/pytorch_lightning:base-cuda-py3.10-torch2.2-cuda12.1.0"
65
          PACKAGE_NAME: "lightning"
66
    workspace:
67
      clean: all
68
    steps:
69
      - bash: |
70
          echo "##vso[task.setvariable variable=CUDA_VISIBLE_DEVICES]$(DEVICES)"
71
          cuda_ver=$(python -c "import torch ; print(''.join(map(str, torch.version.cuda.split('.')[:2])))")
72
          echo "##vso[task.setvariable variable=CUDA_VERSION_MM]$cuda_ver"
73
          echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/cu${cuda_ver}/torch_stable.html"
74
          scope=$(python -c 'n = "$(PACKAGE_NAME)" ; print(dict(fabric="lightning_fabric").get(n, n))')
75
          echo "##vso[task.setvariable variable=COVERAGE_SOURCE]$scope"
76
        displayName: "set env. vars"
77
      - bash: |
78
          echo "##vso[task.setvariable variable=TORCH_URL]https://download.pytorch.org/whl/test/cu${CUDA_VERSION_MM}/torch_test.html"
79
        condition: endsWith(variables['Agent.JobName'], 'future')
80
        displayName: "set env. vars 4 future"
81

82
      - bash: |
83
          echo $(DEVICES)
84
          echo $CUDA_VISIBLE_DEVICES
85
          echo $CUDA_VERSION_MM
86
          echo $TORCH_URL
87
          echo $COVERAGE_SOURCE
88
          whereis nvidia
89
          nvidia-smi
90
          which python && which pip
91
          python --version
92
          pip --version
93
          pip list
94
        displayName: "Image info & NVIDIA"
95

96
      - bash: |
97
          PYTORCH_VERSION=$(python -c "import torch; print(torch.__version__.split('+')[0])")
98
          pip install -q wget packaging
99
          python -m wget https://raw.githubusercontent.com/Lightning-AI/utilities/main/scripts/adjust-torch-versions.py
100
          for fpath in `ls requirements/**/*.txt`; do \
101
            python ./adjust-torch-versions.py $fpath ${PYTORCH_VERSION}; \
102
          done
103
        displayName: "Adjust dependencies"
104

105
      - bash: |
106
          extra=$(python -c "print({'lightning': 'fabric-'}.get('$(PACKAGE_NAME)', ''))")
107
          pip install -e ".[${extra}dev]" pytest-timeout -U --find-links="${TORCH_URL}"
108
        displayName: "Install package & dependencies"
109

110
      - bash: |
111
          set -e
112
          python requirements/collect_env_details.py
113
          python -c "import torch ; mgpu = torch.cuda.device_count() ; assert mgpu == 2, f'GPU: {mgpu}'"
114
          python -c "import bitsandbytes"
115
        displayName: "Env details"
116

117
      - bash: python -m pytest lightning_fabric
118
        workingDirectory: src
119
        # without succeeded this could run even if the job has already failed
120
        condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
121
        displayName: "Testing: Fabric doctests"
122

123
      - bash: |
124
          pip install -q -r .actions/requirements.txt
125
          python .actions/assistant.py copy_replace_imports --source_dir="./tests/tests_fabric" \
126
            --source_import="lightning.fabric" \
127
            --target_import="lightning_fabric"
128
          python .actions/assistant.py copy_replace_imports --source_dir="./examples/fabric" \
129
            --source_import="lightning.fabric" \
130
            --target_import="lightning_fabric"
131
        # without succeeded this could run even if the job has already failed
132
        condition: and(succeeded(), eq(variables['PACKAGE_NAME'], 'fabric'))
133
        displayName: "Adjust tests & examples"
134

135
      - bash: python -m coverage run --source ${COVERAGE_SOURCE} -m pytest . -v --durations=50
136
        workingDirectory: tests/tests_fabric/
137
        displayName: "Testing: fabric standard"
138
        timeoutInMinutes: "10"
139

140
      - bash: bash ../run_standalone_tests.sh "."
141
        workingDirectory: tests/tests_fabric/
142
        env:
143
          PL_STANDALONE_TESTS_SOURCE: $(COVERAGE_SOURCE)
144
        displayName: "Testing: fabric standalone"
145
        timeoutInMinutes: "10"
146

147
      - bash: |
148
          python -m coverage report
149
          python -m coverage xml
150
          python -m coverage html
151

152
          # https://docs.codecov.com/docs/codecov-uploader
153
          curl -Os https://uploader.codecov.io/latest/linux/codecov
154
          chmod +x codecov
155
          ./codecov --token=$(CODECOV_TOKEN) --commit=$(Build.SourceVersion) \
156
            --flags=gpu,pytest,${COVERAGE_SOURCE} --name="GPU-coverage" --env=linux,azure
157
          ls -l
158
        workingDirectory: tests/tests_fabric/
159
        displayName: "Statistics"
160

161
      - script: |
162
          set -e
163
          bash run_fabric_examples.sh --accelerator=cuda --devices=1
164
          bash run_fabric_examples.sh --accelerator=cuda --devices=2 --strategy ddp
165
        workingDirectory: examples/
166
        displayName: "Testing: fabric examples"
167
pytorch-lightning

Использование cookies