pytorch-lightning

Форк
0
175 строк · 5.9 Кб
1
name: Test PyTorch - TPU
2

3
on:
4
  push:
5
    branches: [master, "release/*"]
6
  pull_request_target:
7
    branches: [master, "release/*"]
8
    types: [opened, reopened, ready_for_review, labeled, synchronize]
9

10
concurrency:
11
  group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
12
  cancel-in-progress: ${{ github.event_name == 'pull_request' }}
13

14
defaults:
15
  run:
16
    shell: bash
17

18
jobs:
19
  test-on-tpus:
20
    runs-on: ubuntu-22.04
21
    # run only when the PR title contains 'TPU' or is a merge to master
22
    if: |
23
      (github.event_name == 'push' && github.ref == 'refs/heads/master') ||
24
      (startsWith(github.event_name, 'pull_request') && contains(github.event.pull_request.labels.*.name, 'run TPU'))
25
    strategy:
26
      fail-fast: false
27
      matrix:
28
        pkg-name: ["fabric", "pytorch"]
29
        accelerator_type: ["v4-8"]
30
    timeout-minutes: 30
31
    env:
32
      XLA_VER: "2.0"
33
      PR_NUMBER: ${{ github.event.pull_request.number && github.event.pull_request.number || 'master' }}
34
      SHA: ${{ github.event.pull_request.head.sha && github.event.pull_request.head.sha || github.sha }}
35
      CLOUDSDK_CORE_DISABLE_PROMPTS: 1 # default to --quiet
36
    steps:
37
      - name: Set env
38
        run: |
39
          # define --zone: https://cloud.google.com/tpu/docs/regions-zones
40
          if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then
41
            echo "CLOUDSDK_COMPUTE_ZONE=us-central2-b" >> $GITHUB_ENV
42
          else
43
            echo "CLOUDSDK_COMPUTE_ZONE=us-west4-a" >> $GITHUB_ENV
44
          fi
45

46
      - uses: actions/checkout@v4
47
        with:
48
          ref: ${{ github.event.pull_request.head.sha }}
49
      - uses: actions/setup-python@v5
50
        with:
51
          python-version: "3.10"
52

53
      - uses: google-github-actions/auth@v2
54
        with:
55
          credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }}
56
      - uses: "google-github-actions/setup-gcloud@v2"
57

58
      - name: Time-based job cleanup
59
        if: always()
60
        run: |
61
          gcloud compute tpus tpu-vm list --format='value(name,createTime)' > creation_times.txt
62
          cat creation_times.txt
63

64
          if [ ! -s "creation_times.txt" ]; then
65
            echo "No existing jobs"
66
            exit 0
67
          fi
68

69
          jobs_deleted=false
70
          while read -r job_name created_at; do
71
            # Skip jobs with "keepalive" in the name
72
            if [[ "$job_name" == *"keepalive"* ]]; then
73
              echo "Skipping $job_name, has keepalive in name"
74
              continue
75
            fi
76

77
            # Convert the creation time to Unix timestamp
78
            created_timestamp=$(date -d "${created_at}" +%s)
79

80
            # Calculate the difference between the current time and the creation time
81
            current_timestamp=$(date +%s)
82
            age=$((current_timestamp - created_timestamp))
83

84
            # Check if the age has surpassed a timeout
85
            if ((age > 35 * 60)); then
86
              # delete the job
87
              gcloud compute tpus tpu-vm delete "$job_name" --async
88
              jobs_deleted=true
89
            else
90
              echo "Skipping $job_name, alive for $age seconds"
91
            fi
92
          done < creation_times.txt
93

94
          if [ "$jobs_deleted" = true ]; then
95
            sleep 5
96
            # diagnostics
97
            gcloud compute tpus tpu-vm list
98
          fi
99

100
      - name: Update script
101
        run: |
102
          import os
103
          fname = f'tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh'
104
          with open(fname) as fopen:
105
              data = fopen.read()
106
          data = data.replace('{PYTORCH_VERSION}', os.environ["XLA_VER"])
107
          print(data)
108
          with open(fname, "w") as fopen:
109
              fopen.write(data)
110
        shell: python
111

112
      - name: Create node
113
        id: tpu-create
114
        # TPU capacity is very limited so this workflow's success is optional. continue normally if creation fails
115
        continue-on-error: true
116
        env:
117
          JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
118
        run: |
119
          if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then
120
            gcloud compute tpus tpu-vm create "$JOB_NAME" \
121
              --accelerator-type=${{ matrix.accelerator_type }} \
122
              --version="tpu-vm-v4-pt-$XLA_VER" \
123
              --preemptible
124
          fi
125

126
      - name: Run tests
127
        if: steps.tpu-create.outcome == 'success'
128
        env:
129
          JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
130
        run: |
131
          set -uex
132

133
          # zip-copy-unzip the repository
134
          zip -q -r repo.zip . -x .git/
135
          gcloud compute tpus tpu-vm scp --worker=all repo.zip "$JOB_NAME":~
136
          gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; unzip -q -o repo.zip"
137

138
          # run script
139
          gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; bash tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh"
140
          exit_code=$?
141

142
          # pull out the coverage file
143
          gcloud compute tpus tpu-vm scp "$JOB_NAME":~/coverage.xml .
144

145
          exit $exit_code
146

147
      - name: Cleanup job
148
        if: always()
149
        env:
150
          JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
151
        run: |
152
          if ! gcloud compute tpus tpu-vm list | grep -q "$JOB_NAME"; then
153
            echo "$JOB_NAME wasn't created"
154
            exit 0
155
          fi
156

157
          # diagnostics
158
          gcloud compute tpus tpu-vm describe "$JOB_NAME"
159

160
          # delete the job
161
          gcloud compute tpus tpu-vm delete "$JOB_NAME" --async
162
          sleep 5
163

164
          # diagnostics
165
          gcloud compute tpus tpu-vm list
166

167
      - name: Upload coverage to Codecov
168
        uses: codecov/codecov-action@v4
169
        continue-on-error: true
170
        with:
171
          token: ${{ secrets.CODECOV_TOKEN }}
172
          file: coverage.xml
173
          flags: tpu,pytest
174
          name: TPU-coverage
175
          fail_ci_if_error: false
176

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.