pytorch-lightning
175 строк · 5.9 Кб
1name: Test PyTorch - TPU
2
3on:
4push:
5branches: [master, "release/*"]
6pull_request_target:
7branches: [master, "release/*"]
8types: [opened, reopened, ready_for_review, labeled, synchronize]
9
10concurrency:
11group: ${{ github.workflow }}-${{ github.ref }}-${{ github.head_ref }}
12cancel-in-progress: ${{ github.event_name == 'pull_request' }}
13
14defaults:
15run:
16shell: bash
17
18jobs:
19test-on-tpus:
20runs-on: ubuntu-22.04
21# run only when the PR title contains 'TPU' or is a merge to master
22if: |
23(github.event_name == 'push' && github.ref == 'refs/heads/master') ||
24(startsWith(github.event_name, 'pull_request') && contains(github.event.pull_request.labels.*.name, 'run TPU'))
25strategy:
26fail-fast: false
27matrix:
28pkg-name: ["fabric", "pytorch"]
29accelerator_type: ["v4-8"]
30timeout-minutes: 30
31env:
32XLA_VER: "2.0"
33PR_NUMBER: ${{ github.event.pull_request.number && github.event.pull_request.number || 'master' }}
34SHA: ${{ github.event.pull_request.head.sha && github.event.pull_request.head.sha || github.sha }}
35CLOUDSDK_CORE_DISABLE_PROMPTS: 1 # default to --quiet
36steps:
37- name: Set env
38run: |
39# define --zone: https://cloud.google.com/tpu/docs/regions-zones
40if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then
41echo "CLOUDSDK_COMPUTE_ZONE=us-central2-b" >> $GITHUB_ENV
42else
43echo "CLOUDSDK_COMPUTE_ZONE=us-west4-a" >> $GITHUB_ENV
44fi
45
46- uses: actions/checkout@v4
47with:
48ref: ${{ github.event.pull_request.head.sha }}
49- uses: actions/setup-python@v5
50with:
51python-version: "3.10"
52
53- uses: google-github-actions/auth@v2
54with:
55credentials_json: ${{ secrets.GKE_SA_KEY_BASE64 }}
56- uses: "google-github-actions/setup-gcloud@v2"
57
58- name: Time-based job cleanup
59if: always()
60run: |
61gcloud compute tpus tpu-vm list --format='value(name,createTime)' > creation_times.txt
62cat creation_times.txt
63
64if [ ! -s "creation_times.txt" ]; then
65echo "No existing jobs"
66exit 0
67fi
68
69jobs_deleted=false
70while read -r job_name created_at; do
71# Skip jobs with "keepalive" in the name
72if [[ "$job_name" == *"keepalive"* ]]; then
73echo "Skipping $job_name, has keepalive in name"
74continue
75fi
76
77# Convert the creation time to Unix timestamp
78created_timestamp=$(date -d "${created_at}" +%s)
79
80# Calculate the difference between the current time and the creation time
81current_timestamp=$(date +%s)
82age=$((current_timestamp - created_timestamp))
83
84# Check if the age has surpassed a timeout
85if ((age > 35 * 60)); then
86# delete the job
87gcloud compute tpus tpu-vm delete "$job_name" --async
88jobs_deleted=true
89else
90echo "Skipping $job_name, alive for $age seconds"
91fi
92done < creation_times.txt
93
94if [ "$jobs_deleted" = true ]; then
95sleep 5
96# diagnostics
97gcloud compute tpus tpu-vm list
98fi
99
100- name: Update script
101run: |
102import os
103fname = f'tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh'
104with open(fname) as fopen:
105data = fopen.read()
106data = data.replace('{PYTORCH_VERSION}', os.environ["XLA_VER"])
107print(data)
108with open(fname, "w") as fopen:
109fopen.write(data)
110shell: python
111
112- name: Create node
113id: tpu-create
114# TPU capacity is very limited so this workflow's success is optional. continue normally if creation fails
115continue-on-error: true
116env:
117JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
118run: |
119if [[ "${{ matrix.accelerator_type }}" == v4* ]]; then
120gcloud compute tpus tpu-vm create "$JOB_NAME" \
121--accelerator-type=${{ matrix.accelerator_type }} \
122--version="tpu-vm-v4-pt-$XLA_VER" \
123--preemptible
124fi
125
126- name: Run tests
127if: steps.tpu-create.outcome == 'success'
128env:
129JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
130run: |
131set -uex
132
133# zip-copy-unzip the repository
134zip -q -r repo.zip . -x .git/
135gcloud compute tpus tpu-vm scp --worker=all repo.zip "$JOB_NAME":~
136gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; unzip -q -o repo.zip"
137
138# run script
139gcloud compute tpus tpu-vm ssh "$JOB_NAME" --worker=all --command="cd ~; bash tests/tests_${{ matrix.pkg-name }}/run_tpu_tests.sh"
140exit_code=$?
141
142# pull out the coverage file
143gcloud compute tpus tpu-vm scp "$JOB_NAME":~/coverage.xml .
144
145exit $exit_code
146
147- name: Cleanup job
148if: always()
149env:
150JOB_NAME: ${{ env.PR_NUMBER }}-${{ matrix.pkg-name }}-${{ matrix.accelerator_type }}-${{ env.SHA }}
151run: |
152if ! gcloud compute tpus tpu-vm list | grep -q "$JOB_NAME"; then
153echo "$JOB_NAME wasn't created"
154exit 0
155fi
156
157# diagnostics
158gcloud compute tpus tpu-vm describe "$JOB_NAME"
159
160# delete the job
161gcloud compute tpus tpu-vm delete "$JOB_NAME" --async
162sleep 5
163
164# diagnostics
165gcloud compute tpus tpu-vm list
166
167- name: Upload coverage to Codecov
168uses: codecov/codecov-action@v4
169continue-on-error: true
170with:
171token: ${{ secrets.CODECOV_TOKEN }}
172file: coverage.xml
173flags: tpu,pytest
174name: TPU-coverage
175fail_ci_if_error: false
176