pytorch
90 строк · 2.7 Кб
1name: Setup ROCm host
2
3description: Set up ROCm host for CI
4
5runs:
6using: composite
7steps:
8- name: Set DOCKER_HOST
9shell: bash
10run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
11
12- name: Remove leftover Docker config file
13shell: bash
14continue-on-error: true
15run: |
16set -ex
17
18cat ~/.docker/config.json || true
19# https://stackoverflow.com/questions/64455468/error-when-logging-into-ecr-with-docker-login-error-saving-credentials-not
20rm -f ~/.docker/config.json
21
22- name: Stop all running docker containers
23if: always()
24shell: bash
25run: |
26# ignore expansion of "docker ps -q" since it could be empty
27# shellcheck disable=SC2046
28docker stop $(docker ps -q) || true
29# Prune all stopped containers.
30docker container prune -f
31
32- name: Runner health check system info
33if: always()
34shell: bash
35run: |
36cat /etc/os-release || true
37cat /etc/apt/sources.list.d/rocm.list || true
38cat /opt/rocm/.info/version || true
39whoami
40
41- name: Runner health check rocm-smi
42if: always()
43shell: bash
44run: |
45rocm-smi
46
47- name: Runner health check rocminfo
48if: always()
49shell: bash
50run: |
51rocminfo
52
53- name: Runner health check GPU count
54if: always()
55shell: bash
56run: |
57ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
58msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
59if [[ $ngpu -eq 0 ]]; then
60echo "Error: Failed to detect any GPUs on the runner"
61echo "$msg"
62exit 1
63fi
64if [[ $ngpu -eq 1 ]]; then
65echo "Error: only 1 GPU detected, at least 2 GPUs are needed for distributed jobs"
66echo "$msg"
67exit 1
68fi
69
70- name: Runner diskspace health check
71uses: ./.github/actions/diskspace-cleanup
72if: always()
73
74- name: Runner health check disconnect on failure
75if: ${{ failure() }}
76shell: bash
77run: |
78killall runsvc.sh
79
80- name: Preserve github env variables for use in docker
81shell: bash
82run: |
83env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
84env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
85
86- name: ROCm set GPU_FLAG
87shell: bash
88run: |
89# All GPUs are visible to the runner; visibility, if needed, will be set by run_test.py.
90echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
91