pytorch

Форк
0
90 строк · 2.7 Кб
1
name: Setup ROCm host
2

3
description: Set up ROCm host for CI
4

5
runs:
6
  using: composite
7
  steps:
8
    - name: Set DOCKER_HOST
9
      shell: bash
10
      run: echo "DOCKER_HOST=unix:///run/user/$(id -u)/docker.sock" >> "${GITHUB_ENV}"
11

12
    - name: Remove leftover Docker config file
13
      shell: bash
14
      continue-on-error: true
15
      run: |
16
        set -ex
17

18
        cat ~/.docker/config.json || true
19
        # https://stackoverflow.com/questions/64455468/error-when-logging-into-ecr-with-docker-login-error-saving-credentials-not
20
        rm -f ~/.docker/config.json
21

22
    - name: Stop all running docker containers
23
      if: always()
24
      shell: bash
25
      run: |
26
        # ignore expansion of "docker ps -q" since it could be empty
27
        # shellcheck disable=SC2046
28
        docker stop $(docker ps -q) || true
29
        # Prune all stopped containers.
30
        docker container prune -f
31

32
    - name: Runner health check system info
33
      if: always()
34
      shell: bash
35
      run: |
36
        cat /etc/os-release || true
37
        cat /etc/apt/sources.list.d/rocm.list || true
38
        cat /opt/rocm/.info/version || true
39
        whoami
40

41
    - name: Runner health check rocm-smi
42
      if: always()
43
      shell: bash
44
      run: |
45
        rocm-smi
46

47
    - name: Runner health check rocminfo
48
      if: always()
49
      shell: bash
50
      run: |
51
        rocminfo
52

53
    - name: Runner health check GPU count
54
      if: always()
55
      shell: bash
56
      run: |
57
        ngpu=$(rocminfo | grep -c -E 'Name:.*\sgfx')
58
        msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
59
        if [[ $ngpu -eq 0 ]]; then
60
            echo "Error: Failed to detect any GPUs on the runner"
61
            echo "$msg"
62
            exit 1
63
        fi
64
        if [[ $ngpu -eq 1 ]]; then
65
            echo "Error: only 1 GPU detected, at least 2 GPUs are needed for distributed jobs"
66
            echo "$msg"
67
            exit 1
68
        fi
69

70
    - name: Runner diskspace health check
71
      uses: ./.github/actions/diskspace-cleanup
72
      if: always()
73

74
    - name: Runner health check disconnect on failure
75
      if: ${{ failure() }}
76
      shell: bash
77
      run: |
78
        killall runsvc.sh
79

80
    - name: Preserve github env variables for use in docker
81
      shell: bash
82
      run: |
83
        env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
84
        env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
85

86
    - name: ROCm set GPU_FLAG
87
      shell: bash
88
      run: |
89
        # All GPUs are visible to the runner; visibility, if needed, will be set by run_test.py.
90
        echo "GPU_FLAG=--device=/dev/mem --device=/dev/kfd --device=/dev/dri --group-add video --group-add daemon" >> "${GITHUB_ENV}"
91

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.