3
description: Set up XPU host for CI
8
- name: Clean all stopped docker containers
12
# Prune all stopped containers.
13
# If other runner is pruning on this node, will skip.
14
nprune=$(ps -ef | grep -c "docker container prune")
15
if [[ $nprune -eq 1 ]]; then
16
docker container prune -f
19
- name: Runner health check system info
23
cat /etc/os-release || true
24
cat /etc/apt/sources.list.d/oneAPI.list || true
25
cat /etc/apt/sources.list.d/intel-gpu-jammy.list || true
28
- name: Runner health check xpu-smi
34
- name: Runner health check GPU count
38
ngpu=$(xpu-smi discovery | grep -c -E 'Device Name')
39
msg="Please file an issue on pytorch/pytorch reporting the faulty runner. Include a link to the runner logs so the runner can be identified"
40
if [[ $ngpu -eq 0 ]]; then
41
echo "Error: Failed to detect any GPUs on the runner"
46
- name: Runner diskspace health check
47
uses: ./.github/actions/diskspace-cleanup
50
- name: Runner health check disconnect on failure
56
- name: Preserve github env variables for use in docker
59
env | grep '^GITHUB' >> "/tmp/github_env_${GITHUB_RUN_ID}"
60
env | grep '^CI' >> "/tmp/github_env_${GITHUB_RUN_ID}"
62
- name: XPU set GPU_FLAG
65
# Add render group for container creation.
66
render_gid=`cat /etc/group | grep render | cut -d: -f3`
67
echo "GPU_FLAG=--device=/dev/mem --device=/dev/dri --group-add video --group-add $render_gid" >> "${GITHUB_ENV}"