skypilot
1name: resnet-app2
3resources:4cloud: aws5accelerators:6V100: 17
8inputs: {9gs://cloud-tpu-test-dataset/fake_imagenet: 70,10}
11
12outputs: {13resnet-model-dir: 0.1,14}
15
16# file_mounts: {
17# /tmp/fake_imagenet: gs://cloud-tpu-test-datasets/fake_imagenet,
18# }
19
20setup: |21git clone https://github.com/concretevitamin/tpu || true
22cd tpu
23git checkout 9459fee
24
25. $(conda info --base)/etc/profile.d/conda.sh26pip install --upgrade pip27
28conda activate resnet
29
30if [ $? -eq 0 ]; then31echo "conda env exists"
32else
33conda create -n resnet python=3.7 -y34conda activate resnet
35conda install cudatoolkit=11.0 -y36pip install tensorflow==2.4.0 pyyaml
37pip install protobuf==3.20
38
39# Automatically set CUDNN envvars when conda activate is run40mkdir -p $CONDA_PREFIX/etc/conda/activate.d41echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh42echo 'export LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$CUDNN_PATH/lib:$LD_LIBRARY_PATH' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh43
44cd models
45pip install -e .46fi
47
48run: |49cd tpu
50. $(conda info --base)/etc/profile.d/conda.sh
51conda activate resnet
52
53export XLA_FLAGS='--xla_gpu_cuda_data_dir=/usr/local/cuda/'54python -u models/official/resnet/resnet_main.py --use_tpu=False \55--mode=train --train_batch_size=256 --train_steps=250 \56--iterations_per_loop=125 \57--data_dir=gs://cloud-tpu-test-datasets/fake_imagenet \58--model_dir=resnet-model-dir \59--amp --xla --loss_scale=12860