skypilot
1# Runs ResNet in inference mode on the fake_imagenet dataset
2#
3# Usage:
4# sky launch -c infer resnet_inference_app.yaml
5# sky down infer
6
7name: resnet-inference
8
9resources:
10accelerators:
11V100: 1
12
13file_mounts:
14/tmp/resnet-model-dir:
15source: s3://mluo-resnet-model-dir
16
17
18setup: |
19git clone https://github.com/concretevitamin/tpu || true
20cd tpu
21git checkout gpu_train
22
23. $(conda info --base)/etc/profile.d/conda.sh
24pip install --upgrade pip
25
26conda activate resnet
27
28if [ $? -eq 0 ]; then
29echo "conda env exists"
30else
31conda create -n resnet python=3.7 -y
32conda activate resnet
33conda install cudatoolkit=11.0 -y
34pip install tensorflow==2.4.0 pyyaml
35pip install protobuf==3.20
36cd models
37pip install -e .
38fi
39
40run: |
41cd tpu
42. $(conda info --base)/etc/profile.d/conda.sh
43conda activate resnet
44
45export XLA_FLAGS='--xla_gpu_cuda_data_dir=/usr/local/cuda/'
46python -u models/official/resnet/resnet_main.py --use_tpu=False \
47--mode=infer --data_dir=gs://cloud-tpu-test-datasets/fake_imagenet \
48--model_dir=/tmp/resnet-model-dir --amp --xla --loss_scale=128 \
49--infer_batch_size=8 --infer_steps=10000
50