7
workdir = '~/Downloads/tpu'
9
data_mount_path = '/tmp/imagenet'
13
'git clone https://github.com/concretevitamin/tpu '
17
subprocess.run(f'cd {workdir} && git checkout 9459fee',
24
pip install --upgrade pip
26
conda activate resnet && exists=1 || exists=0
27
if [ $exists -eq 0 ]; then
28
conda create -n resnet python=3.7 -y
30
conda install cudatoolkit=11.0 -y
31
pip install tensorflow==2.4.0 pyyaml
32
pip install protobuf==3.20
33
mkdir -p $CONDA_PREFIX/etc/conda/activate.d
34
echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
35
echo 'export LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$CUDNN_PATH/lib:$LD_LIBRARY_PATH' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
36
cd models && pip install -e .
43
export XLA_FLAGS=\'--xla_gpu_cuda_data_dir=/usr/local/cuda/\'
44
python -u models/official/resnet/resnet_main.py --use_tpu=False \
45
--mode=train --train_batch_size=256 --train_steps=250 \
46
--iterations_per_loop=125 \
47
--data_dir={data_mount_path} \
48
--model_dir=resnet-model-dir \
49
--amp --xla --loss_scale=128
55
storage = sky.Storage(source="s3://imagenet-bucket")
67
train.set_storage_mounts({
68
data_mount_path: storage,
71
train.set_inputs('s3://imagenet-bucket', estimated_size_gigabytes=150)
72
train.set_outputs('resnet-model-dir', estimated_size_gigabytes=0.1)
74
sky.Resources(sky.AWS(), 'p3.2xlarge'),