skypilot

Форк
0
/
resnet_app_storage.py 
77 строк · 2.7 Кб
1
import subprocess
2

3
import sky
4

5
with sky.Dag() as dag:
6
    # The working directory contains all code and will be synced to remote.
7
    workdir = '~/Downloads/tpu'
8

9
    data_mount_path = '/tmp/imagenet'
10

11
    # Clone the repo locally to workdir
12
    subprocess.run(
13
        'git clone https://github.com/concretevitamin/tpu '
14
        f'{workdir} || true',
15
        shell=True,
16
        check=True)
17
    subprocess.run(f'cd {workdir} && git checkout 9459fee',
18
                   shell=True,
19
                   check=True)
20

21
    # The setup command.  Will be run under the working directory.
22
    setup = """\
23
        set -e
24
        pip install --upgrade pip
25
        conda init bash
26
        conda activate resnet && exists=1 || exists=0
27
        if [ $exists -eq 0 ]; then
28
            conda create -n resnet python=3.7 -y
29
            conda activate resnet
30
            conda install cudatoolkit=11.0 -y
31
            pip install tensorflow==2.4.0 pyyaml
32
            pip install protobuf==3.20
33
            mkdir -p $CONDA_PREFIX/etc/conda/activate.d
34
            echo 'CUDNN_PATH=$(dirname $(python -c "import nvidia.cudnn;print(nvidia.cudnn.__file__)"))' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
35
            echo 'export LD_LIBRARY_PATH=$CONDA_PREFIX/lib/:$CUDNN_PATH/lib:$LD_LIBRARY_PATH' >> $CONDA_PREFIX/etc/conda/activate.d/env_vars.sh
36
            cd models && pip install -e .
37
        fi
38
        """
39

40
    # The command to run.  Will be run under the working directory.
41
    run = f"""\
42
        conda activate resnet
43
        export XLA_FLAGS=\'--xla_gpu_cuda_data_dir=/usr/local/cuda/\'
44
        python -u models/official/resnet/resnet_main.py --use_tpu=False \
45
            --mode=train --train_batch_size=256 --train_steps=250 \
46
            --iterations_per_loop=125 \
47
            --data_dir={data_mount_path} \
48
            --model_dir=resnet-model-dir \
49
            --amp --xla --loss_scale=128
50
        """
51

52
    # If the backend to be added is not specified, then SkyPilot's optimizer
53
    # will choose the backend bucket to be stored.
54
    # S3 Example
55
    storage = sky.Storage(source="s3://imagenet-bucket")
56
    # GCS Example
57
    #storage = sky.Storage(name="imagenet_test_mluo",source="gs://imagenet_test_mluo")
58
    # Can also be from a local dir
59
    # storage = sky.Storage(name="imagenet-bucket", source="~/imagenet-data/")
60

61
    train = sky.Task(
62
        'train',
63
        workdir=workdir,
64
        setup=setup,
65
        run=run,
66
    )
67
    train.set_storage_mounts({
68
        data_mount_path: storage,
69
    })
70

71
    train.set_inputs('s3://imagenet-bucket', estimated_size_gigabytes=150)
72
    train.set_outputs('resnet-model-dir', estimated_size_gigabytes=0.1)
73
    train.set_resources({
74
        sky.Resources(sky.AWS(), 'p3.2xlarge'),
75
    })
76

77
sky.launch(dag)
78

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.