skypilot

Форк
0
103 строки · 3.3 Кб
1
# Example: a distributed DeepSpeed job (DeepSpeed-Chat) on 2 VMs.
2
#
3
# This takes care constructing a "hostfile" to pass to DeepSpeed.
4
#
5
# Usage:
6
#
7
#   $ sky launch sky.yaml -r --down -c ds
8
#
9
#   # Optional: After the job starts running, you can log into the two nodes and
10
#   # check gpustat:
11
#   $ ssh ds
12
#   $ gpustat -i
13
#   $ ssh ds-worker1
14
#   $ gpustat -i
15

16
resources:
17
  accelerators: A100:1  # GCP, Lambda
18
  # accelerators: A100-80GB:1  # Azure, GCP, SCP
19
  # accelerators: A10G:1  # AWS. Will OOM for (1) single_node/run_1.3b_lora.sh (2) multi_node/run_66b.sh.
20
  # accelerators: T4:1  # AWS, Azure, GCP. Will OOM for (1) single_node/run_1.3b_lora.sh (2) multi_node/run_66b.sh.
21
num_nodes: 2
22

23
setup: |
24
  git clone https://github.com/microsoft/DeepSpeedExamples.git || true
25
  cd DeepSpeedExamples
26
  git checkout d7c42b4f34df91035e7ed3e0c51500bb53d0bc71
27

28
  conda activate deepspeed
29
  if [ $? -eq 0 ]; then
30
    echo 'conda env exists'
31
  else
32
    conda create -n deepspeed python=3.8 -y
33
    conda activate deepspeed
34

35
    pip install deepspeed
36

37
    cd applications/DeepSpeed-Chat
38
    pip install -r requirements.txt
39

40
    # Required by DeepSpeed in multi-node settings.
41
    #
42
    # NOTE(skypilot): DeepSpeed uses `pdsh` to log into each node and calls
43
    # `ninja --version`; so it has to be installed system-wide rather than in
44
    # the above 'deepspeed' conda env.
45
    sudo apt-get -y install pdsh ninja-build
46
  fi
47

48
file_mounts:
49
  # Required for DeepSpeed's passwordless SSH (run commands on nodes).
50
  ~/.ssh/id_rsa: ~/.ssh/sky-key
51

52
run: |
53
  cd DeepSpeedExamples
54
  conda activate deepspeed
55

56
  # Launch on the first node only
57
  if [ "${SKYPILOT_NODE_RANK}" == "0" ]; then
58

59
    # Prepare a hostfile.
60
    HOSTFILE_PATH=/tmp/hostfile.${SKYPILOT_TASK_ID}
61
    python -c "import os;n_gpus=os.environ['SKYPILOT_NUM_GPUS_PER_NODE'];print('\n'.join([f'{ip} slots={n_gpus}' for ip in os.environ['SKYPILOT_NODE_IPS'].splitlines()]))" > ${HOSTFILE_PATH}
62

63
    echo "*******************************************"
64
    echo "Hostfile: ${HOSTFILE_PATH}"
65
    cat ${HOSTFILE_PATH}
66
    echo "*******************************************"
67

68
    ################ Your launch command goes here ################
69

70
    cd applications/DeepSpeed-Chat/training/step1_supervised_finetuning/
71

72
    # Adapted from: training_scripts/single_node/run_1.3b_lora.sh
73
    # Note the additional argument: --hostfile $HOSTFILE_PATH
74
    # Alternatively, you can move HOSTFILE_PATH to /job/hostfile:
75
    #   sudo mkdir -p /job; sudo chmod 777 /job; mv ${HOSTFILE_PATH} /job/hostfile
76

77
    OUTPUT_PATH=./output
78
    mkdir -p $OUTPUT_PATH
79
    deepspeed \
80
      --hostfile $HOSTFILE_PATH \
81
      main.py \
82
      --data_path Dahoas/rm-static Dahoas/full-hh-rlhf Dahoas/synthetic-instruct-gptj-pairwise yitingxie/rlhf-reward-datasets \
83
      --data_split 2,4,4 \
84
      --model_name_or_path facebook/opt-1.3b \
85
      --per_device_train_batch_size 8 \
86
      --per_device_eval_batch_size 8 \
87
      --max_seq_len 512 \
88
      --learning_rate 1e-3 \
89
      --weight_decay 0.1 \
90
      --num_train_epochs 16 \
91
      --gradient_accumulation_steps 1 \
92
      --lr_scheduler_type cosine \
93
      --num_warmup_steps 0 \
94
      --seed 1234 \
95
      --zero_stage 0 \
96
      --lora_dim 128 \
97
      --lora_module_name decoder.layers. \
98
      --only_optimize_lora \
99
      --deepspeed \
100
      --output_dir $OUTPUT_PATH \
101
      | tee $OUTPUT_PATH/training.log
102

103
  fi
104

Использование cookies

Мы используем файлы cookie в соответствии с Политикой конфиденциальности и Политикой использования cookies.

Нажимая кнопку «Принимаю», Вы даете АО «СберТех» согласие на обработку Ваших персональных данных в целях совершенствования нашего веб-сайта и Сервиса GitVerse, а также повышения удобства их использования.

Запретить использование cookies Вы можете самостоятельно в настройках Вашего браузера.