skypilot
87 строк · 2.3 Кб
1name: bert_qa2
3---
4
5name: train6
7resources:8accelerators: V100:19
10# Assume your working directory is under `~/transformers`.
11# To make this example work, please run the following command:
12# git clone https://github.com/huggingface/transformers.git ~/transformers -b v4.30.1
13workdir: ~/transformers14
15file_mounts:16/checkpoint:17name: test-bert-train-eval # NOTE: Fill in your bucket name18mode: MOUNT19
20setup: |21pip install -e .
22cd examples/pytorch/question-answering/
23pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
24pip install wandb
25
26run: |27cd examples/pytorch/question-answering/
28python run_qa.py \
29--model_name_or_path bert-base-uncased \
30--dataset_name squad \
31--do_train \
32--per_device_train_batch_size 12 \
33--learning_rate 3e-5 \
34--num_train_epochs 1 \
35--max_seq_length 384 \
36--doc_stride 128 \
37--report_to wandb \
38--run_name $SKYPILOT_TASK_ID \
39--output_dir /checkpoint/bert_qa/$SKYPILOT_TASK_ID \
40--save_total_limit 10 \
41--save_steps 1000
42echo Model saved to /checkpoint/bert_qa/$SKYPILOT_TASK_ID
43
44envs:45WANDB_API_KEY: # NOTE: Fill in your wandb key46
47---
48
49name: eval50
51resources:52accelerators: T4:153
54workdir: ~/transformers55
56file_mounts:57/checkpoint:58name: test-bert-train-eval # NOTE: Fill in your bucket name59mode: MOUNT60
61setup: |62pip install -e .
63cd examples/pytorch/question-answering/
64pip install -r requirements.txt torch==1.12.1+cu113 --extra-index-url https://download.pytorch.org/whl/cu113
65pip install wandb
66
67run: |68FIRST_TASK_UNIQUE_ID=$(echo "$SKYPILOT_TASK_IDS" | sed -n 1p)
69echo Load model from /checkpoint/bert_qa/$FIRST_TASK_UNIQUE_ID
70cd examples/pytorch/question-answering/
71python run_qa.py \
72--model_name_or_path /checkpoint/bert_qa/$FIRST_TASK_UNIQUE_ID \
73--dataset_name squad \
74--do_eval \
75--per_device_train_batch_size 12 \
76--learning_rate 3e-5 \
77--num_train_epochs 50 \
78--max_seq_length 384 \
79--doc_stride 128 \
80--report_to wandb \
81--run_name $SKYPILOT_TASK_ID \
82--output_dir /checkpoint/bert_qa/$FIRST_TASK_UNIQUE_ID \
83--save_total_limit 10 \
84--save_steps 1000
85
86envs:87WANDB_API_KEY: # NOTE: Fill in your wandb key88