skypilot
1# Usage:
2#
3# Unmanaged spot (no auto-recovery; for debugging):
4# HF_TOKEN=abc BUCKET=<unique-name> sky launch -c axolotl-spot axolotl-spot.yaml --env HF_TOKEN --env BUCKET -i30 --down
5#
6# Managed spot (auto-recovery; for full runs):
7# HF_TOKEN=abc BUCKET=<unique-name> sky spot launch -n axolotl-spot axolotl-spot.yaml --env HF_TOKEN --env BUCKET
8
9name: axolotl
10
11resources:
12accelerators: A100:1
13cloud: gcp # optional
14use_spot: True
15
16workdir: mistral
17
18file_mounts:
19/sky-notebook:
20name: ${BUCKET}
21mode: MOUNT
22
23setup: |
24docker pull winglian/axolotl:main-py3.10-cu118-2.0.1
25
26run: |
27docker run --gpus all \
28-v ~/sky_workdir:/sky_workdir \
29-v /root/.cache:/root/.cache \
30winglian/axolotl:main-py3.10-cu118-2.0.1 \
31huggingface-cli login --token ${HF_TOKEN}
32
33docker run --gpus all \
34-v ~/sky_workdir:/sky_workdir \
35-v /root/.cache:/root/.cache \
36-v /sky-notebook:/sky-notebook \
37winglian/axolotl:main-py3.10-cu118-2.0.1 \
38accelerate launch -m axolotl.cli.train /sky_workdir/qlora-checkpoint.yaml
39
40envs:
41HF_TOKEN: <your-huggingface-token> # TODO: Replace with huggingface token
42BUCKET: <a-unique-bucket-name-to-use>
43
44
45
46
47
48
49