toolbox/training/supervised-finetune.bash

40 lines
1.2 KiB
Bash

#!/usr/bin/env bash
set -x
export BATCH_SIZE=2
export MODEL="EleutherAI/pythia-1.3b-deduped"
export NUMBER_OF_GPUS=1
export OUTPUT_DIR="checkpoints"
LOG_NAME=$(date "+%Y-%m-%d_%H-%M-%S")
# Set HuggingFace Datasets to offline mode by default: since we're using local
# JSON files, hitting their servers means something went wrong. If you're doing
# something else, adjust this accordingly.
export HF_DATASETS_OFFLINE=1
# HuggingFace transformers should be allowed to hit their servers though, to
# download pre-trained models during the first execution for example.
# export TRANSFORMERS_OFFLINE=1
mkdir -p "$OUTPUT_DIR/logs"
mkdir -p "$OUTPUT_DIR/runs"
torchrun \
--nproc_per_node ${NUMBER_OF_GPUS} \
--master_port 19198 \
./colossalai/run_sft.py \
--train_file "./data/train.json" \
--validation_file "./data/eval.json" \
--learning_rate "5.0e-5" \
--checkpointing_steps 64 \
--block_size 1024 \
--mem_cap 0 \
--lr_scheduler_type "cosine" \
--num_warmup_steps 100 \
--model_name_or_path "$MODEL" \
--output_dir "$OUTPUT_DIR" \
--num_train_epochs 1 \
--per_device_eval_batch_size "$BATCH_SIZE" \
--per_device_train_batch_size "$BATCH_SIZE" "$@" \
2>&1 | tee "$OUTPUT_DIR/logs/$LOG_NAME.log"