Megatron-DeepSpeed-GPU-多机训练

发布于:2024-04-05 ⋅ 阅读:(28) ⋅ 点赞:(0)


本文演示了Megatron-DeepSpeed-GPU-多机训练的操作步骤

1.从ngc拉取pytorch:24.03-py3镜像

docker pull nvcr.io/nvidia/pytorch:24.03-py3

2.安装nvidia-docker、创建容器

cd /mnt		
curl -s -L https://nvidia.github.io/nvidia-docker/gpgkey | apt-key add -
distribution=$(. /etc/os-release;echo $ID$VERSION_ID)
curl -s -L https://nvidia.github.io/nvidia-docker/$distribution/nvidia-docker.list | tee /etc/apt/sources.list.d/nvidia-docker.list
apt-get update
apt-get install -y nvidia-docker2
nvidia-docker run -ti -e NVIDIA_VISIBLE_DEVICES=all --privileged \
			--net=host -v $PWD:/home \
            -w /home --rm nvcr.io/nvidia/pytorch:24.03-py3 /bin/bash

3.安装Megatron-DeepSpeed环境

pip install transformers
pip install deepspeed
git clone https://github.com/microsoft/Megatron-DeepSpeed
cd Megatron-DeepSpeed
git checkout 3c5f47563f697702c1e305fa01b7563f54b747fc
python3 setup.py install

4.安装openmpi和ssh服务

apt update
apt install -y openssh-server
apt install -y openmpi-bin openmpi-doc libopenmpi-dev

rm -rf ~/.ssh/*
ssh-keygen

sed -i 's/^.*PermitRootLogin.*$/PermitRootLogin yes/g' /etc/ssh/sshd_config
sed -i 's/^.*Port.*$/Port 2223/g' /etc/ssh/sshd_config
export passwd=Hello123 && printf "${passwd}\n${passwd}\n"  | passwd root 
 
cat >/usr/bin/run.sh <<EOF
#!/bin/bash
mkdir  -p /run/sshd
source ~/.bashrc
/usr/sbin/sshd -D
EOF
chmod  777 /usr/bin/run.sh
nohup /usr/bin/run.sh &
  
tee ~/.ssh/config <<-'EOF'
Host worker_1
        User  root
        Hostname 192.168.1.100
        port 2223
        IdentityFile ~/.ssh/id_rsa
Host worker_2
        User  root
        Hostname 192.168.1.101
        port 2223
        IdentityFile ~/.ssh/id_rsa        
EOF

5.拷贝公钥

ssh-copy-id worker_1
ssh-copy-id worker_2

6.安装pdsh

wget https://storage.googleapis.com/google-code-archive-downloads/v2/code.google.com/pdsh/pdsh-2.29.tar.bz2
tar -xf pdsh-2.29.tar.bz2
cd pdsh-2.29
./configure --with-ssh
make -j
make install
cp /usr/local/bin/pdsh /usr/bin/

7.升级protobuf

pip install --upgrade protobuf==3.20.1

8.准备数据集

cd /home/Megatron-DeepSpeed
wget https://huggingface.co/bigscience/misc-test-data/resolve/main/stas/oscar-1GB.jsonl.xz
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-vocab.json
wget https://s3.amazonaws.com/models.huggingface.co/bert/gpt2-merges.txt
xz -d oscar-1GB.jsonl.xz
python3 tools/preprocess_data.py \
    --input oscar-1GB.jsonl \
    --output-prefix my-gpt2 \
    --vocab-file gpt2-vocab.json \
    --dataset-impl mmap \
    --tokenizer-type GPT2BPETokenizer \
    --merge-file gpt2-merges.txt \
    --append-eod \
    --workers 8

9.创建配置文件

cd /home/Megatron-DeepSpeed
tee hostfile <<-'EOF'
worker_1 slots=1
worker_2 slots=1
EOF

tee ds_config.json <<-'EOF'
{
  "train_micro_batch_size_per_gpu": 1,
  "train_batch_size": 16,
  "gradient_clipping": 1.0,
  "zero_optimization": {
    "stage": 1
  },
  "fp16": {
    "enabled": true,
    "loss_scale": 0,
    "loss_scale_window": 500,
    "hysteresis": 2,
    "min_loss_scale": 1,
    "initial_scale_power": 12
  },
  "steps_per_print": 2000,
  "wall_clock_breakdown": false
}
EOF

10.开始测试

export MAX_JOBS=8
export NCCL_DEBUG=info 
export NCCL_SOCKET_IFNAME=enp5s0 
export NCCL_IB_DISABLE=1	
deepspeed --hostfile ./hostfile pretrain_gpt.py \
	--tensor-model-parallel-size 2 \
	--pipeline-model-parallel-size 1 \
	--distributed-backend nccl \
	--num-layers 2 \
	--hidden-size 8 \
	--num-attention-heads 2 \
	--seq-length 512 \
	--max-position-embeddings 512 \
	--micro-batch-size 1 \
	--rampup-batch-size 2 2 1_000 \
	--global-batch-size 16 \
	--train-samples 10_000 \
	--optimizer adam \
	--adam-beta1 0.9 \
	--adam-beta2 0.95 \
	--adam-eps 1e-8 \
	--lr 1e-4 \
	--log-interval 1 \
	--lr-warmup-samples 5 \
	--min-lr 1e-6 \
	--lr-decay-style cosine \
	--lr-decay-samples 12 \
	--clip-grad 1.0 \
	--weight-decay 1e-1 \
	--fp16 \
	--partition-activations \
	--seed 42 \
	--vocab-file gpt2-vocab.json \
	--merge-file gpt2-merges.txt \
	--exit-interval 100 --log-interval 10 \
	--save-interval 50 --eval-interval 100 \
	--eval-iters 10 --checkpoint-activations \
	--save checkpoints/gpt2_4 \
	--data-path my-gpt2_text_document \
	--tensorboard-dir output_dir/tensorboard \
	--tensorboard-queue-size 5 \
	--log-timers-to-tensorboard \
	--log-batch-size-to-tensorboard \
	--log-validation-ppl-to-tensorboard \
	--deepspeed \
	--deepspeed_config ./ds_config.json \
	--zero-stage 1 --deepspeed-activation-checkpointing