diff --git a/.github/workflows/workflow_finetune_gaudi2.yml b/.github/workflows/workflow_finetune_gaudi2.yml new file mode 100644 index 00000000..a79db5b9 --- /dev/null +++ b/.github/workflows/workflow_finetune_gaudi2.yml @@ -0,0 +1,102 @@ +name: Finetune + +on: + workflow_call: + inputs: + ci_type: + type: string + default: 'pr' + runner_container_image: + type: string + default: '127.0.0.1:5000/llmray-build' + runner_config_path: + type: string + default: '/home/ci/llm-ray-actions-runner' + code_checkout_path: + type: string + default: '/home/ci/actions-runner/_work/llm-on-ray/llm-on-ray' + model_cache_path: + type: string + default: '/scratch-2/huggingface/cache' + +concurrency: + group: ${{ github.workflow }}-${{ github.event.pull_request.number || github.ref }}-ft-gaudi2 + cancel-in-progress: true + +jobs: + finetune: + name: finetune + strategy: + matrix: + model: [ EleutherAI/gpt-j-6b, meta-llama/Llama-2-7b-chat-hf, gpt2, bigscience/bloom-560m, facebook/opt-125m, mosaicml/mpt-7b, meta-llama/Llama-2-7b-hf, mistralai/Mistral-7B-v0.1, google/gemma-2b] + isPR: + - ${{inputs.ci_type == 'pr'}} + + exclude: + - { isPR: true } + include: + - { model: "EleutherAI/gpt-j-6b"} + - { model: "meta-llama/Llama-2-7b-chat-hf"} + - { model: "mistralai/Mistral-7B-v0.1"} + - { model: "google/gemma-2b"} + + runs-on: gaudi2 + + defaults: + run: + shell: bash + container: + image: ${{ inputs.runner_container_image }} + env: + http_proxy: + https_proxy: + SHELL: bash -eo pipefail + volumes: + - /var/run/docker.sock:/var/run/docker.sock + - ${{ inputs.runner_config_path }}:/root/actions-runner-config + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Load environment variables + run: cat /root/actions-runner-config/.env >> $GITHUB_ENV + + - name: Build Docker Image + run: | + DF_SUFFIX=".habana" + TARGET="finetune" + source dev/scripts/ci-functions.sh + build_and_prune_gaudi ${TARGET} ${DF_SUFFIX} + + - name: Start Docker Container + run: | + TARGET="finetune" + code_checkout_path=${{ inputs.code_checkout_path }} + model_cache_path=${{ inputs.model_cache_path }} + source dev/scripts/ci-functions.sh + start_docker_gaudi ${TARGET} ${code_checkout_path} ${model_cache_path} ${{env.HF_ACCESS_TOKEN}} + + - name: Run Finetune Test + run: | + TARGET="finetune" + source dev/scripts/ci-functions.sh + finetune_test_gaudi ${{ matrix.model }} + + - name: Run PEFT-LoRA Test + run: | + source dev/scripts/ci-functions.sh + peft_lora_test_gaudi ${{ matrix.model }} + + - name: Stop Ray + run: | + TARGET="finetune" + source dev/scripts/ci-functions.sh + stop_ray ${TARGET} + + - name: Stop Container + if: success() || failure() + run: | + TARGET="finetune" + source dev/scripts/ci-functions.sh + stop_container ${TARGET} diff --git a/.github/workflows/workflow_orders_on_merge.yml b/.github/workflows/workflow_orders_on_merge.yml index 632f880b..ea1ef0ff 100644 --- a/.github/workflows/workflow_orders_on_merge.yml +++ b/.github/workflows/workflow_orders_on_merge.yml @@ -24,9 +24,13 @@ jobs: # needs: Lint # uses: ./.github/workflows/workflow_inference_gaudi2.yml - # Finetune: - # needs: Lint - # uses: ./.github/workflows/workflow_finetune.yml +# Finetune: +# needs: Lint +# uses: ./.github/workflows/workflow_finetune.yml + +# Finetune_Gaudi: +# needs: Lint +# uses: ./.github/workflows/workflow_finetune_gaudi2.yml # Benchmark: # needs: Lint diff --git a/.github/workflows/workflow_orders_on_pr.yml b/.github/workflows/workflow_orders_on_pr.yml index cac4bfac..01292c86 100644 --- a/.github/workflows/workflow_orders_on_pr.yml +++ b/.github/workflows/workflow_orders_on_pr.yml @@ -24,6 +24,15 @@ jobs: # needs: Lint # uses: ./.github/workflows/workflow_inference_gaudi2.yml - # Finetune: - # needs: Lint - # uses: ./.github/workflows/workflow_finetune.yml +# Finetune: +# needs: Lint +# uses: ./.github/workflows/workflow_finetune.yml + +# Finetune_Gaudi: +# needs: Lint +# uses: ./.github/workflows/workflow_finetune_gaudi2.yml + +# Benchmark: +# needs: Lint +# uses: ./.github/workflows/workflow_test_benchmark.yml + diff --git a/dev/scripts/ci-functions.sh b/dev/scripts/ci-functions.sh index fde5f071..73107f67 100644 --- a/dev/scripts/ci-functions.sh +++ b/dev/scripts/ci-functions.sh @@ -31,7 +31,27 @@ build_and_prune() { # Build Docker image and perform cleaning operation docker build ./ "${docker_args[@]}" -f dev/docker/ci/Dockerfile${DF_SUFFIX} -t ${TARGET}:latest && yes | docker container prune && yes docker image prune -f +} + +build_and_prune_gaudi() { + # Set TARGET and DF-SUFFIX using the passed in parameters + local TARGET=$1 + local DF_SUFFIX=$2 + local PYTHON_V=$3 + + docker_args=() + docker_args+=("--build-arg=CACHEBUST=1") + + if [ -n "$PYTHON_V" ]; then + docker_args+=("--build-arg=python_v=${PYTHON_V}") + fi + + echo "Build Docker image and perform cleaning operation" + echo "docker build ./ ${docker_args[@]} -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:habana && yes | docker container prune && yes | docker image prune -f" + # Build Docker image and perform cleaning operation + docker build ./ "${docker_args[@]}" -f dev/docker/Dockerfile${DF_SUFFIX} -t ${TARGET}:habana && yes | docker container prune && yes + docker image prune -f } start_docker() { @@ -68,6 +88,41 @@ start_docker() { docker run -tid "${docker_args[@]}" "${TARGET}:latest" } +start_docker_gaudi() { + local TARGET=$1 + local code_checkout_path=$2 + local model_cache_path=$3 + local HF_TOKEN=$4 + + cid=$(docker ps -q --filter "name=${TARGET}") + if [[ ! -z "$cid" ]]; then docker stop $cid && docker rm $cid; fi + # check and remove exited container + cid=$(docker ps -a -q --filter "name=${TARGET}") + if [[ ! -z "$cid" ]]; then docker rm $cid; fi + docker ps -a + + docker_args=() + docker_args+=("-v=${code_checkout_path}:${CODE_CHECKOUT_PATH_LOCAL}") + + if [ -z "$model_cache_path" ]; then + echo "no cache path" + else + docker_args+=("-v=${model_cache_path}:${MODEL_CACHE_PATH_LOACL}") + fi + + docker_args+=("--runtime=habana" ) + docker_args+=("--name=${TARGET}" ) + docker_args+=("--hostname=${TARGET}-container") + + echo "docker run -tid "${docker_args[@]}" "${TARGET}:habana"" + docker run -tid "${docker_args[@]}" "${TARGET}:habana" + if [ -z "$HF_TOKEN" ]; then + echo "no hf token" + else + docker exec "${TARGET}" bash -c "huggingface-cli login --token ${HF_TOKEN}" + fi +} + install_dependencies(){ local TARGET=$1 @@ -225,3 +280,22 @@ peft_lora_test(){ docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune.yaml" } +finetune_test_gaudi(){ + local model=$1 + echo Set finetune source config : + docker exec "finetune" bash -c "pip install --upgrade-strategy eager optimum[habana]" + docker exec "finetune" bash -c "source \$(python -c 'import oneccl_bindings_for_pytorch as torch_ccl;print(torch_ccl.cwd)')/env/setvars.sh; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --head --node-ip-address 127.0.0.1 --ray-debugger-external; RAY_SERVE_ENABLE_EXPERIMENTAL_STREAMING=1 ray start --address='127.0.0.1:6379' --ray-debugger-external" + echo Set "${model}" patch_yaml_config : + docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} " + echo Stert "${model}" finetune : + docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml" +} + +peft_lora_test_gaudi(){ + local model=$1 + docker exec "finetune" bash -c "rm -rf /tmp/llm-ray/*" + echo Set "${model}" patch_yaml_config : + docker exec "finetune" bash -c "python dev/scripts/patch_yaml_config.py --conf_path "llm_on_ray/finetune/finetune_gaudi.yaml" --models ${model} --peft_lora" + echo Stert "${model}" peft lora finetune : + docker exec "finetune" bash -c "llm_on_ray-finetune --config_file llm_on_ray/finetune/finetune_gaudi.yaml" +} diff --git a/llm_on_ray/finetune/finetune_gaudi.yaml b/llm_on_ray/finetune/finetune_gaudi.yaml new file mode 100644 index 00000000..a972fe91 --- /dev/null +++ b/llm_on_ray/finetune/finetune_gaudi.yaml @@ -0,0 +1,37 @@ +General: + base_model: EleutherAI/gpt-j-6b + gpt_base_model: true + output_dir: /tmp/llm-ray/output + save_strategy: no + config: + trust_remote_code: false + use_auth_token: null + lora_config: + task_type: CAUSAL_LM + r: 8 + lora_alpha: 32 + lora_dropout: 0.1 + enable_gradient_checkpointing: false +Dataset: + train_file: examples/data/sample_finetune_data_small.jsonl + group: true + max_length: 512 + block_size: 512 + shuffle: false + validation_file: null + validation_split_percentage: 5 +Training: + optimizer: adamw_torch + batch_size: 2 + epochs: 3 + learning_rate: 1.0e-05 + lr_scheduler: linear + weight_decay: 0.0 + mixed_precision: bf16 + device: hpu + num_training_workers: 2 + resources_per_worker: + CPU: 32 + accelerate_mode: DDP + gradient_accumulation_steps: 1 + logging_steps: 10