Add lora test using qwen (#16161)

lucylq · web-flow · commit c3a53f3afce0 · 2025-12-10T12:20:41.000-08:00
### Summary Use qwen0.6B with unsloth (instead of llama1B with torchtune) for lora test. 1. Smaller model / quicker test. 2. Eventually remove dependency on torchtune. 3. Qwen is not gated on HF. TODO: add quantized test after #15951 ``` Expected result prefix: <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant To calculate 15% of 80, we can multiply 80 by 0.15. 80 * 0.15 = 12 So, 15% of 80 is 12. #### 12 The answer is: 12<|im_end|> + echo 'Actual result: <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant To calculate 15% of 80, we can multiply 80 by 0.15. 80 * 0.15 = 12 So, 15% of 80 is 12. #### 12 The answer is: 12<|im_end|> PyTorchObserver {"prompt_tokens":15,"generated_tokens":65,"model_load_start_ms":1765320124550,"model_load_end_ms":1765320127516,"inference_start_ms":1765320152867,"inference_end_ms":1765320178119,"prompt_eval_end_ms":1765320153334,"first_token_ms":1765320153334,"aggregate_sampling_time_ms":19,"SCALING_FACTOR_UNITS_PER_SECOND":1000}' Actual result: <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant To calculate 15% of 80, we can multiply 80 by 0.15. 80 * 0.15 = 12 So, 15% of 80 is 12. #### 12 The answer is: 12<|im_end|> PyTorchObserver {"prompt_tokens":15,"generated_tokens":65,"model_load_start_ms":1765320124550,"model_load_end_ms":1765320127516,"inference_start_ms":1765320152867,"inference_end_ms":1765320178119,"prompt_eval_end_ms":1765320153334,"first_token_ms":1765320153334,"aggregate_sampling_time_ms":19,"SCALING_FACTOR_UNITS_PER_SECOND":1000} + echo Success Success ```
diff --git a/.ci/scripts/test_llama_lora.sh b/.ci/scripts/test_llama_lora.sh
diff --git a/.ci/scripts/test_lora.sh b/.ci/scripts/test_lora.sh
@@ -0,0 +1,124 @@
+#!/bin/bash
+# Copyright (c) Meta Platforms, Inc. and affiliates.
+# All rights reserved.
+#
+# This source code is licensed under the BSD-style license found in the
+# LICENSE file in the root directory of this source tree.
+
+set -exu
+# shellcheck source=/dev/null
+source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
+
+cmake_install_executorch_libraries() {
+    echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
+    rm -rf cmake-out
+    cmake --workflow llm-release
+}
+
+cmake_build_llama_runner() {
+    echo "Building llama runner"
+    pushd extension/llm/tokenizers
+    echo "Updating tokenizers submodule"
+    git submodule update --init
+    popd
+    make llama-cpu
+}
+
+cleanup_files() {
+  echo "Deleting downloaded and generated files"
+  rm -rf "${HF_QWEN_PATH}/"
+  rm -rf "${HF_ADAPTER_PATH}/"
+  rm -rf *.pte *.ptd
+  rm result*.txt
+}
+
+# Hosting lora adapter in personal repo for now.
+python -m pip install -q huggingface_hub
+HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
+HF_ADAPTER_PATH=$(
+  bash "$(dirname "${BASH_SOURCE[0]}")/download_hf_hub.sh" \
+    --model_id "${HF_ADAPTER_REPO}" \
+    --files "adapter_config.json" "adapter_model.safetensors"
+)
+
+### SINGLE LORA PTE ###
+# Export LoRA PTE file.
+$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
+    --config examples/models/qwen3/config/qwen3_xnnpack.yaml \
+    +base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
+    +base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
+    +export.output_name="qwen_lora_math_full.pte"
+
+# Capture the path of the downloaded qwen artifacts
+HF_QWEN_PATH=$(python -c "from huggingface_hub import snapshot_download; print(snapshot_download('unsloth/Qwen3-0.6B'))")
+echo "Model downloaded to: $HF_QWEN_PATH"
+
+### BUILD LLAMA RUNNER.
+cmake_install_executorch_libraries
+cmake_build_llama_runner
+
+# Runner constants.
+RUNTIME_ARGS="--tokenizer_path=${HF_QWEN_PATH}/ --temperature=0 --seq_len=100 --warmup=1"
+PROMPT="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant"
+EXPECTED_PREFIX="
+<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
+To calculate 15% of 80, we can multiply 80 by 0.15.
+80 * 0.15 = 12
+So, 15% of 80 is 12.
+#### 12
+The answer is: 12<|im_end|>"
+
+# Run llama runner on single lora PTE file.
+NOW=$(date +"%H:%M:%S")
+echo "Starting to run llama runner at ${NOW}"
+# shellcheck source=/dev/null
+cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math_full.pte --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
+NOW=$(date +"%H:%M:%S")
+echo "Finished at ${NOW}"
+
+RESULT=$(cat result.txt)
+if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT}"
+  # Do not clean up files if test passes, as they're re-used in the next test.
+  echo "Success"
+else
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT}"
+  echo "Failure; results not the same"
+  cleanup_files
+  exit 1
+fi
+
+### PROGRAM DATA SEPARATION ###
+# Export LoRA PTE, LoRA PTD, foundation PTD file.
+$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
+    --config examples/models/qwen3/config/qwen3_xnnpack.yaml \
+    +base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
+    +base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
+    +export.output_name="qwen_lora_math.pte" \
+    +export.foundation_weights_file="qwen_foundation.ptd" \
+    +export.lora_weights_file="qwen_lora_math.ptd"
+
+# Run llama runner on PTE, PTD files.
+NOW=$(date +"%H:%M:%S")
+echo "Starting to run llama runner at ${NOW}"
+# shellcheck source=/dev/null
+cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math.pte --data_paths="qwen_foundation.ptd,qwen_lora_math.ptd" --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
+NOW=$(date +"%H:%M:%S")
+echo "Finished at ${NOW}"
+
+RESULT2=$(cat result2.txt)
+if [[ "${RESULT2}" == "${EXPECTED_PREFIX}"* ]]; then
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT2}"
+  echo "Success"
+else
+  echo "Expected result prefix: ${EXPECTED_PREFIX}"
+  echo "Actual result: ${RESULT2}"
+  echo "Failure; results not the same"
+  cleanup_files
+  exit 1
+fi
+
+cleanup_files
diff --git a/.github/workflows/pull.yml b/.github/workflows/pull.yml
@@ -728,8 +728,8 @@ jobs:
         # run llama runner in eager mode
         PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_runner_eager.sh
 
-  test-llama-lora-linux:
-    name: test-llama-lora-linux
+  test-lora-linux:
+    name: test-lora-linux
     uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
     permissions:
       id-token: write
@@ -752,11 +752,8 @@ jobs:
         # Install llama requirements
         bash examples/models/llama/install_requirements.sh
 
-        # install a recent version of torchtune (>= 20250730)
-        PYTHON_EXECUTABLE=python python -m pip install torchtune==0.7.0.dev20250929  --extra-index-url https://download.pytorch.org/whl/nightly/cpu
-
         # run llama runner in eager mode
-        PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_lora.sh
+        PYTHON_EXECUTABLE=python bash .ci/scripts/test_lora.sh
 
   test-mediatek-models-linux:
     name: test-mediatek-models-linux
@@ -863,7 +860,7 @@ jobs:
         source .ci/scripts/setup-emscripten.sh
 
         export PNPM_VERSION=10.24.0
-        
+
         curl -fsSL https://get.pnpm.io/install.sh | env PNPM_VERSION=$PNPM_VERSION SHELL="$(which bash)" sh -
 
         export PNPM_HOME="$HOME/.local/share/pnpm"
diff --git a/examples/models/qwen3/README.md b/examples/models/qwen3/README.md
@@ -62,8 +62,9 @@ With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your
 cmake-out/examples/models/llama/llama_main \
   --model_path qwen3_0_6b.pte \
   --tokenizer_path ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json \
-  --prompt="Who is the president of the US?"
+  --prompt="<|im_start|>user Who is the president of the US?<|im_end|><|im_start|>assistant"
 ```
+Note that you have to apply the chat template manually for the C++ runner.
 
 To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section.
 
diff --git a/examples/models/qwen3/config/qwen3_xnnpack.yaml b/examples/models/qwen3/config/qwen3_xnnpack.yaml
@@ -0,0 +1,18 @@
+base:
+  model_class: "qwen3_0_6b"
+  params: "examples/models/qwen3/config/0_6b_config.json"
+  metadata: '{"get_bos_id": 151644, "get_eos_ids":[151645]}'
+
+model:
+  use_kv_cache: True
+  use_sdpa_with_kv_cache: True
+  dtype_override: fp32
+
+export:
+  max_seq_length: 2048
+  max_context_length: 2048
+
+backend:
+  xnnpack:
+    enabled: True
+    extended_ops: True