Skip to content

Commit c3a53f3

Browse files
authored
Add lora test using qwen (#16161)
### Summary Use qwen0.6B with unsloth (instead of llama1B with torchtune) for lora test. 1. Smaller model / quicker test. 2. Eventually remove dependency on torchtune. 3. Qwen is not gated on HF. TODO: add quantized test after #15951 ``` Expected result prefix: <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant To calculate 15% of 80, we can multiply 80 by 0.15. 80 * 0.15 = 12 So, 15% of 80 is 12. #### 12 The answer is: 12<|im_end|> + echo 'Actual result: <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant To calculate 15% of 80, we can multiply 80 by 0.15. 80 * 0.15 = 12 So, 15% of 80 is 12. #### 12 The answer is: 12<|im_end|> PyTorchObserver {"prompt_tokens":15,"generated_tokens":65,"model_load_start_ms":1765320124550,"model_load_end_ms":1765320127516,"inference_start_ms":1765320152867,"inference_end_ms":1765320178119,"prompt_eval_end_ms":1765320153334,"first_token_ms":1765320153334,"aggregate_sampling_time_ms":19,"SCALING_FACTOR_UNITS_PER_SECOND":1000}' Actual result: <|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant To calculate 15% of 80, we can multiply 80 by 0.15. 80 * 0.15 = 12 So, 15% of 80 is 12. #### 12 The answer is: 12<|im_end|> PyTorchObserver {"prompt_tokens":15,"generated_tokens":65,"model_load_start_ms":1765320124550,"model_load_end_ms":1765320127516,"inference_start_ms":1765320152867,"inference_end_ms":1765320178119,"prompt_eval_end_ms":1765320153334,"first_token_ms":1765320153334,"aggregate_sampling_time_ms":19,"SCALING_FACTOR_UNITS_PER_SECOND":1000} + echo Success Success ```
1 parent 8d06ed3 commit c3a53f3

File tree

5 files changed

+148
-180
lines changed

5 files changed

+148
-180
lines changed

.ci/scripts/test_llama_lora.sh

Lines changed: 0 additions & 172 deletions
This file was deleted.

.ci/scripts/test_lora.sh

Lines changed: 124 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,124 @@
1+
#!/bin/bash
2+
# Copyright (c) Meta Platforms, Inc. and affiliates.
3+
# All rights reserved.
4+
#
5+
# This source code is licensed under the BSD-style license found in the
6+
# LICENSE file in the root directory of this source tree.
7+
8+
set -exu
9+
# shellcheck source=/dev/null
10+
source "$(dirname "${BASH_SOURCE[0]}")/utils.sh"
11+
12+
cmake_install_executorch_libraries() {
13+
echo "Installing libexecutorch.a, libextension_module.so, libportable_ops_lib.a"
14+
rm -rf cmake-out
15+
cmake --workflow llm-release
16+
}
17+
18+
cmake_build_llama_runner() {
19+
echo "Building llama runner"
20+
pushd extension/llm/tokenizers
21+
echo "Updating tokenizers submodule"
22+
git submodule update --init
23+
popd
24+
make llama-cpu
25+
}
26+
27+
cleanup_files() {
28+
echo "Deleting downloaded and generated files"
29+
rm -rf "${HF_QWEN_PATH}/"
30+
rm -rf "${HF_ADAPTER_PATH}/"
31+
rm -rf *.pte *.ptd
32+
rm result*.txt
33+
}
34+
35+
# Hosting lora adapter in personal repo for now.
36+
python -m pip install -q huggingface_hub
37+
HF_ADAPTER_REPO="lucylq/qwen3_06B_lora_math"
38+
HF_ADAPTER_PATH=$(
39+
bash "$(dirname "${BASH_SOURCE[0]}")/download_hf_hub.sh" \
40+
--model_id "${HF_ADAPTER_REPO}" \
41+
--files "adapter_config.json" "adapter_model.safetensors"
42+
)
43+
44+
### SINGLE LORA PTE ###
45+
# Export LoRA PTE file.
46+
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
47+
--config examples/models/qwen3/config/qwen3_xnnpack.yaml \
48+
+base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
49+
+base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
50+
+export.output_name="qwen_lora_math_full.pte"
51+
52+
# Capture the path of the downloaded qwen artifacts
53+
HF_QWEN_PATH=$(python -c "from huggingface_hub import snapshot_download; print(snapshot_download('unsloth/Qwen3-0.6B'))")
54+
echo "Model downloaded to: $HF_QWEN_PATH"
55+
56+
### BUILD LLAMA RUNNER.
57+
cmake_install_executorch_libraries
58+
cmake_build_llama_runner
59+
60+
# Runner constants.
61+
RUNTIME_ARGS="--tokenizer_path=${HF_QWEN_PATH}/ --temperature=0 --seq_len=100 --warmup=1"
62+
PROMPT="<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant"
63+
EXPECTED_PREFIX="
64+
<|im_start|>user Calculate 15% of 80?<|im_end|><|im_start|>assistant
65+
To calculate 15% of 80, we can multiply 80 by 0.15.
66+
80 * 0.15 = 12
67+
So, 15% of 80 is 12.
68+
#### 12
69+
The answer is: 12<|im_end|>"
70+
71+
# Run llama runner on single lora PTE file.
72+
NOW=$(date +"%H:%M:%S")
73+
echo "Starting to run llama runner at ${NOW}"
74+
# shellcheck source=/dev/null
75+
cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math_full.pte --prompt="${PROMPT}" ${RUNTIME_ARGS} > result.txt
76+
NOW=$(date +"%H:%M:%S")
77+
echo "Finished at ${NOW}"
78+
79+
RESULT=$(cat result.txt)
80+
if [[ "${RESULT}" == "${EXPECTED_PREFIX}"* ]]; then
81+
echo "Expected result prefix: ${EXPECTED_PREFIX}"
82+
echo "Actual result: ${RESULT}"
83+
# Do not clean up files if test passes, as they're re-used in the next test.
84+
echo "Success"
85+
else
86+
echo "Expected result prefix: ${EXPECTED_PREFIX}"
87+
echo "Actual result: ${RESULT}"
88+
echo "Failure; results not the same"
89+
cleanup_files
90+
exit 1
91+
fi
92+
93+
### PROGRAM DATA SEPARATION ###
94+
# Export LoRA PTE, LoRA PTD, foundation PTD file.
95+
$PYTHON_EXECUTABLE -m extension.llm.export.export_llm \
96+
--config examples/models/qwen3/config/qwen3_xnnpack.yaml \
97+
+base.adapter_checkpoint="${HF_ADAPTER_PATH}/adapter_model.safetensors" \
98+
+base.adapter_config="${HF_ADAPTER_PATH}/adapter_config.json" \
99+
+export.output_name="qwen_lora_math.pte" \
100+
+export.foundation_weights_file="qwen_foundation.ptd" \
101+
+export.lora_weights_file="qwen_lora_math.ptd"
102+
103+
# Run llama runner on PTE, PTD files.
104+
NOW=$(date +"%H:%M:%S")
105+
echo "Starting to run llama runner at ${NOW}"
106+
# shellcheck source=/dev/null
107+
cmake-out/examples/models/llama/llama_main --model_path=qwen_lora_math.pte --data_paths="qwen_foundation.ptd,qwen_lora_math.ptd" --prompt="${PROMPT}" ${RUNTIME_ARGS} > result2.txt
108+
NOW=$(date +"%H:%M:%S")
109+
echo "Finished at ${NOW}"
110+
111+
RESULT2=$(cat result2.txt)
112+
if [[ "${RESULT2}" == "${EXPECTED_PREFIX}"* ]]; then
113+
echo "Expected result prefix: ${EXPECTED_PREFIX}"
114+
echo "Actual result: ${RESULT2}"
115+
echo "Success"
116+
else
117+
echo "Expected result prefix: ${EXPECTED_PREFIX}"
118+
echo "Actual result: ${RESULT2}"
119+
echo "Failure; results not the same"
120+
cleanup_files
121+
exit 1
122+
fi
123+
124+
cleanup_files

.github/workflows/pull.yml

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -728,8 +728,8 @@ jobs:
728728
# run llama runner in eager mode
729729
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_runner_eager.sh
730730
731-
test-llama-lora-linux:
732-
name: test-llama-lora-linux
731+
test-lora-linux:
732+
name: test-lora-linux
733733
uses: pytorch/test-infra/.github/workflows/linux_job_v2.yml@main
734734
permissions:
735735
id-token: write
@@ -752,11 +752,8 @@ jobs:
752752
# Install llama requirements
753753
bash examples/models/llama/install_requirements.sh
754754
755-
# install a recent version of torchtune (>= 20250730)
756-
PYTHON_EXECUTABLE=python python -m pip install torchtune==0.7.0.dev20250929 --extra-index-url https://download.pytorch.org/whl/nightly/cpu
757-
758755
# run llama runner in eager mode
759-
PYTHON_EXECUTABLE=python bash .ci/scripts/test_llama_lora.sh
756+
PYTHON_EXECUTABLE=python bash .ci/scripts/test_lora.sh
760757
761758
test-mediatek-models-linux:
762759
name: test-mediatek-models-linux
@@ -863,7 +860,7 @@ jobs:
863860
source .ci/scripts/setup-emscripten.sh
864861
865862
export PNPM_VERSION=10.24.0
866-
863+
867864
curl -fsSL https://get.pnpm.io/install.sh | env PNPM_VERSION=$PNPM_VERSION SHELL="$(which bash)" sh -
868865
869866
export PNPM_HOME="$HOME/.local/share/pnpm"

examples/models/qwen3/README.md

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -62,8 +62,9 @@ With ExecuTorch's sample c++ runner (see the Llama README's [Step 3: Run on your
6262
cmake-out/examples/models/llama/llama_main \
6363
--model_path qwen3_0_6b.pte \
6464
--tokenizer_path ~/.cache/huggingface/hub/models--Qwen--Qwen3-0.6B/snapshots/a9c98e602b9d36d2a2f7ba1eb0f5f31e4e8e5143/tokenizer.json \
65-
--prompt="Who is the president of the US?"
65+
--prompt="<|im_start|>user Who is the president of the US?<|im_end|><|im_start|>assistant"
6666
```
67+
Note that you have to apply the chat template manually for the C++ runner.
6768

6869
To run the model on an example iOS or Android app, see the Llama README's [Step 5: Build Mobile apps](../llama/README.md#step-5-build-mobile-apps) section.
6970

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
base:
2+
model_class: "qwen3_0_6b"
3+
params: "examples/models/qwen3/config/0_6b_config.json"
4+
metadata: '{"get_bos_id": 151644, "get_eos_ids":[151645]}'
5+
6+
model:
7+
use_kv_cache: True
8+
use_sdpa_with_kv_cache: True
9+
dtype_override: fp32
10+
11+
export:
12+
max_seq_length: 2048
13+
max_context_length: 2048
14+
15+
backend:
16+
xnnpack:
17+
enabled: True
18+
extended_ops: True

0 commit comments

Comments
 (0)