pytorch · seyeong-han · Dec 10, 2025 · Dec 10, 2025 · Dec 10, 2025
@@ -7,6 +7,7 @@ wraps a bundled `.pte` program and optional `.ptd` weight file, loads the
 streams decoded text pieces through a callback.
 
 The runner assumes:
+
 - `model.pte` contains both Whisper encoder and decoder entry points named
   `encoder` and `text_decoder`.
 - (Optional) Depending on export configurations, model weights can be optionally stored in a companion
@@ -23,16 +24,19 @@ module to generate the spectrogram tensor.
 Currently we have CUDA and Metal build support.
 
 For CPU:
+
 ```
 make whisper-cpu
 ```
 
 For CUDA:
+
 ```
 make whisper-cuda
 ```
 
 For Metal:
+
 ```
 make whisper-metal
 ```
@@ -56,6 +60,7 @@ optimum-cli export executorch \
 ```
 
 This command generates:
+
 - `model.pte` — Compiled Whisper model
 - `aoti_cuda_blob.ptd` — Weight data file for CUDA backend
 
@@ -71,6 +76,7 @@ optimum-cli export executorch \
 ```
 
 This command generates:
+
 - `model.pte` — Compiled Whisper model
 - `aoti_metal_blob.ptd` — Weight data file for Metal backend
 
@@ -106,19 +112,20 @@ optimum-cli export executorch \
     --output_dir ./
 ```
 
-
 ### Download Tokenizer
 
 Download the tokenizer files required for inference according to your model version:
 
 **For Whisper Small:**
+
 ```bash
 curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer.json -o tokenizer.json
 curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer_config.json -o tokenizer_config.json
 curl -L https://huggingface.co/openai/whisper-small/resolve/main/special_tokens_map.json -o special_tokens_map.json
 ```
 
 **For Whisper Large v2:**
+
 ```bash
 curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer.json -o tokenizer.json
 curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer_config.json -o tokenizer_config.json
@@ -166,3 +173,32 @@ cmake-out/examples/models/whisper/whisper_runner \
     --processor_path whisper_preprocessor.pte \
     --temperature 0
 ```
+
+### Performance Benchmarking
+
+Use the `--benchmark` flag to display performance metrics including model load time, inference time, and tokens per second.
+
+```bash
+cmake-out/examples/models/whisper/whisper_runner \
+    --model_path model.pte \
+    --data_path aoti_metal_blob.ptd \
+    --tokenizer_path ./ \
+    --audio_path output.wav \
+    --processor_path whisper_preprocessor.pte \
+    --temperature 0 \
+    --benchmark
+```
+
+Note: To see the benchmark logs, you must enable logging in the build configuration.
+Edit [CMakePresets.json](./CMakePresets.json) in the root directory and set `EXECUTORCH_ENABLE_LOGGING` to `ON` for the `llm-release` preset:
+
+```json
+    {
+      "name": "llm-release",
+      ...
+      "cacheVariables": {
+        ...
+        "EXECUTORCH_ENABLE_LOGGING": "ON"
+      }
+    },
+```
@@ -44,21 +44,27 @@ DEFINE_double(
     0.0,
     "Sampling temperature. 0.0 performs greedy decoding.");
 DEFINE_int32(max_new_tokens, 128, "Maximum number of tokens to generate.");
+DEFINE_bool(
+    benchmark,
+    false,
+    "Display performance metrics (model load time, inference time, tokens/sec).");
 
 using ::executorch::extension::from_blob;
 using ::executorch::extension::Module;
 
 int main(int argc, char** argv) {
   gflags::ParseCommandLineFlags(&argc, &argv, true);
-  ::executorch::extension::TensorPtr features;
-  std::vector<float> audio_data;
-  std::unique_ptr<Module> processor;
 
   if (FLAGS_audio_path.empty()) {
     ET_LOG(Error, "audio_path flag must be provided.");
     return 1;
   }
 
+  ::executorch::extension::TensorPtr features;
+  std::vector<float> audio_data;
+  std::unique_ptr<Module> processor;
+
+
   audio_data =
       executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path);
   ET_LOG(
@@ -98,13 +104,25 @@ int main(int argc, char** argv) {
       tensor.mutable_data_ptr<float>()[0]);
   features = std::make_shared<::executorch::aten::Tensor>(std::move(tensor));
 
+  // Track model load time
+  double model_load_start_ms = ::executorch::extension::llm::time_in_ms();
   executorch::extension::asr::AsrRunner runner(
       FLAGS_model_path, FLAGS_data_path, FLAGS_tokenizer_path);
   auto load_err = runner.load();
   if (load_err != ::executorch::runtime::Error::Ok) {
     ET_LOG(Error, "Failed to load Whisper model.");
     return 1;
   }
+  double model_load_end_ms = ::executorch::extension::llm::time_in_ms();
+  double model_load_time_ms = model_load_end_ms - model_load_start_ms;
+
+  if (FLAGS_benchmark) {
+    ET_LOG(
+        Info,
+        "Model load time: %.2f ms (%.2f seconds)",
+        model_load_time_ms,
+        model_load_time_ms / 1000.0);
+  }
 
   executorch::extension::asr::AsrTranscribeConfig config;
   config.max_new_tokens = FLAGS_max_new_tokens;
@@ -115,16 +133,41 @@ int main(int argc, char** argv) {
   config.decoder_start_token_id = 50258;
   ET_LOG(Info, "Using decoder_start_token_id=50258");
 
+  // Track inference time
+  double inference_start_ms = ::executorch::extension::llm::time_in_ms();
   auto result =
       runner.transcribe(features, config, [&](const std::string& piece) {
         ::executorch::extension::llm::safe_printf(piece.c_str());
         fflush(stdout);
       });
+  double inference_end_ms = ::executorch::extension::llm::time_in_ms();
 
   if (!result.ok()) {
     ET_LOG(Error, "Transcription failed.");
     return 1;
   }
 
+  // Calculate and log performance metrics
+  double inference_time_ms = inference_end_ms - inference_start_ms;
+  int64_t num_tokens = result.get().size();
+  double tokens_per_sec = (num_tokens * 1000.0) / inference_time_ms;
+
+  if (FLAGS_benchmark) {
+    ET_LOG(
+        Info,
+        "Inference time: %.2f ms (%.2f seconds)",
+        inference_time_ms,
+        inference_time_ms / 1000.0);
+    ET_LOG(Info, "Generated tokens: %lld", (long long)num_tokens);
+    ET_LOG(Info, "Tokens per second: %.2f", tokens_per_sec);
+    ET_LOG(
+        Info,
+        "=== Performance Summary === Model Load: %.2f ms | Inference: %.2f ms | Tokens: %lld | Speed: %.2f tok/s",
+        model_load_time_ms,
+        inference_time_ms,
+        (long long)num_tokens,
+        tokens_per_sec);
+  }
+
   return 0;
 }