diff --git a/examples/models/whisper/README.md b/examples/models/whisper/README.md index 329ef55e8b6..1e2edd24970 100644 --- a/examples/models/whisper/README.md +++ b/examples/models/whisper/README.md @@ -7,6 +7,7 @@ wraps a bundled `.pte` program and optional `.ptd` weight file, loads the streams decoded text pieces through a callback. The runner assumes: + - `model.pte` contains both Whisper encoder and decoder entry points named `encoder` and `text_decoder`. - (Optional) Depending on export configurations, model weights can be optionally stored in a companion @@ -23,16 +24,19 @@ module to generate the spectrogram tensor. Currently we have CUDA and Metal build support. For CPU: + ``` make whisper-cpu ``` For CUDA: + ``` make whisper-cuda ``` For Metal: + ``` make whisper-metal ``` @@ -56,6 +60,7 @@ optimum-cli export executorch \ ``` This command generates: + - `model.pte` — Compiled Whisper model - `aoti_cuda_blob.ptd` — Weight data file for CUDA backend @@ -71,6 +76,7 @@ optimum-cli export executorch \ ``` This command generates: + - `model.pte` — Compiled Whisper model - `aoti_metal_blob.ptd` — Weight data file for Metal backend @@ -106,12 +112,12 @@ optimum-cli export executorch \ --output_dir ./ ``` - ### Download Tokenizer Download the tokenizer files required for inference according to your model version: **For Whisper Small:** + ```bash curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer.json -o tokenizer.json curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer_config.json -o tokenizer_config.json @@ -119,6 +125,7 @@ curl -L https://huggingface.co/openai/whisper-small/resolve/main/special_tokens_ ``` **For Whisper Large v2:** + ```bash curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer.json -o tokenizer.json curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer_config.json -o tokenizer_config.json @@ -166,3 +173,32 @@ cmake-out/examples/models/whisper/whisper_runner \ --processor_path whisper_preprocessor.pte \ --temperature 0 ``` + +### Performance Benchmarking + +Use the `--benchmark` flag to display performance metrics including model load time, inference time, and tokens per second. + +```bash +cmake-out/examples/models/whisper/whisper_runner \ + --model_path model.pte \ + --data_path aoti_metal_blob.ptd \ + --tokenizer_path ./ \ + --audio_path output.wav \ + --processor_path whisper_preprocessor.pte \ + --temperature 0 \ + --benchmark +``` + +Note: To see the benchmark logs, you must enable logging in the build configuration. +Edit [CMakePresets.json](./CMakePresets.json) in the root directory and set `EXECUTORCH_ENABLE_LOGGING` to `ON` for the `llm-release` preset: + +```json + { + "name": "llm-release", + ... + "cacheVariables": { + ... + "EXECUTORCH_ENABLE_LOGGING": "ON" + } + }, +``` diff --git a/examples/models/whisper/main.cpp b/examples/models/whisper/main.cpp index 080106c8915..28bcbcfc4b6 100644 --- a/examples/models/whisper/main.cpp +++ b/examples/models/whisper/main.cpp @@ -44,21 +44,27 @@ DEFINE_double( 0.0, "Sampling temperature. 0.0 performs greedy decoding."); DEFINE_int32(max_new_tokens, 128, "Maximum number of tokens to generate."); +DEFINE_bool( + benchmark, + false, + "Display performance metrics (model load time, inference time, tokens/sec)."); using ::executorch::extension::from_blob; using ::executorch::extension::Module; int main(int argc, char** argv) { gflags::ParseCommandLineFlags(&argc, &argv, true); - ::executorch::extension::TensorPtr features; - std::vector audio_data; - std::unique_ptr processor; if (FLAGS_audio_path.empty()) { ET_LOG(Error, "audio_path flag must be provided."); return 1; } + ::executorch::extension::TensorPtr features; + std::vector audio_data; + std::unique_ptr processor; + + audio_data = executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path); ET_LOG( @@ -98,6 +104,8 @@ int main(int argc, char** argv) { tensor.mutable_data_ptr()[0]); features = std::make_shared<::executorch::aten::Tensor>(std::move(tensor)); + // Track model load time + double model_load_start_ms = ::executorch::extension::llm::time_in_ms(); executorch::extension::asr::AsrRunner runner( FLAGS_model_path, FLAGS_data_path, FLAGS_tokenizer_path); auto load_err = runner.load(); @@ -105,6 +113,16 @@ int main(int argc, char** argv) { ET_LOG(Error, "Failed to load Whisper model."); return 1; } + double model_load_end_ms = ::executorch::extension::llm::time_in_ms(); + double model_load_time_ms = model_load_end_ms - model_load_start_ms; + + if (FLAGS_benchmark) { + ET_LOG( + Info, + "Model load time: %.2f ms (%.2f seconds)", + model_load_time_ms, + model_load_time_ms / 1000.0); + } executorch::extension::asr::AsrTranscribeConfig config; config.max_new_tokens = FLAGS_max_new_tokens; @@ -115,16 +133,41 @@ int main(int argc, char** argv) { config.decoder_start_token_id = 50258; ET_LOG(Info, "Using decoder_start_token_id=50258"); + // Track inference time + double inference_start_ms = ::executorch::extension::llm::time_in_ms(); auto result = runner.transcribe(features, config, [&](const std::string& piece) { ::executorch::extension::llm::safe_printf(piece.c_str()); fflush(stdout); }); + double inference_end_ms = ::executorch::extension::llm::time_in_ms(); if (!result.ok()) { ET_LOG(Error, "Transcription failed."); return 1; } + // Calculate and log performance metrics + double inference_time_ms = inference_end_ms - inference_start_ms; + int64_t num_tokens = result.get().size(); + double tokens_per_sec = (num_tokens * 1000.0) / inference_time_ms; + + if (FLAGS_benchmark) { + ET_LOG( + Info, + "Inference time: %.2f ms (%.2f seconds)", + inference_time_ms, + inference_time_ms / 1000.0); + ET_LOG(Info, "Generated tokens: %lld", (long long)num_tokens); + ET_LOG(Info, "Tokens per second: %.2f", tokens_per_sec); + ET_LOG( + Info, + "=== Performance Summary === Model Load: %.2f ms | Inference: %.2f ms | Tokens: %lld | Speed: %.2f tok/s", + model_load_time_ms, + inference_time_ms, + (long long)num_tokens, + tokens_per_sec); + } + return 0; }