Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 37 additions & 1 deletion examples/models/whisper/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,7 @@ wraps a bundled `.pte` program and optional `.ptd` weight file, loads the
streams decoded text pieces through a callback.

The runner assumes:

- `model.pte` contains both Whisper encoder and decoder entry points named
`encoder` and `text_decoder`.
- (Optional) Depending on export configurations, model weights can be optionally stored in a companion
Expand All @@ -23,16 +24,19 @@ module to generate the spectrogram tensor.
Currently we have CUDA and Metal build support.

For CPU:

```
make whisper-cpu
```

For CUDA:

```
make whisper-cuda
```

For Metal:

```
make whisper-metal
```
Expand All @@ -56,6 +60,7 @@ optimum-cli export executorch \
```

This command generates:

- `model.pte` — Compiled Whisper model
- `aoti_cuda_blob.ptd` — Weight data file for CUDA backend

Expand All @@ -71,6 +76,7 @@ optimum-cli export executorch \
```

This command generates:

- `model.pte` — Compiled Whisper model
- `aoti_metal_blob.ptd` — Weight data file for Metal backend

Expand Down Expand Up @@ -106,19 +112,20 @@ optimum-cli export executorch \
--output_dir ./
```


### Download Tokenizer

Download the tokenizer files required for inference according to your model version:

**For Whisper Small:**

```bash
curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer.json -o tokenizer.json
curl -L https://huggingface.co/openai/whisper-small/resolve/main/tokenizer_config.json -o tokenizer_config.json
curl -L https://huggingface.co/openai/whisper-small/resolve/main/special_tokens_map.json -o special_tokens_map.json
```

**For Whisper Large v2:**

```bash
curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer.json -o tokenizer.json
curl -L https://huggingface.co/openai/whisper-large-v2/resolve/main/tokenizer_config.json -o tokenizer_config.json
Expand Down Expand Up @@ -166,3 +173,32 @@ cmake-out/examples/models/whisper/whisper_runner \
--processor_path whisper_preprocessor.pte \
--temperature 0
```

### Performance Benchmarking

Use the `--benchmark` flag to display performance metrics including model load time, inference time, and tokens per second.

```bash
cmake-out/examples/models/whisper/whisper_runner \
--model_path model.pte \
--data_path aoti_metal_blob.ptd \
--tokenizer_path ./ \
--audio_path output.wav \
--processor_path whisper_preprocessor.pte \
--temperature 0 \
--benchmark
```

Note: To see the benchmark logs, you must enable logging in the build configuration.
Edit [CMakePresets.json](./CMakePresets.json) in the root directory and set `EXECUTORCH_ENABLE_LOGGING` to `ON` for the `llm-release` preset:

```json
{
"name": "llm-release",
...
"cacheVariables": {
...
"EXECUTORCH_ENABLE_LOGGING": "ON"
}
},
```
49 changes: 46 additions & 3 deletions examples/models/whisper/main.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -44,21 +44,27 @@ DEFINE_double(
0.0,
"Sampling temperature. 0.0 performs greedy decoding.");
DEFINE_int32(max_new_tokens, 128, "Maximum number of tokens to generate.");
DEFINE_bool(
benchmark,
false,
"Display performance metrics (model load time, inference time, tokens/sec).");

using ::executorch::extension::from_blob;
using ::executorch::extension::Module;

int main(int argc, char** argv) {
gflags::ParseCommandLineFlags(&argc, &argv, true);
::executorch::extension::TensorPtr features;
std::vector<float> audio_data;
std::unique_ptr<Module> processor;

if (FLAGS_audio_path.empty()) {
ET_LOG(Error, "audio_path flag must be provided.");
return 1;
}

::executorch::extension::TensorPtr features;
std::vector<float> audio_data;
std::unique_ptr<Module> processor;


audio_data =
executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path);
ET_LOG(
Expand Down Expand Up @@ -98,13 +104,25 @@ int main(int argc, char** argv) {
tensor.mutable_data_ptr<float>()[0]);
features = std::make_shared<::executorch::aten::Tensor>(std::move(tensor));

// Track model load time
double model_load_start_ms = ::executorch::extension::llm::time_in_ms();
executorch::extension::asr::AsrRunner runner(
FLAGS_model_path, FLAGS_data_path, FLAGS_tokenizer_path);
auto load_err = runner.load();
if (load_err != ::executorch::runtime::Error::Ok) {
ET_LOG(Error, "Failed to load Whisper model.");
return 1;
}
double model_load_end_ms = ::executorch::extension::llm::time_in_ms();
double model_load_time_ms = model_load_end_ms - model_load_start_ms;

if (FLAGS_benchmark) {
ET_LOG(
Info,
"Model load time: %.2f ms (%.2f seconds)",
model_load_time_ms,
model_load_time_ms / 1000.0);
}

executorch::extension::asr::AsrTranscribeConfig config;
config.max_new_tokens = FLAGS_max_new_tokens;
Expand All @@ -115,16 +133,41 @@ int main(int argc, char** argv) {
config.decoder_start_token_id = 50258;
ET_LOG(Info, "Using decoder_start_token_id=50258");

// Track inference time
double inference_start_ms = ::executorch::extension::llm::time_in_ms();
auto result =
runner.transcribe(features, config, [&](const std::string& piece) {
::executorch::extension::llm::safe_printf(piece.c_str());
fflush(stdout);
});
double inference_end_ms = ::executorch::extension::llm::time_in_ms();

if (!result.ok()) {
ET_LOG(Error, "Transcription failed.");
return 1;
}

// Calculate and log performance metrics
double inference_time_ms = inference_end_ms - inference_start_ms;
int64_t num_tokens = result.get().size();
double tokens_per_sec = (num_tokens * 1000.0) / inference_time_ms;

if (FLAGS_benchmark) {
ET_LOG(
Info,
"Inference time: %.2f ms (%.2f seconds)",
inference_time_ms,
inference_time_ms / 1000.0);
ET_LOG(Info, "Generated tokens: %lld", (long long)num_tokens);
ET_LOG(Info, "Tokens per second: %.2f", tokens_per_sec);
ET_LOG(
Info,
"=== Performance Summary === Model Load: %.2f ms | Inference: %.2f ms | Tokens: %lld | Speed: %.2f tok/s",
model_load_time_ms,
inference_time_ms,
(long long)num_tokens,
tokens_per_sec);
}

return 0;
}
Loading