diff --git a/examples/models/parakeet/README.md b/examples/models/parakeet/README.md index 649a2225536..98f64fee239 100644 --- a/examples/models/parakeet/README.md +++ b/examples/models/parakeet/README.md @@ -183,3 +183,34 @@ DYLD_LIBRARY_PATH=/usr/lib ./cmake-out/examples/models/parakeet/parakeet_runner | `--tokenizer_path` | Path to tokenizer file (default: `tokenizer.json`) | | `--data_path` | Path to data file (.ptd) for delegate data (required for CUDA) | | `--timestamps` | Timestamp output mode: `none\|token\|word\|segment\|all` (default: `segment`) | +| `--warmup` | Run warmup iterations before timed inference (default: `false`) | +| `--warmup_iterations` | Number of warmup iterations (default: `1`, only used if `--warmup` is set) | + +### Benchmarking + +To measure inference time with proper warmup: + +```bash +# Run with warmup (1 iteration by default) +DYLD_LIBRARY_PATH=/usr/lib ./cmake-out/examples/models/parakeet/parakeet_runner \ + --model_path examples/models/parakeet/parakeet_metal/model.pte \ + --audio_path /path/to/audio.wav \ + --tokenizer_path examples/models/parakeet/parakeet_metal/tokenizer.model \ + --warmup + +# Run with multiple warmup iterations +DYLD_LIBRARY_PATH=/usr/lib ./cmake-out/examples/models/parakeet/parakeet_runner \ + --model_path examples/models/parakeet/parakeet_metal/model.pte \ + --audio_path /path/to/audio.wav \ + --tokenizer_path examples/models/parakeet/parakeet_metal/tokenizer.model \ + --warmup --warmup_iterations=3 +``` + +The runner will output inference performance statistics including: +- **Audio duration**: Length of the input audio in seconds +- **Preprocessor time**: Time to convert audio to mel spectrogram +- **Encoder time**: Time to run the acoustic encoder +- **Decoder time**: Time to run TDT greedy decoding +- **Total inference time**: End-to-end inference time +- **Real-time factor (RTF)**: Ratio of inference time to audio duration (lower is better) +- **Speed**: How many times faster than real-time (higher is better) diff --git a/examples/models/parakeet/main.cpp b/examples/models/parakeet/main.cpp index cb0df6cf72f..7ccf004f0a3 100644 --- a/examples/models/parakeet/main.cpp +++ b/examples/models/parakeet/main.cpp @@ -52,6 +52,11 @@ DEFINE_string( timestamps, "segment", "Timestamp output mode: none|token|word|segment|all"); +DEFINE_bool(warmup, false, "Run warmup iterations before timed inference."); +DEFINE_int32( + warmup_iterations, + 1, + "Number of warmup iterations (only used if --warmup is set)."); using ::executorch::extension::from_blob; using ::executorch::extension::Module; @@ -329,6 +334,124 @@ std::vector greedy_decode_executorch( return hypothesis; } +struct InferenceStats { + long preprocessor_ms = 0; + long encoder_ms = 0; + long decoder_ms = 0; + long total_inference_ms = 0; + double audio_duration_s = 0.0; + size_t num_tokens = 0; +}; + +struct InferenceResult { + std::vector tokens; + ::executorch::extension::TensorPtr f_proj; + int64_t encoded_len = 0; + InferenceStats stats; + bool ok = false; +}; + +InferenceResult run_inference( + Module& model, + const std::vector& audio_data, + int64_t blank_id, + int64_t num_rnn_layers, + int64_t pred_hidden, + int64_t sample_rate, + bool collect_stats = true) { + InferenceResult result; + result.stats.audio_duration_s = + static_cast(audio_data.size()) / static_cast(sample_rate); + + long inference_start = executorch::extension::llm::time_in_ms(); + + // Create audio tensor + auto audio_tensor = from_blob( + const_cast(audio_data.data()), + {static_cast<::executorch::aten::SizesType>(audio_data.size())}, + ::executorch::aten::ScalarType::Float); + std::vector audio_len_data = { + static_cast(audio_data.size())}; + auto audio_len_tensor = from_blob( + audio_len_data.data(), {1}, ::executorch::aten::ScalarType::Long); + + // Run preprocessor + long preprocessor_start = executorch::extension::llm::time_in_ms(); + auto proc_result = model.execute( + "preprocessor", + std::vector<::executorch::runtime::EValue>{ + audio_tensor, audio_len_tensor}); + if (!proc_result.ok()) { + ET_LOG(Error, "Preprocessor forward failed."); + return result; + } + long preprocessor_end = executorch::extension::llm::time_in_ms(); + + auto& proc_outputs = proc_result.get(); + auto mel = proc_outputs[0].toTensor(); + auto mel_len_tensor_out = proc_outputs[1].toTensor(); + int64_t mel_len_value = mel_len_tensor_out.const_data_ptr()[0]; + + std::vector mel_len_data = {mel_len_value}; + auto mel_len = + from_blob(mel_len_data.data(), {1}, ::executorch::aten::ScalarType::Long); + + // Run encoder + long encoder_start = executorch::extension::llm::time_in_ms(); + auto enc_result = model.execute( + "encoder", std::vector<::executorch::runtime::EValue>{mel, mel_len}); + if (!enc_result.ok()) { + ET_LOG(Error, "Encoder forward failed."); + return result; + } + long encoder_end = executorch::extension::llm::time_in_ms(); + + auto& enc_outputs = enc_result.get(); + auto f_proj = enc_outputs[0].toTensor(); + result.encoded_len = enc_outputs[1].toTensor().const_data_ptr()[0]; + + // Run decoder + long decoder_start = executorch::extension::llm::time_in_ms(); + result.tokens = greedy_decode_executorch( + model, f_proj, result.encoded_len, blank_id, num_rnn_layers, pred_hidden); + long decoder_end = executorch::extension::llm::time_in_ms(); + + long inference_end = executorch::extension::llm::time_in_ms(); + + if (collect_stats) { + result.stats.preprocessor_ms = preprocessor_end - preprocessor_start; + result.stats.encoder_ms = encoder_end - encoder_start; + result.stats.decoder_ms = decoder_end - decoder_start; + result.stats.total_inference_ms = inference_end - inference_start; + result.stats.num_tokens = result.tokens.size(); + } + + result.ok = true; + return result; +} + +void print_inference_stats(const InferenceStats& stats) { + double rtf = (stats.audio_duration_s > 0) + ? (static_cast(stats.total_inference_ms) / 1000.0) / + stats.audio_duration_s + : 0.0; + double speed_ratio = (rtf > 0) ? 1.0 / rtf : 0.0; + + std::cout << "\n--- Inference Performance Statistics ---" << std::endl; + std::cout << "Audio duration: " << stats.audio_duration_s << " s" + << std::endl; + std::cout << "Preprocessor time: " << stats.preprocessor_ms << " ms" + << std::endl; + std::cout << "Encoder time: " << stats.encoder_ms << " ms" << std::endl; + std::cout << "Decoder time: " << stats.decoder_ms << " ms (" + << stats.num_tokens << " tokens)" << std::endl; + std::cout << "Total inference time: " << stats.total_inference_ms << " ms" + << std::endl; + std::cout << "Real-time factor (RTF): " << rtf << std::endl; + std::cout << "Speed: " << speed_ratio << "x real-time" << std::endl; + std::cout << "-----------------------------------------" << std::endl; +} + } // namespace int main(int argc, char** argv) { @@ -369,61 +492,6 @@ int main(int argc, char** argv) { ::executorch::extension::llm::load_wav_audio_data(FLAGS_audio_path); ET_LOG(Info, "Loaded %zu audio samples", audio_data.size()); - auto audio_tensor = from_blob( - audio_data.data(), - {static_cast<::executorch::aten::SizesType>(audio_data.size())}, - ::executorch::aten::ScalarType::Float); - std::vector audio_len_data = { - static_cast(audio_data.size())}; - auto audio_len_tensor = from_blob( - audio_len_data.data(), {1}, ::executorch::aten::ScalarType::Long); - - ET_LOG(Info, "Running preprocessor..."); - auto proc_result = model->execute( - "preprocessor", - std::vector<::executorch::runtime::EValue>{ - audio_tensor, audio_len_tensor}); - if (!proc_result.ok()) { - ET_LOG(Error, "Preprocessor forward failed."); - return 1; - } - auto& proc_outputs = proc_result.get(); - auto mel = proc_outputs[0].toTensor(); - auto mel_len_tensor_out = proc_outputs[1].toTensor(); - int64_t mel_len_value = mel_len_tensor_out.const_data_ptr()[0]; - - // Create mel_len tensor for encoder - std::vector mel_len_data = {mel_len_value}; - auto mel_len = - from_blob(mel_len_data.data(), {1}, ::executorch::aten::ScalarType::Long); - - ET_LOG( - Info, - "Mel spectrogram shape: [%ld, %ld, %ld], mel_len: %lld", - static_cast(mel.sizes()[0]), - static_cast(mel.sizes()[1]), - static_cast(mel.sizes()[2]), - static_cast(mel_len_value)); - - ET_LOG(Info, "Running encoder..."); - auto enc_result = model->execute( - "encoder", std::vector<::executorch::runtime::EValue>{mel, mel_len}); - if (!enc_result.ok()) { - ET_LOG(Error, "Encoder forward failed."); - return 1; - } - auto& enc_outputs = enc_result.get(); - auto f_proj = enc_outputs[0].toTensor(); // [B, T, joint_hidden] - int64_t encoded_len = enc_outputs[1].toTensor().const_data_ptr()[0]; - - ET_LOG( - Info, - "Encoder output (f_proj) shape: [%ld, %ld, %ld], len=%ld", - static_cast(f_proj.sizes()[0]), - static_cast(f_proj.sizes()[1]), - static_cast(f_proj.sizes()[2]), - static_cast(encoded_len)); - // Query model metadata from constant_methods std::vector<::executorch::runtime::EValue> empty_inputs; auto num_rnn_layers_result = model->execute("num_rnn_layers", empty_inputs); @@ -465,10 +533,46 @@ int main(int argc, char** argv) { window_stride, encoder_subsampling_factor); - ET_LOG(Info, "Running TDT greedy decode..."); - auto decoded_tokens = greedy_decode_executorch( - *model, f_proj, encoded_len, blank_id, num_rnn_layers, pred_hidden); + // Run warmup iterations if requested + if (FLAGS_warmup && FLAGS_warmup_iterations > 0) { + ET_LOG( + Info, + "Running %d warmup iteration(s)...", + FLAGS_warmup_iterations); + for (int i = 0; i < FLAGS_warmup_iterations; ++i) { + ET_LOG(Info, "Warmup iteration %d/%d", i + 1, FLAGS_warmup_iterations); + auto warmup_result = run_inference( + *model, + audio_data, + blank_id, + num_rnn_layers, + pred_hidden, + sample_rate, + false); + if (!warmup_result.ok) { + ET_LOG(Error, "Warmup iteration %d failed.", i + 1); + return 1; + } + } + ET_LOG(Info, "Warmup complete."); + } + // Run timed inference + ET_LOG(Info, "Running timed inference..."); + auto inference_result = run_inference( + *model, + audio_data, + blank_id, + num_rnn_layers, + pred_hidden, + sample_rate, + true); + if (!inference_result.ok) { + ET_LOG(Error, "Inference failed."); + return 1; + } + + auto& decoded_tokens = inference_result.tokens; ET_LOG(Info, "Decoded %zu tokens", decoded_tokens.size()); // Load tokenizer @@ -492,6 +596,9 @@ int main(int argc, char** argv) { executorch::backends::metal::print_metal_backend_stats(); #endif // ET_BUILD_METAL + // Print inference timing stats + print_inference_stats(inference_result.stats); + if (!timestamp_mode.enabled()) { return 0; }