|
| 1 | +#!/bin/bash |
| 2 | +# Run FLEURS full multilingual benchmark (100 samples x 24 languages = 2,400 samples) with sleep prevention. |
| 3 | +# |
| 4 | +# Benchmarks all 24 languages supported by Parakeet TDT v3: |
| 5 | +# Best (WER < 5%): en_us, es_419, it_it, fr_fr, de_de |
| 6 | +# Good (5-10%): ru_ru, nl_nl, pl_pl, uk_ua, sk_sk |
| 7 | +# Moderate (10-15%): cs_cz, bg_bg, hr_hr, ro_ro, fi_fi |
| 8 | +# Lower (>15%): hu_hu, sv_se, et_ee, da_dk, lt_lt, el_gr, mt_mt, lv_lv, sl_si |
| 9 | +# |
| 10 | +# Usage: |
| 11 | +# ./Scripts/fleurs_full_benchmark.sh |
| 12 | +# |
| 13 | +# The script downloads FLEURS data automatically if needed. |
| 14 | +# Uses caffeinate to prevent sleep so you can close the lid. |
| 15 | +# Results are saved to benchmark_results/ with timestamps. |
| 16 | + |
| 17 | +set -euo pipefail |
| 18 | + |
| 19 | +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" |
| 20 | +PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)" |
| 21 | +RESULTS_DIR="$PROJECT_DIR/benchmark_results" |
| 22 | +TIMESTAMP=$(date +"%Y%m%d_%H%M%S") |
| 23 | +LOG_FILE="$RESULTS_DIR/fleurs_full_benchmark_${TIMESTAMP}.log" |
| 24 | +SAMPLES_PER_LANG=100 |
| 25 | + |
| 26 | +# All 24 supported languages |
| 27 | +LANGUAGES=( |
| 28 | + # Best performing (WER < 5%) |
| 29 | + "en_us" "es_419" "it_it" "fr_fr" "de_de" |
| 30 | + # Good performance (WER 5-10%) |
| 31 | + "ru_ru" "nl_nl" "pl_pl" "uk_ua" "sk_sk" |
| 32 | + # Moderate performance (WER 10-15%) |
| 33 | + "cs_cz" "bg_bg" "hr_hr" "ro_ro" "fi_fi" |
| 34 | + # Lower performance (WER > 15%) |
| 35 | + "hu_hu" "sv_se" "et_ee" "da_dk" "lt_lt" "el_gr" "mt_mt" "lv_lv" "sl_si" |
| 36 | +) |
| 37 | + |
| 38 | +MODELS_DIR="$HOME/Library/Application Support/FluidAudio/Models" |
| 39 | + |
| 40 | +mkdir -p "$RESULTS_DIR" |
| 41 | + |
| 42 | +log() { |
| 43 | + echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG_FILE" |
| 44 | +} |
| 45 | + |
| 46 | +# Verify Parakeet v3 models exist |
| 47 | +verify_models() { |
| 48 | + local v3_dir="$MODELS_DIR/parakeet-tdt-0.6b-v3" |
| 49 | + for f in Preprocessor.mlmodelc Encoder.mlmodelc Decoder.mlmodelc JointDecision.mlmodelc parakeet_vocab.json; do |
| 50 | + if [[ ! -e "$v3_dir/$f" ]]; then |
| 51 | + log "MISSING v3: $v3_dir/$f" |
| 52 | + return 1 |
| 53 | + fi |
| 54 | + done |
| 55 | + return 0 |
| 56 | +} |
| 57 | + |
| 58 | +log "=== Verifying Parakeet v3 models ===" |
| 59 | +if ! verify_models; then |
| 60 | + log "" |
| 61 | + log "ERROR: Parakeet v3 models missing." |
| 62 | + log "Please run ASR benchmark first to download models." |
| 63 | + exit 1 |
| 64 | +fi |
| 65 | +log "Parakeet v3 models verified. FLEURS data will download automatically if needed." |
| 66 | + |
| 67 | +log "=== FLEURS full benchmark: $SAMPLES_PER_LANG samples x ${#LANGUAGES[@]} languages = $(( SAMPLES_PER_LANG * ${#LANGUAGES[@]} )) total ===" |
| 68 | +log "Results directory: $RESULTS_DIR" |
| 69 | + |
| 70 | +cd "$PROJECT_DIR" |
| 71 | + |
| 72 | +# Build release if not already built |
| 73 | +if [[ ! -x ".build/release/fluidaudiocli" ]]; then |
| 74 | + log "Building release binary..." |
| 75 | + swift build -c release 2>&1 | tail -1 | tee -a "$LOG_FILE" |
| 76 | +fi |
| 77 | +CLI="$PROJECT_DIR/.build/release/fluidaudiocli" |
| 78 | + |
| 79 | +# caffeinate -s: prevent sleep even on AC power / lid closed |
| 80 | +# caffeinate -i: prevent idle sleep |
| 81 | +caffeinate -si -w $$ & |
| 82 | +CAFFEINATE_PID=$! |
| 83 | +log "caffeinate started (PID $CAFFEINATE_PID) — safe to close the lid" |
| 84 | + |
| 85 | +SUITE_START=$(date +%s) |
| 86 | + |
| 87 | +# Run all languages |
| 88 | +LANG_NAMES=( |
| 89 | + "English (US)" "Spanish (Spain)" "Italian (Italy)" "French (France)" "German (Germany)" |
| 90 | + "Russian (Russia)" "Dutch (Netherlands)" "Polish (Poland)" "Ukrainian (Ukraine)" "Slovak (Slovakia)" |
| 91 | + "Czech (Czechia)" "Bulgarian (Bulgaria)" "Croatian (Croatia)" "Romanian (Romania)" "Finnish (Finland)" |
| 92 | + "Hungarian (Hungary)" "Swedish (Sweden)" "Estonian (Estonia)" "Danish (Denmark)" "Lithuanian (Lithuania)" |
| 93 | + "Greek (Greece)" "Maltese (Malta)" "Latvian (Latvia)" "Slovenian (Slovenia)" |
| 94 | +) |
| 95 | + |
| 96 | +for i in "${!LANGUAGES[@]}"; do |
| 97 | + lang="${LANGUAGES[$i]}" |
| 98 | + name="${LANG_NAMES[$i]}" |
| 99 | + label="fleurs_${lang}" |
| 100 | + output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json" |
| 101 | + |
| 102 | + log "--- [$((i+1))/${#LANGUAGES[@]}] $name ($lang): starting ($SAMPLES_PER_LANG samples) ---" |
| 103 | + start_time=$(date +%s) |
| 104 | + |
| 105 | + "$CLI" fleurs-benchmark \ |
| 106 | + --languages "$lang" \ |
| 107 | + --samples "$SAMPLES_PER_LANG" \ |
| 108 | + --output "$output_file" \ |
| 109 | + 2>&1 | tee -a "$LOG_FILE" |
| 110 | + |
| 111 | + end_time=$(date +%s) |
| 112 | + elapsed=$(( end_time - start_time )) |
| 113 | + log "--- $name: finished in ${elapsed}s — $output_file ---" |
| 114 | +done |
| 115 | + |
| 116 | +SUITE_END=$(date +%s) |
| 117 | +SUITE_ELAPSED=$(( SUITE_END - SUITE_START )) |
| 118 | +SUITE_HOURS=$(( SUITE_ELAPSED / 3600 )) |
| 119 | +SUITE_MINS=$(( (SUITE_ELAPSED % 3600) / 60 )) |
| 120 | +SUITE_SECS=$(( SUITE_ELAPSED % 60 )) |
| 121 | + |
| 122 | +log "=== All benchmarks complete in ${SUITE_HOURS}h ${SUITE_MINS}m ${SUITE_SECS}s ===" |
| 123 | +log "Results:" |
| 124 | +ls -lh "$RESULTS_DIR"/*_${TIMESTAMP}.json 2>/dev/null | tee -a "$LOG_FILE" |
| 125 | + |
| 126 | +# Extract WER from all results |
| 127 | +log "" |
| 128 | +log "=== WER Summary (100 samples per language) ===" |
| 129 | +log "" |
| 130 | +printf "%-30s %10s %10s %10s\n" "Language" "WER%" "CER%" "RTFx" | tee -a "$LOG_FILE" |
| 131 | +printf "%-30s %10s %10s %10s\n" "------------------------------" "----------" "----------" "----------" | tee -a "$LOG_FILE" |
| 132 | + |
| 133 | +extract_metrics() { |
| 134 | + local json_file="$1" |
| 135 | + if [[ -f "$json_file" ]]; then |
| 136 | + python3 -c " |
| 137 | +import json |
| 138 | +d = json.load(open('$json_file')) |
| 139 | +wer = round(d['summary']['averageWER']*100, 2) |
| 140 | +cer = round(d['summary']['averageCER']*100, 2) |
| 141 | +rtfx = round(d['summary']['averageRTFx'], 1) |
| 142 | +print(f'{wer}\t{cer}\t{rtfx}') |
| 143 | +" 2>/dev/null || echo "N/A\tN/A\tN/A" |
| 144 | + else |
| 145 | + echo "N/A\tN/A\tN/A" |
| 146 | + fi |
| 147 | +} |
| 148 | + |
| 149 | +for i in "${!LANGUAGES[@]}"; do |
| 150 | + lang="${LANGUAGES[$i]}" |
| 151 | + name="${LANG_NAMES[$i]}" |
| 152 | + json_file="$RESULTS_DIR/fleurs_${lang}_${TIMESTAMP}.json" |
| 153 | + |
| 154 | + metrics=$(extract_metrics "$json_file") |
| 155 | + wer=$(echo "$metrics" | cut -f1) |
| 156 | + cer=$(echo "$metrics" | cut -f2) |
| 157 | + rtfx=$(echo "$metrics" | cut -f3) |
| 158 | + |
| 159 | + printf "%-30s %9s%% %9s%% %9sx\n" "$name ($lang)" "$wer" "$cer" "$rtfx" | tee -a "$LOG_FILE" |
| 160 | +done |
| 161 | + |
| 162 | +log "" |
| 163 | +log "✅ Full FLEURS benchmark complete" |
| 164 | +log "Total samples processed: $(( SAMPLES_PER_LANG * ${#LANGUAGES[@]} ))" |
| 165 | +log "Results saved to: $RESULTS_DIR/*_${TIMESTAMP}.json" |
| 166 | + |
| 167 | +# caffeinate will exit automatically since the parent process ($$) exits |
0 commit comments