Skip to content

Commit 17e6b70

Browse files
Alex-Wenggclaude
andcommitted
chore: Add FLEURS benchmark scripts and apply swift-format
- Add fleurs_full_benchmark.sh: Benchmarks all 24 FLEURS languages (2,400 samples) - Add fleurs_subset_benchmark.sh: Benchmarks 5 key languages (500 samples) - Apply swift-format indentation fixes (3-space → 4-space for continuations) - Apply swift-format trailing comma conventions Scripts used to establish baseline WER results documented in: Documentation/fleurs-full-benchmark-baseline.md Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
1 parent eef13c8 commit 17e6b70

4 files changed

Lines changed: 435 additions & 19 deletions

File tree

Scripts/fleurs_full_benchmark.sh

Lines changed: 167 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,167 @@
1+
#!/bin/bash
2+
# Run FLEURS full multilingual benchmark (100 samples x 24 languages = 2,400 samples) with sleep prevention.
3+
#
4+
# Benchmarks all 24 languages supported by Parakeet TDT v3:
5+
# Best (WER < 5%): en_us, es_419, it_it, fr_fr, de_de
6+
# Good (5-10%): ru_ru, nl_nl, pl_pl, uk_ua, sk_sk
7+
# Moderate (10-15%): cs_cz, bg_bg, hr_hr, ro_ro, fi_fi
8+
# Lower (>15%): hu_hu, sv_se, et_ee, da_dk, lt_lt, el_gr, mt_mt, lv_lv, sl_si
9+
#
10+
# Usage:
11+
# ./Scripts/fleurs_full_benchmark.sh
12+
#
13+
# The script downloads FLEURS data automatically if needed.
14+
# Uses caffeinate to prevent sleep so you can close the lid.
15+
# Results are saved to benchmark_results/ with timestamps.
16+
17+
set -euo pipefail
18+
19+
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
20+
PROJECT_DIR="$(cd "$SCRIPT_DIR/.." && pwd)"
21+
RESULTS_DIR="$PROJECT_DIR/benchmark_results"
22+
TIMESTAMP=$(date +"%Y%m%d_%H%M%S")
23+
LOG_FILE="$RESULTS_DIR/fleurs_full_benchmark_${TIMESTAMP}.log"
24+
SAMPLES_PER_LANG=100
25+
26+
# All 24 supported languages
27+
LANGUAGES=(
28+
# Best performing (WER < 5%)
29+
"en_us" "es_419" "it_it" "fr_fr" "de_de"
30+
# Good performance (WER 5-10%)
31+
"ru_ru" "nl_nl" "pl_pl" "uk_ua" "sk_sk"
32+
# Moderate performance (WER 10-15%)
33+
"cs_cz" "bg_bg" "hr_hr" "ro_ro" "fi_fi"
34+
# Lower performance (WER > 15%)
35+
"hu_hu" "sv_se" "et_ee" "da_dk" "lt_lt" "el_gr" "mt_mt" "lv_lv" "sl_si"
36+
)
37+
38+
MODELS_DIR="$HOME/Library/Application Support/FluidAudio/Models"
39+
40+
mkdir -p "$RESULTS_DIR"
41+
42+
log() {
43+
echo "[$(date '+%H:%M:%S')] $*" | tee -a "$LOG_FILE"
44+
}
45+
46+
# Verify Parakeet v3 models exist
47+
verify_models() {
48+
local v3_dir="$MODELS_DIR/parakeet-tdt-0.6b-v3"
49+
for f in Preprocessor.mlmodelc Encoder.mlmodelc Decoder.mlmodelc JointDecision.mlmodelc parakeet_vocab.json; do
50+
if [[ ! -e "$v3_dir/$f" ]]; then
51+
log "MISSING v3: $v3_dir/$f"
52+
return 1
53+
fi
54+
done
55+
return 0
56+
}
57+
58+
log "=== Verifying Parakeet v3 models ==="
59+
if ! verify_models; then
60+
log ""
61+
log "ERROR: Parakeet v3 models missing."
62+
log "Please run ASR benchmark first to download models."
63+
exit 1
64+
fi
65+
log "Parakeet v3 models verified. FLEURS data will download automatically if needed."
66+
67+
log "=== FLEURS full benchmark: $SAMPLES_PER_LANG samples x ${#LANGUAGES[@]} languages = $(( SAMPLES_PER_LANG * ${#LANGUAGES[@]} )) total ==="
68+
log "Results directory: $RESULTS_DIR"
69+
70+
cd "$PROJECT_DIR"
71+
72+
# Build release if not already built
73+
if [[ ! -x ".build/release/fluidaudiocli" ]]; then
74+
log "Building release binary..."
75+
swift build -c release 2>&1 | tail -1 | tee -a "$LOG_FILE"
76+
fi
77+
CLI="$PROJECT_DIR/.build/release/fluidaudiocli"
78+
79+
# caffeinate -s: prevent sleep even on AC power / lid closed
80+
# caffeinate -i: prevent idle sleep
81+
caffeinate -si -w $$ &
82+
CAFFEINATE_PID=$!
83+
log "caffeinate started (PID $CAFFEINATE_PID) — safe to close the lid"
84+
85+
SUITE_START=$(date +%s)
86+
87+
# Run all languages
88+
LANG_NAMES=(
89+
"English (US)" "Spanish (Spain)" "Italian (Italy)" "French (France)" "German (Germany)"
90+
"Russian (Russia)" "Dutch (Netherlands)" "Polish (Poland)" "Ukrainian (Ukraine)" "Slovak (Slovakia)"
91+
"Czech (Czechia)" "Bulgarian (Bulgaria)" "Croatian (Croatia)" "Romanian (Romania)" "Finnish (Finland)"
92+
"Hungarian (Hungary)" "Swedish (Sweden)" "Estonian (Estonia)" "Danish (Denmark)" "Lithuanian (Lithuania)"
93+
"Greek (Greece)" "Maltese (Malta)" "Latvian (Latvia)" "Slovenian (Slovenia)"
94+
)
95+
96+
for i in "${!LANGUAGES[@]}"; do
97+
lang="${LANGUAGES[$i]}"
98+
name="${LANG_NAMES[$i]}"
99+
label="fleurs_${lang}"
100+
output_file="$RESULTS_DIR/${label}_${TIMESTAMP}.json"
101+
102+
log "--- [$((i+1))/${#LANGUAGES[@]}] $name ($lang): starting ($SAMPLES_PER_LANG samples) ---"
103+
start_time=$(date +%s)
104+
105+
"$CLI" fleurs-benchmark \
106+
--languages "$lang" \
107+
--samples "$SAMPLES_PER_LANG" \
108+
--output "$output_file" \
109+
2>&1 | tee -a "$LOG_FILE"
110+
111+
end_time=$(date +%s)
112+
elapsed=$(( end_time - start_time ))
113+
log "--- $name: finished in ${elapsed}s — $output_file ---"
114+
done
115+
116+
SUITE_END=$(date +%s)
117+
SUITE_ELAPSED=$(( SUITE_END - SUITE_START ))
118+
SUITE_HOURS=$(( SUITE_ELAPSED / 3600 ))
119+
SUITE_MINS=$(( (SUITE_ELAPSED % 3600) / 60 ))
120+
SUITE_SECS=$(( SUITE_ELAPSED % 60 ))
121+
122+
log "=== All benchmarks complete in ${SUITE_HOURS}h ${SUITE_MINS}m ${SUITE_SECS}s ==="
123+
log "Results:"
124+
ls -lh "$RESULTS_DIR"/*_${TIMESTAMP}.json 2>/dev/null | tee -a "$LOG_FILE"
125+
126+
# Extract WER from all results
127+
log ""
128+
log "=== WER Summary (100 samples per language) ==="
129+
log ""
130+
printf "%-30s %10s %10s %10s\n" "Language" "WER%" "CER%" "RTFx" | tee -a "$LOG_FILE"
131+
printf "%-30s %10s %10s %10s\n" "------------------------------" "----------" "----------" "----------" | tee -a "$LOG_FILE"
132+
133+
extract_metrics() {
134+
local json_file="$1"
135+
if [[ -f "$json_file" ]]; then
136+
python3 -c "
137+
import json
138+
d = json.load(open('$json_file'))
139+
wer = round(d['summary']['averageWER']*100, 2)
140+
cer = round(d['summary']['averageCER']*100, 2)
141+
rtfx = round(d['summary']['averageRTFx'], 1)
142+
print(f'{wer}\t{cer}\t{rtfx}')
143+
" 2>/dev/null || echo "N/A\tN/A\tN/A"
144+
else
145+
echo "N/A\tN/A\tN/A"
146+
fi
147+
}
148+
149+
for i in "${!LANGUAGES[@]}"; do
150+
lang="${LANGUAGES[$i]}"
151+
name="${LANG_NAMES[$i]}"
152+
json_file="$RESULTS_DIR/fleurs_${lang}_${TIMESTAMP}.json"
153+
154+
metrics=$(extract_metrics "$json_file")
155+
wer=$(echo "$metrics" | cut -f1)
156+
cer=$(echo "$metrics" | cut -f2)
157+
rtfx=$(echo "$metrics" | cut -f3)
158+
159+
printf "%-30s %9s%% %9s%% %9sx\n" "$name ($lang)" "$wer" "$cer" "$rtfx" | tee -a "$LOG_FILE"
160+
done
161+
162+
log ""
163+
log "✅ Full FLEURS benchmark complete"
164+
log "Total samples processed: $(( SAMPLES_PER_LANG * ${#LANGUAGES[@]} ))"
165+
log "Results saved to: $RESULTS_DIR/*_${TIMESTAMP}.json"
166+
167+
# caffeinate will exit automatically since the parent process ($$) exits

0 commit comments

Comments
 (0)