diff --git a/apps/llm/app/index.tsx b/apps/llm/app/index.tsx
index 72358ae72c..b67b3fa7ce 100644
--- a/apps/llm/app/index.tsx
+++ b/apps/llm/app/index.tsx
@@ -29,12 +29,6 @@ export default function Home() {
>
LLM Structured Output
- router.navigate('voice_chat/')}
- >
- Voice Chat
-
router.navigate('multimodal_llm/')}
diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx
deleted file mode 100644
index 23ab70bff4..0000000000
--- a/apps/llm/app/voice_chat/index.tsx
+++ /dev/null
@@ -1,311 +0,0 @@
-import { useContext, useEffect, useState } from 'react';
-import {
- Keyboard,
- KeyboardAvoidingView,
- Platform,
- StyleSheet,
- Text,
- TouchableOpacity,
- TouchableWithoutFeedback,
- View,
-} from 'react-native';
-import SWMIcon from '../../assets/icons/swm_icon.svg';
-import Spinner from '../../components/Spinner';
-import ErrorBanner from '../../components/ErrorBanner';
-import {
- useSpeechToText,
- useLLM,
- QWEN3_0_6B_QUANTIZED,
- QWEN3_1_7B_QUANTIZED,
- LLAMA3_2_1B_SPINQUANT,
- WHISPER_TINY_EN,
- WHISPER_TINY_EN_QUANTIZED,
- WHISPER_BASE_EN,
- WHISPER_SMALL_EN,
- LLMProps,
- SpeechToTextProps,
-} from 'react-native-executorch';
-import { ModelPicker, ModelOption } from '../../components/ModelPicker';
-import PauseIcon from '../../assets/icons/pause_icon.svg';
-import MicIcon from '../../assets/icons/mic_icon.svg';
-import StopIcon from '../../assets/icons/stop_icon.svg';
-import ColorPalette from '../../colors';
-import Messages from '../../components/Messages';
-import { AudioManager, AudioRecorder } from 'react-native-audio-api';
-import DeviceInfo from 'react-native-device-info';
-import { useIsFocused } from '@react-navigation/native';
-import { useSafeAreaInsets } from 'react-native-safe-area-context';
-import { GeneratingContext } from '../../context';
-
-type LLMModelSources = LLMProps['model'];
-type STTModelSources = SpeechToTextProps['model'];
-
-const LLM_MODELS: ModelOption[] = [
- { label: 'Qwen3 0.6B', value: QWEN3_0_6B_QUANTIZED },
- { label: 'Qwen3 1.7B', value: QWEN3_1_7B_QUANTIZED },
- { label: 'Llama 1B', value: LLAMA3_2_1B_SPINQUANT },
-];
-
-const STT_MODELS: ModelOption[] = [
- { label: 'Whisper Tiny', value: WHISPER_TINY_EN },
- { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED },
- { label: 'Whisper Base', value: WHISPER_BASE_EN },
- { label: 'Whisper Small', value: WHISPER_SMALL_EN },
-];
-
-export default function VoiceChatScreenWrapper() {
- const isFocused = useIsFocused();
-
- return isFocused ? : null;
-}
-
-function VoiceChatScreen() {
- const { bottom } = useSafeAreaInsets();
- const [isRecording, setIsRecording] = useState(false);
- const [liveTranscription, setLiveTranscription] = useState('');
- const [selectedLLM, setSelectedLLM] =
- useState(QWEN3_0_6B_QUANTIZED);
- const [selectedSTT, setSelectedSTT] =
- useState(WHISPER_TINY_EN);
- const [error, setError] = useState(null);
-
- const [recorder] = useState(() => new AudioRecorder());
-
- const { setGlobalGenerating } = useContext(GeneratingContext);
-
- const llm = useLLM({ model: selectedLLM });
- const speechToText = useSpeechToText({
- model: selectedSTT,
- });
-
- useEffect(() => {
- setGlobalGenerating(llm.isGenerating || speechToText.isGenerating);
- }, [llm.isGenerating, speechToText.isGenerating, setGlobalGenerating]);
-
- useEffect(() => {
- AudioManager.setAudioSessionOptions({
- iosCategory: 'playAndRecord',
- iosMode: 'spokenAudio',
- iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
- });
- AudioManager.requestRecordingPermissions();
- }, []);
-
- const handleRecordPress = async () => {
- if (isRecording) {
- setIsRecording(false);
- recorder.stop();
- speechToText.streamStop();
- } else {
- setIsRecording(true);
- setLiveTranscription('');
-
- const sampleRate = 16000;
- recorder.onAudioReady(
- {
- sampleRate,
- bufferLength: 0.1 * sampleRate,
- channelCount: 1,
- },
- ({ buffer }) => {
- speechToText.streamInsert(buffer.getChannelData(0));
- }
- );
- recorder.start();
-
- let finalResult = '';
-
- try {
- for await (const result of speechToText.stream()) {
- const text = result.committed.text + result.nonCommitted.text;
- setLiveTranscription(text);
- finalResult = text;
- }
- } catch (e) {
- setError(e instanceof Error ? e.message : String(e));
- } finally {
- if (finalResult.trim().length > 0) {
- await llm.sendMessage(finalResult);
- setLiveTranscription('');
- }
- }
- }
- };
-
- useEffect(() => {
- if (llm.error) setError(String(llm.error));
- }, [llm.error]);
-
- useEffect(() => {
- if (speechToText.error) setError(String(speechToText.error));
- }, [speechToText.error]);
-
- return (!llm.isReady || !speechToText.isReady) &&
- !llm.error &&
- !speechToText.error ? (
-
- ) : (
-
-
-
-
- Qwen 3 x Whisper
-
- setError(null)} />
- {llm.messageHistory.length > 0 || liveTranscription.length > 0 ? (
-
- 0
- ? [
- ...llm.messageHistory,
- {
- role: 'user',
- content: liveTranscription,
- },
- ]
- : llm.messageHistory
- }
- llmResponse={llm.response}
- isGenerating={llm.isGenerating}
- deleteMessage={llm.deleteMessage}
- />
-
- ) : (
-
- Hello! 👋
-
- Tap the mic and speak to me. I'll transcribe your voice and
- respond using a language model — all on-device.
-
-
- )}
-
- setSelectedLLM(m)}
- />
- setSelectedSTT(m)}
- />
-
-
- {DeviceInfo.isEmulatorSync() ? (
-
-
- recording disabled on emulator
-
-
- ) : (
- <>
- {llm.isGenerating ? (
-
-
-
- ) : (
-
- {isRecording ? (
-
- ) : (
-
- )}
-
- )}
- >
- )}
-
-
-
- );
-}
-
-const styles = StyleSheet.create({
- keyboardAvoidingView: {
- flex: 1,
- },
- topContainer: {
- height: 68,
- width: '100%',
- alignItems: 'center',
- justifyContent: 'center',
- },
- chatContainer: {
- flex: 10,
- width: '100%',
- },
- textModelName: {
- color: ColorPalette.primary,
- },
- helloMessageContainer: {
- flex: 10,
- width: '100%',
- alignItems: 'center',
- justifyContent: 'center',
- },
- helloText: {
- fontFamily: 'medium',
- fontSize: 30,
- color: ColorPalette.primary,
- },
- bottomHelloText: {
- fontFamily: 'regular',
- fontSize: 20,
- lineHeight: 28,
- textAlign: 'center',
- color: ColorPalette.primary,
- },
- bottomContainer: {
- height: 100,
- width: '100%',
- justifyContent: 'center',
- alignItems: 'center',
- paddingHorizontal: 16,
- },
- recordTouchable: {
- height: '100%',
- justifyContent: 'center',
- alignItems: 'center',
- },
- recordingInfo: {
- width: '100%',
- display: 'flex',
- justifyContent: 'center',
- alignItems: 'center',
- },
- emulatorBox: {
- padding: 10,
- margin: 10,
- borderWidth: 1,
- borderRadius: 8,
- borderColor: 'gray',
- justifyContent: 'center',
- alignItems: 'center',
- },
- emulatorWarning: {
- color: 'gray',
- fontSize: 16,
- },
-});
diff --git a/apps/speech/package.json b/apps/speech/package.json
index 93e07755dd..47de1396a3 100644
--- a/apps/speech/package.json
+++ b/apps/speech/package.json
@@ -20,7 +20,7 @@
"metro-config": "^0.83.0",
"react": "19.2.5",
"react-native": "0.83.4",
- "react-native-audio-api": "0.12.0",
+ "react-native-audio-api": "0.12.2",
"react-native-device-info": "^15.0.2",
"react-native-executorch": "workspace:*",
"react-native-executorch-expo-resource-fetcher": "workspace:*",
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index dfd39c15b4..ad4f6505c8 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -14,21 +14,25 @@ import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
import {
useSpeechToText,
WHISPER_TINY_EN,
- WHISPER_TINY_EN_QUANTIZED,
+ WHISPER_TINY_EN_COREML,
WHISPER_BASE_EN,
+ WHISPER_BASE_EN_COREML,
WHISPER_SMALL_EN,
TranscriptionResult,
SpeechToTextProps,
+ WHISPER_SMALL_EN_COREML,
} from 'react-native-executorch';
import { ModelPicker, ModelOption } from '../components/ModelPicker';
type STTModelSources = SpeechToTextProps['model'];
const MODELS: ModelOption[] = [
- { label: 'Whisper Tiny', value: WHISPER_TINY_EN },
- { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED },
- { label: 'Whisper Base', value: WHISPER_BASE_EN },
- { label: 'Whisper Small', value: WHISPER_SMALL_EN },
+ { label: 'Whisper Tiny EN (XNNPACK)', value: WHISPER_TINY_EN },
+ { label: 'Whisper Tiny EN (CoreML)', value: WHISPER_TINY_EN_COREML },
+ { label: 'Whisper Base EN (XNNPACK)', value: WHISPER_BASE_EN },
+ { label: 'Whisper Base EN (CoreML)', value: WHISPER_BASE_EN_COREML },
+ { label: 'Whisper Small EN (XNNPACK)', value: WHISPER_SMALL_EN },
+ { label: 'Whisper Small EN (CoreML)', value: WHISPER_SMALL_EN_COREML },
];
import FontAwesome from '@expo/vector-icons/FontAwesome';
import {
@@ -45,9 +49,12 @@ import ErrorBanner from '../components/ErrorBanner';
const isSimulator = DeviceInfo.isEmulatorSync();
+const DEFAULT_MODEL =
+ Platform.OS === 'ios' ? WHISPER_BASE_EN_COREML : WHISPER_TINY_EN;
+
export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
const [selectedModel, setSelectedModel] =
- useState(WHISPER_TINY_EN);
+ useState(DEFAULT_MODEL);
const model = useSpeechToText({
model: selectedModel,
@@ -148,7 +155,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
recorder.current.onAudioReady(
{
sampleRate,
- bufferLength: 0.1 * sampleRate,
+ bufferLength: 0.1 * sampleRate, // 100 ms
channelCount: 1,
},
({ buffer }) => {
@@ -178,6 +185,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
try {
const streamIter = model.stream({
verbose: enableTimestamps,
+ timeout: 100,
});
for await (const { committed, nonCommitted } of streamIter) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index a20fd7b1bc..ec47586266 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -599,8 +599,7 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
jsi::Object wordObj(runtime);
wordObj.setProperty(
runtime, "word",
- jsi::String::createFromUtf8(runtime, seg.words[i].content +
- seg.words[i].punctations));
+ jsi::String::createFromUtf8(runtime, seg.words[i].content));
wordObj.setProperty(runtime, "start",
static_cast(seg.words[i].start));
wordObj.setProperty(runtime, "end", static_cast(seg.words[i].end));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 4b58c5039b..9537642d58 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -94,7 +94,7 @@ TranscriptionResult wordsToResult(const std::vector &words,
std::string fullText;
for (const auto &w : words) {
- fullText += w.content + w.punctations;
+ fullText += w.content;
}
res.text = fullText;
@@ -115,7 +115,8 @@ TranscriptionResult wordsToResult(const std::vector &words,
} // namespace
void SpeechToText::stream(std::shared_ptr callback,
- std::string languageOption, bool verbose) {
+ std::string languageOption, bool verbose,
+ uint32_t timeout) {
if (isStreaming_) {
throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
"Streaming is already in progress!");
@@ -158,10 +159,10 @@ void SpeechToText::stream(std::shared_ptr callback,
// running transcriptions too rapidly (before the audio buffer is filled
// with significant amount of new data) can cause streamer to commit wrong
// phrases.
- std::this_thread::sleep_for(std::chrono::milliseconds(100));
+ std::this_thread::sleep_for(std::chrono::milliseconds(timeout));
}
- std::vector finalWords = streamer_->finish();
+ std::vector finalWords = streamer_->finish(options);
TranscriptionResult finalRes =
wordsToResult(finalWords, languageOption, verbose);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index ade835869c..ec51862793 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -42,7 +42,8 @@ class SpeechToText {
// Stream
void stream(std::shared_ptr callback,
- std::string languageOption, bool enableTimestamps);
+ std::string languageOption, bool enableTimestamps,
+ uint32_t timeout);
void streamStop();
void streamInsert(std::span waveform);
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
index 357309391d..efe6cc2819 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
@@ -36,7 +36,7 @@ class OnlineASR {
virtual ProcessResult process(const DecodingOptions &options) = 0;
- virtual std::vector finish() = 0;
+ virtual std::vector finish(const DecodingOptions &options) = 0;
virtual void reset() = 0;
};
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
index e7319f95b5..2343d1faab 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
@@ -4,13 +4,14 @@
namespace rnexecutorch::models::speech_to_text {
+/**
+ * Basically a different representation of token,
+ * with timestamps calculated.
+ */
struct Word {
std::string content;
float start;
float end;
-
- std::string
- punctations; // Trailing punctations which appear after the main content
};
} // namespace rnexecutorch::models::speech_to_text
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
index d1debeb0f0..d2555a79fa 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
@@ -138,8 +138,9 @@ executorch::aten::Tensor ASR::decode(std::span tokens,
positionShape, cachePositions.data(), ScalarType::Long);
const auto encoderOutputSize = static_cast(encoderOutput.size());
- std::vector encShape = {1, constants::kNumFrames,
- encoderOutputSize / constants::kNumFrames};
+ std::vector encShape = {
+ 1, static_cast(constants::kNumFrames),
+ encoderOutputSize / static_cast(constants::kNumFrames)};
auto encoderTensor = executorch::extension::make_tensor_ptr(
std::move(encShape), const_cast(encoderOutput.data()),
ScalarType::Float);
@@ -262,11 +263,21 @@ ASR::generate(std::span waveform, const DecodingOptions &options,
std::vector scores;
uint64_t startPos = 0;
- while (std::cmp_less_equal(startPos + sequenceIds.size(),
- constants::kMaxDecodeLength)) {
- executorch::aten::Tensor logitsTensor =
- this->decode(sequenceIds, encoderFeatures, startPos);
+ // Prefill: feed each initial token individually so decode() always sees 1
+ // token
+ std::span firstToken(sequenceIds.data(), 1);
+ executorch::aten::Tensor logitsTensor =
+ this->decode(firstToken, encoderFeatures, startPos);
+ ++startPos;
+ for (size_t i = 1; i < sequenceIds.size(); ++i) {
+ std::span single(sequenceIds.data() + i, 1);
+ logitsTensor = this->decode(single, encoderFeatures, startPos);
+ ++startPos;
+ }
+
+ // Autoregressive decoding: always 1 token at a time
+ while (std::cmp_less(startPos, constants::kMaxDecodeLength)) {
const size_t logitsInnerDim = logitsTensor.size(1);
const size_t logitsDictSize = logitsTensor.size(2);
const float *logitsData = logitsTensor.const_data_ptr() +
@@ -302,15 +313,16 @@ ASR::generate(std::span waveform, const DecodingOptions &options,
nextProb = probs[nextId];
}
- // Move the startPos pointer by the amount of tokens we processed
- startPos += sequenceIds.size();
- sequenceIds = {nextId};
cachedTokens.push_back(nextId);
scores.push_back(nextProb);
if (nextId == endOfTranscriptionToken_) {
break;
}
+
+ std::span single(&cachedTokens.back(), 1);
+ logitsTensor = this->decode(single, encoderFeatures, startPos);
+ ++startPos;
}
return {.tokens = std::vector(cachedTokens.cbegin() +
@@ -437,7 +449,7 @@ ASR::estimateWordLevelTimestampsLinear(std::span tokens,
const float wEnd = wStart + timePerChar * wSize;
prevCharCount += wSize;
- // We store punctations separately to other characters.
+ // Detect and extract trailing punctuations.
std::string puncts = "";
while (!w.empty() && constants::kPunctations.contains(w.back())) {
puncts += w.back();
@@ -445,7 +457,14 @@ ASR::estimateWordLevelTimestampsLinear(std::span tokens,
}
std::reverse(puncts.begin(), puncts.end());
- wordObjs.emplace_back(std::move(w), wStart, wEnd, std::move(puncts));
+ // Add the core word.
+ wordObjs.emplace_back(std::move(w), wStart, wEnd);
+
+ // If punctuation was present, add it as a separate "word" with an
+ // instantaneous timestamp at the end of the original word.
+ if (!puncts.empty()) {
+ wordObjs.emplace_back(std::move(puncts), wEnd, wEnd);
+ }
}
return wordObjs;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
index 0b284345ec..62a9f968f7 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
@@ -9,34 +9,37 @@ namespace rnexecutorch::models::speech_to_text::whisper::constants {
// Maximum duration of each audio chunk to process (in seconds)
// It is intentionally set to 29 since otherwise only the last chunk would be
// correctly transcribe due to the model's positional encoding limit
-constexpr static int32_t kChunkSize = 29;
+inline constexpr size_t kChunkSize = 29;
// Sampling rate expected by Whisper and the model's audio pipeline (16 kHz)
-constexpr static int32_t kSamplingRate = 16000;
-constexpr static int32_t kSamplesPerMilisecond = kSamplingRate / 1000;
+inline constexpr size_t kSamplingRate = 16000;
+inline constexpr size_t kSamplesPerMilisecond = kSamplingRate / 1000;
+
+inline constexpr size_t kMaxSamples = kChunkSize * kSamplingRate;
// The maximum number of tokens the decoder can generate per chunk
-constexpr static int32_t kMaxDecodeLength = 128;
+inline constexpr size_t kMaxDecodeLength = 128;
// Minimum allowed chunk length before processing (in audio samples)
-constexpr static int32_t kMinChunkSamples = 1 * kSamplingRate;
+inline constexpr size_t kMinChunkSamples = 1 * kSamplingRate;
// Number of mel frames output by the encoder (derived from input spectrogram)
-constexpr static int32_t kNumFrames = 1500;
+inline constexpr size_t kNumFrames = 1500;
// Time precision used by Whisper timestamps: each token spans 0.02 seconds
-constexpr static float kTimePrecision = 0.02f;
+inline constexpr float kTimePrecision = 0.02f;
// Special characters serving as pause / end of sentence
-static const std::unordered_set kPunctations = {',', '.', '?',
+inline const std::unordered_set kPunctations = {',', '.', '?',
'!', ':', ';'};
+inline const std::unordered_set kEosPunctations = {'.', '?', '!', ';'};
// Special token constants
namespace tokens {
-static const std::string kStartOfTranscript = "<|startoftranscript|>";
-static const std::string kEndOfTranscript = "<|endoftext|>";
-static const std::string kBeginTimestamp = "<|0.00|>";
-static const std::string kBlankAudio = "[BLANK_AUDIO]";
+inline const std::string kStartOfTranscript = "<|startoftranscript|>";
+inline const std::string kEndOfTranscript = "<|endoftext|>";
+inline const std::string kBeginTimestamp = "<|0.00|>";
+inline const std::string kBlankAudio = "[BLANK_AUDIO]";
} // namespace tokens
} // namespace rnexecutorch::models::speech_to_text::whisper::constants
\ No newline at end of file
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp
deleted file mode 100644
index ce365e4e44..0000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-#include "HypothesisBuffer.h"
-#include "Params.h"
-#include "Utils.h"
-
-#include
-#include
-
-namespace rnexecutorch::models::speech_to_text::whisper::stream {
-
-void HypothesisBuffer::insert(std::span words, float offset) {
- // Step 1 - decide which words should be considered as fresh.
- fresh_.clear();
-
- // We try to find the last committed word in a transcription string.
- // Everything beyond that word will be considered as fresh.
- // To make the algorithm more resilient to repeated strings of words,
- // we check also the preceeding words as well as timestamps (with liberal
- // range).
- size_t firstFreshWordIdx = 0;
- if (!committed_.empty()) {
- std::optional lastMatchingWordIdx =
- findCommittedSuffix(words, params::kStreamCommitedSuffixSearchSize,
- params::kStreamMaxOverlapTimestampDiff1,
- params::kStreamWordsPerErrorRate);
- firstFreshWordIdx = lastMatchingWordIdx.value_or(0);
- }
-
- bool isCompletelyFresh = firstFreshWordIdx == 0;
- for (size_t i = firstFreshWordIdx; i < words.size(); i++) {
- const auto &word = words[i];
-
- // Global start is a beginning timestamp relative only to the beginning of
- // the current streaming process.
- const float startGlobal = word.start + offset;
- const float endGlobal = word.end + offset;
-
- if (!isCompletelyFresh ||
- startGlobal > lastCommittedTime_ - params::kStreamFreshThreshold) {
- fresh_.emplace_back(word.content, startGlobal, endGlobal,
- word.punctations);
- }
- }
-
- // Step 2 - we have already selected the fresh words. Now it's time to
- // correct any mistakes and remove the words which overlap with already
- // commited segments - to avoid duplicates.
- if (!fresh_.empty() && !committed_.empty()) {
- // Calculate the largest overlapping fragment size.
- // Note that we use size limit (kStreamMaxOverlapSize) for efficiency of the
- // algorithm, and timestamp difference limit
- // (kStreamMaxOverlapTimestampDiff) to avoid removing correct fragments
- // which were just repeated after some time.
- size_t overlapSize = utils::findLargestOverlapingFragment(
- committed_, fresh_, params::kStreamMaxOverlapSize,
- params::kStreamMaxOverlapTimestampDiff2);
-
- if (overlapSize > 0) {
- fresh_.erase(fresh_.begin(), fresh_.begin() + overlapSize);
- }
- }
-}
-
-std::deque HypothesisBuffer::commit() {
- std::deque toCommit = {};
-
- // Find a stable prefix: words that haven't changed between last and current
- // iteration.
- while (!fresh_.empty() && !hypothesis_.empty() &&
- fresh_.front().content == hypothesis_.front().content) {
- // The last word from the fresh_ buffer must also match punctations with the
- // hypothesis. This is done in order to ensure correct punctation marks in
- // the resulting transcription.
- if (fresh_.size() == 1 &&
- fresh_.front().punctations != hypothesis_.front().punctations) {
- break;
- }
-
- // Take timestamps from the hypothesis, but actual content from the fresh
- // buffer.
- toCommit.emplace_back(std::move(fresh_.front().content),
- hypothesis_.front().start, hypothesis_.front().end,
- std::move(fresh_.front().punctations));
- fresh_.pop_front();
- hypothesis_.pop_front();
- }
-
- // Save the last committed word timestamp.
- // This will mark the end of the entire committed sequence.
- if (!toCommit.empty()) {
- lastCommittedTime_ = toCommit.back().end;
- }
-
- // The remaining words from the fresh buffer (uncommitted phrase)
- // become a hypothesis for the next iteration.
- hypothesis_ = std::move(fresh_);
- fresh_.clear();
-
- // The last step is to commit the selected words.
- committed_.insert(committed_.end(), toCommit.cbegin(), toCommit.cend());
-
- return toCommit;
-}
-
-void HypothesisBuffer::releaseCommits(size_t wordsToKeep) {
- if (committed_.size() > wordsToKeep) {
- size_t nWordsToErase = committed_.size() - wordsToKeep;
- committed_.erase(committed_.begin(), committed_.begin() + nWordsToErase);
- }
-}
-
-void HypothesisBuffer::reset() {
- fresh_.clear();
- hypothesis_.clear();
- committed_.clear();
-
- lastCommittedTime_ = 0.f;
-}
-
-std::optional HypothesisBuffer::findCommittedSuffix(
- std::span words, size_t nCommitted,
- float timestampDiffTolerance, size_t wordsPerMistake) {
- if (words.empty() || committed_.empty() || nCommitted == 0) {
- return std::nullopt;
- }
-
- // Determine the subset size of committed words to check against.
- size_t committedToMatchSize = std::min(nCommitted, committed_.size());
-
- // Iterate backwards through 'words' to find the most recent occurrence of a
- // suffix of 'committed_' (or the full 'committed_' sequence).
- for (int32_t i = static_cast(words.size()) - 1; i >= 0; --i) {
- bool match = true;
- size_t matchedCount = 0;
- size_t contentMistakeCount = 0;
-
- // Linearly interpolate tolerance if we are at the beginning and can't check
- // all committed words.
- float effectiveTolerance = timestampDiffTolerance;
- if (i < static_cast(committedToMatchSize) - 1) {
- effectiveTolerance *=
- static_cast(i + 1) / static_cast(committedToMatchSize);
- }
-
- // Try to match backwards from words[i] and committed_.back()
- for (size_t j = 0; j < committedToMatchSize; ++j) {
- int32_t wordsIdx = i - static_cast(j);
- int32_t committedIdx =
- static_cast(committed_.size()) - 1 - static_cast(j);
-
- if (wordsIdx < 0) {
- // We reached the beginning of the words span.
- // The algorithm allows matching a partial prefix if it's at the start.
- break;
- }
-
- const Word &w1 = words[wordsIdx];
- const Word &w2 = committed_[committedIdx];
-
- // Check timestamps within tolerance
- if (std::max(std::abs(w1.start - w2.start), std::abs(w1.end - w2.end)) >
- effectiveTolerance) {
- match = false;
- break;
- }
-
- // Allow sparse content mismatches while still treating the overall
- // sequence as matching.
- if (utils::equalsIgnoreCase(w1.content, w2.content)) {
- matchedCount++;
- } else {
- contentMistakeCount++;
- }
-
- // Early exit if mistake count already exceeds what we can recover from
- // given the remaining words to check.
- if (wordsPerMistake > 0) {
- size_t remainingToMatch = committedToMatchSize - 1 - j;
- size_t maxPossibleMatched = matchedCount + remainingToMatch;
- if (contentMistakeCount > (maxPossibleMatched / wordsPerMistake)) {
- match = false;
- break;
- }
- }
- }
-
- // One content mistake is allowed per M matched words.
- size_t maxAllowedMistakes =
- (wordsPerMistake == 0) ? 0 : (matchedCount / wordsPerMistake);
-
- if (match && matchedCount > 0 &&
- contentMistakeCount <= maxAllowedMistakes) {
- return static_cast(i);
- }
- }
-
- return std::nullopt;
-}
-
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h
deleted file mode 100644
index 25833ec01b..0000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#pragma once
-
-#include
-#include
-#include
-
-#include "../common/types/Word.h"
-
-namespace rnexecutorch::models::speech_to_text::whisper::stream {
-
-/**
- * A buffer for managing streaming transcription hypotheses.
- * This class handles stabilization of the transcription result by tracking
- * "fresh" hypotheses and "committing" them once they are stable across updates.
- */
-class HypothesisBuffer {
-public:
- /**
- * Inserts new words into the fresh_ buffer.
- * Words are filtered based on the last committed time and checked for
- * overlaps with existing committed words to prevent duplicates.
- *
- * @param newWords A span of recently generated words.
- * @param offset Time offset to adjust the word timestamps.
- */
- void insert(std::span words, float offset);
-
- /**
- * Attempts to commit words present in the fresh_ buffer.
- * A phrase from fresh_ buffer can only be committed if it also appears
- * in the hypothesis_ buffer (uncommitted words from previous iteration).
- *
- * Uncommitted words become a 'hypothesis' and are moved into the hypothesis_
- * buffer.
- *
- * @return A sequence of words committed in the current iteration.
- */
- std::deque commit();
-
- /**
- * Shrinks the committed_ buffer by erasing all words except N latest ones.
- *
- * Used primarily to relieve increasing memory usage during very
- * long streaming sessions.
- *
- * @param wordsToKeep - number of trailing words to be kept in.
- */
- void releaseCommits(size_t wordsToKeep);
-
- /**
- * Resets all the stored buffers and state variables to the initial state
- */
- void reset();
-
- // Declare a friendship with OnlineASR to allow it to access the internal
- // state of stored buffers.
- friend class OnlineASR;
-
-private:
- // Finds the most recent occurance of given committed string of words
- // in a custom span of words.
- // Returns the index of the last matching word (or nullopt if not present).
- std::optional findCommittedSuffix(std::span words,
- size_t nCommitted,
- float timestampDiffTolerance = 1.F,
- size_t wordsPerMistake = 4);
-
- // Stored buffers
- // The lifecycle of a correct result word looks as following:
- // fresh buffer -> hypothesis buffer -> commited
- std::deque
- fresh_; // 'New' words from current iterations, which require some checks
- // before they go into hypothesis_ buffer.
- std::deque
- hypothesis_; // Words potentially to be commited, stored between
- // iterations (obtained from fresh_ buffer).
- std::deque committed_; // A history of already commited words.
-
- float lastCommittedTime_ = 0.0f;
-};
-
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
index ded2183201..188c77d80d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -1,35 +1,43 @@
+#include "OnlineASR.h"
+
#include
#include
-#include
-#include
+#include
#include "Constants.h"
-#include "OnlineASR.h"
#include "Params.h"
#include "Utils.h"
namespace rnexecutorch::models::speech_to_text::whisper::stream {
-namespace {
-std::vector move_to_vector(std::deque &container) {
- return std::vector(std::make_move_iterator(container.begin()),
- std::make_move_iterator(container.end()));
+OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
+ // Reserve an expected amount of memory for audio buffer.
+ audioBuffer_.reserve((constants::kChunkSize + 1) * constants::kSamplingRate);
}
-} // namespace
-OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
- // Reserve a minimal expected amount of memory for audio buffer.
- audioBuffer_.reserve(static_cast(2 * params::kStreamChunkThreshold *
- constants::kSamplingRate));
+bool OnlineASR::isReady() const {
+ std::scoped_lock lock(streamingMutex);
+
+ return audioBuffer_.size() >= constants::kMinChunkSamples;
}
void OnlineASR::insertAudioChunk(std::span audio) {
- std::scoped_lock lock(audioBufferMutex_);
+ std::scoped_lock lock(streamingMutex);
+
audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end());
-}
-bool OnlineASR::isReady() const {
- return audioBuffer_.size() >= constants::kMinChunkSamples;
+ // Automatic buffer cleanup.
+ //
+ // This prevents the audio buffer from growing indefinitely during continuous
+ // streaming. It is particularly useful when VAD (Voice Activity Detection)
+ // is used and elements are inserted but not processed for a long time.
+ // It should not pass the condition in a normal streaming, that is when
+ // process() method is called regularly within reasonable steps of time.
+ if (audioBuffer_.size() > constants::kMaxSamples) {
+ // Note that results are not actually committed now, but saved for
+ // a later call of process().
+ memory_.toCommit = commitAndClean(memory_.transcript);
+ }
}
ProcessResult OnlineASR::process(const DecodingOptions &options) {
@@ -38,126 +46,213 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
// Copy the audio buffer to avoid keeping the lock during the entire
// transcription process.
{
- std::scoped_lock lock(audioBufferMutex_);
+ std::scoped_lock lock(streamingMutex);
audioCopy = audioBuffer_;
}
- std::vector transcriptions = asr_->transcribe(audioBuffer_, options);
+ // Obtain a transcription for current audio buffer state.
+ // It's very unlikely that buffer will exceed whisper's maximum capacity, but
+ // for absolute safety we can additionally clip the buffer.
+ std::span input(
+ audioCopy.begin(),
+ audioCopy.begin() + std::min(constants::kMaxSamples, audioCopy.size()));
- if (transcriptions.empty()) {
- return {.committed = {}, .nonCommitted = {}};
- }
+ std::vector transcriptions = asr_->transcribe(input, options);
// Flatten segments into a single word sequence.
+ // This is basically our 'nonCommitted' part for now.
std::vector words;
- words.reserve(transcriptions.front().words.size());
-
for (auto &segment : transcriptions) {
- words.insert(words.end(), std::make_move_iterator(segment.words.begin()),
- std::make_move_iterator(segment.words.end()));
+ std::move(segment.words.begin(), segment.words.end(),
+ std::back_inserter(words));
}
- hypothesisBuffer_.insert(words, bufferTimeOffset_);
-
- // Apply fix for timestamps.
- if (!hypothesisBuffer_.fresh_.empty()) {
- size_t noNewWords = hypothesisBuffer_.fresh_.size();
- float establishedEnd = hypothesisBuffer_.lastCommittedTime_;
- float newBegin = hypothesisBuffer_.fresh_.front().start;
- const float newEnd = hypothesisBuffer_.fresh_.back().end;
- float shift = 0.F;
- for (size_t i = 0; i < hypothesisBuffer_.fresh_.size(); i++) {
- const float originalEnd = hypothesisBuffer_.fresh_[i].end;
-
- if (i < hypothesisBuffer_.hypothesis_.size() &&
- utils::equalsIgnoreCase(hypothesisBuffer_.fresh_[i].content,
- hypothesisBuffer_.hypothesis_[i].content)) {
- hypothesisBuffer_.fresh_[i].start =
- hypothesisBuffer_.hypothesis_[i].start;
- hypothesisBuffer_.fresh_[i].end = hypothesisBuffer_.hypothesis_[i].end;
- shift = hypothesisBuffer_.fresh_[i].end - originalEnd;
-
- establishedEnd = hypothesisBuffer_.hypothesis_[i].end;
- newBegin = hypothesisBuffer_.fresh_[i].end;
- noNewWords--;
- continue;
- }
-
- // In case of a new word, we apply timestamp range scaling
- // based on timestamps established in previous iterations.
- const float freshDuration = newEnd - establishedEnd;
- const float epsilon = std::max(
- 0.F, 0.85F * (freshDuration -
- static_cast(noNewWords /
- params::kStreamWordsPerSecond)));
- float scale =
- (freshDuration - epsilon) / std::max(newEnd - newBegin, 0.2F);
- hypothesisBuffer_.fresh_[i].start =
- shift + (hypothesisBuffer_.fresh_[i].start - newEnd) * scale + newEnd;
- hypothesisBuffer_.fresh_[i].end =
- shift + (hypothesisBuffer_.fresh_[i].end - newEnd) * scale + newEnd;
+ // Aquire lock for the rest of the method (extensive usage of audioBuffer_).
+ std::scoped_lock lock(streamingMutex);
+
+ // Step 1: examine all previously saved EOS points.
+ // The idea is to remove entries which have changed or no longer exist
+ // due to model correcting it's output.
+ for (size_t i = 0; i < memory_.eos.size(); i++) {
+ const auto &eos = memory_.eos[i];
+ if (eos.position >= words.size() || !utils::isEos(words[eos.position]) ||
+ (eos.position > 0 &&
+ eos.preceeding != words[eos.position - 1].content)) {
+ memory_.eos.erase(memory_.eos.begin() + i, memory_.eos.end());
+ break;
}
}
- auto committed = hypothesisBuffer_.commit();
- auto nonCommitted = hypothesisBuffer_.hypothesis_;
+ // Step 2: check if the newest EOS character from transcript should be
+ // saved to eos_ vector.
+ auto lastEosIt = std::find_if(words.rbegin(), words.rend(), utils::isEos);
+ if (lastEosIt != words.rend()) {
+ size_t lastEosIndex = std::distance(words.begin(), lastEosIt.base()) - 1;
- // We want to save the most recent end of sentence word
- // to improve the audio cutting mechanism.
- for (const auto &word : committed) {
- if (!word.punctations.empty()) {
- lastSentenceEnd_ = word.end;
+ // Because of step 1, we know that if the last EOS exist in eos_,
+ // then it must be the last entry.
+ if (memory_.eos.empty() || memory_.eos.back().position != lastEosIndex) {
+ // Register last EOS entry
+ std::string preceeding =
+ lastEosIndex > 0 ? words[lastEosIndex - 1].content : "";
+ memory_.eos.emplace_back(lastEosIndex, preceeding, lastEosIt->end);
}
}
- // Since Whisper does not accept waveforms longer than 30 seconds, we need
- // to cut the audio at some safe point.
- {
- std::scoped_lock lock(audioBufferMutex_);
-
- const float audioDuration =
- static_cast(audioBuffer_.size()) / constants::kSamplingRate;
- if (audioDuration > params::kStreamChunkThreshold) {
- // Leave some portion of audio in, to improve model behavior
- // in future iterations.
- const float erasePoint =
- hypothesisBuffer_.lastCommittedTime_ == lastSentenceEnd_
- ? audioDuration
- : std::min(lastSentenceEnd_, params::kStreamChunkThreshold);
- const float minEraseDuration =
- audioDuration - params::kStreamAudioBufferMaxReserve;
- const float maxEraseDuration =
- audioDuration - params::kStreamAudioBufferMinReserve;
- const float eraseDuration = std::clamp(
- erasePoint - bufferTimeOffset_, minEraseDuration, maxEraseDuration);
- const size_t nSamplesToErase =
- static_cast(eraseDuration * constants::kSamplingRate);
+ std::vector committed;
- audioBuffer_.erase(audioBuffer_.begin(),
- audioBuffer_.begin() + nSamplesToErase);
- bufferTimeOffset_ += eraseDuration;
- }
+ // Step 3: collect all the words which could possible get committed
+ // in-between iterations.
+ if (!memory_.toCommit.empty()) {
+ committed.insert(committed.end(),
+ std::make_move_iterator(memory_.toCommit.begin()),
+ std::make_move_iterator(memory_.toCommit.end()));
+ memory_.toCommit.clear();
}
- return {.committed = move_to_vector(committed),
- .nonCommitted = move_to_vector(nonCommitted)};
+ // Step 4: clear the buffer if it is getting too large.
+ // The idea is to use the saved EOS entries and try to cut the buffer
+ // in a 'good' spot - where it will remove a significant audio chunk, yet
+ // won't affect most recent, unfinished speech samples.
+ size_t bufferSize = audioBuffer_.size();
+ if (bufferSize > static_cast(params::kStreamSafeBufferDuration *
+ constants::kSamplingRate)) {
+ auto newCommitted = commitAndClean(words);
+
+ committed.insert(committed.end(),
+ std::make_move_iterator(newCommitted.begin()),
+ std::make_move_iterator(newCommitted.end()));
+ }
+
+ // Save the uncommitted part to streamer's memory,
+ // cause it might be necessary when committing inside streamInsert().
+ memory_.transcript = words;
+
+ // Note that uncommitted part represented by recent transcription (words)
+ // is already shrinked if something has been committed during the cleanup
+ // phase.
+ return {.committed = std::move(committed), .nonCommitted = std::move(words)};
}
-std::vector OnlineASR::finish() {
- // We always push the last remaining hypothesis, even if it's not
- // confirmed in second iteration, to avoid ending up with broken sentences.
- std::deque remaining = hypothesisBuffer_.hypothesis_;
+std::vector OnlineASR::finish(const DecodingOptions &options) {
+ ProcessResult result = process(options);
+
+ // Last-tick committed delta + whatever never made it past the commit
+ // threshold.
+ std::vector residual = std::move(result.committed);
+ residual.insert(residual.end(),
+ std::make_move_iterator(result.nonCommitted.begin()),
+ std::make_move_iterator(result.nonCommitted.end()));
+
+ reset();
- return move_to_vector(remaining);
+ return residual;
}
void OnlineASR::reset() {
- std::scoped_lock lock(audioBufferMutex_);
-
- hypothesisBuffer_.reset();
- bufferTimeOffset_ = 0.f;
+ std::scoped_lock lock(streamingMutex);
audioBuffer_.clear();
+
+ // Reset memory.
+ memory_.transcript.clear();
+ memory_.eos.clear();
+ memory_.toCommit.clear();
+}
+
+std::vector OnlineASR::commitAndClean(std::vector &transcript) {
+ const size_t bufferSize = audioBuffer_.size();
+ const float midBufferThreshold = params::kStreamMaxDuration / 2.0F;
+
+ std::vector committed;
+
+ // If we don't have any EOS entries, then we most likely have not
+ // recorded any speech. In this case we can safely cut the maximum amount of
+ // audio data.
+ if (memory_.eos.empty()) {
+ size_t cut =
+ bufferSize - params::kStreamSafetyThreshold * constants::kSamplingRate;
+
+ audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+ }
+
+ // If we have exactly one (most recent) EOS entry in the eos_, then
+ // we need to be more careful.
+ // Normally we want to keep at least one sentence in, but if the sentence
+ // covers a significant amount of buffer, we have no choice.
+ else if (memory_.eos.size() == 1) {
+ const float eosTimestamp = memory_.eos[0].tmstpend;
+
+ const float upperHalfDuration =
+ std::max(0.0F, eosTimestamp - midBufferThreshold);
+ const float wordsPerSecond =
+ upperHalfDuration > 0.1F
+ ? static_cast(transcript.size()) / upperHalfDuration
+ : 0.0F;
+
+ // The EOS sits early enough that cutting up to the safety margin won't
+ // touch the ongoing (post-EOS) speech.
+ const bool eosSafe = eosTimestamp < params::kStreamSafeBufferDuration -
+ params::kStreamSafetyThreshold;
+
+ if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) {
+ // EOS lies past the midpoint, but a low word density implies the spoken
+ // audio is concentrated in the upper half. Drop the lower half and
+ // shift the EOS accordingly.
+ audioBuffer_.erase(audioBuffer_.begin(),
+ audioBuffer_.begin() +
+ static_cast(midBufferThreshold *
+ constants::kSamplingRate));
+ memory_.eos[0].tmstpend -= midBufferThreshold;
+ } else {
+ // Cut everything up to and including the sentence — either by the
+ // safety margin (when EOS is early) or (more aggresively) right at the
+ // EOS boundary — and commit its words.
+ const size_t cut =
+ eosSafe
+ ? bufferSize -
+ static_cast(params::kStreamSafetyThreshold *
+ constants::kSamplingRate)
+ : static_cast(eosTimestamp * constants::kSamplingRate);
+
+ audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+
+ committed.insert(committed.end(),
+ std::make_move_iterator(transcript.begin()),
+ std::make_move_iterator(transcript.end()));
+
+ transcript.clear();
+ memory_.eos.clear();
+ }
+ }
+
+ // In case of 2 or more sentences, we generally want to keep the last one
+ // intact. This would provide a bit of stability to the algorithm.
+ else {
+ const auto &secondTolastEntry = memory_.eos[memory_.eos.size() - 2];
+
+ const size_t cut = static_cast(secondTolastEntry.tmstpend *
+ constants::kSamplingRate);
+ const size_t lastCommittedPos = secondTolastEntry.position;
+
+ audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+
+ // Move all words up to the last committed position (inclusive) to the
+ // committed buffer.
+ committed.insert(
+ committed.end(), std::make_move_iterator(transcript.begin()),
+ std::make_move_iterator(transcript.begin() + lastCommittedPos + 1));
+ transcript.erase(transcript.begin(),
+ transcript.begin() + lastCommittedPos + 1);
+
+ // Retain only the most recent EOS entry, shifting both its timestamp
+ // and its position to match the new (truncated) transcript origin.
+ memory_.eos.erase(memory_.eos.begin(), memory_.eos.end() - 1);
+ memory_.eos[0].tmstpend -= secondTolastEntry.tmstpend;
+ memory_.eos[0].position -= lastCommittedPos + 1;
+ }
+
+ return committed;
}
} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
index df6d469e39..7547d16bd5 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
@@ -1,13 +1,13 @@
#pragma once
+#include
+#include
+#include
+
#include "../common/schema/OnlineASR.h"
#include "../common/types/ProcessResult.h"
-#include "../common/types/Segment.h"
#include "../common/types/Word.h"
#include "ASR.h"
-#include "HypothesisBuffer.h"
-
-#include
namespace rnexecutorch::models::speech_to_text::whisper::stream {
@@ -21,60 +21,65 @@ class OnlineASR : public schema::OnlineASR {
OnlineASR(const ASR *asr);
/**
- * Appends new audio samples to the internal processing buffer.
- *
- * @param audio A span of PCM float samples (expected 16kHz).
+ * Checks if the buffer contains enough audio for the next processing step.
+ * @return True if ready, false otherwise.
*/
- void insertAudioChunk(std::span audio) override;
+ bool isReady() const override;
/**
- * Determines whether the model is ready to process the next iteration.
- *
- * @return True if audioBuffer has enough samples, False otherwise
+ * Appends audio samples to the internal buffer.
+ * @param audio Span containing the audio data.
*/
- bool isReady() const override;
+ void insertAudioChunk(std::span audio) override;
/**
- * Processes the current audio buffer and returns new transcription results.
- * Stability is managed by an internal HypothesisBuffer to ensure that
- * only confirmed (stable) text is returned as "committed".
- *
- * @param options Decoding configuration (language, etc.).
- * @return A ProcessResult containing newly committed and uncommitted
- * words.
+ * Processes the current buffered audio and returns transcription results.
+ * @param options Decoding options for the transcription.
+ * @return Transcription result containing committed and volatile tokens.
*/
ProcessResult process(const DecodingOptions &options) override;
/**
- * Finalizes the current streaming session.
- * Flushes any remaining words from the hypothesis buffer.
- *
- * @return A vector of remaining transcribed words.
+ * Finalizes the current stream and returns all words.
+ * @return Vector of detected words.
*/
- std::vector finish() override;
+ std::vector finish(const DecodingOptions &options) override;
/**
- * Reset the streaming state by resetting the buffers
+ * Resets the internal state and clears buffers.
*/
void reset() override;
private:
+ // Cleans up the buffer and returns committed words based on given transcript.
+ std::vector commitAndClean(std::vector &transcript);
+
// ASR module connection for transcribing the audio
const ASR *asr_;
- // Helper buffers - audio buffer
- // Stores the increasing amounts of streamed audio.
- // Cleared from time to time after reaching a threshold size.
+ // Audio buffer (input) - accumulates obtained audio samples.
std::vector audioBuffer_ = {};
- mutable std::mutex audioBufferMutex_;
- float bufferTimeOffset_ = 0.F; // Audio buffer offset
+ mutable std::mutex streamingMutex; // Covers both buffer & memory
- // Helper buffers - hypothesis buffer
- // Manages the whisper streaming hypothesis mechanism.
- HypothesisBuffer hypothesisBuffer_;
+ // Streaming memory.
+ // In general, helps to navigate continous streaming state and improve buffer
+ // handling algorithms.
+ struct Memory {
+ // State management helper.
+ struct EOSEntry {
+ size_t position; // An absolute position (index) in the transcription
+ // (word sequence).
+ std::string preceeding; // A preceeding word in the transcription
+ float tmstpend; // Ending timestamp of the sentence.
+ };
- // State members to keep track of specyfic aspects of buffer state
- float lastSentenceEnd_ = 0.F;
+ std::vector
+ transcript; // The most recent transcription result (uncommitted only!).
+ std::vector
+ eos; // End of sentence points from the most recent transcription.
+ std::vector toCommit; // Words to be committed in the next iteration
+ // (next process() call).
+ } memory_;
};
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
\ No newline at end of file
+} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
index 5eb74c06cc..847a22b1e0 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
@@ -1,6 +1,9 @@
#pragma once
+#include "Constants.h"
+
#include
+#include
/**
* Hyperparameters
@@ -11,90 +14,50 @@
namespace rnexecutorch::models::speech_to_text::whisper::params {
/**
- * Determines the range of buffer left when skipping an audio chunk
- * of size lower than maximum allowed chunk size.
- *
- * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer],
- * then instead of moving to the last returned timestamp, we jump across the
- * entire 30 seconds chunk. This resolves the issue of multiple redundant
- * segments being produced by the transcription algorithm.
+ * Maximum duration of audio that the streaming buffer keeps before forcing
+ * a cleanup. Aligned with Whisper's maximum supported input length.
*/
-constexpr static int32_t kChunkBreakBuffer = 2; // [s]
+constexpr inline float kStreamMaxDuration =
+ static_cast(constants::kChunkSize);
/**
- * Determines the maximum timestamp difference available for a word to be
- * considered as fresh in streaming algorithm.
+ * The minimum amount of recent audio always kept in the buffer when a blind
+ * cut is performed. Acts as the lower bound on what survives a cleanup.
*/
-constexpr static float kStreamFreshThreshold = 3.F; // [s], originally 0.5
+constexpr inline float kStreamSafetyThreshold = 3.F; // [s]
/**
- * The size of the most recent committed suffix searched in
- * fresh words string.
- *
- * For example, if the committed buffer contains ["I", "did" "a" "very" "nasty"
- * "thing."], and kStreamCommitedSuffixSearchSize = 3, then we search for
- * ["very" "nasty" "thing."] suffix.
+ * Forced-cleanup threshold. Once the buffer grows past this duration we run
+ * the EOS-anchored cleanup routine.
*/
-constexpr static size_t kStreamCommitedSuffixSearchSize = 5;
+constexpr inline float kStreamSafeBufferDuration =
+ kStreamMaxDuration - kStreamSafetyThreshold; // [s]
/**
- * Determines the maximum expected size of overlapping fragments between
- * fresh words buffer and commited words buffer in streaming mode.
- *
- * It is a limit of maximum amount of erased repeated words from fresh buffer.
- * The bigger it gets, the less probable it is to commit the same phrase twice.
+ * An estimate of the number of words spoken per second.
+ * Used for estimating transcription progress and buffer management heuristics.
*/
-constexpr static size_t kStreamMaxOverlapSize =
- 12; // Number of overlaping words
+constexpr inline float kWordsPerSecondEstimation = 2.25F;
/**
- * Similar to kMaxStreamOverlapSize, but this one determines
- * the maximum allowed timestamp difference between the overlaping fragments.
- *
- * It's the first, more strict threshold, used when searching for recently
- * committed entries.
+ * Upper bound for words per second estimate in fast speech.
*/
-constexpr static float kStreamMaxOverlapTimestampDiff1 = 6.F; // [s]
+constexpr inline float kWordsPerSecondHigh = 4.F;
/**
- * Similar to kMaxStreamOverlapSize, but this one determines
- * the maximum allowed timestamp difference between the overlaping fragments.
- *
- * It's the second, more liberal threshold, used in overlap correction
- * algorithm.
+ * Lower bound for words per second estimate in slow speech.
*/
-constexpr static float kStreamMaxOverlapTimestampDiff2 = 15.F; // [s]
+constexpr inline float kWordsPerSecondLow = 1.5F;
/**
- * Number of words per 1 allowed mistake (error correction).
+ * Determines the range of buffer left when skipping an audio chunk
+ * of size lower than maximum allowed chunk size.
*
- * For example, if kStreamWordsPerErrorRate = 4, then we allow maximum 1 mistake
- * in a 4 word string.
- */
-constexpr static size_t kStreamWordsPerErrorRate = 5;
-
-/**
- * A threshold which exceeded causes the main streaming audio buffer to be
- * cleared.
- */
-constexpr static float kStreamChunkThreshold = 20.F; // [s]
-
-/**
- * Decides how much of recent audio waveform is always kept in when
- * clearing the audio buffer in streaming algorithm.
- */
-constexpr static float kStreamAudioBufferMinReserve = 2.F; // [s]
-
-/**
- * Decides how much of recent audio waveform can be kept in when
- * clearing the audio buffer in streaming algorithm.
- */
-constexpr static float kStreamAudioBufferMaxReserve = 6.F; // [s]
-
-/**
- * An estimate of number of words per second produced in a standard
- * human conversation speech.
+ * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer],
+ * then instead of moving to the last returned timestamp, we jump across the
+ * entire 30 seconds chunk. This resolves the issue of multiple redundant
+ * segments being produced by the transcription algorithm.
*/
-constexpr static float kStreamWordsPerSecond = 2.5F;
+constexpr inline int32_t kChunkBreakBuffer = 2; // [s]
-} // namespace rnexecutorch::models::speech_to_text::whisper::params
\ No newline at end of file
+} // namespace rnexecutorch::models::speech_to_text::whisper::params
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
index 2e4e3b5076..48c84a84b7 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
@@ -1,6 +1,7 @@
#pragma once
#include "../common/types/Word.h"
+#include "Constants.h"
#include
#include
#include
@@ -8,70 +9,14 @@
namespace rnexecutorch::models::speech_to_text::whisper::utils {
-// Compares two strings without case-sensitivity.
-inline bool equalsIgnoreCase(const std::string &a, const std::string &b) {
- if (a.size() != b.size()) {
- return false;
- }
- return std::equal(a.begin(), a.end(), b.begin(), [](char c1, char c2) {
- return std::tolower(static_cast(c1)) ==
- std::tolower(static_cast(c2));
- });
-}
-
/**
- * Finds the largest (in number of words) overlaping fragment between word
- * vectors A (suffix) and B (prefix).
+ * Checks if the given word represents an End-of-Sentence (EOS) punctuation.
*
- * An overlaping fragment is any fragment C, which can be simultaneously a
- * suffix of A and a prefix of B. Example: A = 'Jane likes food and playing
- * games', B = 'playing games and sleeping', the overlap fragment C = 'playing
- * games'.
- *
- * @param suffixVec An input vector, where only suffixes can overlap.
- * Typically the 'commited' buffer in streaming algorithm.
- * @param preffixVec An input vector, where only prefixes can overlap.
- * Typically the 'fresh' buffer in streaming algorithm.
- * @param maxCheckRange The maximum size of overlapping fragment. Determines the
- * range of search.
- * @param maxTimestampDiff The maximum allowed timestamp difference between
- * overlaping fragments. If exceeded, the fragment are not considered as
- * overlaping.
- * @return The size of the largest found overlaping fragment.
+ * @param word The word to check.
*/
-template
-inline size_t findLargestOverlapingFragment(const Container &suffixVec,
- const Container &prefixVec,
- size_t maxCheckRange = 10,
- float maxTimestampDiff = 100.f) {
- size_t range = std::min({suffixVec.size(), prefixVec.size(), maxCheckRange});
-
- if (range == 0) {
- return 0;
- }
-
- // i starts at the index where the suffix of length 'range' begins.
- for (size_t i = suffixVec.size() - range; i < suffixVec.size(); ++i) {
- // We search for overlaps by searching for the first word of prefixVec
- if (equalsIgnoreCase(suffixVec[i].content, prefixVec[0].content)) {
- size_t calculatedSize = suffixVec.size() - i;
-
- bool isEqual =
- std::equal(suffixVec.begin() + i, suffixVec.end(), prefixVec.begin(),
- [maxTimestampDiff](const Word &sWord, const Word &pWord) {
- return equalsIgnoreCase(sWord.content, pWord.content) &&
- std::max(std::fabs(sWord.start - pWord.start),
- std::fabs(sWord.end - pWord.end)) <=
- maxTimestampDiff;
- });
-
- if (isEqual) {
- return calculatedSize;
- }
- }
- }
-
- return 0;
+constexpr inline bool isEos(const Word &word) {
+ return word.content.size() == 1 &&
+ constants::kEosPunctations.contains(word.content[0]);
}
} // namespace rnexecutorch::models::speech_to_text::whisper::utils
\ No newline at end of file
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 6fb20f9ca3..c423594213 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -773,32 +773,29 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = {
} as const;
// S2T
-const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_EN_MODEL = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`;
+const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`;
+const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`;
-const WHISPER_TINY_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_quantized_xnnpack.pte`;
+const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`;
+const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`;
-const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_EN_MODEL = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`;
+const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`;
+const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`;
-const WHISPER_BASE_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-base-quantized.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-base-quantized.en/${VERSION_TAG}/xnnpack/whisper_base_en_quantized_xnnpack.pte`;
+const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`;
+const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`;
-const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_EN_MODEL = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`;
+const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`;
+const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`;
-const WHISPER_SMALL_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-small-quantized.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-small-quantized.en/${VERSION_TAG}/xnnpack/whisper_small_en_quantized_xnnpack.pte`;
-
-const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_MODEL = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`;
-
-const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_MODEL = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_xnnpack.pte`;
-
-const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_xnnpack.pte`;
+const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`;
+const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`;
/**
* @category Models - Speech To Text
@@ -806,18 +803,15 @@ const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/
export const WHISPER_TINY_EN = {
modelName: 'whisper-tiny-en',
isMultilingual: false,
- modelSource: WHISPER_TINY_EN_MODEL,
+ modelSource: WHISPER_TINY_EN_MODEL_XNNPACK,
tokenizerSource: WHISPER_TINY_EN_TOKENIZER,
} as const;
-/**
- * @category Models - Speech To Text
- */
-export const WHISPER_TINY_EN_QUANTIZED = {
- modelName: 'whisper-tiny-en-quantized',
+export const WHISPER_TINY_EN_COREML = {
+ modelName: 'whisper-tiny-en',
isMultilingual: false,
- modelSource: WHISPER_TINY_EN_QUANTIZED_MODEL,
- tokenizerSource: WHISPER_TINY_EN_QUANTIZED_TOKENIZER,
+ modelSource: WHISPER_TINY_EN_MODEL_COREML,
+ tokenizerSource: WHISPER_TINY_EN_TOKENIZER,
} as const;
/**
@@ -826,18 +820,18 @@ export const WHISPER_TINY_EN_QUANTIZED = {
export const WHISPER_BASE_EN = {
modelName: 'whisper-base-en',
isMultilingual: false,
- modelSource: WHISPER_BASE_EN_MODEL,
+ modelSource: WHISPER_BASE_EN_MODEL_XNNPACK,
tokenizerSource: WHISPER_BASE_EN_TOKENIZER,
} as const;
/**
* @category Models - Speech To Text
*/
-export const WHISPER_BASE_EN_QUANTIZED = {
- modelName: 'whisper-base-en-quantized',
+export const WHISPER_BASE_EN_COREML = {
+ modelName: 'whisper-base-en',
isMultilingual: false,
- modelSource: WHISPER_BASE_EN_QUANTIZED_MODEL,
- tokenizerSource: WHISPER_BASE_EN_QUANTIZED_TOKENIZER,
+ modelSource: WHISPER_BASE_EN_MODEL_COREML,
+ tokenizerSource: WHISPER_BASE_EN_TOKENIZER,
} as const;
/**
@@ -846,18 +840,18 @@ export const WHISPER_BASE_EN_QUANTIZED = {
export const WHISPER_SMALL_EN = {
modelName: 'whisper-small-en',
isMultilingual: false,
- modelSource: WHISPER_SMALL_EN_MODEL,
+ modelSource: WHISPER_SMALL_EN_MODEL_XNNPACK,
tokenizerSource: WHISPER_SMALL_EN_TOKENIZER,
} as const;
/**
* @category Models - Speech To Text
*/
-export const WHISPER_SMALL_EN_QUANTIZED = {
- modelName: 'whisper-small-en-quantized',
+export const WHISPER_SMALL_EN_COREML = {
+ modelName: 'whisper-small-en',
isMultilingual: false,
- modelSource: WHISPER_SMALL_EN_QUANTIZED_MODEL,
- tokenizerSource: WHISPER_SMALL_EN_QUANTIZED_TOKENIZER,
+ modelSource: WHISPER_SMALL_EN_MODEL_COREML,
+ tokenizerSource: WHISPER_SMALL_EN_TOKENIZER,
} as const;
/**
@@ -866,7 +860,17 @@ export const WHISPER_SMALL_EN_QUANTIZED = {
export const WHISPER_TINY = {
modelName: 'whisper-tiny',
isMultilingual: true,
- modelSource: WHISPER_TINY_MODEL,
+ modelSource: WHISPER_TINY_MODEL_XNNPACK,
+ tokenizerSource: WHISPER_TINY_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_TINY_COREML = {
+ modelName: 'whisper-tiny',
+ isMultilingual: true,
+ modelSource: WHISPER_TINY_MODEL_COREML,
tokenizerSource: WHISPER_TINY_TOKENIZER,
} as const;
@@ -876,7 +880,17 @@ export const WHISPER_TINY = {
export const WHISPER_BASE = {
modelName: 'whisper-base',
isMultilingual: true,
- modelSource: WHISPER_BASE_MODEL,
+ modelSource: WHISPER_BASE_MODEL_XNNPACK,
+ tokenizerSource: WHISPER_BASE_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_BASE_COREML = {
+ modelName: 'whisper-base',
+ isMultilingual: true,
+ modelSource: WHISPER_BASE_MODEL_COREML,
tokenizerSource: WHISPER_BASE_TOKENIZER,
} as const;
@@ -886,7 +900,17 @@ export const WHISPER_BASE = {
export const WHISPER_SMALL = {
modelName: 'whisper-small',
isMultilingual: true,
- modelSource: WHISPER_SMALL_MODEL,
+ modelSource: WHISPER_SMALL_MODEL_XNNPACK,
+ tokenizerSource: WHISPER_SMALL_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_SMALL_COREML = {
+ modelName: 'whisper-small',
+ isMultilingual: true,
+ modelSource: WHISPER_SMALL_MODEL_COREML,
tokenizerSource: WHISPER_SMALL_TOKENIZER,
} as const;
@@ -1314,14 +1338,17 @@ export const MODEL_REGISTRY = {
STYLE_TRANSFER_UDNIE,
STYLE_TRANSFER_UDNIE_QUANTIZED,
WHISPER_TINY_EN,
- WHISPER_TINY_EN_QUANTIZED,
+ WHISPER_TINY_EN_COREML,
WHISPER_BASE_EN,
- WHISPER_BASE_EN_QUANTIZED,
+ WHISPER_BASE_EN_COREML,
WHISPER_SMALL_EN,
- WHISPER_SMALL_EN_QUANTIZED,
+ WHISPER_SMALL_EN_COREML,
WHISPER_TINY,
+ WHISPER_TINY_COREML,
WHISPER_BASE,
+ WHISPER_BASE_COREML,
WHISPER_SMALL,
+ WHISPER_SMALL_COREML,
DEEPLAB_V3_RESNET50,
DEEPLAB_V3_RESNET101,
DEEPLAB_V3_MOBILENET_V3_LARGE,
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 9f428c98b2..5ac929a67f 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -5,6 +5,7 @@ import {
SpeechToTextType,
SpeechToTextProps,
TranscriptionResult,
+ StreamingOptions,
} from '../../types/stt';
import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -104,7 +105,7 @@ export const useSpeechToText = ({
);
const stream = useCallback(
- async function* (options: DecodingOptions = {}): AsyncGenerator<
+ async function* (options: StreamingOptions = {}): AsyncGenerator<
{
committed: TranscriptionResult;
nonCommitted: TranscriptionResult;
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index a1bf6231ad..36464ee964 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -2,6 +2,7 @@ import {
DecodingOptions,
SpeechToTextModelConfig,
SpeechToTextModelName,
+ StreamingOptions,
TranscriptionResult,
} from '../../types/stt';
import { ResourceFetcher } from '../../utils/ResourceFetcher';
@@ -177,7 +178,7 @@ export class SpeechToTextModule {
* @yields An object containing `committed` and `nonCommitted` transcription results.
* @returns An async generator yielding transcription updates.
*/
- public async *stream(options: DecodingOptions = {}): AsyncGenerator<{
+ public async *stream(options: StreamingOptions = {}): AsyncGenerator<{
committed: TranscriptionResult;
nonCommitted: TranscriptionResult;
}> {
@@ -185,6 +186,7 @@ export class SpeechToTextModule {
const verbose = !!options.verbose;
const language = options.language || '';
+ const timeout = options.timeout || 100;
const queue: {
committed: TranscriptionResult;
@@ -219,7 +221,8 @@ export class SpeechToTextModule {
wake();
},
language,
- verbose
+ verbose,
+ timeout
);
finished = true;
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index 0a6ed11f70..20f1013ef0 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -94,7 +94,7 @@ export interface SpeechToTextType {
* @returns Asynchronous generator that returns `committed` and `nonCommitted` transcription.
* Both `committed` and `nonCommitted` are of type `TranscriptionResult`
*/
- stream(options?: DecodingOptions | undefined): AsyncGenerator<
+ stream(options?: StreamingOptions | undefined): AsyncGenerator<
{
committed: TranscriptionResult;
nonCommitted: TranscriptionResult;
@@ -208,6 +208,15 @@ export interface DecodingOptions {
verbose?: boolean;
}
+/**
+ * Configuration options for the speech-to-text streaming process.
+ * @category Types
+ * @property {number} [timeout] - Specifies (in milliseconds) how much does streamer wait between model inferences.
+ */
+export interface StreamingOptions extends DecodingOptions {
+ timeout?: number;
+}
+
/**
* Structure that represent single token with timestamp information.
* @category Types
diff --git a/yarn.lock b/yarn.lock
index 256469db22..9584660eb7 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -15249,6 +15249,24 @@ __metadata:
languageName: node
linkType: hard
+"react-native-audio-api@npm:0.12.2":
+ version: 0.12.2
+ resolution: "react-native-audio-api@npm:0.12.2"
+ dependencies:
+ semver: "npm:^7.7.3"
+ peerDependencies:
+ react: "*"
+ react-native: "*"
+ react-native-worklets: ">= 0.6.0"
+ peerDependenciesMeta:
+ react-native-worklets:
+ optional: true
+ bin:
+ setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js
+ checksum: 10/ed495058382188c8beb51ce89f2ef14d846dc0c0a07c65a7b4c71aa106fb7ea14aa8660b05fb33941c038d1a7ab2ba4ab3eb039fe481841938c45396903c6060
+ languageName: node
+ linkType: hard
+
"react-native-builder-bob@npm:^0.40.12":
version: 0.40.18
resolution: "react-native-builder-bob@npm:0.40.18"
@@ -16627,7 +16645,7 @@ __metadata:
metro-config: "npm:^0.83.0"
react: "npm:19.2.5"
react-native: "npm:0.83.4"
- react-native-audio-api: "npm:0.12.0"
+ react-native-audio-api: "npm:0.12.2"
react-native-device-info: "npm:^15.0.2"
react-native-executorch: "workspace:*"
react-native-executorch-expo-resource-fetcher: "workspace:*"