diff --git a/apps/llm/app/index.tsx b/apps/llm/app/index.tsx
index 72358ae72c..b67b3fa7ce 100644
--- a/apps/llm/app/index.tsx
+++ b/apps/llm/app/index.tsx
@@ -29,12 +29,6 @@ export default function Home() {
         >
           <Text style={styles.buttonText}>LLM Structured Output</Text>
         </TouchableOpacity>
-        <TouchableOpacity
-          style={styles.button}
-          onPress={() => router.navigate('voice_chat/')}
-        >
-          <Text style={styles.buttonText}>Voice Chat</Text>
-        </TouchableOpacity>
         <TouchableOpacity
           style={styles.button}
           onPress={() => router.navigate('multimodal_llm/')}
diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx
deleted file mode 100644
index 23ab70bff4..0000000000
--- a/apps/llm/app/voice_chat/index.tsx
+++ /dev/null
@@ -1,311 +0,0 @@
-import { useContext, useEffect, useState } from 'react';
-import {
-  Keyboard,
-  KeyboardAvoidingView,
-  Platform,
-  StyleSheet,
-  Text,
-  TouchableOpacity,
-  TouchableWithoutFeedback,
-  View,
-} from 'react-native';
-import SWMIcon from '../../assets/icons/swm_icon.svg';
-import Spinner from '../../components/Spinner';
-import ErrorBanner from '../../components/ErrorBanner';
-import {
-  useSpeechToText,
-  useLLM,
-  QWEN3_0_6B_QUANTIZED,
-  QWEN3_1_7B_QUANTIZED,
-  LLAMA3_2_1B_SPINQUANT,
-  WHISPER_TINY_EN,
-  WHISPER_TINY_EN_QUANTIZED,
-  WHISPER_BASE_EN,
-  WHISPER_SMALL_EN,
-  LLMProps,
-  SpeechToTextProps,
-} from 'react-native-executorch';
-import { ModelPicker, ModelOption } from '../../components/ModelPicker';
-import PauseIcon from '../../assets/icons/pause_icon.svg';
-import MicIcon from '../../assets/icons/mic_icon.svg';
-import StopIcon from '../../assets/icons/stop_icon.svg';
-import ColorPalette from '../../colors';
-import Messages from '../../components/Messages';
-import { AudioManager, AudioRecorder } from 'react-native-audio-api';
-import DeviceInfo from 'react-native-device-info';
-import { useIsFocused } from '@react-navigation/native';
-import { useSafeAreaInsets } from 'react-native-safe-area-context';
-import { GeneratingContext } from '../../context';
-
-type LLMModelSources = LLMProps['model'];
-type STTModelSources = SpeechToTextProps['model'];
-
-const LLM_MODELS: ModelOption<LLMModelSources>[] = [
-  { label: 'Qwen3 0.6B', value: QWEN3_0_6B_QUANTIZED },
-  { label: 'Qwen3 1.7B', value: QWEN3_1_7B_QUANTIZED },
-  { label: 'Llama 1B', value: LLAMA3_2_1B_SPINQUANT },
-];
-
-const STT_MODELS: ModelOption<STTModelSources>[] = [
-  { label: 'Whisper Tiny', value: WHISPER_TINY_EN },
-  { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED },
-  { label: 'Whisper Base', value: WHISPER_BASE_EN },
-  { label: 'Whisper Small', value: WHISPER_SMALL_EN },
-];
-
-export default function VoiceChatScreenWrapper() {
-  const isFocused = useIsFocused();
-
-  return isFocused ? <VoiceChatScreen /> : null;
-}
-
-function VoiceChatScreen() {
-  const { bottom } = useSafeAreaInsets();
-  const [isRecording, setIsRecording] = useState(false);
-  const [liveTranscription, setLiveTranscription] = useState('');
-  const [selectedLLM, setSelectedLLM] =
-    useState<LLMModelSources>(QWEN3_0_6B_QUANTIZED);
-  const [selectedSTT, setSelectedSTT] =
-    useState<STTModelSources>(WHISPER_TINY_EN);
-  const [error, setError] = useState<string | null>(null);
-
-  const [recorder] = useState(() => new AudioRecorder());
-
-  const { setGlobalGenerating } = useContext(GeneratingContext);
-
-  const llm = useLLM({ model: selectedLLM });
-  const speechToText = useSpeechToText({
-    model: selectedSTT,
-  });
-
-  useEffect(() => {
-    setGlobalGenerating(llm.isGenerating || speechToText.isGenerating);
-  }, [llm.isGenerating, speechToText.isGenerating, setGlobalGenerating]);
-
-  useEffect(() => {
-    AudioManager.setAudioSessionOptions({
-      iosCategory: 'playAndRecord',
-      iosMode: 'spokenAudio',
-      iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'],
-    });
-    AudioManager.requestRecordingPermissions();
-  }, []);
-
-  const handleRecordPress = async () => {
-    if (isRecording) {
-      setIsRecording(false);
-      recorder.stop();
-      speechToText.streamStop();
-    } else {
-      setIsRecording(true);
-      setLiveTranscription('');
-
-      const sampleRate = 16000;
-      recorder.onAudioReady(
-        {
-          sampleRate,
-          bufferLength: 0.1 * sampleRate,
-          channelCount: 1,
-        },
-        ({ buffer }) => {
-          speechToText.streamInsert(buffer.getChannelData(0));
-        }
-      );
-      recorder.start();
-
-      let finalResult = '';
-
-      try {
-        for await (const result of speechToText.stream()) {
-          const text = result.committed.text + result.nonCommitted.text;
-          setLiveTranscription(text);
-          finalResult = text;
-        }
-      } catch (e) {
-        setError(e instanceof Error ? e.message : String(e));
-      } finally {
-        if (finalResult.trim().length > 0) {
-          await llm.sendMessage(finalResult);
-          setLiveTranscription('');
-        }
-      }
-    }
-  };
-
-  useEffect(() => {
-    if (llm.error) setError(String(llm.error));
-  }, [llm.error]);
-
-  useEffect(() => {
-    if (speechToText.error) setError(String(speechToText.error));
-  }, [speechToText.error]);
-
-  return (!llm.isReady || !speechToText.isReady) &&
-    !llm.error &&
-    !speechToText.error ? (
-    <Spinner
-      visible={true}
-      textContent={`Loading the LLM model ${(llm.downloadProgress * 100).toFixed(0)} %\nLoading the speech model ${(speechToText.downloadProgress * 100).toFixed(0)} %`}
-    />
-  ) : (
-    <TouchableWithoutFeedback onPress={Keyboard.dismiss}>
-      <KeyboardAvoidingView
-        style={styles.keyboardAvoidingView}
-        behavior={Platform.OS === 'ios' ? 'padding' : 'height'}
-        keyboardVerticalOffset={Platform.OS === 'android' ? 30 : 0}
-      >
-        <View style={styles.topContainer}>
-          <SWMIcon width={45} height={45} />
-          <Text style={styles.textModelName}>Qwen 3 x Whisper</Text>
-        </View>
-        <ErrorBanner message={error} onDismiss={() => setError(null)} />
-        {llm.messageHistory.length > 0 || liveTranscription.length > 0 ? (
-          <View style={styles.chatContainer}>
-            <Messages
-              chatHistory={
-                isRecording && liveTranscription.length > 0
-                  ? [
-                      ...llm.messageHistory,
-                      {
-                        role: 'user',
-                        content: liveTranscription,
-                      },
-                    ]
-                  : llm.messageHistory
-              }
-              llmResponse={llm.response}
-              isGenerating={llm.isGenerating}
-              deleteMessage={llm.deleteMessage}
-            />
-          </View>
-        ) : (
-          <View style={styles.helloMessageContainer}>
-            <Text style={styles.helloText}>Hello! 👋</Text>
-            <Text style={styles.bottomHelloText}>
-              Tap the mic and speak to me. I'll transcribe your voice and
-              respond using a language model — all on-device.
-            </Text>
-          </View>
-        )}
-
-        <ModelPicker
-          models={LLM_MODELS}
-          selectedModel={selectedLLM}
-          onSelect={(m) => setSelectedLLM(m)}
-        />
-        <ModelPicker
-          models={STT_MODELS}
-          selectedModel={selectedSTT}
-          onSelect={(m) => setSelectedSTT(m)}
-        />
-
-        <View
-          style={[
-            styles.bottomContainer,
-            Platform.OS === 'android' && {
-              paddingBottom: bottom || 16,
-              height: 100 + (bottom || 16),
-            },
-          ]}
-        >
-          {DeviceInfo.isEmulatorSync() ? (
-            <View style={styles.emulatorBox}>
-              <Text style={[styles.emulatorWarning]}>
-                recording disabled on emulator
-              </Text>
-            </View>
-          ) : (
-            <>
-              {llm.isGenerating ? (
-                <TouchableOpacity onPress={llm.interrupt}>
-                  <PauseIcon height={40} width={40} padding={4} margin={8} />
-                </TouchableOpacity>
-              ) : (
-                <TouchableOpacity
-                  style={
-                    !isRecording ? styles.recordTouchable : styles.recordingInfo
-                  }
-                  onPress={handleRecordPress}
-                >
-                  {isRecording ? (
-                    <StopIcon height={40} width={40} padding={4} margin={8} />
-                  ) : (
-                    <MicIcon height={40} width={40} padding={4} margin={8} />
-                  )}
-                </TouchableOpacity>
-              )}
-            </>
-          )}
-        </View>
-      </KeyboardAvoidingView>
-    </TouchableWithoutFeedback>
-  );
-}
-
-const styles = StyleSheet.create({
-  keyboardAvoidingView: {
-    flex: 1,
-  },
-  topContainer: {
-    height: 68,
-    width: '100%',
-    alignItems: 'center',
-    justifyContent: 'center',
-  },
-  chatContainer: {
-    flex: 10,
-    width: '100%',
-  },
-  textModelName: {
-    color: ColorPalette.primary,
-  },
-  helloMessageContainer: {
-    flex: 10,
-    width: '100%',
-    alignItems: 'center',
-    justifyContent: 'center',
-  },
-  helloText: {
-    fontFamily: 'medium',
-    fontSize: 30,
-    color: ColorPalette.primary,
-  },
-  bottomHelloText: {
-    fontFamily: 'regular',
-    fontSize: 20,
-    lineHeight: 28,
-    textAlign: 'center',
-    color: ColorPalette.primary,
-  },
-  bottomContainer: {
-    height: 100,
-    width: '100%',
-    justifyContent: 'center',
-    alignItems: 'center',
-    paddingHorizontal: 16,
-  },
-  recordTouchable: {
-    height: '100%',
-    justifyContent: 'center',
-    alignItems: 'center',
-  },
-  recordingInfo: {
-    width: '100%',
-    display: 'flex',
-    justifyContent: 'center',
-    alignItems: 'center',
-  },
-  emulatorBox: {
-    padding: 10,
-    margin: 10,
-    borderWidth: 1,
-    borderRadius: 8,
-    borderColor: 'gray',
-    justifyContent: 'center',
-    alignItems: 'center',
-  },
-  emulatorWarning: {
-    color: 'gray',
-    fontSize: 16,
-  },
-});
diff --git a/apps/speech/package.json b/apps/speech/package.json
index 93e07755dd..47de1396a3 100644
--- a/apps/speech/package.json
+++ b/apps/speech/package.json
@@ -20,7 +20,7 @@
     "metro-config": "^0.83.0",
     "react": "19.2.5",
     "react-native": "0.83.4",
-    "react-native-audio-api": "0.12.0",
+    "react-native-audio-api": "0.12.2",
     "react-native-device-info": "^15.0.2",
     "react-native-executorch": "workspace:*",
     "react-native-executorch-expo-resource-fetcher": "workspace:*",
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
index dfd39c15b4..ad4f6505c8 100644
--- a/apps/speech/screens/SpeechToTextScreen.tsx
+++ b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -14,21 +14,25 @@ import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
   useSpeechToText,
   WHISPER_TINY_EN,
-  WHISPER_TINY_EN_QUANTIZED,
+  WHISPER_TINY_EN_COREML,
   WHISPER_BASE_EN,
+  WHISPER_BASE_EN_COREML,
   WHISPER_SMALL_EN,
   TranscriptionResult,
   SpeechToTextProps,
+  WHISPER_SMALL_EN_COREML,
 } from 'react-native-executorch';
 import { ModelPicker, ModelOption } from '../components/ModelPicker';
 
 type STTModelSources = SpeechToTextProps['model'];
 
 const MODELS: ModelOption<STTModelSources>[] = [
-  { label: 'Whisper Tiny', value: WHISPER_TINY_EN },
-  { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED },
-  { label: 'Whisper Base', value: WHISPER_BASE_EN },
-  { label: 'Whisper Small', value: WHISPER_SMALL_EN },
+  { label: 'Whisper Tiny EN (XNNPACK)', value: WHISPER_TINY_EN },
+  { label: 'Whisper Tiny EN (CoreML)', value: WHISPER_TINY_EN_COREML },
+  { label: 'Whisper Base EN (XNNPACK)', value: WHISPER_BASE_EN },
+  { label: 'Whisper Base EN (CoreML)', value: WHISPER_BASE_EN_COREML },
+  { label: 'Whisper Small EN (XNNPACK)', value: WHISPER_SMALL_EN },
+  { label: 'Whisper Small EN (CoreML)', value: WHISPER_SMALL_EN_COREML },
 ];
 import FontAwesome from '@expo/vector-icons/FontAwesome';
 import {
@@ -45,9 +49,12 @@ import ErrorBanner from '../components/ErrorBanner';
 
 const isSimulator = DeviceInfo.isEmulatorSync();
 
+const DEFAULT_MODEL =
+  Platform.OS === 'ios' ? WHISPER_BASE_EN_COREML : WHISPER_TINY_EN;
+
 export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
   const [selectedModel, setSelectedModel] =
-    useState<STTModelSources>(WHISPER_TINY_EN);
+    useState<STTModelSources>(DEFAULT_MODEL);
 
   const model = useSpeechToText({
     model: selectedModel,
@@ -148,7 +155,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     recorder.current.onAudioReady(
       {
         sampleRate,
-        bufferLength: 0.1 * sampleRate,
+        bufferLength: 0.1 * sampleRate, // 100 ms
         channelCount: 1,
       },
       ({ buffer }) => {
@@ -178,6 +185,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     try {
       const streamIter = model.stream({
         verbose: enableTimestamps,
+        timeout: 100,
       });
 
       for await (const { committed, nonCommitted } of streamIter) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
index a20fd7b1bc..ec47586266 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h
@@ -599,8 +599,7 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) {
     jsi::Object wordObj(runtime);
     wordObj.setProperty(
         runtime, "word",
-        jsi::String::createFromUtf8(runtime, seg.words[i].content +
-                                                 seg.words[i].punctations));
+        jsi::String::createFromUtf8(runtime, seg.words[i].content));
     wordObj.setProperty(runtime, "start",
                         static_cast<double>(seg.words[i].start));
     wordObj.setProperty(runtime, "end", static_cast<double>(seg.words[i].end));
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
index 4b58c5039b..9537642d58 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp
@@ -94,7 +94,7 @@ TranscriptionResult wordsToResult(const std::vector<Word> &words,
 
   std::string fullText;
   for (const auto &w : words) {
-    fullText += w.content + w.punctations;
+    fullText += w.content;
   }
   res.text = fullText;
 
@@ -115,7 +115,8 @@ TranscriptionResult wordsToResult(const std::vector<Word> &words,
 } // namespace
 
 void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
-                          std::string languageOption, bool verbose) {
+                          std::string languageOption, bool verbose,
+                          uint32_t timeout) {
   if (isStreaming_) {
     throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress,
                             "Streaming is already in progress!");
@@ -158,10 +159,10 @@ void SpeechToText::stream(std::shared_ptr<jsi::Function> callback,
     // running transcriptions too rapidly (before the audio buffer is filled
     // with significant amount of new data) can cause streamer to commit wrong
     // phrases.
-    std::this_thread::sleep_for(std::chrono::milliseconds(100));
+    std::this_thread::sleep_for(std::chrono::milliseconds(timeout));
   }
 
-  std::vector<Word> finalWords = streamer_->finish();
+  std::vector<Word> finalWords = streamer_->finish(options);
   TranscriptionResult finalRes =
       wordsToResult(finalWords, languageOption, verbose);
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
index ade835869c..ec51862793 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h
@@ -42,7 +42,8 @@ class SpeechToText {
 
   // Stream
   void stream(std::shared_ptr<jsi::Function> callback,
-              std::string languageOption, bool enableTimestamps);
+              std::string languageOption, bool enableTimestamps,
+              uint32_t timeout);
   void streamStop();
   void streamInsert(std::span<float> waveform);
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
index 357309391d..efe6cc2819 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h
@@ -36,7 +36,7 @@ class OnlineASR {
 
   virtual ProcessResult process(const DecodingOptions &options) = 0;
 
-  virtual std::vector<Word> finish() = 0;
+  virtual std::vector<Word> finish(const DecodingOptions &options) = 0;
 
   virtual void reset() = 0;
 };
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
index e7319f95b5..2343d1faab 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h
@@ -4,13 +4,14 @@
 
 namespace rnexecutorch::models::speech_to_text {
 
+/**
+ * Basically a different representation of token,
+ * with timestamps calculated.
+ */
 struct Word {
   std::string content;
   float start;
   float end;
-
-  std::string
-      punctations; // Trailing punctations which appear after the main content
 };
 
 } // namespace rnexecutorch::models::speech_to_text
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
index d1debeb0f0..d2555a79fa 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp
@@ -138,8 +138,9 @@ executorch::aten::Tensor ASR::decode(std::span<uint64_t> tokens,
       positionShape, cachePositions.data(), ScalarType::Long);
 
   const auto encoderOutputSize = static_cast<int32_t>(encoderOutput.size());
-  std::vector<int32_t> encShape = {1, constants::kNumFrames,
-                                   encoderOutputSize / constants::kNumFrames};
+  std::vector<int32_t> encShape = {
+      1, static_cast<int32_t>(constants::kNumFrames),
+      encoderOutputSize / static_cast<int32_t>(constants::kNumFrames)};
   auto encoderTensor = executorch::extension::make_tensor_ptr(
       std::move(encShape), const_cast<float *>(encoderOutput.data()),
       ScalarType::Float);
@@ -262,11 +263,21 @@ ASR::generate(std::span<const float> waveform, const DecodingOptions &options,
   std::vector<float> scores;
 
   uint64_t startPos = 0;
-  while (std::cmp_less_equal(startPos + sequenceIds.size(),
-                             constants::kMaxDecodeLength)) {
-    executorch::aten::Tensor logitsTensor =
-        this->decode(sequenceIds, encoderFeatures, startPos);
 
+  // Prefill: feed each initial token individually so decode() always sees 1
+  // token
+  std::span<uint64_t> firstToken(sequenceIds.data(), 1);
+  executorch::aten::Tensor logitsTensor =
+      this->decode(firstToken, encoderFeatures, startPos);
+  ++startPos;
+  for (size_t i = 1; i < sequenceIds.size(); ++i) {
+    std::span<uint64_t> single(sequenceIds.data() + i, 1);
+    logitsTensor = this->decode(single, encoderFeatures, startPos);
+    ++startPos;
+  }
+
+  // Autoregressive decoding: always 1 token at a time
+  while (std::cmp_less(startPos, constants::kMaxDecodeLength)) {
     const size_t logitsInnerDim = logitsTensor.size(1);
     const size_t logitsDictSize = logitsTensor.size(2);
     const float *logitsData = logitsTensor.const_data_ptr<float>() +
@@ -302,15 +313,16 @@ ASR::generate(std::span<const float> waveform, const DecodingOptions &options,
       nextProb = probs[nextId];
     }
 
-    // Move the startPos pointer by the amount of tokens we processed
-    startPos += sequenceIds.size();
-    sequenceIds = {nextId};
     cachedTokens.push_back(nextId);
     scores.push_back(nextProb);
 
     if (nextId == endOfTranscriptionToken_) {
       break;
     }
+
+    std::span<uint64_t> single(&cachedTokens.back(), 1);
+    logitsTensor = this->decode(single, encoderFeatures, startPos);
+    ++startPos;
   }
 
   return {.tokens = std::vector<uint64_t>(cachedTokens.cbegin() +
@@ -437,7 +449,7 @@ ASR::estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
     const float wEnd = wStart + timePerChar * wSize;
     prevCharCount += wSize;
 
-    // We store punctations separately to other characters.
+    // Detect and extract trailing punctuations.
     std::string puncts = "";
     while (!w.empty() && constants::kPunctations.contains(w.back())) {
       puncts += w.back();
@@ -445,7 +457,14 @@ ASR::estimateWordLevelTimestampsLinear(std::span<const uint64_t> tokens,
     }
     std::reverse(puncts.begin(), puncts.end());
 
-    wordObjs.emplace_back(std::move(w), wStart, wEnd, std::move(puncts));
+    // Add the core word.
+    wordObjs.emplace_back(std::move(w), wStart, wEnd);
+
+    // If punctuation was present, add it as a separate "word" with an
+    // instantaneous timestamp at the end of the original word.
+    if (!puncts.empty()) {
+      wordObjs.emplace_back(std::move(puncts), wEnd, wEnd);
+    }
   }
 
   return wordObjs;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
index 0b284345ec..62a9f968f7 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h
@@ -9,34 +9,37 @@ namespace rnexecutorch::models::speech_to_text::whisper::constants {
 // Maximum duration of each audio chunk to process (in seconds)
 // It is intentionally set to 29 since otherwise only the last chunk would be
 // correctly transcribe due to the model's positional encoding limit
-constexpr static int32_t kChunkSize = 29;
+inline constexpr size_t kChunkSize = 29;
 
 // Sampling rate expected by Whisper and the model's audio pipeline (16 kHz)
-constexpr static int32_t kSamplingRate = 16000;
-constexpr static int32_t kSamplesPerMilisecond = kSamplingRate / 1000;
+inline constexpr size_t kSamplingRate = 16000;
+inline constexpr size_t kSamplesPerMilisecond = kSamplingRate / 1000;
+
+inline constexpr size_t kMaxSamples = kChunkSize * kSamplingRate;
 
 // The maximum number of tokens the decoder can generate per chunk
-constexpr static int32_t kMaxDecodeLength = 128;
+inline constexpr size_t kMaxDecodeLength = 128;
 
 // Minimum allowed chunk length before processing (in audio samples)
-constexpr static int32_t kMinChunkSamples = 1 * kSamplingRate;
+inline constexpr size_t kMinChunkSamples = 1 * kSamplingRate;
 
 // Number of mel frames output by the encoder (derived from input spectrogram)
-constexpr static int32_t kNumFrames = 1500;
+inline constexpr size_t kNumFrames = 1500;
 
 // Time precision used by Whisper timestamps: each token spans 0.02 seconds
-constexpr static float kTimePrecision = 0.02f;
+inline constexpr float kTimePrecision = 0.02f;
 
 // Special characters serving as pause / end of sentence
-static const std::unordered_set<char> kPunctations = {',', '.', '?',
+inline const std::unordered_set<char> kPunctations = {',', '.', '?',
                                                       '!', ':', ';'};
+inline const std::unordered_set<char> kEosPunctations = {'.', '?', '!', ';'};
 
 // Special token constants
 namespace tokens {
-static const std::string kStartOfTranscript = "<|startoftranscript|>";
-static const std::string kEndOfTranscript = "<|endoftext|>";
-static const std::string kBeginTimestamp = "<|0.00|>";
-static const std::string kBlankAudio = "[BLANK_AUDIO]";
+inline const std::string kStartOfTranscript = "<|startoftranscript|>";
+inline const std::string kEndOfTranscript = "<|endoftext|>";
+inline const std::string kBeginTimestamp = "<|0.00|>";
+inline const std::string kBlankAudio = "[BLANK_AUDIO]";
 } // namespace tokens
 
 } // namespace rnexecutorch::models::speech_to_text::whisper::constants
\ No newline at end of file
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp
deleted file mode 100644
index ce365e4e44..0000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp
+++ /dev/null
@@ -1,199 +0,0 @@
-#include "HypothesisBuffer.h"
-#include "Params.h"
-#include "Utils.h"
-
-#include <algorithm>
-#include <cmath>
-
-namespace rnexecutorch::models::speech_to_text::whisper::stream {
-
-void HypothesisBuffer::insert(std::span<const Word> words, float offset) {
-  // Step 1 - decide which words should be considered as fresh.
-  fresh_.clear();
-
-  // We try to find the last committed word in a transcription string.
-  // Everything beyond that word will be considered as fresh.
-  // To make the algorithm more resilient to repeated strings of words,
-  // we check also the preceeding words as well as timestamps (with liberal
-  // range).
-  size_t firstFreshWordIdx = 0;
-  if (!committed_.empty()) {
-    std::optional<size_t> lastMatchingWordIdx =
-        findCommittedSuffix(words, params::kStreamCommitedSuffixSearchSize,
-                            params::kStreamMaxOverlapTimestampDiff1,
-                            params::kStreamWordsPerErrorRate);
-    firstFreshWordIdx = lastMatchingWordIdx.value_or(0);
-  }
-
-  bool isCompletelyFresh = firstFreshWordIdx == 0;
-  for (size_t i = firstFreshWordIdx; i < words.size(); i++) {
-    const auto &word = words[i];
-
-    // Global start is a beginning timestamp relative only to the beginning of
-    // the current streaming process.
-    const float startGlobal = word.start + offset;
-    const float endGlobal = word.end + offset;
-
-    if (!isCompletelyFresh ||
-        startGlobal > lastCommittedTime_ - params::kStreamFreshThreshold) {
-      fresh_.emplace_back(word.content, startGlobal, endGlobal,
-                          word.punctations);
-    }
-  }
-
-  // Step 2 - we have already selected the fresh words. Now it's time to
-  // correct any mistakes and remove the words which overlap with already
-  // commited segments - to avoid duplicates.
-  if (!fresh_.empty() && !committed_.empty()) {
-    // Calculate the largest overlapping fragment size.
-    // Note that we use size limit (kStreamMaxOverlapSize) for efficiency of the
-    // algorithm, and timestamp difference limit
-    // (kStreamMaxOverlapTimestampDiff) to avoid removing correct fragments
-    // which were just repeated after some time.
-    size_t overlapSize = utils::findLargestOverlapingFragment(
-        committed_, fresh_, params::kStreamMaxOverlapSize,
-        params::kStreamMaxOverlapTimestampDiff2);
-
-    if (overlapSize > 0) {
-      fresh_.erase(fresh_.begin(), fresh_.begin() + overlapSize);
-    }
-  }
-}
-
-std::deque<Word> HypothesisBuffer::commit() {
-  std::deque<Word> toCommit = {};
-
-  // Find a stable prefix: words that haven't changed between last and current
-  // iteration.
-  while (!fresh_.empty() && !hypothesis_.empty() &&
-         fresh_.front().content == hypothesis_.front().content) {
-    // The last word from the fresh_ buffer must also match punctations with the
-    // hypothesis. This is done in order to ensure correct punctation marks in
-    // the resulting transcription.
-    if (fresh_.size() == 1 &&
-        fresh_.front().punctations != hypothesis_.front().punctations) {
-      break;
-    }
-
-    // Take timestamps from the hypothesis, but actual content from the fresh
-    // buffer.
-    toCommit.emplace_back(std::move(fresh_.front().content),
-                          hypothesis_.front().start, hypothesis_.front().end,
-                          std::move(fresh_.front().punctations));
-    fresh_.pop_front();
-    hypothesis_.pop_front();
-  }
-
-  // Save the last committed word timestamp.
-  // This will mark the end of the entire committed sequence.
-  if (!toCommit.empty()) {
-    lastCommittedTime_ = toCommit.back().end;
-  }
-
-  // The remaining words from the fresh buffer (uncommitted phrase)
-  // become a hypothesis for the next iteration.
-  hypothesis_ = std::move(fresh_);
-  fresh_.clear();
-
-  // The last step is to commit the selected words.
-  committed_.insert(committed_.end(), toCommit.cbegin(), toCommit.cend());
-
-  return toCommit;
-}
-
-void HypothesisBuffer::releaseCommits(size_t wordsToKeep) {
-  if (committed_.size() > wordsToKeep) {
-    size_t nWordsToErase = committed_.size() - wordsToKeep;
-    committed_.erase(committed_.begin(), committed_.begin() + nWordsToErase);
-  }
-}
-
-void HypothesisBuffer::reset() {
-  fresh_.clear();
-  hypothesis_.clear();
-  committed_.clear();
-
-  lastCommittedTime_ = 0.f;
-}
-
-std::optional<size_t> HypothesisBuffer::findCommittedSuffix(
-    std::span<const Word> words, size_t nCommitted,
-    float timestampDiffTolerance, size_t wordsPerMistake) {
-  if (words.empty() || committed_.empty() || nCommitted == 0) {
-    return std::nullopt;
-  }
-
-  // Determine the subset size of committed words to check against.
-  size_t committedToMatchSize = std::min(nCommitted, committed_.size());
-
-  // Iterate backwards through 'words' to find the most recent occurrence of a
-  // suffix of 'committed_' (or the full 'committed_' sequence).
-  for (int32_t i = static_cast<int32_t>(words.size()) - 1; i >= 0; --i) {
-    bool match = true;
-    size_t matchedCount = 0;
-    size_t contentMistakeCount = 0;
-
-    // Linearly interpolate tolerance if we are at the beginning and can't check
-    // all committed words.
-    float effectiveTolerance = timestampDiffTolerance;
-    if (i < static_cast<int32_t>(committedToMatchSize) - 1) {
-      effectiveTolerance *=
-          static_cast<float>(i + 1) / static_cast<float>(committedToMatchSize);
-    }
-
-    // Try to match backwards from words[i] and committed_.back()
-    for (size_t j = 0; j < committedToMatchSize; ++j) {
-      int32_t wordsIdx = i - static_cast<int32_t>(j);
-      int32_t committedIdx =
-          static_cast<int32_t>(committed_.size()) - 1 - static_cast<int32_t>(j);
-
-      if (wordsIdx < 0) {
-        // We reached the beginning of the words span.
-        // The algorithm allows matching a partial prefix if it's at the start.
-        break;
-      }
-
-      const Word &w1 = words[wordsIdx];
-      const Word &w2 = committed_[committedIdx];
-
-      // Check timestamps within tolerance
-      if (std::max(std::abs(w1.start - w2.start), std::abs(w1.end - w2.end)) >
-          effectiveTolerance) {
-        match = false;
-        break;
-      }
-
-      // Allow sparse content mismatches while still treating the overall
-      // sequence as matching.
-      if (utils::equalsIgnoreCase(w1.content, w2.content)) {
-        matchedCount++;
-      } else {
-        contentMistakeCount++;
-      }
-
-      // Early exit if mistake count already exceeds what we can recover from
-      // given the remaining words to check.
-      if (wordsPerMistake > 0) {
-        size_t remainingToMatch = committedToMatchSize - 1 - j;
-        size_t maxPossibleMatched = matchedCount + remainingToMatch;
-        if (contentMistakeCount > (maxPossibleMatched / wordsPerMistake)) {
-          match = false;
-          break;
-        }
-      }
-    }
-
-    // One content mistake is allowed per M matched words.
-    size_t maxAllowedMistakes =
-        (wordsPerMistake == 0) ? 0 : (matchedCount / wordsPerMistake);
-
-    if (match && matchedCount > 0 &&
-        contentMistakeCount <= maxAllowedMistakes) {
-      return static_cast<size_t>(i);
-    }
-  }
-
-  return std::nullopt;
-}
-
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h
deleted file mode 100644
index 25833ec01b..0000000000
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h
+++ /dev/null
@@ -1,82 +0,0 @@
-#pragma once
-
-#include <deque>
-#include <optional>
-#include <span>
-
-#include "../common/types/Word.h"
-
-namespace rnexecutorch::models::speech_to_text::whisper::stream {
-
-/**
- * A buffer for managing streaming transcription hypotheses.
- * This class handles stabilization of the transcription result by tracking
- * "fresh" hypotheses and "committing" them once they are stable across updates.
- */
-class HypothesisBuffer {
-public:
-  /**
-   * Inserts new words into the fresh_ buffer.
-   * Words are filtered based on the last committed time and checked for
-   * overlaps with existing committed words to prevent duplicates.
-   *
-   * @param newWords A span of recently generated words.
-   * @param offset   Time offset to adjust the word timestamps.
-   */
-  void insert(std::span<const Word> words, float offset);
-
-  /**
-   * Attempts to commit words present in the fresh_ buffer.
-   * A phrase from fresh_ buffer can only be committed if it also appears
-   * in the hypothesis_ buffer (uncommitted words from previous iteration).
-   *
-   * Uncommitted words become a 'hypothesis' and are moved into the hypothesis_
-   * buffer.
-   *
-   * @return A sequence of words committed in the current iteration.
-   */
-  std::deque<Word> commit();
-
-  /**
-   * Shrinks the committed_ buffer by erasing all words except N latest ones.
-   *
-   * Used primarily to relieve increasing memory usage during very
-   * long streaming sessions.
-   *
-   * @param wordsToKeep - number of trailing words to be kept in.
-   */
-  void releaseCommits(size_t wordsToKeep);
-
-  /**
-   * Resets all the stored buffers and state variables to the initial state
-   */
-  void reset();
-
-  // Declare a friendship with OnlineASR to allow it to access the internal
-  // state of stored buffers.
-  friend class OnlineASR;
-
-private:
-  // Finds the most recent occurance of given committed string of words
-  // in a custom span of words.
-  // Returns the index of the last matching word (or nullopt if not present).
-  std::optional<size_t> findCommittedSuffix(std::span<const Word> words,
-                                            size_t nCommitted,
-                                            float timestampDiffTolerance = 1.F,
-                                            size_t wordsPerMistake = 4);
-
-  // Stored buffers
-  // The lifecycle of a correct result word looks as following:
-  // fresh buffer -> hypothesis buffer -> commited
-  std::deque<Word>
-      fresh_; // 'New' words from current iterations, which require some checks
-              // before they go into hypothesis_ buffer.
-  std::deque<Word>
-      hypothesis_; // Words potentially to be commited, stored between
-                   // iterations (obtained from fresh_ buffer).
-  std::deque<Word> committed_; // A history of already commited words.
-
-  float lastCommittedTime_ = 0.0f;
-};
-
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
index ded2183201..188c77d80d 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp
@@ -1,35 +1,43 @@
+#include "OnlineASR.h"
+
 #include <algorithm>
 #include <iterator>
-#include <numeric>
-#include <sstream>
+#include <utility>
 
 #include "Constants.h"
-#include "OnlineASR.h"
 #include "Params.h"
 #include "Utils.h"
 
 namespace rnexecutorch::models::speech_to_text::whisper::stream {
 
-namespace {
-std::vector<Word> move_to_vector(std::deque<Word> &container) {
-  return std::vector<Word>(std::make_move_iterator(container.begin()),
-                           std::make_move_iterator(container.end()));
+OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
+  // Reserve an expected amount of memory for audio buffer.
+  audioBuffer_.reserve((constants::kChunkSize + 1) * constants::kSamplingRate);
 }
-} // namespace
 
-OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) {
-  // Reserve a minimal expected amount of memory for audio buffer.
-  audioBuffer_.reserve(static_cast<size_t>(2 * params::kStreamChunkThreshold *
-                                           constants::kSamplingRate));
+bool OnlineASR::isReady() const {
+  std::scoped_lock<std::mutex> lock(streamingMutex);
+
+  return audioBuffer_.size() >= constants::kMinChunkSamples;
 }
 
 void OnlineASR::insertAudioChunk(std::span<const float> audio) {
-  std::scoped_lock<std::mutex> lock(audioBufferMutex_);
+  std::scoped_lock<std::mutex> lock(streamingMutex);
+
   audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end());
-}
 
-bool OnlineASR::isReady() const {
-  return audioBuffer_.size() >= constants::kMinChunkSamples;
+  // Automatic buffer cleanup.
+  //
+  // This prevents the audio buffer from growing indefinitely during continuous
+  // streaming. It is particularly useful when VAD (Voice Activity Detection)
+  // is used and elements are inserted but not processed for a long time.
+  // It should not pass the condition in a normal streaming, that is when
+  // process() method is called regularly within reasonable steps of time.
+  if (audioBuffer_.size() > constants::kMaxSamples) {
+    // Note that results are not actually committed now, but saved for
+    // a later call of process().
+    memory_.toCommit = commitAndClean(memory_.transcript);
+  }
 }
 
 ProcessResult OnlineASR::process(const DecodingOptions &options) {
@@ -38,126 +46,213 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) {
   // Copy the audio buffer to avoid keeping the lock during the entire
   // transcription process.
   {
-    std::scoped_lock<std::mutex> lock(audioBufferMutex_);
+    std::scoped_lock<std::mutex> lock(streamingMutex);
     audioCopy = audioBuffer_;
   }
 
-  std::vector<Segment> transcriptions = asr_->transcribe(audioBuffer_, options);
+  // Obtain a transcription for current audio buffer state.
+  // It's very unlikely that buffer will exceed whisper's maximum capacity, but
+  // for absolute safety we can additionally clip the buffer.
+  std::span<const float> input(
+      audioCopy.begin(),
+      audioCopy.begin() + std::min(constants::kMaxSamples, audioCopy.size()));
 
-  if (transcriptions.empty()) {
-    return {.committed = {}, .nonCommitted = {}};
-  }
+  std::vector<Segment> transcriptions = asr_->transcribe(input, options);
 
   // Flatten segments into a single word sequence.
+  // This is basically our 'nonCommitted' part for now.
   std::vector<Word> words;
-  words.reserve(transcriptions.front().words.size());
-
   for (auto &segment : transcriptions) {
-    words.insert(words.end(), std::make_move_iterator(segment.words.begin()),
-                 std::make_move_iterator(segment.words.end()));
+    std::move(segment.words.begin(), segment.words.end(),
+              std::back_inserter(words));
   }
 
-  hypothesisBuffer_.insert(words, bufferTimeOffset_);
-
-  // Apply fix for timestamps.
-  if (!hypothesisBuffer_.fresh_.empty()) {
-    size_t noNewWords = hypothesisBuffer_.fresh_.size();
-    float establishedEnd = hypothesisBuffer_.lastCommittedTime_;
-    float newBegin = hypothesisBuffer_.fresh_.front().start;
-    const float newEnd = hypothesisBuffer_.fresh_.back().end;
-    float shift = 0.F;
-    for (size_t i = 0; i < hypothesisBuffer_.fresh_.size(); i++) {
-      const float originalEnd = hypothesisBuffer_.fresh_[i].end;
-
-      if (i < hypothesisBuffer_.hypothesis_.size() &&
-          utils::equalsIgnoreCase(hypothesisBuffer_.fresh_[i].content,
-                                  hypothesisBuffer_.hypothesis_[i].content)) {
-        hypothesisBuffer_.fresh_[i].start =
-            hypothesisBuffer_.hypothesis_[i].start;
-        hypothesisBuffer_.fresh_[i].end = hypothesisBuffer_.hypothesis_[i].end;
-        shift = hypothesisBuffer_.fresh_[i].end - originalEnd;
-
-        establishedEnd = hypothesisBuffer_.hypothesis_[i].end;
-        newBegin = hypothesisBuffer_.fresh_[i].end;
-        noNewWords--;
-        continue;
-      }
-
-      // In case of a new word, we apply timestamp range scaling
-      // based on timestamps established in previous iterations.
-      const float freshDuration = newEnd - establishedEnd;
-      const float epsilon = std::max(
-          0.F, 0.85F * (freshDuration -
-                        static_cast<float>(noNewWords /
-                                           params::kStreamWordsPerSecond)));
-      float scale =
-          (freshDuration - epsilon) / std::max(newEnd - newBegin, 0.2F);
-      hypothesisBuffer_.fresh_[i].start =
-          shift + (hypothesisBuffer_.fresh_[i].start - newEnd) * scale + newEnd;
-      hypothesisBuffer_.fresh_[i].end =
-          shift + (hypothesisBuffer_.fresh_[i].end - newEnd) * scale + newEnd;
+  // Aquire lock for the rest of the method (extensive usage of audioBuffer_).
+  std::scoped_lock<std::mutex> lock(streamingMutex);
+
+  // Step 1: examine all previously saved EOS points.
+  // The idea is to remove entries which have changed or no longer exist
+  // due to model correcting it's output.
+  for (size_t i = 0; i < memory_.eos.size(); i++) {
+    const auto &eos = memory_.eos[i];
+    if (eos.position >= words.size() || !utils::isEos(words[eos.position]) ||
+        (eos.position > 0 &&
+         eos.preceeding != words[eos.position - 1].content)) {
+      memory_.eos.erase(memory_.eos.begin() + i, memory_.eos.end());
+      break;
     }
   }
 
-  auto committed = hypothesisBuffer_.commit();
-  auto nonCommitted = hypothesisBuffer_.hypothesis_;
+  // Step 2: check if the newest EOS character from transcript should be
+  // saved to eos_ vector.
+  auto lastEosIt = std::find_if(words.rbegin(), words.rend(), utils::isEos);
+  if (lastEosIt != words.rend()) {
+    size_t lastEosIndex = std::distance(words.begin(), lastEosIt.base()) - 1;
 
-  // We want to save the most recent end of sentence word
-  // to improve the audio cutting mechanism.
-  for (const auto &word : committed) {
-    if (!word.punctations.empty()) {
-      lastSentenceEnd_ = word.end;
+    // Because of step 1, we know that if the last EOS exist in eos_,
+    // then it must be the last entry.
+    if (memory_.eos.empty() || memory_.eos.back().position != lastEosIndex) {
+      // Register last EOS entry
+      std::string preceeding =
+          lastEosIndex > 0 ? words[lastEosIndex - 1].content : "";
+      memory_.eos.emplace_back(lastEosIndex, preceeding, lastEosIt->end);
     }
   }
 
-  // Since Whisper does not accept waveforms longer than 30 seconds, we need
-  // to cut the audio at some safe point.
-  {
-    std::scoped_lock<std::mutex> lock(audioBufferMutex_);
-
-    const float audioDuration =
-        static_cast<float>(audioBuffer_.size()) / constants::kSamplingRate;
-    if (audioDuration > params::kStreamChunkThreshold) {
-      // Leave some portion of audio in, to improve model behavior
-      // in future iterations.
-      const float erasePoint =
-          hypothesisBuffer_.lastCommittedTime_ == lastSentenceEnd_
-              ? audioDuration
-              : std::min(lastSentenceEnd_, params::kStreamChunkThreshold);
-      const float minEraseDuration =
-          audioDuration - params::kStreamAudioBufferMaxReserve;
-      const float maxEraseDuration =
-          audioDuration - params::kStreamAudioBufferMinReserve;
-      const float eraseDuration = std::clamp(
-          erasePoint - bufferTimeOffset_, minEraseDuration, maxEraseDuration);
-      const size_t nSamplesToErase =
-          static_cast<size_t>(eraseDuration * constants::kSamplingRate);
+  std::vector<Word> committed;
 
-      audioBuffer_.erase(audioBuffer_.begin(),
-                         audioBuffer_.begin() + nSamplesToErase);
-      bufferTimeOffset_ += eraseDuration;
-    }
+  // Step 3: collect all the words which could possible get committed
+  // in-between iterations.
+  if (!memory_.toCommit.empty()) {
+    committed.insert(committed.end(),
+                     std::make_move_iterator(memory_.toCommit.begin()),
+                     std::make_move_iterator(memory_.toCommit.end()));
+    memory_.toCommit.clear();
   }
 
-  return {.committed = move_to_vector(committed),
-          .nonCommitted = move_to_vector(nonCommitted)};
+  // Step 4: clear the buffer if it is getting too large.
+  // The idea is to use the saved EOS entries and try to cut the buffer
+  // in a 'good' spot - where it will remove a significant audio chunk, yet
+  // won't affect most recent, unfinished speech samples.
+  size_t bufferSize = audioBuffer_.size();
+  if (bufferSize > static_cast<size_t>(params::kStreamSafeBufferDuration *
+                                       constants::kSamplingRate)) {
+    auto newCommitted = commitAndClean(words);
+
+    committed.insert(committed.end(),
+                     std::make_move_iterator(newCommitted.begin()),
+                     std::make_move_iterator(newCommitted.end()));
+  }
+
+  // Save the uncommitted part to streamer's memory,
+  // cause it might be necessary when committing inside streamInsert().
+  memory_.transcript = words;
+
+  // Note that uncommitted part represented by recent transcription (words)
+  // is already shrinked if something has been committed during the cleanup
+  // phase.
+  return {.committed = std::move(committed), .nonCommitted = std::move(words)};
 }
 
-std::vector<Word> OnlineASR::finish() {
-  // We always push the last remaining hypothesis, even if it's not
-  // confirmed in second iteration, to avoid ending up with broken sentences.
-  std::deque<Word> remaining = hypothesisBuffer_.hypothesis_;
+std::vector<Word> OnlineASR::finish(const DecodingOptions &options) {
+  ProcessResult result = process(options);
+
+  // Last-tick committed delta + whatever never made it past the commit
+  // threshold.
+  std::vector<Word> residual = std::move(result.committed);
+  residual.insert(residual.end(),
+                  std::make_move_iterator(result.nonCommitted.begin()),
+                  std::make_move_iterator(result.nonCommitted.end()));
+
+  reset();
 
-  return move_to_vector(remaining);
+  return residual;
 }
 
 void OnlineASR::reset() {
-  std::scoped_lock<std::mutex> lock(audioBufferMutex_);
-
-  hypothesisBuffer_.reset();
-  bufferTimeOffset_ = 0.f;
+  std::scoped_lock<std::mutex> lock(streamingMutex);
 
   audioBuffer_.clear();
+
+  // Reset memory.
+  memory_.transcript.clear();
+  memory_.eos.clear();
+  memory_.toCommit.clear();
+}
+
+std::vector<Word> OnlineASR::commitAndClean(std::vector<Word> &transcript) {
+  const size_t bufferSize = audioBuffer_.size();
+  const float midBufferThreshold = params::kStreamMaxDuration / 2.0F;
+
+  std::vector<Word> committed;
+
+  // If we don't have any EOS entries, then we most likely have not
+  // recorded any speech. In this case we can safely cut the maximum amount of
+  // audio data.
+  if (memory_.eos.empty()) {
+    size_t cut =
+        bufferSize - params::kStreamSafetyThreshold * constants::kSamplingRate;
+
+    audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+  }
+
+  // If we have exactly one (most recent) EOS entry in the eos_, then
+  // we need to be more careful.
+  // Normally we want to keep at least one sentence in, but if the sentence
+  // covers a significant amount of buffer, we have no choice.
+  else if (memory_.eos.size() == 1) {
+    const float eosTimestamp = memory_.eos[0].tmstpend;
+
+    const float upperHalfDuration =
+        std::max(0.0F, eosTimestamp - midBufferThreshold);
+    const float wordsPerSecond =
+        upperHalfDuration > 0.1F
+            ? static_cast<float>(transcript.size()) / upperHalfDuration
+            : 0.0F;
+
+    // The EOS sits early enough that cutting up to the safety margin won't
+    // touch the ongoing (post-EOS) speech.
+    const bool eosSafe = eosTimestamp < params::kStreamSafeBufferDuration -
+                                            params::kStreamSafetyThreshold;
+
+    if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) {
+      // EOS lies past the midpoint, but a low word density implies the spoken
+      // audio is concentrated in the upper half. Drop the lower half and
+      // shift the EOS accordingly.
+      audioBuffer_.erase(audioBuffer_.begin(),
+                         audioBuffer_.begin() +
+                             static_cast<size_t>(midBufferThreshold *
+                                                 constants::kSamplingRate));
+      memory_.eos[0].tmstpend -= midBufferThreshold;
+    } else {
+      // Cut everything up to and including the sentence — either by the
+      // safety margin (when EOS is early) or (more aggresively) right at the
+      // EOS boundary — and commit its words.
+      const size_t cut =
+          eosSafe
+              ? bufferSize -
+                    static_cast<size_t>(params::kStreamSafetyThreshold *
+                                        constants::kSamplingRate)
+              : static_cast<size_t>(eosTimestamp * constants::kSamplingRate);
+
+      audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+
+      committed.insert(committed.end(),
+                       std::make_move_iterator(transcript.begin()),
+                       std::make_move_iterator(transcript.end()));
+
+      transcript.clear();
+      memory_.eos.clear();
+    }
+  }
+
+  // In case of 2 or more sentences, we generally want to keep the last one
+  // intact. This would provide a bit of stability to the algorithm.
+  else {
+    const auto &secondTolastEntry = memory_.eos[memory_.eos.size() - 2];
+
+    const size_t cut = static_cast<size_t>(secondTolastEntry.tmstpend *
+                                           constants::kSamplingRate);
+    const size_t lastCommittedPos = secondTolastEntry.position;
+
+    audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut);
+
+    // Move all words up to the last committed position (inclusive) to the
+    // committed buffer.
+    committed.insert(
+        committed.end(), std::make_move_iterator(transcript.begin()),
+        std::make_move_iterator(transcript.begin() + lastCommittedPos + 1));
+    transcript.erase(transcript.begin(),
+                     transcript.begin() + lastCommittedPos + 1);
+
+    // Retain only the most recent EOS entry, shifting both its timestamp
+    // and its position to match the new (truncated) transcript origin.
+    memory_.eos.erase(memory_.eos.begin(), memory_.eos.end() - 1);
+    memory_.eos[0].tmstpend -= secondTolastEntry.tmstpend;
+    memory_.eos[0].position -= lastCommittedPos + 1;
+  }
+
+  return committed;
 }
 
 } // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
index df6d469e39..7547d16bd5 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h
@@ -1,13 +1,13 @@
 #pragma once
 
+#include <mutex>
+#include <span>
+#include <vector>
+
 #include "../common/schema/OnlineASR.h"
 #include "../common/types/ProcessResult.h"
-#include "../common/types/Segment.h"
 #include "../common/types/Word.h"
 #include "ASR.h"
-#include "HypothesisBuffer.h"
-
-#include <mutex>
 
 namespace rnexecutorch::models::speech_to_text::whisper::stream {
 
@@ -21,60 +21,65 @@ class OnlineASR : public schema::OnlineASR {
   OnlineASR(const ASR *asr);
 
   /**
-   * Appends new audio samples to the internal processing buffer.
-   *
-   * @param audio A span of PCM float samples (expected 16kHz).
+   * Checks if the buffer contains enough audio for the next processing step.
+   * @return True if ready, false otherwise.
    */
-  void insertAudioChunk(std::span<const float> audio) override;
+  bool isReady() const override;
 
   /**
-   * Determines whether the model is ready to process the next iteration.
-   *
-   * @return True if audioBuffer has enough samples, False otherwise
+   * Appends audio samples to the internal buffer.
+   * @param audio Span containing the audio data.
    */
-  bool isReady() const override;
+  void insertAudioChunk(std::span<const float> audio) override;
 
   /**
-   * Processes the current audio buffer and returns new transcription results.
-   * Stability is managed by an internal HypothesisBuffer to ensure that
-   * only confirmed (stable) text is returned as "committed".
-   *
-   * @param options Decoding configuration (language, etc.).
-   * @return        A ProcessResult containing newly committed and uncommitted
-   * words.
+   * Processes the current buffered audio and returns transcription results.
+   * @param options Decoding options for the transcription.
+   * @return Transcription result containing committed and volatile tokens.
    */
   ProcessResult process(const DecodingOptions &options) override;
 
   /**
-   * Finalizes the current streaming session.
-   * Flushes any remaining words from the hypothesis buffer.
-   *
-   * @return A vector of remaining transcribed words.
+   * Finalizes the current stream and returns all words.
+   * @return Vector of detected words.
    */
-  std::vector<Word> finish() override;
+  std::vector<Word> finish(const DecodingOptions &options) override;
 
   /**
-   * Reset the streaming state by resetting the buffers
+   * Resets the internal state and clears buffers.
    */
   void reset() override;
 
 private:
+  // Cleans up the buffer and returns committed words based on given transcript.
+  std::vector<Word> commitAndClean(std::vector<Word> &transcript);
+
   // ASR module connection for transcribing the audio
   const ASR *asr_;
 
-  // Helper buffers - audio buffer
-  // Stores the increasing amounts of streamed audio.
-  // Cleared from time to time after reaching a threshold size.
+  // Audio buffer (input) - accumulates obtained audio samples.
   std::vector<float> audioBuffer_ = {};
-  mutable std::mutex audioBufferMutex_;
-  float bufferTimeOffset_ = 0.F; // Audio buffer offset
+  mutable std::mutex streamingMutex; // Covers both buffer & memory
 
-  // Helper buffers - hypothesis buffer
-  // Manages the whisper streaming hypothesis mechanism.
-  HypothesisBuffer hypothesisBuffer_;
+  // Streaming memory.
+  // In general, helps to navigate continous streaming state and improve buffer
+  // handling algorithms.
+  struct Memory {
+    // State management helper.
+    struct EOSEntry {
+      size_t position; // An absolute position (index) in the transcription
+                       // (word sequence).
+      std::string preceeding; // A preceeding word in the transcription
+      float tmstpend;         // Ending timestamp of the sentence.
+    };
 
-  // State members to keep track of specyfic aspects of buffer state
-  float lastSentenceEnd_ = 0.F;
+    std::vector<Word>
+        transcript; // The most recent transcription result (uncommitted only!).
+    std::vector<EOSEntry>
+        eos; // End of sentence points from the most recent transcription.
+    std::vector<Word> toCommit; // Words to be committed in the next iteration
+                                // (next process() call).
+  } memory_;
 };
 
-} // namespace rnexecutorch::models::speech_to_text::whisper::stream
\ No newline at end of file
+} // namespace rnexecutorch::models::speech_to_text::whisper::stream
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
index 5eb74c06cc..847a22b1e0 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h
@@ -1,6 +1,9 @@
 #pragma once
 
+#include "Constants.h"
+
 #include <cinttypes>
+#include <cstdlib>
 
 /**
  * Hyperparameters
@@ -11,90 +14,50 @@
 namespace rnexecutorch::models::speech_to_text::whisper::params {
 
 /**
- * Determines the range of buffer left when skipping an audio chunk
- * of size lower than maximum allowed chunk size.
- *
- * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer],
- * then instead of moving to the last returned timestamp, we jump across the
- * entire 30 seconds chunk. This resolves the issue of multiple redundant
- * segments being produced by the transcription algorithm.
+ * Maximum duration of audio that the streaming buffer keeps before forcing
+ * a cleanup. Aligned with Whisper's maximum supported input length.
  */
-constexpr static int32_t kChunkBreakBuffer = 2; // [s]
+constexpr inline float kStreamMaxDuration =
+    static_cast<float>(constants::kChunkSize);
 
 /**
- * Determines the maximum timestamp difference available for a word to be
- * considered as fresh in streaming algorithm.
+ * The minimum amount of recent audio always kept in the buffer when a blind
+ * cut is performed. Acts as the lower bound on what survives a cleanup.
  */
-constexpr static float kStreamFreshThreshold = 3.F; // [s], originally 0.5
+constexpr inline float kStreamSafetyThreshold = 3.F; // [s]
 
 /**
- * The size of the most recent committed suffix searched in
- * fresh words string.
- *
- * For example, if the committed buffer contains ["I", "did" "a" "very" "nasty"
- * "thing."], and kStreamCommitedSuffixSearchSize = 3, then we search for
- * ["very" "nasty" "thing."] suffix.
+ * Forced-cleanup threshold. Once the buffer grows past this duration we run
+ * the EOS-anchored cleanup routine.
  */
-constexpr static size_t kStreamCommitedSuffixSearchSize = 5;
+constexpr inline float kStreamSafeBufferDuration =
+    kStreamMaxDuration - kStreamSafetyThreshold; // [s]
 
 /**
- * Determines the maximum expected size of overlapping fragments between
- * fresh words buffer and commited words buffer in streaming mode.
- *
- * It is a limit of maximum amount of erased repeated words from fresh buffer.
- * The bigger it gets, the less probable it is to commit the same phrase twice.
+ * An estimate of the number of words spoken per second.
+ * Used for estimating transcription progress and buffer management heuristics.
  */
-constexpr static size_t kStreamMaxOverlapSize =
-    12; // Number of overlaping words
+constexpr inline float kWordsPerSecondEstimation = 2.25F;
 
 /**
- * Similar to kMaxStreamOverlapSize, but this one determines
- * the maximum allowed timestamp difference between the overlaping fragments.
- *
- * It's the first, more strict threshold, used when searching for recently
- * committed entries.
+ * Upper bound for words per second estimate in fast speech.
  */
-constexpr static float kStreamMaxOverlapTimestampDiff1 = 6.F; // [s]
+constexpr inline float kWordsPerSecondHigh = 4.F;
 
 /**
- * Similar to kMaxStreamOverlapSize, but this one determines
- * the maximum allowed timestamp difference between the overlaping fragments.
- *
- * It's the second, more liberal threshold, used in overlap correction
- * algorithm.
+ * Lower bound for words per second estimate in slow speech.
  */
-constexpr static float kStreamMaxOverlapTimestampDiff2 = 15.F; // [s]
+constexpr inline float kWordsPerSecondLow = 1.5F;
 
 /**
- * Number of words per 1 allowed mistake (error correction).
+ * Determines the range of buffer left when skipping an audio chunk
+ * of size lower than maximum allowed chunk size.
  *
- * For example, if kStreamWordsPerErrorRate = 4, then we allow maximum 1 mistake
- * in a 4 word string.
- */
-constexpr static size_t kStreamWordsPerErrorRate = 5;
-
-/**
- * A threshold which exceeded causes the main streaming audio buffer to be
- * cleared.
- */
-constexpr static float kStreamChunkThreshold = 20.F; // [s]
-
-/**
- * Decides how much of recent audio waveform is always kept in when
- * clearing the audio buffer in streaming algorithm.
- */
-constexpr static float kStreamAudioBufferMinReserve = 2.F; // [s]
-
-/**
- * Decides how much of recent audio waveform can be kept in when
- * clearing the audio buffer in streaming algorithm.
- */
-constexpr static float kStreamAudioBufferMaxReserve = 6.F; // [s]
-
-/**
- * An estimate of number of words per second produced in a standard
- * human conversation speech.
+ * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer],
+ * then instead of moving to the last returned timestamp, we jump across the
+ * entire 30 seconds chunk. This resolves the issue of multiple redundant
+ * segments being produced by the transcription algorithm.
  */
-constexpr static float kStreamWordsPerSecond = 2.5F;
+constexpr inline int32_t kChunkBreakBuffer = 2; // [s]
 
-} // namespace rnexecutorch::models::speech_to_text::whisper::params
\ No newline at end of file
+} // namespace rnexecutorch::models::speech_to_text::whisper::params
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
index 2e4e3b5076..48c84a84b7 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "../common/types/Word.h"
+#include "Constants.h"
 #include <algorithm>
 #include <cmath>
 #include <span>
@@ -8,70 +9,14 @@
 
 namespace rnexecutorch::models::speech_to_text::whisper::utils {
 
-// Compares two strings without case-sensitivity.
-inline bool equalsIgnoreCase(const std::string &a, const std::string &b) {
-  if (a.size() != b.size()) {
-    return false;
-  }
-  return std::equal(a.begin(), a.end(), b.begin(), [](char c1, char c2) {
-    return std::tolower(static_cast<unsigned char>(c1)) ==
-           std::tolower(static_cast<unsigned char>(c2));
-  });
-}
-
 /**
- * Finds the largest (in number of words) overlaping fragment between word
- * vectors A (suffix) and B (prefix).
+ * Checks if the given word represents an End-of-Sentence (EOS) punctuation.
  *
- * An overlaping fragment is any fragment C, which can be simultaneously a
- * suffix of A and a prefix of B. Example: A = 'Jane likes food and playing
- * games', B = 'playing games and sleeping', the overlap fragment C = 'playing
- * games'.
- *
- * @param suffixVec An input vector, where only suffixes can overlap.
- *                  Typically the 'commited' buffer in streaming algorithm.
- * @param preffixVec An input vector, where only prefixes can overlap.
- *                   Typically the 'fresh' buffer in streaming algorithm.
- * @param maxCheckRange The maximum size of overlapping fragment. Determines the
- * range of search.
- * @param maxTimestampDiff The maximum allowed timestamp difference between
- * overlaping fragments. If exceeded, the fragment are not considered as
- * overlaping.
- * @return The size of the largest found overlaping fragment.
+ * @param word The word to check.
  */
-template <typename Container>
-inline size_t findLargestOverlapingFragment(const Container &suffixVec,
-                                            const Container &prefixVec,
-                                            size_t maxCheckRange = 10,
-                                            float maxTimestampDiff = 100.f) {
-  size_t range = std::min({suffixVec.size(), prefixVec.size(), maxCheckRange});
-
-  if (range == 0) {
-    return 0;
-  }
-
-  // i starts at the index where the suffix of length 'range' begins.
-  for (size_t i = suffixVec.size() - range; i < suffixVec.size(); ++i) {
-    // We search for overlaps by searching for the first word of prefixVec
-    if (equalsIgnoreCase(suffixVec[i].content, prefixVec[0].content)) {
-      size_t calculatedSize = suffixVec.size() - i;
-
-      bool isEqual =
-          std::equal(suffixVec.begin() + i, suffixVec.end(), prefixVec.begin(),
-                     [maxTimestampDiff](const Word &sWord, const Word &pWord) {
-                       return equalsIgnoreCase(sWord.content, pWord.content) &&
-                              std::max(std::fabs(sWord.start - pWord.start),
-                                       std::fabs(sWord.end - pWord.end)) <=
-                                  maxTimestampDiff;
-                     });
-
-      if (isEqual) {
-        return calculatedSize;
-      }
-    }
-  }
-
-  return 0;
+constexpr inline bool isEos(const Word &word) {
+  return word.content.size() == 1 &&
+         constants::kEosPunctations.contains(word.content[0]);
 }
 
 } // namespace rnexecutorch::models::speech_to_text::whisper::utils
\ No newline at end of file
diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts
index 6fb20f9ca3..c423594213 100644
--- a/packages/react-native-executorch/src/constants/modelUrls.ts
+++ b/packages/react-native-executorch/src/constants/modelUrls.ts
@@ -773,32 +773,29 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = {
 } as const;
 
 // S2T
-const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_EN_MODEL = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`;
+const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`;
+const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`;
 
-const WHISPER_TINY_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_quantized_xnnpack.pte`;
+const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`;
+const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`;
 
-const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_EN_MODEL = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`;
+const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`;
+const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`;
 
-const WHISPER_BASE_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-base-quantized.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-base-quantized.en/${VERSION_TAG}/xnnpack/whisper_base_en_quantized_xnnpack.pte`;
+const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`;
+const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`;
 
-const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_EN_MODEL = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`;
+const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`;
+const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`;
 
-const WHISPER_SMALL_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-small-quantized.en/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-small-quantized.en/${VERSION_TAG}/xnnpack/whisper_small_en_quantized_xnnpack.pte`;
-
-const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_TINY_MODEL = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`;
-
-const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_BASE_MODEL = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_xnnpack.pte`;
-
-const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/tokenizer.json`;
-const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_xnnpack.pte`;
+const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/tokenizer.json`;
+const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`;
+const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`;
 
 /**
  * @category Models - Speech To Text
@@ -806,18 +803,15 @@ const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/
 export const WHISPER_TINY_EN = {
   modelName: 'whisper-tiny-en',
   isMultilingual: false,
-  modelSource: WHISPER_TINY_EN_MODEL,
+  modelSource: WHISPER_TINY_EN_MODEL_XNNPACK,
   tokenizerSource: WHISPER_TINY_EN_TOKENIZER,
 } as const;
 
-/**
- * @category Models - Speech To Text
- */
-export const WHISPER_TINY_EN_QUANTIZED = {
-  modelName: 'whisper-tiny-en-quantized',
+export const WHISPER_TINY_EN_COREML = {
+  modelName: 'whisper-tiny-en',
   isMultilingual: false,
-  modelSource: WHISPER_TINY_EN_QUANTIZED_MODEL,
-  tokenizerSource: WHISPER_TINY_EN_QUANTIZED_TOKENIZER,
+  modelSource: WHISPER_TINY_EN_MODEL_COREML,
+  tokenizerSource: WHISPER_TINY_EN_TOKENIZER,
 } as const;
 
 /**
@@ -826,18 +820,18 @@ export const WHISPER_TINY_EN_QUANTIZED = {
 export const WHISPER_BASE_EN = {
   modelName: 'whisper-base-en',
   isMultilingual: false,
-  modelSource: WHISPER_BASE_EN_MODEL,
+  modelSource: WHISPER_BASE_EN_MODEL_XNNPACK,
   tokenizerSource: WHISPER_BASE_EN_TOKENIZER,
 } as const;
 
 /**
  * @category Models - Speech To Text
  */
-export const WHISPER_BASE_EN_QUANTIZED = {
-  modelName: 'whisper-base-en-quantized',
+export const WHISPER_BASE_EN_COREML = {
+  modelName: 'whisper-base-en',
   isMultilingual: false,
-  modelSource: WHISPER_BASE_EN_QUANTIZED_MODEL,
-  tokenizerSource: WHISPER_BASE_EN_QUANTIZED_TOKENIZER,
+  modelSource: WHISPER_BASE_EN_MODEL_COREML,
+  tokenizerSource: WHISPER_BASE_EN_TOKENIZER,
 } as const;
 
 /**
@@ -846,18 +840,18 @@ export const WHISPER_BASE_EN_QUANTIZED = {
 export const WHISPER_SMALL_EN = {
   modelName: 'whisper-small-en',
   isMultilingual: false,
-  modelSource: WHISPER_SMALL_EN_MODEL,
+  modelSource: WHISPER_SMALL_EN_MODEL_XNNPACK,
   tokenizerSource: WHISPER_SMALL_EN_TOKENIZER,
 } as const;
 
 /**
  * @category Models - Speech To Text
  */
-export const WHISPER_SMALL_EN_QUANTIZED = {
-  modelName: 'whisper-small-en-quantized',
+export const WHISPER_SMALL_EN_COREML = {
+  modelName: 'whisper-small-en',
   isMultilingual: false,
-  modelSource: WHISPER_SMALL_EN_QUANTIZED_MODEL,
-  tokenizerSource: WHISPER_SMALL_EN_QUANTIZED_TOKENIZER,
+  modelSource: WHISPER_SMALL_EN_MODEL_COREML,
+  tokenizerSource: WHISPER_SMALL_EN_TOKENIZER,
 } as const;
 
 /**
@@ -866,7 +860,17 @@ export const WHISPER_SMALL_EN_QUANTIZED = {
 export const WHISPER_TINY = {
   modelName: 'whisper-tiny',
   isMultilingual: true,
-  modelSource: WHISPER_TINY_MODEL,
+  modelSource: WHISPER_TINY_MODEL_XNNPACK,
+  tokenizerSource: WHISPER_TINY_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_TINY_COREML = {
+  modelName: 'whisper-tiny',
+  isMultilingual: true,
+  modelSource: WHISPER_TINY_MODEL_COREML,
   tokenizerSource: WHISPER_TINY_TOKENIZER,
 } as const;
 
@@ -876,7 +880,17 @@ export const WHISPER_TINY = {
 export const WHISPER_BASE = {
   modelName: 'whisper-base',
   isMultilingual: true,
-  modelSource: WHISPER_BASE_MODEL,
+  modelSource: WHISPER_BASE_MODEL_XNNPACK,
+  tokenizerSource: WHISPER_BASE_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_BASE_COREML = {
+  modelName: 'whisper-base',
+  isMultilingual: true,
+  modelSource: WHISPER_BASE_MODEL_COREML,
   tokenizerSource: WHISPER_BASE_TOKENIZER,
 } as const;
 
@@ -886,7 +900,17 @@ export const WHISPER_BASE = {
 export const WHISPER_SMALL = {
   modelName: 'whisper-small',
   isMultilingual: true,
-  modelSource: WHISPER_SMALL_MODEL,
+  modelSource: WHISPER_SMALL_MODEL_XNNPACK,
+  tokenizerSource: WHISPER_SMALL_TOKENIZER,
+} as const;
+
+/**
+ * @category Models - Speech To Text
+ */
+export const WHISPER_SMALL_COREML = {
+  modelName: 'whisper-small',
+  isMultilingual: true,
+  modelSource: WHISPER_SMALL_MODEL_COREML,
   tokenizerSource: WHISPER_SMALL_TOKENIZER,
 } as const;
 
@@ -1314,14 +1338,17 @@ export const MODEL_REGISTRY = {
     STYLE_TRANSFER_UDNIE,
     STYLE_TRANSFER_UDNIE_QUANTIZED,
     WHISPER_TINY_EN,
-    WHISPER_TINY_EN_QUANTIZED,
+    WHISPER_TINY_EN_COREML,
     WHISPER_BASE_EN,
-    WHISPER_BASE_EN_QUANTIZED,
+    WHISPER_BASE_EN_COREML,
     WHISPER_SMALL_EN,
-    WHISPER_SMALL_EN_QUANTIZED,
+    WHISPER_SMALL_EN_COREML,
     WHISPER_TINY,
+    WHISPER_TINY_COREML,
     WHISPER_BASE,
+    WHISPER_BASE_COREML,
     WHISPER_SMALL,
+    WHISPER_SMALL_COREML,
     DEEPLAB_V3_RESNET50,
     DEEPLAB_V3_RESNET101,
     DEEPLAB_V3_MOBILENET_V3_LARGE,
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
index 9f428c98b2..5ac929a67f 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts
@@ -5,6 +5,7 @@ import {
   SpeechToTextType,
   SpeechToTextProps,
   TranscriptionResult,
+  StreamingOptions,
 } from '../../types/stt';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -104,7 +105,7 @@ export const useSpeechToText = ({
   );
 
   const stream = useCallback(
-    async function* (options: DecodingOptions = {}): AsyncGenerator<
+    async function* (options: StreamingOptions = {}): AsyncGenerator<
       {
         committed: TranscriptionResult;
         nonCommitted: TranscriptionResult;
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
index a1bf6231ad..36464ee964 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts
@@ -2,6 +2,7 @@ import {
   DecodingOptions,
   SpeechToTextModelConfig,
   SpeechToTextModelName,
+  StreamingOptions,
   TranscriptionResult,
 } from '../../types/stt';
 import { ResourceFetcher } from '../../utils/ResourceFetcher';
@@ -177,7 +178,7 @@ export class SpeechToTextModule {
    * @yields An object containing `committed` and `nonCommitted` transcription results.
    * @returns An async generator yielding transcription updates.
    */
-  public async *stream(options: DecodingOptions = {}): AsyncGenerator<{
+  public async *stream(options: StreamingOptions = {}): AsyncGenerator<{
     committed: TranscriptionResult;
     nonCommitted: TranscriptionResult;
   }> {
@@ -185,6 +186,7 @@ export class SpeechToTextModule {
 
     const verbose = !!options.verbose;
     const language = options.language || '';
+    const timeout = options.timeout || 100;
 
     const queue: {
       committed: TranscriptionResult;
@@ -219,7 +221,8 @@ export class SpeechToTextModule {
             wake();
           },
           language,
-          verbose
+          verbose,
+          timeout
         );
 
         finished = true;
diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts
index 0a6ed11f70..20f1013ef0 100644
--- a/packages/react-native-executorch/src/types/stt.ts
+++ b/packages/react-native-executorch/src/types/stt.ts
@@ -94,7 +94,7 @@ export interface SpeechToTextType {
    * @returns Asynchronous generator that returns `committed` and `nonCommitted` transcription.
    * Both `committed` and `nonCommitted` are of type `TranscriptionResult`
    */
-  stream(options?: DecodingOptions | undefined): AsyncGenerator<
+  stream(options?: StreamingOptions | undefined): AsyncGenerator<
     {
       committed: TranscriptionResult;
       nonCommitted: TranscriptionResult;
@@ -208,6 +208,15 @@ export interface DecodingOptions {
   verbose?: boolean;
 }
 
+/**
+ * Configuration options for the speech-to-text streaming process.
+ * @category Types
+ * @property {number} [timeout] - Specifies (in milliseconds) how much does streamer wait between model inferences.
+ */
+export interface StreamingOptions extends DecodingOptions {
+  timeout?: number;
+}
+
 /**
  * Structure that represent single token with timestamp information.
  * @category Types
diff --git a/yarn.lock b/yarn.lock
index 256469db22..9584660eb7 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -15249,6 +15249,24 @@ __metadata:
   languageName: node
   linkType: hard
 
+"react-native-audio-api@npm:0.12.2":
+  version: 0.12.2
+  resolution: "react-native-audio-api@npm:0.12.2"
+  dependencies:
+    semver: "npm:^7.7.3"
+  peerDependencies:
+    react: "*"
+    react-native: "*"
+    react-native-worklets: ">= 0.6.0"
+  peerDependenciesMeta:
+    react-native-worklets:
+      optional: true
+  bin:
+    setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js
+  checksum: 10/ed495058382188c8beb51ce89f2ef14d846dc0c0a07c65a7b4c71aa106fb7ea14aa8660b05fb33941c038d1a7ab2ba4ab3eb039fe481841938c45396903c6060
+  languageName: node
+  linkType: hard
+
 "react-native-builder-bob@npm:^0.40.12":
   version: 0.40.18
   resolution: "react-native-builder-bob@npm:0.40.18"
@@ -16627,7 +16645,7 @@ __metadata:
     metro-config: "npm:^0.83.0"
     react: "npm:19.2.5"
     react-native: "npm:0.83.4"
-    react-native-audio-api: "npm:0.12.0"
+    react-native-audio-api: "npm:0.12.2"
     react-native-device-info: "npm:^15.0.2"
     react-native-executorch: "workspace:*"
     react-native-executorch-expo-resource-fetcher: "workspace:*"