diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt
index 84d006eefe..55eda9bfd5 100644
--- a/.cspell-wordlist.txt
+++ b/.cspell-wordlist.txt
@@ -203,3 +203,5 @@ fishjam
 Fishjam
 deinitialize
 Deinitialize
+phonemize
+phonemization
\ No newline at end of file
diff --git a/.gitmodules b/.gitmodules
index 31a5d6e4b4..290a297a4c 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -4,3 +4,7 @@
 [submodule "third-party/googletest"]
 	path = third-party/googletest
 	url = https://github.com/google/googletest.git
+[submodule "packages/react-native-executorch/third-party/common/phonemis"]
+	path = packages/react-native-executorch/third-party/common/phonemis
+	url = https://github.com/IgorSwat/Phonemis
+	branch = main
diff --git a/apps/speech/components/ModelPicker.tsx b/apps/speech/components/ModelPicker.tsx
index 5e8284ee9a..9fee51ff34 100644
--- a/apps/speech/components/ModelPicker.tsx
+++ b/apps/speech/components/ModelPicker.tsx
@@ -1,10 +1,12 @@
 import React, { useEffect, useRef, useState } from 'react';
 import {
   Dimensions,
+  Modal,
   ScrollView,
   StyleSheet,
   Text,
   TouchableOpacity,
+  TouchableWithoutFeedback,
   View,
 } from 'react-native';
 
@@ -21,7 +23,7 @@ type Props<T> = {
   disabled?: boolean;
 };
 
-const DROPDOWN_MAX_HEIGHT = 200;
+const DROPDOWN_MAX_HEIGHT = 300;
 
 export function ModelPicker<T>({
   models,
@@ -31,8 +33,11 @@ export function ModelPicker<T>({
   disabled,
 }: Props<T>) {
   const [open, setOpen] = useState(false);
-  const [triggerHeight, setTriggerHeight] = useState(0);
-  const [expandUp, setExpandUp] = useState(false);
+  const [dropdownLayout, setDropdownLayout] = useState({
+    x: 0,
+    y: 0,
+    width: 0,
+  });
   const triggerRef = useRef<React.ComponentRef<typeof TouchableOpacity>>(null);
   const selected = models.find((m) => m.value === selectedModel);
 
@@ -50,23 +55,22 @@ export function ModelPicker<T>({
       (
         _x: number,
         _y: number,
-        _width: number,
+        width: number,
         height: number,
-        _pageX: number,
+        pageX: number,
         pageY: number
       ) => {
-        setTriggerHeight(height);
         const spaceBelow = Dimensions.get('window').height - (pageY + height);
-        setExpandUp(spaceBelow < DROPDOWN_MAX_HEIGHT);
+        const y =
+          spaceBelow >= DROPDOWN_MAX_HEIGHT
+            ? pageY + height + 2
+            : pageY - Math.min(DROPDOWN_MAX_HEIGHT, models.length * 42) - 2;
+        setDropdownLayout({ x: pageX, y, width });
         setOpen(true);
       }
     );
   };
 
-  const dropdownPosition = expandUp
-    ? { bottom: triggerHeight + 2 }
-    : { top: triggerHeight + 2 };
-
   return (
     <View style={styles.container}>
       <TouchableOpacity
@@ -80,36 +84,52 @@ export function ModelPicker<T>({
         <Text style={styles.chevron}>{open ? '▲' : '▼'}</Text>
       </TouchableOpacity>
 
-      {open && (
-        <ScrollView
-          style={[styles.dropdown, dropdownPosition]}
-          nestedScrollEnabled
-          keyboardShouldPersistTaps="handled"
-        >
-          {models.map((item) => {
-            const isSelected = item.value === selectedModel;
-            return (
-              <TouchableOpacity
-                key={item.label}
-                style={[styles.option, isSelected && styles.optionSelected]}
-                onPress={() => {
-                  onSelect(item.value);
-                  setOpen(false);
-                }}
-              >
-                <Text
-                  style={[
-                    styles.optionText,
-                    isSelected && styles.optionTextSelected,
-                  ]}
-                >
-                  {item.label}
-                </Text>
-              </TouchableOpacity>
-            );
-          })}
-        </ScrollView>
-      )}
+      <Modal
+        visible={open}
+        transparent
+        animationType="none"
+        onRequestClose={() => setOpen(false)}
+      >
+        <TouchableWithoutFeedback onPress={() => setOpen(false)}>
+          <View style={StyleSheet.absoluteFill}>
+            <ScrollView
+              style={[
+                styles.dropdown,
+                {
+                  position: 'absolute',
+                  top: dropdownLayout.y,
+                  left: dropdownLayout.x,
+                  width: dropdownLayout.width,
+                },
+              ]}
+              keyboardShouldPersistTaps="handled"
+            >
+              {models.map((item) => {
+                const isSelected = item.value === selectedModel;
+                return (
+                  <TouchableOpacity
+                    key={item.label}
+                    style={[styles.option, isSelected && styles.optionSelected]}
+                    onPress={() => {
+                      onSelect(item.value);
+                      setOpen(false);
+                    }}
+                  >
+                    <Text
+                      style={[
+                        styles.optionText,
+                        isSelected && styles.optionTextSelected,
+                      ]}
+                    >
+                      {item.label}
+                    </Text>
+                  </TouchableOpacity>
+                );
+              })}
+            </ScrollView>
+          </View>
+        </TouchableWithoutFeedback>
+      </Modal>
     </View>
   );
 }
@@ -119,7 +139,6 @@ const styles = StyleSheet.create({
     marginHorizontal: 12,
     marginVertical: 4,
     alignSelf: 'stretch',
-    zIndex: 100,
   },
   trigger: {
     flexDirection: 'row',
@@ -151,19 +170,15 @@ const styles = StyleSheet.create({
     marginLeft: 6,
   },
   dropdown: {
-    position: 'absolute',
-    left: 0,
-    right: 0,
     borderWidth: 1,
     borderColor: '#C1C6E5',
     borderRadius: 8,
     backgroundColor: '#fff',
     maxHeight: DROPDOWN_MAX_HEIGHT,
-    zIndex: 100,
-    elevation: 4,
+    elevation: 8,
     shadowColor: '#000',
     shadowOffset: { width: 0, height: 2 },
-    shadowOpacity: 0.1,
+    shadowOpacity: 0.15,
     shadowRadius: 4,
   },
   option: {
diff --git a/apps/speech/package.json b/apps/speech/package.json
index 2beb2cc41d..377e2e5700 100644
--- a/apps/speech/package.json
+++ b/apps/speech/package.json
@@ -20,7 +20,7 @@
     "metro-config": "^0.83.0",
     "react": "19.2.5",
     "react-native": "0.83.4",
-    "react-native-audio-api": "0.12.0",
+    "react-native-audio-api": "0.11.5",
     "react-native-device-info": "^15.0.2",
     "react-native-executorch": "workspace:*",
     "react-native-executorch-expo-resource-fetcher": "workspace:*",
diff --git a/apps/speech/screens/Quiz.tsx b/apps/speech/screens/Quiz.tsx
index 8f03f1ae6d..ae7cf69998 100644
--- a/apps/speech/screens/Quiz.tsx
+++ b/apps/speech/screens/Quiz.tsx
@@ -18,8 +18,7 @@ import Animated, {
 } from 'react-native-reanimated';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
-  KOKORO_MEDIUM,
-  KOKORO_VOICE_AM_SANTA,
+  KOKORO_AMERICAN_ENGLISH_MALE_SANTA,
   useTextToSpeech,
 } from 'react-native-executorch';
 import {
@@ -60,10 +59,7 @@ const createAudioBufferFromVector = (
 
 export const Quiz = ({ onBack }: { onBack: () => void }) => {
   // --- Hooks & State ---
-  const model = useTextToSpeech({
-    model: KOKORO_MEDIUM,
-    voice: KOKORO_VOICE_AM_SANTA,
-  });
+  const model = useTextToSpeech(KOKORO_AMERICAN_ENGLISH_MALE_SANTA);
 
   const [shuffledQuestions] = useState(() => shuffleArray(QUESTIONS));
   const [currentIndex, setCurrentIndex] = useState(0);
diff --git a/apps/speech/screens/TextToSpeechLLMScreen.tsx b/apps/speech/screens/TextToSpeechLLMScreen.tsx
index e99072869b..d94180096d 100644
--- a/apps/speech/screens/TextToSpeechLLMScreen.tsx
+++ b/apps/speech/screens/TextToSpeechLLMScreen.tsx
@@ -12,8 +12,7 @@ import SWMIcon from '../assets/swm_icon.svg';
 import {
   useLLM,
   useTextToSpeech,
-  KOKORO_MEDIUM,
-  KOKORO_VOICE_AF_HEART,
+  KOKORO_AMERICAN_ENGLISH_FEMALE_HEART,
   LLAMA3_2_1B_QLORA,
 } from 'react-native-executorch';
 import {
@@ -54,10 +53,7 @@ export const TextToSpeechLLMScreen = ({ onBack }: TextToSpeechLLMProps) => {
   const [displayText, setDisplayText] = useState('');
   const [isTtsStreaming, setIsTtsStreaming] = useState(false);
   const llm = useLLM({ model: LLAMA3_2_1B_QLORA });
-  const tts = useTextToSpeech({
-    model: KOKORO_MEDIUM,
-    voice: KOKORO_VOICE_AF_HEART,
-  });
+  const tts = useTextToSpeech(KOKORO_AMERICAN_ENGLISH_FEMALE_HEART);
 
   const processedLengthRef = useRef(0);
   const audioContextRef = useRef<AudioContext | null>(null);
diff --git a/apps/speech/screens/TextToSpeechScreen.tsx b/apps/speech/screens/TextToSpeechScreen.tsx
index 0cb64bfae7..65b3ca7506 100644
--- a/apps/speech/screens/TextToSpeechScreen.tsx
+++ b/apps/speech/screens/TextToSpeechScreen.tsx
@@ -10,37 +10,52 @@ import {
 } from 'react-native';
 import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context';
 import {
-  KOKORO_SMALL,
-  KOKORO_MEDIUM,
-  KOKORO_VOICE_AF_HEART,
-  KOKORO_VOICE_AF_RIVER,
-  KOKORO_VOICE_AF_SARAH,
-  KOKORO_VOICE_AM_ADAM,
-  KOKORO_VOICE_AM_MICHAEL,
-  KOKORO_VOICE_AM_SANTA,
-  KOKORO_VOICE_BF_EMMA,
-  KOKORO_VOICE_BM_DANIEL,
   useTextToSpeech,
-  KokoroConfig,
-  VoiceConfig,
+  TextToSpeechModelConfig,
+  KOKORO_AMERICAN_ENGLISH_FEMALE_HEART,
+  KOKORO_AMERICAN_ENGLISH_FEMALE_RIVER,
+  KOKORO_AMERICAN_ENGLISH_FEMALE_SARAH,
+  KOKORO_AMERICAN_ENGLISH_MALE_ADAM,
+  KOKORO_AMERICAN_ENGLISH_MALE_MICHAEL,
+  KOKORO_AMERICAN_ENGLISH_MALE_SANTA,
+  KOKORO_BRITISH_ENGLISH_FEMALE_EMMA,
+  KOKORO_BRITISH_ENGLISH_MALE_DANIEL,
+  KOKORO_FRENCH_FEMALE_SIWIS,
+  KOKORO_SPANISH_FEMALE_DORA,
+  KOKORO_SPANISH_MALE_ALEX,
+  KOKORO_ITALIAN_FEMALE_SARA,
+  KOKORO_ITALIAN_MALE_NICOLA,
+  KOKORO_PORTUGUESE_FEMALE_DORA,
+  KOKORO_PORTUGUESE_MALE_SANTA,
+  KOKORO_POLISH_MALE_MATEUSZ,
+  KOKORO_HINDI_FEMALE_ALPHA,
+  KOKORO_HINDI_MALE_OMEGA,
+  KOKORO_HINDI_MALE_PSI,
 } from 'react-native-executorch';
 import { ModelPicker, ModelOption } from '../components/ModelPicker';
 
-const TTS_MODELS: ModelOption<KokoroConfig>[] = [
-  { label: 'Kokoro Small', value: KOKORO_SMALL },
-  { label: 'Kokoro Medium', value: KOKORO_MEDIUM },
+const VOICES: ModelOption<TextToSpeechModelConfig>[] = [
+  { label: '🇺🇸 AF Heart', value: KOKORO_AMERICAN_ENGLISH_FEMALE_HEART },
+  { label: '🇺🇸 AF River', value: KOKORO_AMERICAN_ENGLISH_FEMALE_RIVER },
+  { label: '🇺🇸 AF Sarah', value: KOKORO_AMERICAN_ENGLISH_FEMALE_SARAH },
+  { label: '🇺🇸 AM Adam', value: KOKORO_AMERICAN_ENGLISH_MALE_ADAM },
+  { label: '🇺🇸 AM Michael', value: KOKORO_AMERICAN_ENGLISH_MALE_MICHAEL },
+  { label: '🇺🇸 AM Santa', value: KOKORO_AMERICAN_ENGLISH_MALE_SANTA },
+  { label: '🇬🇧 BF Emma', value: KOKORO_BRITISH_ENGLISH_FEMALE_EMMA },
+  { label: '🇬🇧 BM Daniel', value: KOKORO_BRITISH_ENGLISH_MALE_DANIEL },
+  { label: '🇫🇷 FF Siwis', value: KOKORO_FRENCH_FEMALE_SIWIS },
+  { label: '🇪🇸 EF Dora', value: KOKORO_SPANISH_FEMALE_DORA },
+  { label: '🇪🇸 EM Alex', value: KOKORO_SPANISH_MALE_ALEX },
+  { label: '🇮🇹 IF Sara', value: KOKORO_ITALIAN_FEMALE_SARA },
+  { label: '🇮🇹 IM Nicola', value: KOKORO_ITALIAN_MALE_NICOLA },
+  { label: '🇵🇹 PF Dora', value: KOKORO_PORTUGUESE_FEMALE_DORA },
+  { label: '🇵🇹 PM Santa', value: KOKORO_PORTUGUESE_MALE_SANTA },
+  { label: '🇵🇱 PM Mateusz', value: KOKORO_POLISH_MALE_MATEUSZ },
+  { label: '🇮🇳 HF Alpha', value: KOKORO_HINDI_FEMALE_ALPHA },
+  { label: '🇮🇳 HM Omega', value: KOKORO_HINDI_MALE_OMEGA },
+  { label: '🇮🇳 HM Psi', value: KOKORO_HINDI_MALE_PSI },
 ];
 
-const VOICES: ModelOption<VoiceConfig>[] = [
-  { label: 'AF Heart', value: KOKORO_VOICE_AF_HEART },
-  { label: 'AF River', value: KOKORO_VOICE_AF_RIVER },
-  { label: 'AF Sarah', value: KOKORO_VOICE_AF_SARAH },
-  { label: 'AM Adam', value: KOKORO_VOICE_AM_ADAM },
-  { label: 'AM Michael', value: KOKORO_VOICE_AM_MICHAEL },
-  { label: 'AM Santa', value: KOKORO_VOICE_AM_SANTA },
-  { label: 'BF Emma', value: KOKORO_VOICE_BF_EMMA },
-  { label: 'BM Daniel', value: KOKORO_VOICE_BM_DANIEL },
-];
 import FontAwesome from '@expo/vector-icons/FontAwesome';
 import {
   AudioManager,
@@ -77,16 +92,11 @@ const createAudioBufferFromVector = (
 };
 
 export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => {
-  const [selectedModel, setSelectedModel] =
-    useState<KokoroConfig>(KOKORO_MEDIUM);
-  const [selectedVoice, setSelectedVoice] = useState<VoiceConfig>(
-    KOKORO_VOICE_AF_HEART
+  const [selectedVoice, setSelectedVoice] = useState<TextToSpeechModelConfig>(
+    KOKORO_AMERICAN_ENGLISH_FEMALE_HEART
   );
 
-  const model = useTextToSpeech({
-    model: selectedModel,
-    voice: selectedVoice,
-  });
+  const model = useTextToSpeech(selectedVoice);
 
   const [inputText, setInputText] = useState('');
   const [isPlaying, setIsPlaying] = useState(false);
@@ -94,6 +104,7 @@ export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => {
   const [error, setError] = useState<string | null>(null);
 
   const audioContextRef = useRef<AudioContext | null>(null);
+  const gainNodeRef = useRef<any>(null);
   const sourceRef = useRef<AudioBufferSourceNode>(null);
 
   useEffect(() => {
@@ -103,12 +114,20 @@ export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => {
       iosOptions: ['defaultToSpeaker'],
     });
 
-    audioContextRef.current = new AudioContext({ sampleRate: 24000 });
-    audioContextRef.current.suspend();
+    const context = new AudioContext({ sampleRate: 24000 });
+    audioContextRef.current = context;
+    context.suspend();
+
+    // Increase the audio volume
+    const gainNode = context.createGain();
+    gainNode.gain.value = 2.0; // Increase volume by 2x
+    gainNode.connect(context.destination);
+    gainNodeRef.current = gainNode;
 
     return () => {
       audioContextRef.current?.close();
       audioContextRef.current = null;
+      gainNodeRef.current = null;
     };
   }, []);
 
@@ -142,7 +161,12 @@ export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => {
           const source = (sourceRef.current =
             audioContext.createBufferSource());
           source.buffer = audioBuffer;
-          source.connect(audioContext.destination);
+
+          if (gainNodeRef.current) {
+            source.connect(gainNodeRef.current);
+          } else {
+            source.connect(audioContext.destination);
+          }
 
           source.onEnded = () => resolve();
 
@@ -157,6 +181,7 @@ export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => {
 
       await model.stream({
         text: inputText,
+        phonemize: true,
         onNext,
         onEnd,
       });
@@ -197,13 +222,6 @@ export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => {
           </View>
           <ErrorBanner message={error} onDismiss={() => setError(null)} />
 
-          <ModelPicker
-            label="Model"
-            models={TTS_MODELS}
-            selectedModel={selectedModel}
-            disabled={model.isGenerating}
-            onSelect={(m) => setSelectedModel(m)}
-          />
           <ModelPicker
             label="Voice"
             models={VOICES}
diff --git a/packages/react-native-executorch/android/CMakeLists.txt b/packages/react-native-executorch/android/CMakeLists.txt
index e7fae6e632..038335f7e2 100644
--- a/packages/react-native-executorch/android/CMakeLists.txt
+++ b/packages/react-native-executorch/android/CMakeLists.txt
@@ -21,6 +21,7 @@ string(APPEND CMAKE_CXX_FLAGS " -DRCT_NEW_ARCH_ENABLED")
 set(ANDROID_CPP_DIR "${CMAKE_SOURCE_DIR}/src/main/cpp")
 set(COMMON_CPP_DIR "${CMAKE_SOURCE_DIR}/../common")
 set(LIBS_DIR "${CMAKE_SOURCE_DIR}/../third-party/android/libs")
+set(COMMON_THIRD_PARTY_DIR "${CMAKE_SOURCE_DIR}/../third-party/common")
 set(TOKENIZERS_DIR "${CMAKE_SOURCE_DIR}/../third-party/include/executorch/extension/llm/tokenizers/include")
 set(INCLUDE_DIR "${CMAKE_SOURCE_DIR}/../third-party/include")
 
diff --git a/packages/react-native-executorch/android/src/main/cpp/CMakeLists.txt b/packages/react-native-executorch/android/src/main/cpp/CMakeLists.txt
index d7bd1fa870..66a3f19e89 100644
--- a/packages/react-native-executorch/android/src/main/cpp/CMakeLists.txt
+++ b/packages/react-native-executorch/android/src/main/cpp/CMakeLists.txt
@@ -88,9 +88,15 @@ endif()
 
 # ------- phonemis -------
 
-set(PHONEMIS_LIBS
-  "${LIBS_DIR}/phonemis/${ANDROID_ABI}/libphonemis.a"
-)
+set(PHONEMIS_DIR "${COMMON_THIRD_PARTY_DIR}/phonemis")
+
+set(BUILD_RUNNER OFF CACHE BOOL "" FORCE)
+set(BUILD_TESTS OFF CACHE BOOL "" FORCE)
+add_subdirectory(${PHONEMIS_DIR} ${CMAKE_BINARY_DIR}/phonemis)
+
+target_compile_definitions(phonemis PRIVATE ET_ON)  # Phonemis uses ET_ON flag to detect available ExecuTorch build (NeuralPhonemizer)
+target_include_directories(phonemis PRIVATE "${INCLUDE_DIR}")  # ExecuTorch headers
+target_include_directories(react-native-executorch PUBLIC "${PHONEMIS_DIR}/src")
 
 # --------------
 
@@ -102,7 +108,7 @@ target_link_libraries(
   ${RN_VERSION_LINK_LIBRARIES}
   ${OPENCV_LIBS}
   ${OPENCV_THIRD_PARTY_LIBS}
-  ${PHONEMIS_LIBS}
+  phonemis
   executorch
   ${EXECUTORCH_LIBS}
   z
diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
index 7fb1387d49..1c29091228 100644
--- a/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
+++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/ModelHostObject.h
@@ -202,15 +202,6 @@ template <typename Model> class ModelHostObject : public JsiHostObject {
       addFunctions(JSI_EXPORT_FUNCTION(
           ModelHostObject<Model>, synchronousHostFunction<&Model::streamInsert>,
           "streamInsert"));
-      addFunctions(
-          JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
-                              promiseHostFunction<&Model::generateFromPhonemes>,
-                              "generateFromPhonemes"));
-
-      addFunctions(
-          JSI_EXPORT_FUNCTION(ModelHostObject<Model>,
-                              promiseHostFunction<&Model::streamFromPhonemes>,
-                              "streamFromPhonemes"));
     }
 
     if constexpr (meta::HasGenerateFromString<Model>) {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp
index ff71d2b536..4603cf6656 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp
@@ -48,10 +48,10 @@ DurationPredictor::DurationPredictor(
       [](const auto &a, const auto &b) { return a.second < b.second; });
 }
 
-std::tuple<Tensor, std::vector<int64_t>, int32_t>
+std::tuple<Tensor, std::vector<int64_t>, int32_t, std::vector<Timestamp>>
 DurationPredictor::generate(std::span<const Token> tokens,
-                            std::span<bool> textMask, std::span<float> ref_hs,
-                            float speed) {
+                            std::span<const bool> textMask,
+                            std::span<const float> ref_hs, float speed) {
   size_t inputSize = tokens.size();
 
   // Perform input shape checks
@@ -78,11 +78,15 @@ DurationPredictor::generate(std::span<const Token> tokens,
   auto tokensTensor =
       make_tensor_ptr({1, static_cast<int32_t>(tokens.size())},
                       const_cast<Token *>(tokens.data()), ScalarType::Long);
+
   auto textMaskTensor =
       make_tensor_ptr({1, static_cast<int32_t>(textMask.size())},
-                      textMask.data(), ScalarType::Bool);
-  auto voiceRefTensor = make_tensor_ptr({1, constants::kVoiceRefHalfSize},
-                                        ref_hs.data(), ScalarType::Float);
+                      const_cast<bool *>(textMask.data()), ScalarType::Bool);
+
+  auto voiceRefTensor =
+      make_tensor_ptr({1, constants::kVoiceRefHalfSize},
+                      const_cast<float *>(ref_hs.data()), ScalarType::Float);
+
   auto speedTensor = make_tensor_ptr({1}, &speed, ScalarType::Float);
 
   // Execute the appropriate "forward_xyz" method, based on given method name
@@ -126,6 +130,10 @@ DurationPredictor::generate(std::span<const Token> tokens,
       indices.begin(),
       std::lower_bound(indices.begin(), indices.end(), originalLength));
 
+  // Calculate timestamps - based on predicted durations.
+  std::vector<Timestamp> timestamps =
+      calculateTimestamps(predDurPtr, inputSize);
+
   /**
    * Returns:
    *   - d: tensor containing the predicted durations for each token.
@@ -133,13 +141,30 @@ DurationPredictor::generate(std::span<const Token> tokens,
    *   - effDuration: an effective duration after post-processing.
    */
   return std::make_tuple(std::move(dTensor), std::move(indices),
-                         std::move(effDuration));
+                         std::move(effDuration), std::move(timestamps));
 }
 
 size_t DurationPredictor::getTokensLimit() const {
   return forwardMethods_.empty() ? 0 : forwardMethods_.back().second;
 }
 
+std::vector<Timestamp>
+DurationPredictor::calculateTimestamps(const int64_t *predDurPtr,
+                                       size_t inputSize) const {
+  std::vector<Timestamp> timestamps;
+  timestamps.reserve(inputSize);
+
+  size_t accDur = 0;
+  for (size_t i = 0; i < inputSize; i++) {
+    int64_t dur = predDurPtr[i] *
+                  constants::kTicksPerDuration; // Convert to audio samples
+    timestamps.emplace_back(accDur, accDur + dur);
+    accDur += dur;
+  }
+
+  return timestamps;
+}
+
 void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens,
                                        int32_t targetDuration) const {
   // We expect durations tensor to be a Long tensor of a shape [1, n_tokens]
@@ -175,7 +200,7 @@ void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens,
         shrinking ? std::ceil(scaled) - scaled : scaled - std::floor(scaled);
 
     durationsPtr[i] = static_cast<int64_t>(shrinking ? std::ceil(scaled)
-                                                      : std::floor(scaled));
+                                                     : std::floor(scaled));
     scaledSum += durationsPtr[i];
 
     // Keeps the entries sorted by the remainders
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h
index 0921fd17ac..b932aa07c4 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h
@@ -35,23 +35,29 @@ class DurationPredictor : public BaseModel {
    *                    d - Tensor: predicted durations for each token,
    *                    indices  - std::vector<int64_t>: repeated token indices,
    *                    effDuration  - int32_t: effective duration after
-   *                    post-processing.
+   *                                            post-processing.
+   *                    timestamps - timestamp marks for each token (phoneme)
    */
-  std::tuple<Tensor, std::vector<int64_t>, int32_t>
-  generate(std::span<const Token> tokens, std::span<bool> textMask,
-           std::span<float> ref_hs, float speed = 1.F);
+  std::tuple<Tensor, std::vector<int64_t>, int32_t, std::vector<Timestamp>>
+  generate(std::span<const Token> tokens, std::span<const bool> textMask,
+           std::span<const float> ref_hs, float speed = 1.F);
 
   // Returns maximum supported amount of input tokens.
   size_t getTokensLimit() const;
 
 private:
+  // Helper function - calculating timestamps based on predicted durations
+  std::vector<Timestamp> calculateTimestamps(const int64_t *predDurPtr,
+                                             size_t inputSize) const;
+
   // Helper function - duration scalling
   // Performs integer scaling on the durations tensor to ensure the sum of
   // durations matches the given target duration
-  void scaleDurations(Tensor &durations, size_t nTokens,
-                      int32_t targetDuration) const;
+  void scaleDurations(
+      Tensor &durations, size_t nTokens,
+      int32_t targetDuration) const; // Helper function - calculating effective
+                                     // duration based on duration tensor
 
-  // Helper function - calculating effective duration based on duration tensor
   // Since we apply padding to the input, the effective duration is
   // usually a little bit lower than the max duration defined by static input
   // size.
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
index ea43f09d47..06366a095c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp
@@ -4,25 +4,39 @@
 
 #include <algorithm>
 #include <fstream>
-#include <phonemis/utilities/string_utils.h>
+#include <phonemis/utils/strings.h>
+#include <phonemis/utils/unicode.h>
 #include <rnexecutorch/Error.h>
+#include <rnexecutorch/Log.h>
 #include <rnexecutorch/data_processing/Sequential.h>
 #include <thread>
 
 namespace rnexecutorch::models::text_to_speech::kokoro {
 
 Kokoro::Kokoro(const std::string &lang, const std::string &taggerDataSource,
-               const std::string &phonemizerDataSource,
+               const std::string &lexiconSource,
+               const std::string &neuralModelSource,
                const std::string &durationPredictorSource,
                const std::string &synthesizerSource,
                const std::string &voiceSource,
                std::shared_ptr<react::CallInvoker> callInvoker)
     : callInvoker_(std::move(callInvoker)),
-      phonemizer_(lang == "en-us"   ? phonemis::Lang::EN_US
-                  : lang == "en-gb" ? phonemis::Lang::EN_GB
-                                    : phonemis::Lang::DEFAULT,
-                  taggerDataSource, phonemizerDataSource),
-      partitioner_(context_),
+      phonemizer_(phonemis::Config{
+          .lang = lang,
+          .tagger = taggerDataSource.empty()
+                        ? std::optional<phonemis::tagger::Config>{}
+                        : std::make_optional(phonemis::tagger::Config{
+                              .data_filepath = taggerDataSource}),
+          .phonemizer =
+              phonemis::phonemizer::Config{
+                  .lang = lang,
+                  .lexicon_filepath = lexiconSource.empty()
+                                          ? std::nullopt
+                                          : std::make_optional(lexiconSource),
+                  .nn_model_filepath =
+                      neuralModelSource.empty()
+                          ? std::nullopt
+                          : std::make_optional(neuralModelSource)}}),
       durationPredictor_(durationPredictorSource, context_, callInvoker_),
       synthesizer_(synthesizerSource, context_, callInvoker_) {
   // Populate the voice array by reading given file
@@ -76,16 +90,29 @@ void Kokoro::loadVoice(const std::string &voiceSource) {
   }
 }
 
-std::vector<float>
-Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) {
-  // Divide the phonemes string into substrings.
-  // Affects the further calculations only in case of string size
-  // exceeding the biggest model's input.
-  auto subsentences =
-      partitioner_.divide<Partitioner::Strategy::TOTAL_TIME>(phonemes);
+std::vector<float> Kokoro::generate(std::u32string input, float speed,
+                                    bool phonemize) {
+  if (input.size() > params::kMaxTextSize) {
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                            "Kokoro: maximum input text size exceeded");
+  }
+
+  if (input.empty()) {
+    return {};
+  }
+
+  // G2P (Grapheme to Phoneme) conversion
+  auto phonemes = phonemize ? phonemizer_(input) : input;
+
+  // Divide the phonemes string into substrings, minimizing the amount of
+  // breaks.
+  auto partition = partitioner_.partition(phonemes, context_.inputTokensLimit,
+                                          Partitioner::Mode::MIN_BREAKS);
 
   std::vector<float> audio = {};
-  for (const auto &subsentence : subsentences) {
+  for (const auto &[offset, length] : partition.segments) {
+    auto subsentence = partition.content.substr(offset, length);
+
     // Generate an audio vector with the Kokoro model
     auto audioPart = synthesize(subsentence, speed);
 
@@ -94,6 +121,7 @@ Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) {
     size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
                          ? params::kPauseValues.at(lastPhoneme)
                          : params::kDefaultPause;
+
     // Add audio part and silence pause to the main audio vector
     audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()),
                  std::make_move_iterator(audioPart.end()));
@@ -104,8 +132,9 @@ Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) {
   return audio;
 }
 
-void Kokoro::streamFromPhonemesImpl(const std::u32string &phonemes, float speed,
-                                    std::shared_ptr<jsi::Function> callback) {
+void Kokoro::stream(std::shared_ptr<jsi::Function> callback, float speed,
+                    bool phonemize, bool stopOnEmptyBuffer) {
+  // Create a callback
   auto nativeCallback = [this, callback](const std::vector<float> &audioVec) {
     if (this->isStreaming_) {
       this->callInvoker_->invokeAsync(
@@ -116,70 +145,6 @@ void Kokoro::streamFromPhonemesImpl(const std::u32string &phonemes, float speed,
     }
   };
 
-  // Use LATENCY strategy to minimize the time-to-first-audio for streaming
-  auto subsentences =
-      partitioner_.divide<Partitioner::Strategy::LATENCY>(phonemes);
-
-  for (size_t i = 0; i < subsentences.size(); i++) {
-    if (!isStreaming_) {
-      break;
-    }
-
-    const auto &subsentence = subsentences[i];
-
-    // Determine the silent padding duration to be stripped from the edges of
-    // the generated audio. If a chunk ends with a space or follows one that
-    // did, it indicates a word boundary split – we use a shorter padding
-    // to ensure natural speech flow. Otherwise, we use 50ms for standard
-    // pauses.
-    bool endsWithSpace = (subsentence.back() == U' ');
-    bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' ');
-    size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms]
-
-    // Generate an audio vector with the Kokoro model
-    auto audioPart = synthesize(subsentence, speed, paddingMs);
-
-    // Calculate and append a pause between the sentences
-    char32_t lastPhoneme = subsentence.back();
-    size_t pauseMs = params::kPauseValues.contains(lastPhoneme)
-                         ? params::kPauseValues.at(lastPhoneme)
-                         : params::kDefaultPause;
-    audioPart.resize(
-        audioPart.size() + pauseMs * constants::kSamplesPerMilisecond, 0.F);
-
-    // Push the audio right away to the JS side
-    nativeCallback(std::move(audioPart));
-  }
-}
-
-std::vector<float> Kokoro::generate(std::string text, float speed) {
-  if (text.size() > params::kMaxTextSize) {
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            "Kokoro: maximum input text size exceeded");
-  }
-
-  if (text.empty()) {
-    return {};
-  }
-
-  // G2P (Grapheme to Phoneme) conversion
-  auto phonemes = phonemizer_.process(text);
-
-  return generateFromPhonemesImpl(phonemes, speed);
-}
-
-std::vector<float> Kokoro::generateFromPhonemes(std::string phonemes,
-                                                float speed) {
-  if (phonemes.empty()) {
-    return {};
-  }
-
-  return generateFromPhonemesImpl(
-      phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed);
-}
-
-void Kokoro::stream(float speed, bool stopOnEmptyBuffer,
-                    std::shared_ptr<jsi::Function> callback) {
   isStreaming_ = true;
   stopOnEmptyBuffer_ = stopOnEmptyBuffer;
 
@@ -187,11 +152,16 @@ void Kokoro::stream(float speed, bool stopOnEmptyBuffer,
   // The extracted text is then passed to the inner loop, which performs a
   // standard streaming on a fixed amount of input text.
   while (isStreaming_) {
-    std::string text;
+    std::u32string input;
 
     // Extract the code relying on input buffer for a separate mutex lock
     // section.
     {
+      // Trim to remove trailing whitespace characters
+      inputTextBuffer_ =
+          phonemis::utils::strings::strip<std::u32string, char32_t>(
+              inputTextBuffer_);
+
       std::scoped_lock<std::mutex> lock(inputTextBufferMutex_);
       if (inputTextBuffer_.empty() && stopOnEmptyBuffer_) {
         break;
@@ -212,7 +182,7 @@ void Kokoro::stream(float speed, bool stopOnEmptyBuffer,
       // chunks which end in the middle of a sentence.
       if (chunkSize > 0 ||
           streamSkippedIterations >= params::kStreamMaxSkippedIterations) {
-        text = inputTextBuffer_.substr(0, chunkSize);
+        input = inputTextBuffer_.substr(0, chunkSize);
         inputTextBuffer_.erase(0, chunkSize);
         streamSkippedIterations = 0;
       } else {
@@ -220,10 +190,93 @@ void Kokoro::stream(float speed, bool stopOnEmptyBuffer,
       }
     }
 
-    if (!text.empty()) {
+    if (!input.empty()) {
       // Now we proceed with a standard streaming logic for fixed-size input.
-      auto phonemes = phonemizer_.process(text);
-      streamFromPhonemesImpl(phonemes, speed, callback);
+      // Start with preprocessing the input once.
+      std::u32string buffer = phonemizer_.preprocess(input);
+
+      // A variable to keep the information about phonemized (but not
+      // synthesized) tokens from the previous iteration.
+      size_t phonemizedTokens = 0;
+
+      while (!buffer.empty() && isStreaming_) {
+        // Since we do not phonemize the entire input before partitioning, there
+        // is a possibility that some segment might exceed the token limit after
+        // phonemization. This is being handled later.
+        auto partition = partitioner_.partition(
+            buffer, context_.inputTokensLimit, Partitioner::Mode::MIN_LATENCY);
+
+        for (size_t i = 0; i < partition.segments.size(); i++) {
+          if (!isStreaming_) {
+            break;
+          }
+
+          const auto &[offset, length] = partition.segments[i];
+          const auto subsentence = partition.content.substr(0, length);
+
+          std::u32string phonemes;
+
+          if (phonemize) {
+            size_t unchangedLength = std::min(length, phonemizedTokens);
+            // Include trailing space if it was already phonemized
+            if (unchangedLength < length &&
+                subsentence[unchangedLength] == U' ' &&
+                phonemizedTokens > unchangedLength) {
+              unchangedLength++;
+            }
+
+            // We phonemize on the fly - meaning there is no time waste
+            // phonemizing the entire input if we only need one segment at the
+            // time.`
+            phonemes = subsentence.substr(0, unchangedLength);
+            if (unchangedLength < length) {
+              // Phonemize without preprocessing (since we already did that).
+              phonemes +=
+                  phonemizer_(subsentence.substr(unchangedLength), false);
+            }
+          } else {
+            // Simple case - no phonemization, no risk of exceeding the token
+            // limit.
+            phonemes = subsentence;
+          }
+
+          if (phonemes.size() <= context_.inputTokensLimit - 2) {
+            // Determine the silent padding duration
+            bool endsWithSpace = (subsentence.back() == U' ');
+            bool prevEndsWithSpace =
+                (offset > 0 && partition.content[offset - 1] == U' ');
+            size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50;
+
+            // Generate and push audio
+            auto audioPart = synthesize(phonemes, speed, paddingMs);
+
+            size_t pauseMs = params::kPauseValues.contains(phonemes.back())
+                                 ? params::kPauseValues.at(phonemes.back())
+                                 : params::kDefaultPause;
+
+            audioPart.resize(audioPart.size() +
+                                 pauseMs * constants::kSamplesPerMilisecond,
+                             0.F);
+
+            nativeCallback(std::move(audioPart));
+
+            // Remove processed segment from buffer.
+            // Since we process it from left to right, we expect the segment to
+            // be at the beginning of the buffer.
+            buffer.erase(0, length);
+            phonemizedTokens = std::max(phonemizedTokens, length) - length;
+          } else {
+            // Length exceeds limit. Replace the sentence in buffer with its
+            // phonemization.
+            if (phonemize) {
+              buffer.replace(0, length, phonemes);
+            }
+            phonemizedTokens = phonemes.size();
+
+            break;
+          }
+        }
+      }
     }
 
     // A little bit of pause to not overload the thread.
@@ -241,86 +294,97 @@ void Kokoro::stream(float speed, bool stopOnEmptyBuffer,
   }
 }
 
-void Kokoro::streamFromPhonemes(std::string phonemes, float speed,
-                                std::shared_ptr<jsi::Function> callback) {
-  if (phonemes.empty()) {
-    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
-                            "Kokoro: phoneme string must not be empty");
-  }
-
-  isStreaming_ = true;
-  streamFromPhonemesImpl(
-      phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed,
-      callback);
-  isStreaming_ = false;
-}
-
-void Kokoro::streamInsert(std::string textChunk) noexcept {
-  std::scoped_lock<std::mutex> lock(inputTextBufferMutex_);
-  inputTextBuffer_.append(textChunk);
-}
-
-void Kokoro::streamStop(bool instant) noexcept {
-  if (instant) {
-    isStreaming_ = false;
-  } else {
-    stopOnEmptyBuffer_ = true;
-  }
-}
-
-std::vector<float> Kokoro::synthesize(const std::u32string &phonemes,
-                                      float speed, size_t paddingMs) {
+std::vector<float> Kokoro::synthesize(std::u32string_view phonemes, float speed,
+                                      size_t paddingMs) {
   if (phonemes.empty()) {
     return {};
   }
 
-  // Clamp the input to not go beyond number of input token limits
-  // Note that 2 tokens are always reserved for pre- and post-fix padding,
-  // so we effectively take at most (maxNoInputTokens_ - 2) tokens.
-  size_t noTokens = std::clamp(phonemes.size() + 2, constants::kMinInputTokens,
-                               context_.inputTokensLimit);
+  // Remove leading whitespace if exists.
+  if (phonemes.front() == U' ') {
+    phonemes = phonemes.substr(1);
+  }
 
-  // Map phonemes to tokens
+  // 1. Prepare input tokens.
+  // Clamp input to avoid exceeding model limits (2 tokens reserved for pre/post
+  // padding).
+  const size_t noTokens =
+      std::clamp(phonemes.size() + 2, constants::kMinInputTokens,
+                 context_.inputTokensLimit);
   const auto tokens = utils::tokenize(phonemes, {noTokens});
 
-  // Select the appropriate voice vector
-  size_t voiceID =
-      std::min({phonemes.size() - 1, noTokens - 1, voice_.size() - 1});
-  auto &voice = voice_[voiceID];
-
-  // Initialize text mask
-  // Exclude all the paddings apart from first and last one.
-  size_t realInputLength = std::min(phonemes.size() + 2, noTokens);
+  // 2. Initialize text mask.
+  // Exclude all paddings except the first and last ones.
+  // We use uint8_t instead of bool to avoid boolean span issues.
   std::vector<uint8_t> textMask(noTokens, false);
-  std::fill(textMask.begin(), textMask.begin() + realInputLength, true);
-
-  // Inference 1 - DurationPredictor
-  // The resulting duration vector is already scalled at this point
-  auto [d, indices, effectiveDuration] = durationPredictor_.generate(
-      std::span(tokens),
-      std::span(reinterpret_cast<bool *>(textMask.data()), textMask.size()),
-      std::span(voice).last(constants::kVoiceRefHalfSize), speed);
+  std::fill(textMask.begin(),
+            textMask.begin() + std::min(phonemes.size() + 2, noTokens), true);
 
-  // Inference 2 - Synthesizer
+  // 3. Select the appropriate voice vector.
+  // Each number of input tokens corresponds to a different voice embedding
+  // vector.
+  const size_t voiceID =
+      std::min({phonemes.size() - 1, noTokens - 1, voice_.size() - 1});
+  const auto &voice = voice_[voiceID];
+
+  // 4. Inference Phase 1: DurationPredictor (submodule).
+  auto [d, indices, effectiveDuration, timestamps] =
+      durationPredictor_.generate(
+          std::span(tokens),
+          std::span(reinterpret_cast<bool *>(textMask.data()), textMask.size()),
+          std::span(voice).last(constants::kVoiceRefHalfSize), speed);
+
+  // 5. Inference Phase 2: Synthesizer.
+  // Note that we reduce the size of the duration tensor to match the number of
+  // tokens.
   auto decoding = synthesizer_.generate(
       std::span(tokens),
       std::span(reinterpret_cast<bool *>(textMask.data()), textMask.size()),
       std::span(indices),
-      // Note that we reduce the size of d tensor to match the initial number of
-      // input tokens
       std::span<float>(d.mutable_data_ptr<float>(),
                        noTokens * d.sizes().back()),
       std::span(voice));
-  auto audioTensor = decoding->at(0).toTensor();
 
-  // Cut the resulting audio vector according to the effective duration
-  int32_t effLength = constants::kTicksPerDuration * effectiveDuration;
+  // 6. Post-processing: Finalize audio.
+  const auto audioTensor = decoding->at(0).toTensor();
+  const int32_t audioLength = constants::kTicksPerDuration * effectiveDuration;
+
   auto audio =
-      std::span<const float>(audioTensor.const_data_ptr<float>(), effLength);
-  auto croppedAudio =
+      std::span<const float>(audioTensor.const_data_ptr<float>(), audioLength);
+
+  // To counter any potential trailing voice artifacts (which can occur due to
+  // slight mismatch of .pte model results) we cut it according to the predicted
+  // duration ticks.
+  if (noTokens > 2) {
+    // We want to skip both the last PAD token, as well as any potential EOS
+    // token just before it.
+    auto lastTokenTimestamp =
+        !phonemis::utils::unicode::isalpha(phonemes.back())
+            ? timestamps[noTokens - 3].end
+            : timestamps[noTokens - 2].end;
+
+    audio = audio.subspan(0, lastTokenTimestamp);
+  }
+
+  // Now additional stripping of a (hopefully) pure silence.
+  audio =
       utils::stripAudio(audio, paddingMs * constants::kSamplesPerMilisecond);
 
-  return {croppedAudio.begin(), croppedAudio.end()};
+  return {audio.begin(), audio.end()};
+}
+
+void Kokoro::streamInsert(std::u32string chunk) noexcept {
+  rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "Inserting data");
+  std::scoped_lock<std::mutex> lock(inputTextBufferMutex_);
+  inputTextBuffer_.append(chunk);
+}
+
+void Kokoro::streamStop(bool instant) noexcept {
+  if (instant) {
+    isStreaming_ = false;
+  } else {
+    stopOnEmptyBuffer_ = true;
+  }
 }
 
 std::size_t Kokoro::getMemoryLowerBound() const noexcept {
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
index e33631af61..adf736bd28 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h
@@ -11,7 +11,7 @@
 #include "Partitioner.h"
 #include "Synthesizer.h"
 #include "Types.h"
-#include <phonemis/pipeline.h>
+#include <phonemis/base/pipeline.h>
 #include <rnexecutorch/metaprogramming/ConstructorHelpers.h>
 
 namespace rnexecutorch {
@@ -20,49 +20,51 @@ namespace models::text_to_speech::kokoro {
 class Kokoro {
 public:
   Kokoro(const std::string &lang, const std::string &taggerDataSource,
-         const std::string &phonemizerDataSource,
+         const std::string &lexiconSource, const std::string &neuralModelSource,
          const std::string &durationPredictorSource,
          const std::string &synthesizerSource, const std::string &voiceSource,
          std::shared_ptr<react::CallInvoker> callInvoker);
 
   /**
-   * Processes the entire text at once, before sending back to the JS side.
-   */
-  std::vector<float> generate(std::string text, float speed = 1.F);
-
-  /**
-   * Similar to generate(), but accepts pre-computed phonemes (as a UTF-8 IPA
-   * string) and synthesizes audio, bypassing the built-in phonemizer.
+   * Generates complete audio for the provided text.
+   *
+   * @param text The input to be synthesized - either a raw text or IPA
+   * phonemes.
+   * @param speed Playback speed multiplier (default: 1.0).
+   * @param phonemize Optional, if set to false disables the phonemization and
+   * operates on raw input.
+   * @return A vector of PCM float samples representing the synthesized speech.
    */
-  std::vector<float> generateFromPhonemes(std::string phonemes,
-                                          float speed = 1.F);
+  std::vector<float> generate(std::u32string input, float speed = 1.F,
+                              bool phonemize = true);
 
   /**
-   * Processes text from inputTextBuffer_ in chunks, sending each chunk
-   * individualy to the JS side with asynchronous callbacks.
+   * Starts an asynchronous streaming process that processes text in chunks.
+   * The internal buffer can be expanded during streaming using `streamInsert`.
    *
-   * Allows an incrementally expanded input by using an input text buffer.
+   * @param callback A JSI function called with each generated audio chunk
+   * (std::vector<float>).
+   * @param speed Playback speed multiplier.
+   * @param phonemize Optional, if set to false disables the phonemization and
+   * operates on raw input.
+   * @param stopOnEmptyBuffer If true, streaming terminates automatically when
+   * the buffer is exhausted.
    */
-  void stream(float speed, bool stopOnEmptyBuffer,
-              std::shared_ptr<jsi::Function> callback);
-
-  // Streaming variant that accepts pre-computed phonemes instead of text.
-  void streamFromPhonemes(std::string phonemes, float speed,
-                          std::shared_ptr<jsi::Function> callback);
+  void stream(std::shared_ptr<jsi::Function> callback, float speed = 1.F,
+              bool phonemize = true, bool stopOnEmptyBuffer = false);
 
   /**
-   * Updates the input streaming buffer by adding more text to be processed.
+   * Appends new input data (either text or phonemes) to the buffer.
    *
-   * @param text A new chunk of text, appended to the end of the input buffer.
+   * @param chunk A text/phonemes chunk to be added to the streaming buffer.
    */
-  void streamInsert(std::string textChunk) noexcept;
+  void streamInsert(std::u32string chunk) noexcept;
 
   /**
-   * Stops the streaming process.
+   * Signals the streaming process to stop.
    *
-   * @param instant If true, stops the streaming as soon as possible by
-   * switching the isStreaming_ flag. Otherwise allows to process the rest of
-   * the buffer first, by switching the stopOnEmptyBuffer_ flag.
+   * @param instant If true, stops immediately, discarding remaining buffered
+   * text. If false, finishes processing the current buffer before stopping.
    */
   void streamStop(bool instant) noexcept;
 
@@ -70,38 +72,32 @@ class Kokoro {
   void unload() noexcept;
 
 private:
-  // Helper function - loading voice array
+  // --- Initialization & Core Inference ---
   void loadVoice(const std::string &voiceSource);
-
-  // Helper function - shared synthesis pipeline (partition + synthesize)
-  std::vector<float> generateFromPhonemesImpl(const std::u32string &phonemes,
-                                              float speed);
-  void streamFromPhonemesImpl(const std::u32string &phonemes, float speed,
-                              std::shared_ptr<jsi::Function> callback);
-
-  // Helper function - generate specialization for given input size
-  std::vector<float> synthesize(const std::u32string &phonemes, float speed,
+  std::vector<float> synthesize(std::u32string_view phonemes, float speed,
                                 size_t paddingMs = 50);
 
-  // JS callback handle
+  // --- External Dependencies ---
   std::shared_ptr<react::CallInvoker> callInvoker_;
 
-  // Shared model context
+  // --- Model context ---
   Context context_;
 
-  // Submodules - arranged in order of their appearence in the model's pipeline
+  // --- Model Components ---
+  // Arranged in order of appearance in the generation pipeline
   phonemis::Pipeline phonemizer_;
   Partitioner partitioner_;
   DurationPredictor durationPredictor_;
   Synthesizer synthesizer_;
 
-  // Voice array — dynamically sized to match the voice file.
-  // Each row is a style vector for a given input token count.
+  // --- Data Buffers ---
+  // Voice embeddings: Each row is a style vector for a given input token count
   std::vector<std::array<float, constants::kVoiceRefSize>> voice_;
-
-  // Streaming state control variables
-  std::string inputTextBuffer_;
+  // Streaming buffer
+  std::u32string inputTextBuffer_;
   mutable std::mutex inputTextBufferMutex_;
+
+  // --- Streaming control State ---
   std::atomic<bool> isStreaming_{false};
   std::atomic<bool> stopOnEmptyBuffer_{true};
   int32_t streamSkippedIterations = 0;
@@ -110,5 +106,7 @@ class Kokoro {
 
 REGISTER_CONSTRUCTOR(models::text_to_speech::kokoro::Kokoro, std::string,
                      std::string, std::string, std::string, std::string,
-                     std::string, std::shared_ptr<react::CallInvoker>);
+                     std::string, std::string,
+                     std::shared_ptr<react::CallInvoker>);
+
 } // namespace rnexecutorch
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
index f517db0318..d8b14f4caf 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h
@@ -39,8 +39,8 @@ inline constexpr int32_t kStreamPause = 200;
  * (ms).
  */
 inline const std::unordered_map<char32_t, int32_t> kPauseValues = {
-    {U'.', 250}, {U'?', 350}, {U'!', 180}, {U';', 300},
-    {U'…', 500}, {U',', 125}, {U':', 175}, {U'-', 175}}; // [ms]
+    {U'.', 375}, {U'?', 500}, {U'!', 250}, {U';', 400},
+    {U'…', 600}, {U',', 130}, {U':', 250}, {U'-', 200}}; // [ms]
 
 /**
  * A default pause applied after a sentence finished with a character other
@@ -54,43 +54,24 @@ namespace cropping {
  * The audio cropping algorithm is a moving average variant.
  * This value controls the number of steps in moving average.
  */
-inline constexpr uint32_t kAudioCroppingSteps = 20;
+inline constexpr uint32_t kAudioCroppingSteps = 10;
 
 /**
  * Determines silent audio fragments in audio cropping algorithm.
  * The audio fragment is considered as a silence, if the moving average with K
  * steps does not exceed this threshold.
  */
-inline constexpr float kAudioSilenceThreshold = 0.01F;
+inline constexpr float kAudioSilenceThreshold = 0.005F;
 } // namespace cropping
 
 // Partitioning related hyperparameters
 namespace partitioning {
-/**
- * A penalty for dividing text on end of sentence character (like . or !).
- */
-inline constexpr int64_t kEosPenalty = 5;
-
-/**
- * A penalty for dividing text on pause character (like , or -).
- */
-inline constexpr int64_t kPausePenalty = 18;
-
-/**
- * A penalty for dividing text in the middle of sentence -
- * in other words, on white character.
- *
- * We want to avoid splitting the text between two words with no pause
- * as much as possible, since it kills the naturalness of the speech.
- */
-inline constexpr int64_t kWhitePenalty = 1000;
-
 /**
  * Used in latency-focused partitioning variant. Decides on
  * how much more are big latencies in the beginning phase of
  * an input text penalized.
  */
-inline constexpr int32_t kTokenDiscountFactor = 1;
+inline constexpr int64_t kTokenDiscountFactor = 1;
 
 /**
  * Used in latency-focused partitioning variant. Decides on
@@ -99,7 +80,7 @@ inline constexpr int32_t kTokenDiscountFactor = 1;
  * For example, using kTokenDiscountRange = 128 means that after reaching
  * 128 tokens, the latency is completely omited and not penalized.
  */
-inline constexpr int32_t kTokenDiscountRange = 128;
+inline constexpr int64_t kTokenDiscountRange = 128;
 } // namespace partitioning
 
 } // namespace rnexecutorch::models::text_to_speech::kokoro::params
\ No newline at end of file
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.cpp
index 4dc55ade12..6ff0bb3ca1 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.cpp
@@ -1,10 +1,10 @@
 #include "Partitioner.h"
 #include "Constants.h"
 #include "Params.h"
+
 #include <algorithm>
-#include <functional>
-#include <queue>
-#include <rnexecutorch/Error.h>
+#include <deque>
+#include <limits>
 
 namespace rnexecutorch::models::text_to_speech::kokoro {
 
@@ -13,117 +13,118 @@ using namespace params::partitioning;
 // Custom infinity definition
 constexpr Partitioner::Cost INF = 1e7;
 
-template <>
-std::vector<std::u32string>
-Partitioner::divide<Partitioner::Strategy::TOTAL_TIME>(
-    const std::u32string &phonemes) {
-  return divide(phonemes,
-                [this](Cost prevCost, int32_t rangeBegin, int32_t prevBp,
-                       int32_t currBp, int32_t rangeEnd) {
-                  if (rangeEnd - currBp - 1 > context_.inputTokensLimit)
-                    return INF;
-
-                  // Simply cumulate the costs for both subranges
-                  return prevCost + static_cast<Cost>(rangeEnd - currBp - 1);
-                });
-}
+Partitioner::Partition Partitioner::partition(std::u32string_view input,
+                                              size_t limit, Mode mode) const {
+  if (mode == Mode::MIN_BREAKS) {
+    return partition(input, limit,
+                     [limit](Cost acc, size_t beg, int64_t prevBp, int64_t bp,
+                             size_t end, Separator sep) -> Cost {
+                       if (end - bp > limit) {
+                         return INF;
+                       }
+
+                       Cost sepPenalty = sep == Separator::EOS     ? 1
+                                         : sep == Separator::PAUSE ? 3
+                                         : sep == Separator::WHITE ? 1000
+                                                                   : 0;
+
+                       return acc + sepPenalty + static_cast<Cost>(end - bp);
+                     });
+  }
 
-template <>
-std::vector<std::u32string> Partitioner::divide<Partitioner::Strategy::LATENCY>(
-    const std::u32string &phonemes) {
-  return divide(phonemes, [this](Cost prevCost, int32_t rangeBegin,
-                                 int32_t prevBp, int32_t currBp,
-                                 int32_t rangeEnd) {
-    if (rangeEnd - currBp - 1 > context_.inputTokensLimit)
-      return INF;
-
-    // Estimate the latency (simple linear difference between the rightmost
-    // subranges)
-    int32_t latency = std::max(0, (rangeEnd - currBp) - (currBp - prevBp));
-
-    // Estimate the discount factor (the further we go, the less we care about
-    // the latency)
-    int32_t discount =
-        kTokenDiscountFactor * std::max(0, kTokenDiscountRange - currBp - 1);
-
-    return prevCost +
-           static_cast<Cost>(latency * discount / kTokenDiscountRange);
-  });
+  if (mode == Mode::MIN_LATENCY) {
+    return partition(
+        input, limit,
+        [limit](Cost acc, size_t beg, int64_t prevBp, int64_t bp, size_t end,
+                Separator sep) -> Cost {
+          if (end - bp > limit) {
+            return INF;
+          }
+
+          Cost sepPenalty = sep == Separator::EOS     ? 5
+                            : sep == Separator::PAUSE ? 18
+                            : sep == Separator::WHITE ? 1000
+                                                      : 0;
+
+          int64_t rightmostRangeLength = end - bp;
+          int64_t prevRangeLength = bp - prevBp;
+
+          int64_t latency = std::max(static_cast<int64_t>(0),
+                                     rightmostRangeLength - prevRangeLength);
+          int64_t discount =
+              kTokenDiscountFactor *
+              std::max(static_cast<int64_t>(0), kTokenDiscountRange - bp - 1);
+
+          return acc +
+                 static_cast<Cost>(latency * discount / kTokenDiscountRange) +
+                 sepPenalty;
+        });
+  }
+
+  return {input, {}};
 }
 
-// Helper function - partitioning
-// A template which is controled by concrete operator instead of
-// an abstract Strategy argument.
-// Utilizes dynamic programming approach for finding the
-// optimal solution.
-std::vector<std::u32string> Partitioner::divide(
-    const std::u32string &phonemes,
-    const std::function<Cost(Cost, int32_t, int32_t, int32_t, int32_t)>
-        &costFn) {
-  // DP array
-  // (cost, prev_breakpoint_idx) pairs
-  std::vector<std::pair<Cost, int32_t>> mem(phonemes.size(), {INF, -1});
-
-  // Keep the potential break point indices to speed up the calculation.
-  std::deque<int32_t> eosPoints, pausePoints, whitePoints;
-
-  for (int32_t i = 0; i < phonemes.size(); i++) {
-    auto &[estimation, prevBreakIdx] = mem[i];
-
-    // We assume that phonemes[i] is the last character of currently analyzed
-    // substring. First, estimate for the entire substring without further
-    // division.
-    estimation = costFn(0, 0, -1, -1, i + 1);
-
-    // Now, try to divide into 2 substring and utilize already calculated values
-    // for left-side substring.
+Partitioner::Partition Partitioner::partition(std::u32string_view input,
+                                              size_t limit,
+                                              CostFn costFn) const {
+  if (input.empty()) {
+    return {input, {}};
+  }
+
+  size_t n = input.size();
+  std::vector<std::pair<Cost, int64_t>> dp(n, {INF, -1});
+
+  std::deque<size_t> eosPoints, pausePoints, whitePoints;
+
+  for (size_t i = 0; i < n; ++i) {
+    auto &[bestCost, prevBpIdx] = dp[i];
+
+    bestCost = costFn(0, 0, -1, -1, i + 1, Separator::NO_SEP);
+
     for (auto *q : {&eosPoints, &pausePoints, &whitePoints}) {
-      // First, clear the queus from useless entries (out of even largest model
-      // bounds).
-      while (!q->empty() && q->front() + context_.inputTokensLimit < i) {
+      while (!q->empty() && q->front() + limit < i) {
         q->pop_front();
       }
 
-      // Now iterate through the reimaining positions.
-      Cost penalty = q == &eosPoints     ? kEosPenalty
-                     : q == &pausePoints ? kPausePenalty
-                                         : kWhitePenalty;
-      for (int32_t breakIdx : (*q)) {
-        Cost newEstimation = costFn(mem[breakIdx].first, 0,
-                                    mem[breakIdx].second, breakIdx, i + 1) +
-                             penalty;
-        if (newEstimation < estimation && breakIdx > 0) {
-          estimation = newEstimation;
-          prevBreakIdx = breakIdx;
+      Separator sep = q == &eosPoints     ? Separator::EOS
+                      : q == &pausePoints ? Separator::PAUSE
+                                          : Separator::WHITE;
+      for (size_t breakIdx : (*q)) {
+        Cost cost = costFn(dp[breakIdx].first, 0, dp[breakIdx].second, breakIdx,
+                           i, sep);
+        if (cost < bestCost && breakIdx > 0) {
+          bestCost = cost;
+          prevBpIdx = breakIdx;
         }
       }
     }
 
-    // Add current phoneme to the appropriate queue.
-    char32_t phoneme = phonemes[i];
-    if (constants::kEndOfSentencePhonemes.contains(phoneme)) {
+    char32_t c = input[i];
+    if (constants::kEndOfSentencePhonemes.contains(c)) {
       eosPoints.push_back(i);
-    } else if (constants::kPausePhonemes.contains(phoneme)) {
+    } else if (constants::kPausePhonemes.contains(c)) {
       pausePoints.push_back(i);
-    } else if (phoneme < 256 && std::isspace(static_cast<char>(phoneme))) {
+    } else if (c < 256 && std::isspace(static_cast<char>(c))) {
       whitePoints.push_back(i);
     }
   }
 
-  std::vector<std::u32string> result = {};
+  std::vector<std::pair<size_t, size_t>> segments;
+  int64_t currBp = dp[n - 1].second;
+  size_t lastIdx = n;
 
-  // Perform backtracking to obtain all the substrings.
-  // Note that because of backtracking, the order is reversed.
-  int32_t end = phonemes.size() - 1;
-  while (end != -1) {
-    int32_t begin = mem[end].second + 1;
-    result.push_back(phonemes.substr(begin, end - begin + 1));
-    end = mem[end].second;
+  while (currBp != -1) {
+    size_t start = static_cast<size_t>(currBp + 1);
+    segments.emplace_back(start, lastIdx - start);
+    lastIdx = static_cast<size_t>(currBp + 1);
+    currBp = dp[currBp].second;
   }
+  // Add the first segment
+  segments.emplace_back(0, lastIdx);
 
-  std::ranges::reverse(result);
+  std::reverse(segments.begin(), segments.end());
 
-  return result;
+  return {input, std::move(segments)};
 }
 
-} // namespace rnexecutorch::models::text_to_speech::kokoro
\ No newline at end of file
+} // namespace rnexecutorch::models::text_to_speech::kokoro
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.h
index b327ca4f9b..5ed87bce77 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.h
@@ -1,58 +1,88 @@
 #pragma once
 
+#include "Types.h"
+
 #include <cstdint>
 #include <functional>
 #include <optional>
 #include <string>
 #include <vector>
 
-#include "Types.h"
-
 namespace rnexecutorch::models::text_to_speech::kokoro {
 
 class Partitioner {
 public:
-  Partitioner(const Context &modelContext) : context_(modelContext) {}
-
-  // Partition strategy
-  // Defines how to divide phoneme string into substrings, by minimizing
-  // one of the selected properties.
-  enum class Strategy {
-    TOTAL_TIME = 0, // Only minimizes the estimated total time of processing
-    LATENCY, // Minimizes the streaming latency by dividing into small and
-             // similar length parts
+  /**
+   * Partitioning strategy.
+   * Affects the cost function choice, which changes the way input text is
+   * divided.
+   */
+  enum class Mode {
+    MIN_BREAKS = 0, // Minimizes number of substrings (best quality)
+    MIN_LATENCY =
+        1, // Minimizes the processing latency (best speed - streaming mode)
   };
 
-  // Cost definition
-  using Cost = int64_t;
+  /**
+   * Represents the logical separator types.
+   */
+  enum class Separator {
+    EOS = 1, // End of sentence marker (e.g., '.', '!', '?').
+    PAUSE,   // Mid-sentence pause (e.g., ',', ';', ':').
+    WHITE,   // Whitespace or other weak separators.
 
-  // Partition function
-  // Performs a division of the input phoneme string according to
-  // given strategy.
-  template <Strategy strategy>
-  std::vector<std::u32string> divide(const std::u32string &phonemes);
+    NO_SEP // No separation
+  };
 
-private:
   /**
-   * Helper function - partitioning
+   * Represents a heuristic evaluation of given partition.
+   * The lower it is, the better partition is.
+   */
+  using Cost = uint32_t;
+
+  /**
+   * A cost function type to evaluate given partition.
    *
-   * @param phonemes phoneme string to be partitioned
-   * @param costFn a custom cost function which takes:
-   *               1. starting cost (cost of the previous range or 0 if not
-   * present)
-   *               2. range begin
-   *               3. previous breakpoint (-1 if not present)
-   *               4. current breakpoint (-1 if not present)
-   *               5. range end (exclusive)
+   * @param acc Total cost accumulated from previous segments.
+   * @param beg Start index of the current range.
+   * @param prevBp Previous breakpoint index - useful for calculating some
+   * formulas.
+   * @param bp Breakpoint index (the split point, and the last character of the
+   * left-most subrange). -1 if there are no bps.
+   * @param end End index of the current range (inclusive).
+   * @param sep The type of the breakpoint.
+   */
+  using CostFn = std::function<Cost(Cost acc, size_t beg, int64_t prevBp,
+                                    int64_t bp, size_t end, Separator sep)>;
+
+  /**
+   * Holds the result of text partitioning.
+   * The content is stored as logical views to avoid copying. Segments
+   * defines ranges of the content views for smaller segments.
    */
-  std::vector<std::u32string>
-  divide(const std::u32string &phonemes,
-         const std::function<Cost(Cost, int32_t, int32_t, int32_t, int32_t)>
-             &costFn);
-
-  // Shared model context
-  // A const reference to singleton in Kokoro.
-  const Context &context_;
+  struct Partition {
+    std::u32string_view content;
+    std::vector<std::pair<size_t, size_t>>
+        segments; // Pairs of {offset, length} for each segment.
+  };
+
+  /**
+   * Partitions the input text into segments according to the specified
+   * strategy.
+   *
+   * @param input The source text to be partitioned.
+   * @param limit The maximum available size of a single segment.
+   * @param mode The partitioning strategy to use (defaults to MIN_LATENCY).
+   * @return A Partition object containing the original content view and
+   * breakpoints.
+   */
+  Partition partition(std::u32string_view input, size_t limit,
+                      Mode mode = Mode::MIN_LATENCY) const;
+
+private:
+  // Internal partition implementation that uses a specific cost function.
+  Partition partition(std::u32string_view input, size_t limit,
+                      CostFn costFn) const;
 };
 
 } // namespace rnexecutorch::models::text_to_speech::kokoro
\ No newline at end of file
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.cpp
index fd69c43eed..c5f44b71f8 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.cpp
@@ -28,8 +28,9 @@ Synthesizer::Synthesizer(const std::string &modelSource,
         forwardMethods_.emplace_back(name, inputSize);
       }
     }
-    std::stable_sort(forwardMethods_.begin(), forwardMethods_.end(),
-                     [](const auto &a, const auto &b) { return a.second < b.second; });
+    std::stable_sort(
+        forwardMethods_.begin(), forwardMethods_.end(),
+        [](const auto &a, const auto &b) { return a.second < b.second; });
   }
 
   // Fallback: if no methods discovered, validate "forward" directly
@@ -43,11 +44,10 @@ Synthesizer::Synthesizer(const std::string &modelSource,
   }
 }
 
-Result<std::vector<EValue>> Synthesizer::generate(std::span<const Token> tokens,
-                                                  std::span<bool> textMask,
-                                                  std::span<int64_t> indices,
-                                                  std::span<float> dur,
-                                                  std::span<float> ref_s) {
+Result<std::vector<EValue>> Synthesizer::generate(
+    std::span<const Token> tokens, std::span<const bool> textMask,
+    std::span<const int64_t> indices, std::span<const float> dur,
+    std::span<const float> ref_s) {
   // Perform input shape checks
   // Both F0 and N vectors should be twice as long as duration
   CHECK_SIZE(tokens, textMask.size());
@@ -62,28 +62,33 @@ Result<std::vector<EValue>> Synthesizer::generate(std::span<const Token> tokens,
                       const_cast<Token *>(tokens.data()), ScalarType::Long);
   auto textMaskTensor =
       make_tensor_ptr({1, static_cast<int32_t>(textMask.size())},
-                      textMask.data(), ScalarType::Bool);
-  auto indicesTensor =
-      make_tensor_ptr({duration}, indices.data(), ScalarType::Long);
-  auto durTensor =
-      make_tensor_ptr({1, noTokens, 640}, dur.data(), ScalarType::Float);
-  auto voiceRefTensor = make_tensor_ptr({1, constants::kVoiceRefSize},
-                                        ref_s.data(), ScalarType::Float);
+                      const_cast<bool *>(textMask.data()), ScalarType::Bool);
+  auto indicesTensor = make_tensor_ptr(
+      {duration}, const_cast<int64_t *>(indices.data()), ScalarType::Long);
+  auto durTensor = make_tensor_ptr(
+      {1, noTokens, 640}, const_cast<float *>(dur.data()), ScalarType::Float);
+  auto voiceRefTensor =
+      make_tensor_ptr({1, constants::kVoiceRefSize},
+                      const_cast<float *>(ref_s.data()), ScalarType::Float);
 
   // Select appropriate forward method based on token count
-  auto it = std::ranges::find_if(forwardMethods_,
-      [noTokens](const auto &entry) { return static_cast<int32_t>(entry.second) >= noTokens; });
-  std::string selectedMethod = (it != forwardMethods_.end()) ? it->first : forwardMethods_.back().first;
+  auto it =
+      std::ranges::find_if(forwardMethods_, [noTokens](const auto &entry) {
+        return static_cast<int32_t>(entry.second) >= noTokens;
+      });
+  std::string selectedMethod =
+      (it != forwardMethods_.end()) ? it->first : forwardMethods_.back().first;
 
   // Execute the selected forward method
-  auto results = execute(selectedMethod,
-      {tokensTensor, textMaskTensor, indicesTensor, durTensor, voiceRefTensor});
+  auto results =
+      execute(selectedMethod, {tokensTensor, textMaskTensor, indicesTensor,
+                               durTensor, voiceRefTensor});
 
   if (!results.ok()) {
     throw RnExecutorchError(
         RnExecutorchErrorCode::InvalidModelOutput,
         "[Kokoro::Synthesizer] Failed to execute method " + selectedMethod +
-        ", error: " +
+            ", error: " +
             std::to_string(static_cast<uint32_t>(results.error())));
   }
 
@@ -97,7 +102,8 @@ size_t Synthesizer::getTokensLimit() const {
 }
 
 size_t Synthesizer::getDurationLimit() const {
-  if (forwardMethods_.empty()) return 0;
+  if (forwardMethods_.empty())
+    return 0;
   return getInputShape(forwardMethods_.back().first, 2)[0];
 }
 
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.h
index bfbbd02638..7b87e33c26 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.h
@@ -40,17 +40,18 @@ class Synthesizer : public BaseModel {
    * @param ref_s a full voice array for given duration
    */
   Result<std::vector<EValue>> generate(std::span<const Token> tokens,
-                                       std::span<bool> textMask,
-                                       std::span<int64_t> indices,
-                                       std::span<float> dur,
-                                       std::span<float> ref_s);
+                                       std::span<const bool> textMask,
+                                       std::span<const int64_t> indices,
+                                       std::span<const float> dur,
+                                       std::span<const float> ref_s);
 
   // Model limits getters
   size_t getTokensLimit() const;
   size_t getDurationLimit() const;
 
 private:
-  // Forward methods discovered at construction (e.g. forward_8, forward_64, forward_128)
+  // Forward methods discovered at construction (e.g. forward_8, forward_64,
+  // forward_128)
   std::vector<std::pair<std::string, size_t>> forwardMethods_;
   // Shared model context
   // A const reference to singleton in Kokoro.
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Types.h
index 20a0fe5f20..8a99dc09c8 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Types.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Types.h
@@ -18,4 +18,14 @@ struct Context {
   size_t inputDurationLimit = 0;
 };
 
+/**
+ * Type definition - token timestamp.
+ *
+ * Values correspond to the amount of waveform samples.
+ */
+struct Timestamp {
+  size_t begin = 0;
+  size_t end = 0;
+};
+
 } // namespace rnexecutorch::models::text_to_speech::kokoro
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp
index a77e40a93c..37956e945c 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp
@@ -1,6 +1,7 @@
 #include "Utils.h"
 #include "Constants.h"
 #include "Params.h"
+
 #include <algorithm>
 #include <cmath>
 #include <rnexecutorch/Error.h>
@@ -9,86 +10,95 @@ namespace rnexecutorch::models::text_to_speech::kokoro::utils {
 
 using namespace params::cropping;
 
-// Helper functions
 namespace {
-// Normalizes an audio sample
+
 float normalize(float sample) {
-  float v = std::abs(sample);
-  return v >= kAudioSilenceThreshold ? v : 0.F;
+  return std::max(0.0F, std::abs(sample) - kAudioSilenceThreshold);
 }
 
-// Returns an index corresponding to the first (or last - if reverse=true)
-// non-quiet part of an audio.
-// Utilizes a moving average controled by hyperparameters from Constants.h.
 template <bool reverse> size_t findAudioBound(std::span<const float> audio) {
   if (audio.empty()) {
     return 0;
   }
 
-  size_t length = audio.size();
+  const size_t length = audio.size();
+  float windowSum = 0.0F;
+  size_t processedCount = 0;
+  size_t currentIndex = reverse ? length - 1 : 0;
 
-  float sum = 0.F;
-  size_t count = 0;
-  size_t i = reverse ? length - 1 : 0;
+  while (processedCount < length) {
+    processedCount++;
+    windowSum += normalize(audio[currentIndex]);
 
-  while (count < length) {
-    count++;
-    sum += normalize(audio[i]);
-    if (count > kAudioCroppingSteps) {
-      sum -= normalize(
-          audio[reverse ? i + kAudioCroppingSteps : i - kAudioCroppingSteps]);
+    // Maintain the sliding window sum
+    if (processedCount > kAudioCroppingSteps) {
+      const size_t oldIndex = reverse ? currentIndex + kAudioCroppingSteps
+                                      : currentIndex - kAudioCroppingSteps;
+      windowSum -= normalize(audio[oldIndex]);
     }
 
-    if (count >= kAudioCroppingSteps &&
-        sum / kAudioCroppingSteps >= kAudioSilenceThreshold) {
-      return i;
+    // Check if moving average exceeds threshold
+    if (processedCount >= kAudioCroppingSteps &&
+        (windowSum / kAudioCroppingSteps) >= kAudioSilenceThreshold) {
+      return currentIndex;
     }
 
-    i = reverse ? i - 1 : i + 1;
+    currentIndex = reverse ? currentIndex - 1 : currentIndex + 1;
   }
 
   return reverse ? 0 : length - 1;
 }
+
 } // namespace
 
 std::span<const float> stripAudio(std::span<const float> audio, size_t margin) {
-  auto lbound = findAudioBound<false>(audio);
-  auto rbound = findAudioBound<true>(audio);
+  if (audio.empty()) {
+    return {};
+  }
 
-  lbound = lbound > margin ? lbound - margin : 0;
-  rbound = std::min(rbound + margin, audio.size() > 0 ? audio.size() - 1 : 0);
+  size_t lbound = findAudioBound<false>(audio);
+  size_t rbound = findAudioBound<true>(audio);
 
-  return audio.subspan(lbound, rbound >= lbound ? rbound - lbound + 1 : 0);
+  // Apply margins
+  lbound = (lbound > margin) ? lbound - margin : 0;
+  rbound = std::min(rbound + margin, audio.size() - 1);
+
+  const size_t strippedLength = (rbound >= lbound) ? (rbound - lbound + 1) : 0;
+  return audio.subspan(lbound, strippedLength);
 }
 
-std::vector<Token> tokenize(const std::u32string &phonemes,
+std::vector<Token> tokenize(std::u32string_view phonemes,
                             std::optional<size_t> expectedSize) {
   if (expectedSize.has_value() && expectedSize.value() < 2) {
-    throw rnexecutorch::RnExecutorchError(
-        rnexecutorch::RnExecutorchErrorCode::InvalidUserInput,
-        "expected number of tokens cannot be lower than 2");
+    throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput,
+                            "[Kokoro::Utils] Expected tokens must be >= 2");
   }
 
-  // Number of tokens to populate, with and without edge pad tokens
-  size_t lengthWithPadding =
-      expectedSize.has_value() ? expectedSize.value() : phonemes.size() + 2;
-  size_t lengthWithoutPadding = lengthWithPadding - 2;
-  size_t effNoTokens = std::min(lengthWithoutPadding, phonemes.size());
-
-  // Note that we populate tokens[1:noTokens - 1], since first and last tokens
-  // are zeros (padding). Input could still contain unrecognized tokens, and
-  // that's why we use partition() at the end.
-  std::vector<Token> tokens(lengthWithPadding, constants::kPadToken);
-  std::transform(phonemes.begin(), phonemes.begin() + effNoTokens,
+  // 1. Determine lengths (2 tokens reserved for start/end padding)
+  const size_t totalLength = expectedSize.value_or(phonemes.size() + 2);
+  const size_t maxPhonemes = totalLength - 2;
+  const size_t effectivePhonemeCount = std::min(maxPhonemes, phonemes.size());
+
+  // 2. Initialize with pad tokens
+  std::vector<Token> tokens(totalLength, constants::kPadToken);
+
+  // 3. Map phonemes to vocabulary tokens
+  // Starting from index 1 to leave index 0 as start-padding
+  std::transform(phonemes.begin(), phonemes.begin() + effectivePhonemeCount,
                  tokens.begin() + 1, [](char32_t p) -> Token {
                    return constants::kVocab.contains(p)
                               ? constants::kVocab.at(p)
                               : constants::kInvalidToken;
                  });
-  auto validSeqEnd = std::stable_partition(
-      tokens.begin() + 1, tokens.begin() + effNoTokens + 1,
-      [](Token t) -> bool { return t != constants::kInvalidToken; });
-  std::fill(validSeqEnd, tokens.begin() + effNoTokens + 1,
+
+  // 4. Remove invalid tokens while preserving order (bubbling them to the end
+  // of the content segment)
+  auto validEnd = std::stable_partition(
+      tokens.begin() + 1, tokens.begin() + effectivePhonemeCount + 1,
+      [](Token t) { return t != constants::kInvalidToken; });
+
+  // 5. Fill any gaps created by partitioning or sizing with pad tokens
+  std::fill(validEnd, tokens.begin() + effectivePhonemeCount + 1,
             constants::kPadToken);
 
   return tokens;
diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.h
index 081d40c14c..c6996a3f40 100644
--- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.h
+++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.h
@@ -8,19 +8,20 @@
 
 namespace rnexecutorch::models::text_to_speech::kokoro::utils {
 
-// Removes silence from the beginning and the end of an audio (with some
-// margin).
-// Returns a [l - m, r + m] range of audio samples, where m is the margin,
-// l and r correspond to lower and upper audio bound respectively.
+/**
+ * Strips silence from audio edges using a sliding window.
+ * @param audio The input audio samples.
+ * @param margin Number of silence samples to preserve at each edge.
+ */
 std::span<const float> stripAudio(std::span<const float> audio,
                                   size_t margin = 0);
 
-// Tokenizes given phoneme string.
-// Each phoneme corresponds to exactly one token, with 2 additional pad
-// tokens added at both ends.
-// If extecped number of tokens is provided, eventually expands the token vector
-// with pad tokens to match the given length.
-std::vector<Token> tokenize(const std::u32string &phonemes,
+/**
+ * Maps phonemes to vocabulary tokens with start/end padding.
+ * @param phonemes UTF-32 phoneme sequence.
+ * @param expectedSize If set, pads the output to this exact length.
+ */
+std::vector<Token> tokenize(std::u32string_view phonemes,
                             std::optional<size_t> expectedSize = std::nullopt);
 
 } // namespace rnexecutorch::models::text_to_speech::kokoro::utils
\ No newline at end of file
diff --git a/packages/react-native-executorch/react-native-executorch.podspec b/packages/react-native-executorch/react-native-executorch.podspec
index 902210d01a..5d2180cb9a 100644
--- a/packages/react-native-executorch/react-native-executorch.podspec
+++ b/packages/react-native-executorch/react-native-executorch.podspec
@@ -16,7 +16,6 @@ Pod::Spec.new do |s|
 
   pthreadpool_binaries_path = File.expand_path('$(PODS_TARGET_SRCROOT)/third-party/ios/libs/pthreadpool', __dir__)
   cpuinfo_binaries_path = File.expand_path('$(PODS_TARGET_SRCROOT)/third-party/ios/libs/cpuinfo', __dir__)
-  phonemis_binaries_path = File.expand_path('$(PODS_TARGET_SRCROOT)/third-party/ios/libs/phonemis', __dir__)
 
   s.user_target_xcconfig = {
     "HEADER_SEARCH_PATHS" =>
@@ -28,7 +27,6 @@ Pod::Spec.new do |s|
       '$(inherited)',
       "\"#{pthreadpool_binaries_path}/physical-arm64-release/libpthreadpool.a\"",
       "\"#{cpuinfo_binaries_path}/libcpuinfo.a\"",
-      "\"#{phonemis_binaries_path}/physical-arm64-release/libphonemis.a\"",
 
     ].join(' '),
 
@@ -36,7 +34,6 @@ Pod::Spec.new do |s|
       '$(inherited)',
       "\"#{pthreadpool_binaries_path}/simulator-arm64-debug/libpthreadpool.a\"",
       "\"#{cpuinfo_binaries_path}/libcpuinfo.a\"",
-      "\"#{phonemis_binaries_path}/simulator-arm64-debug/libphonemis.a\"",
     ].join(' '),
 
     'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'x86_64',
@@ -50,7 +47,9 @@ Pod::Spec.new do |s|
       '"$(PODS_TARGET_SRCROOT)/third-party/include" '+
       '"$(PODS_TARGET_SRCROOT)/third-party/include/cpuinfo" '+
       '"$(PODS_TARGET_SRCROOT)/third-party/include/pthreadpool" '+
-      '"$(PODS_TARGET_SRCROOT)/common" ',
+      '"$(PODS_TARGET_SRCROOT)/common" ' +
+      '"$(PODS_TARGET_SRCROOT)/third-party/common/phonemis/src" ',
+    "GCC_PREPROCESSOR_DEFINITIONS" => '$(inherited) ET_ON=1',
     "CLANG_CXX_LANGUAGE_STANDARD" => "c++20",
     'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'x86_64',
   }
@@ -58,6 +57,7 @@ Pod::Spec.new do |s|
   s.source_files = [
     "ios/**/*.{m,mm,h}",
     "common/**/*.{cpp,c,h,hpp}",
+    "third-party/common/phonemis/src/**/*.{cpp,hpp,h}",
   ]
 
   s.libraries = "z"
@@ -70,8 +70,9 @@ Pod::Spec.new do |s|
   # react-native-skia. The headers are preserved by preserve_paths and
   # then made available by HEADER_SEARCH_PATHS.
   s.exclude_files = [
-    "common/rnexecutorch/tests/**/*",
-    "common/rnexecutorch/jsi/*.{h,hpp}"
+    "common/rnexecutorch/tests/**/*.{cpp}",
+    "common/rnexecutorch/jsi/*.{h,hpp}",
+    "third-party/common/phonemis/src/phonemis/main.cpp" # Exclude the phonemis runner
   ]
   s.header_mappings_dir = "common/rnexecutorch"
   s.header_dir = "rnexecutorch"
diff --git a/packages/react-native-executorch/src/constants/tts/models.ts b/packages/react-native-executorch/src/constants/tts/models.ts
index 7b05a580c8..d973722720 100644
--- a/packages/react-native-executorch/src/constants/tts/models.ts
+++ b/packages/react-native-executorch/src/constants/tts/models.ts
@@ -1,28 +1,27 @@
-import { URL_PREFIX, VERSION_TAG } from '../versions';
+import { NEXT_VERSION_TAG, URL_PREFIX } from '../versions';
 
 // Text to speech (tts) - Kokoro model(s)
-const KOKORO_EN_MODELS_ROOT = `${URL_PREFIX}-kokoro/${VERSION_TAG}/xnnpack`;
-const KOKORO_EN_SMALL_MODELS_ROOT = `${KOKORO_EN_MODELS_ROOT}/small`;
-const KOKORO_EN_MEDIUM_MODELS_ROOT = `${KOKORO_EN_MODELS_ROOT}/medium`;
+const KOKORO_MODEL_ROOT = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/xnnpack`;
+const KOKORO_STANDARD_MODEL_ROOT = `${KOKORO_MODEL_ROOT}/standard`;
+const KOKORO_POLISH_MODEL_ROOT = `${KOKORO_MODEL_ROOT}/polish`;
 
 /**
- * A Kokoro model instance which processes the text in batches of maximum 64 tokens.
- * Uses significant less memory than the medium model, but could produce
- * a lower quality speech due to forced, aggressive text splitting.
+ * A standard Kokoro instance which processes the text in batches of maximum 128 tokens.
+ * Works well with built-in languages: english, spanish, french, italian, portugese and hindi.
  * @category Models - Text to Speech
  */
-export const KOKORO_SMALL = {
-  modelName: 'kokoro-small' as const,
-  durationPredictorSource: `${KOKORO_EN_SMALL_MODELS_ROOT}/duration_predictor.pte`,
-  synthesizerSource: `${KOKORO_EN_SMALL_MODELS_ROOT}/synthesizer.pte`,
+export const KOKORO_STANDARD = {
+  modelName: 'kokoro' as const,
+  durationPredictorSource: `${KOKORO_STANDARD_MODEL_ROOT}/duration_predictor_std.pte`,
+  synthesizerSource: `${KOKORO_STANDARD_MODEL_ROOT}/synthesizer_std.pte`,
 };
 
 /**
- * A standard Kokoro instance which processes the text in batches of maximum 128 tokens.
+ * A fine-tuned Kokoro instance for Polish.
  * @category Models - Text to Speech
  */
-export const KOKORO_MEDIUM = {
-  modelName: 'kokoro-medium' as const,
-  durationPredictorSource: `${KOKORO_EN_MEDIUM_MODELS_ROOT}/duration_predictor.pte`,
-  synthesizerSource: `${KOKORO_EN_MEDIUM_MODELS_ROOT}/synthesizer.pte`,
+export const KOKORO_POLISH = {
+  modelName: 'kokoro' as const,
+  durationPredictorSource: `${KOKORO_POLISH_MODEL_ROOT}/duration_predictor_pl.pte`,
+  synthesizerSource: `${KOKORO_POLISH_MODEL_ROOT}/synthesizer_pl.pte`,
 };
diff --git a/packages/react-native-executorch/src/constants/tts/voices.ts b/packages/react-native-executorch/src/constants/tts/voices.ts
index cb98616906..c1bcb7116e 100644
--- a/packages/react-native-executorch/src/constants/tts/voices.ts
+++ b/packages/react-native-executorch/src/constants/tts/voices.ts
@@ -1,84 +1,286 @@
-import { KokoroVoiceExtras, VoiceConfig } from '../../types/tts';
-import { URL_PREFIX, VERSION_TAG } from '../versions';
-
-// Kokoro voices - phonemizers
-const KOKORO_PHONEMIZER_PREFIX = `${URL_PREFIX}-kokoro/${VERSION_TAG}/phonemizer`;
-const KOKORO_PHONEMIZER_TAGGER_DATA = `${KOKORO_PHONEMIZER_PREFIX}/tags.json`;
-const KOKORO_PHONEMIZER_LEXICON_EN_US_DATA = `${KOKORO_PHONEMIZER_PREFIX}/us_merged.json`;
-const KOKORO_PHONEMIZER_LEXICON_EN_GB_DATA = `${KOKORO_PHONEMIZER_PREFIX}/gb_merged.json`;
-
-const EN_US_RESOURCES = {
-  taggerSource: KOKORO_PHONEMIZER_TAGGER_DATA,
-  lexiconSource: KOKORO_PHONEMIZER_LEXICON_EN_US_DATA,
-} as KokoroVoiceExtras;
-const EN_GB_RESOURCES = {
-  taggerSource: KOKORO_PHONEMIZER_TAGGER_DATA,
-  lexiconSource: KOKORO_PHONEMIZER_LEXICON_EN_GB_DATA,
-} as KokoroVoiceExtras;
+import { TextToSpeechModelConfig } from '../../types/tts';
+import { NEXT_VERSION_TAG, URL_PREFIX } from '../versions';
+import { KOKORO_STANDARD, KOKORO_POLISH } from './models';
+
+// Common prefixes - voices & phonemization data
+const KOKORO_VOICE_PREFIX = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/voices`;
+const KOKORO_PHONEMIZER_PREFIX = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/phonemizer`;
+
+const KOKORO_PHONEMIZER_EN_US_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/en-us`;
+const KOKORO_PHONEMIZER_EN_US_TAGGER = `${KOKORO_PHONEMIZER_EN_US_PREFIX}/tags.json`;
+const KOKORO_PHONEMIZER_EN_US_LEXICON = `${KOKORO_PHONEMIZER_EN_US_PREFIX}/lexicon.json`;
+const KOKORO_PHONEMIZER_EN_US_MODEL = `${KOKORO_PHONEMIZER_EN_US_PREFIX}/phonemizer_en_us.pte`;
+
+const KOKORO_PHONEMIZER_EN_GB_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/en-gb`;
+const KOKORO_PHONEMIZER_EN_GB_TAGGER = `${KOKORO_PHONEMIZER_EN_GB_PREFIX}/tags.json`;
+const KOKORO_PHONEMIZER_EN_GB_LEXICON = `${KOKORO_PHONEMIZER_EN_GB_PREFIX}/lexicon.json`;
+const KOKORO_PHONEMIZER_EN_GB_MODEL = `${KOKORO_PHONEMIZER_EN_GB_PREFIX}/phonemizer_en_gb.pte`;
+
+// French
+const KOKORO_PHONEMIZER_FR_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/fr`;
+const KOKORO_PHONEMIZER_FR_MODEL = `${KOKORO_PHONEMIZER_FR_PREFIX}/phonemizer_fr.pte`;
+
+// Spanish
+const KOKORO_PHONEMIZER_ES_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/es`;
+const KOKORO_PHONEMIZER_ES_MODEL = `${KOKORO_PHONEMIZER_ES_PREFIX}/phonemizer_es.pte`;
+
+// Italian
+const KOKORO_PHONEMIZER_IT_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/it`;
+const KOKORO_PHONEMIZER_IT_MODEL = `${KOKORO_PHONEMIZER_IT_PREFIX}/phonemizer_it.pte`;
+
+// Portuguese
+const KOKORO_PHONEMIZER_PT_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/pt`;
+const KOKORO_PHONEMIZER_PT_MODEL = `${KOKORO_PHONEMIZER_PT_PREFIX}/phonemizer_pt.pte`;
+
+// Polish
+const KOKORO_PHONEMIZER_PL_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/pl`;
+const KOKORO_PHONEMIZER_PL_MODEL = `${KOKORO_PHONEMIZER_PL_PREFIX}/phonemizer_pl.pte`;
+
+// Hindi
+const KOKORO_PHONEMIZER_HI_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/hi`;
+const KOKORO_PHONEMIZER_HI_MODEL = `${KOKORO_PHONEMIZER_HI_PREFIX}/phonemizer_hi.pte`;
 
 // Kokoro voices
-const KOKORO_VOICE_PREFIX = `${URL_PREFIX}-kokoro/${VERSION_TAG}/voices`;
 /**
  * @category TTS Supported Voices
  */
-export const KOKORO_VOICE_AF_HEART = {
-  lang: 'en-us' as const,
+export const KOKORO_AMERICAN_ENGLISH_FEMALE_HEART = {
+  model: KOKORO_STANDARD,
   voiceSource: `${KOKORO_VOICE_PREFIX}/af_heart.bin`,
-  extra: EN_US_RESOURCES,
-} as VoiceConfig;
+  phonemizerConfig: {
+    lang: 'en-us' as const,
+    taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER,
+    lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON,
+    neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
 /**
  * @category TTS Supported Voices
  */
-export const KOKORO_VOICE_AF_RIVER = {
-  lang: 'en-us' as const,
+export const KOKORO_AMERICAN_ENGLISH_FEMALE_RIVER = {
+  model: KOKORO_STANDARD,
   voiceSource: `${KOKORO_VOICE_PREFIX}/af_river.bin`,
-  extra: EN_US_RESOURCES,
-} as VoiceConfig;
+  phonemizerConfig: {
+    lang: 'en-us' as const,
+    taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER,
+    lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON,
+    neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
 /**
  * @category TTS Supported Voices
  */
-export const KOKORO_VOICE_AF_SARAH = {
-  lang: 'en-us' as const,
+export const KOKORO_AMERICAN_ENGLISH_FEMALE_SARAH = {
+  model: KOKORO_STANDARD,
   voiceSource: `${KOKORO_VOICE_PREFIX}/af_sarah.bin`,
-  extra: EN_US_RESOURCES,
-} as VoiceConfig;
+  phonemizerConfig: {
+    lang: 'en-us' as const,
+    taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER,
+    lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON,
+    neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
 /**
  * @category TTS Supported Voices
  */
-export const KOKORO_VOICE_AM_ADAM = {
-  lang: 'en-us' as const,
+export const KOKORO_AMERICAN_ENGLISH_MALE_ADAM = {
+  model: KOKORO_STANDARD,
   voiceSource: `${KOKORO_VOICE_PREFIX}/am_adam.bin`,
-  extra: EN_US_RESOURCES,
-} as VoiceConfig;
+  phonemizerConfig: {
+    lang: 'en-us' as const,
+    taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER,
+    lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON,
+    neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
 /**
  * @category TTS Supported Voices
  */
-export const KOKORO_VOICE_AM_MICHAEL = {
-  lang: 'en-us' as const,
+export const KOKORO_AMERICAN_ENGLISH_MALE_MICHAEL = {
+  model: KOKORO_STANDARD,
   voiceSource: `${KOKORO_VOICE_PREFIX}/am_michael.bin`,
-  extra: EN_US_RESOURCES,
-} as VoiceConfig;
+  phonemizerConfig: {
+    lang: 'en-us' as const,
+    taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER,
+    lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON,
+    neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
 /**
  * @category TTS Supported Voices
  */
-export const KOKORO_VOICE_AM_SANTA = {
-  lang: 'en-us' as const,
+export const KOKORO_AMERICAN_ENGLISH_MALE_SANTA = {
+  model: KOKORO_STANDARD,
   voiceSource: `${KOKORO_VOICE_PREFIX}/am_santa.bin`,
-  extra: EN_US_RESOURCES,
-} as VoiceConfig;
+  phonemizerConfig: {
+    lang: 'en-us' as const,
+    taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER,
+    lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON,
+    neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
 /**
  * @category TTS Supported Voices
  */
-export const KOKORO_VOICE_BF_EMMA = {
-  lang: 'en-gb' as const,
+export const KOKORO_BRITISH_ENGLISH_FEMALE_EMMA = {
+  model: KOKORO_STANDARD,
   voiceSource: `${KOKORO_VOICE_PREFIX}/bf_emma.bin`,
-  extra: EN_GB_RESOURCES,
-} as VoiceConfig;
+  phonemizerConfig: {
+    lang: 'en-gb' as const,
+    taggerSource: KOKORO_PHONEMIZER_EN_GB_TAGGER,
+    lexiconSource: KOKORO_PHONEMIZER_EN_GB_LEXICON,
+    neuralModelSource: KOKORO_PHONEMIZER_EN_GB_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
 /**
  * @category TTS Supported Voices
  */
-export const KOKORO_VOICE_BM_DANIEL = {
-  lang: 'en-gb' as const,
+export const KOKORO_BRITISH_ENGLISH_MALE_DANIEL = {
+  model: KOKORO_STANDARD,
   voiceSource: `${KOKORO_VOICE_PREFIX}/bm_daniel.bin`,
-  extra: EN_GB_RESOURCES,
-} as VoiceConfig;
+  phonemizerConfig: {
+    lang: 'en-gb' as const,
+    taggerSource: KOKORO_PHONEMIZER_EN_GB_TAGGER,
+    lexiconSource: KOKORO_PHONEMIZER_EN_GB_LEXICON,
+    neuralModelSource: KOKORO_PHONEMIZER_EN_GB_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
+/**
+ * @category TTS Supported Voices
+ */
+export const KOKORO_FRENCH_FEMALE_SIWIS = {
+  model: KOKORO_STANDARD,
+  voiceSource: `${KOKORO_VOICE_PREFIX}/ff_siwis.bin`,
+  phonemizerConfig: {
+    lang: 'fr' as const,
+    neuralModelSource: KOKORO_PHONEMIZER_FR_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
+/**
+ * @category TTS Supported Voices
+ */
+export const KOKORO_SPANISH_FEMALE_DORA = {
+  model: KOKORO_STANDARD,
+  voiceSource: `${KOKORO_VOICE_PREFIX}/ef_dora.bin`,
+  phonemizerConfig: {
+    lang: 'es' as const,
+    neuralModelSource: KOKORO_PHONEMIZER_ES_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
+/**
+ * @category TTS Supported Voices
+ */
+export const KOKORO_SPANISH_MALE_ALEX = {
+  model: KOKORO_STANDARD,
+  voiceSource: `${KOKORO_VOICE_PREFIX}/em_alex.bin`,
+  phonemizerConfig: {
+    lang: 'es' as const,
+    neuralModelSource: KOKORO_PHONEMIZER_ES_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
+/**
+ * @category TTS Supported Voices
+ */
+export const KOKORO_ITALIAN_FEMALE_SARA = {
+  model: KOKORO_STANDARD,
+  voiceSource: `${KOKORO_VOICE_PREFIX}/if_sara.bin`,
+  phonemizerConfig: {
+    lang: 'it' as const,
+    neuralModelSource: KOKORO_PHONEMIZER_IT_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
+/**
+ * @category TTS Supported Voices
+ */
+export const KOKORO_ITALIAN_MALE_NICOLA = {
+  model: KOKORO_STANDARD,
+  voiceSource: `${KOKORO_VOICE_PREFIX}/im_nicola.bin`,
+  phonemizerConfig: {
+    lang: 'it' as const,
+    neuralModelSource: KOKORO_PHONEMIZER_IT_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
+/**
+ * @category TTS Supported Voices
+ */
+export const KOKORO_PORTUGUESE_FEMALE_DORA = {
+  model: KOKORO_STANDARD,
+  voiceSource: `${KOKORO_VOICE_PREFIX}/pf_dora.bin`,
+  phonemizerConfig: {
+    lang: 'pt' as const,
+    neuralModelSource: KOKORO_PHONEMIZER_PT_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
+/**
+ * @category TTS Supported Voices
+ */
+export const KOKORO_PORTUGUESE_MALE_SANTA = {
+  model: KOKORO_STANDARD,
+  voiceSource: `${KOKORO_VOICE_PREFIX}/pm_santa.bin`,
+  phonemizerConfig: {
+    lang: 'pt' as const,
+    neuralModelSource: KOKORO_PHONEMIZER_PT_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
+/**
+ * @category TTS Supported Voices
+ */
+export const KOKORO_HINDI_FEMALE_ALPHA = {
+  model: KOKORO_STANDARD,
+  voiceSource: `${KOKORO_VOICE_PREFIX}/hf_alpha.bin`,
+  phonemizerConfig: {
+    lang: 'hi' as const,
+    neuralModelSource: KOKORO_PHONEMIZER_HI_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
+/**
+ * @category TTS Supported Voices
+ */
+export const KOKORO_HINDI_MALE_OMEGA = {
+  model: KOKORO_STANDARD,
+  voiceSource: `${KOKORO_VOICE_PREFIX}/hm_omega.bin`,
+  phonemizerConfig: {
+    lang: 'hi' as const,
+    neuralModelSource: KOKORO_PHONEMIZER_HI_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
+/**
+ * @category TTS Supported Voices
+ */
+export const KOKORO_HINDI_MALE_PSI = {
+  model: KOKORO_STANDARD,
+  voiceSource: `${KOKORO_VOICE_PREFIX}/hm_psi.bin`,
+  phonemizerConfig: {
+    lang: 'hi' as const,
+    neuralModelSource: KOKORO_PHONEMIZER_HI_MODEL,
+  },
+} as TextToSpeechModelConfig;
+
+/**
+ * @category TTS Supported Voices
+ */
+export const KOKORO_POLISH_MALE_MATEUSZ = {
+  model: KOKORO_POLISH,
+  voiceSource: `${KOKORO_VOICE_PREFIX}/pm_mateusz.bin`,
+  phonemizerConfig: {
+    lang: 'pl' as const,
+    neuralModelSource: KOKORO_PHONEMIZER_PL_MODEL,
+  },
+} as TextToSpeechModelConfig;
diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
index 70ecc3e73f..6e034693af 100644
--- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
+++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts
@@ -1,12 +1,10 @@
 import { useCallback, useEffect, useState } from 'react';
 import { TextToSpeechModule } from '../../modules/natural_language_processing/TextToSpeechModule';
 import {
-  TextToSpeechProps,
   TextToSpeechInput,
-  TextToSpeechPhonemeInput,
-  TextToSpeechType,
+  TextToSpeechModelConfig,
   TextToSpeechStreamingInput,
-  TextToSpeechStreamingPhonemeInput,
+  TextToSpeechType,
 } from '../../types/tts';
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
@@ -14,14 +12,15 @@ import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils';
 /**
  * React hook for managing Text to Speech instance.
  * @category Hooks
- * @param TextToSpeechProps - Configuration object containing `model` source, `voice` and optional `preventLoad`.
+ * @param model - Configuration object containing model config.
+ * @param options - Additional options for the hook.
+ * @param options.preventLoad - If true, prevents the model from loading automatically on initialization.
  * @returns Ready to use Text to Speech model.
  */
-export const useTextToSpeech = ({
-  model,
-  voice,
-  preventLoad = false,
-}: TextToSpeechProps): TextToSpeechType => {
+export const useTextToSpeech = (
+  model: TextToSpeechModelConfig,
+  { preventLoad = false }: { preventLoad?: boolean } = {}
+): TextToSpeechType => {
   const [error, setError] = useState<RnExecutorchError | null>(null);
   const [isReady, setIsReady] = useState(false);
   const [isGenerating, setIsGenerating] = useState(false);
@@ -38,7 +37,7 @@ export const useTextToSpeech = ({
     setError(null);
     setIsReady(false);
 
-    TextToSpeechModule.fromModelName({ model, voice }, setDownloadProgress)
+    TextToSpeechModule.fromModelName(model, setDownloadProgress)
       .then((mod) => {
         if (!active) {
           mod.delete();
@@ -57,21 +56,21 @@ export const useTextToSpeech = ({
     return () => {
       active = false;
       setModuleInstance((prev) => {
+        prev?.streamStop(true);
         prev?.delete();
         return null;
       });
     };
     // eslint-disable-next-line react-hooks/exhaustive-deps
   }, [
-    model.modelName,
-    model.durationPredictorSource,
-    model.synthesizerSource,
-    voice?.voiceSource,
-    voice?.extra,
+    model.model.modelName,
+    model.model.durationPredictorSource,
+    model.model.synthesizerSource,
+    model.voiceSource,
+    model.phonemizerConfig,
     preventLoad,
   ]);
 
-  // Shared guard for all generation methods
   const guardReady = useCallback(
     (methodName: string): TextToSpeechModule => {
       if (!isReady || !moduleInstance)
@@ -93,19 +92,10 @@ export const useTextToSpeech = ({
     const instance = guardReady('forward');
     try {
       setIsGenerating(true);
-      return await instance.forward(input.text ?? '', input.speed ?? 1.0);
-    } finally {
-      setIsGenerating(false);
-    }
-  };
-
-  const forwardFromPhonemes = async (input: TextToSpeechPhonemeInput) => {
-    const instance = guardReady('forwardFromPhonemes');
-    try {
-      setIsGenerating(true);
-      return await instance.forwardFromPhonemes(
-        input.phonemes ?? '',
-        input.speed ?? 1.0
+      return await instance.forward(
+        input.text ?? '',
+        input.speed ?? 1.0,
+        input.phonemize ?? true
       );
     } finally {
       setIsGenerating(false);
@@ -118,8 +108,6 @@ export const useTextToSpeech = ({
       setIsGenerating(true);
       try {
         if (input.text) {
-          // If the initial text does not end with an end of sentence character,
-          // we add an artificial dot to improve output's quality.
           instance.streamInsert(
             input.text +
               ('.?!;'.includes(input.text.trim().slice(-1)) ? '' : '.')
@@ -129,34 +117,16 @@ export const useTextToSpeech = ({
         await input.onBegin?.();
         for await (const audio of instance.stream({
           speed: input.speed ?? 1.0,
+          phonemize: input.phonemize ?? true,
           stopAutomatically: input.stopAutomatically ?? true,
         })) {
           if (input.onNext) {
             await input.onNext(audio);
           }
         }
-      } finally {
-        await input.onEnd?.();
-        setIsGenerating(false);
-      }
-    },
-    [guardReady]
-  );
-
-  const streamFromPhonemes = useCallback(
-    async (input: TextToSpeechStreamingPhonemeInput) => {
-      const instance = guardReady('streamFromPhonemes');
-      setIsGenerating(true);
-      try {
-        await input.onBegin?.();
-        for await (const audio of instance.streamFromPhonemes({
-          phonemes: input.phonemes ?? '',
-          speed: input.speed ?? 1.0,
-        })) {
-          if (input.onNext) {
-            await input.onNext(audio);
-          }
-        }
+      } catch (e) {
+        instance.streamStop(true);
+        throw e;
       } finally {
         await input.onEnd?.();
         setIsGenerating(false);
@@ -188,9 +158,7 @@ export const useTextToSpeech = ({
     isReady,
     isGenerating,
     forward,
-    forwardFromPhonemes,
     stream,
-    streamFromPhonemes,
     streamInsert,
     streamStop,
     downloadProgress,
diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts
index 96d167a7d2..911e555045 100644
--- a/packages/react-native-executorch/src/index.ts
+++ b/packages/react-native-executorch/src/index.ts
@@ -100,7 +100,8 @@ declare global {
   var loadTextToSpeechKokoro: (
     lang: string,
     taggerData: string,
-    phonemizerData: string,
+    lexiconData: string,
+    neuralPhonemizerData: string,
     durationPredictorSource: string,
     synthesizerSource: string,
     voice: string
diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
index 6ab28543c6..d59fa56e30 100644
--- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
+++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts
@@ -1,12 +1,11 @@
 import { RnExecutorchErrorCode } from '../../errors/ErrorCodes';
 import { parseUnknownError, RnExecutorchError } from '../../errors/errorUtils';
 import { ResourceFetcher } from '../../utils/ResourceFetcher';
+import { ResourceSource } from '../../types/common';
 import {
-  KokoroConfig,
-  TextToSpeechConfig,
+  TextToSpeechModelConfig,
+  TextToSpeechModelSources,
   TextToSpeechStreamingInput,
-  TextToSpeechStreamingPhonemeInput,
-  VoiceConfig,
 } from '../../types/tts';
 import { Logger } from '../../common/Logger';
 
@@ -24,26 +23,17 @@ export class TextToSpeechModule {
 
   /**
    * Creates a Text to Speech instance.
-   * @param config - Configuration object containing `model` and `voice`.
-   *   Pass one of the built-in constants (e.g. `{ model: KOKORO_MEDIUM, voice: KOKORO_VOICE_AF_HEART }`), or use require() to pass them.
+   * @param config - Configuration object containing model and voice sources.
    * @param onDownloadProgress - Optional callback to monitor download progress, receiving a value between 0 and 1.
    * @returns A Promise resolving to a `TextToSpeechModule` instance.
-   * @example
-   * ```ts
-   * import { TextToSpeechModule, KOKORO_MEDIUM, KOKORO_VOICE_AF_HEART } from 'react-native-executorch';
-   * const tts = await TextToSpeechModule.fromModelName(
-   *   { model: KOKORO_MEDIUM, voice: KOKORO_VOICE_AF_HEART },
-   * );
-   * ```
    */
   static async fromModelName(
-    config: TextToSpeechConfig,
+    config: TextToSpeechModelConfig,
     onDownloadProgress: (progress: number) => void = () => {}
   ): Promise<TextToSpeechModule> {
     try {
       const nativeModule = await TextToSpeechModule.loadKokoro(
-        config.model,
-        config.voice,
+        config,
         onDownloadProgress
       );
       return new TextToSpeechModule(nativeModule);
@@ -54,48 +44,52 @@ export class TextToSpeechModule {
   }
 
   private static async loadKokoro(
-    model: KokoroConfig,
-    voice: VoiceConfig,
+    config: TextToSpeechModelConfig,
     onDownloadProgressCallback: (progress: number) => void
   ): Promise<unknown> {
-    if (
-      !voice.extra ||
-      !voice.extra.taggerSource ||
-      !voice.extra.lexiconSource
-    ) {
-      throw new RnExecutorchError(
-        RnExecutorchErrorCode.InvalidConfig,
-        'Kokoro: voice config is missing required extra fields: taggerSource and/or lexiconSource.'
-      );
-    }
+    const { model, voiceSource, phonemizerConfig } = config;
+    const kokoroModel = model as Extract<
+      TextToSpeechModelSources,
+      { modelName: 'kokoro' }
+    >;
+
+    const sources: ResourceSource[] = [
+      kokoroModel.durationPredictorSource,
+      kokoroModel.synthesizerSource,
+      voiceSource,
+    ];
+
+    // Since each of these args is optional, we need to handle the sources array in a dynamic way.
+    const taggerIdx = phonemizerConfig.taggerSource
+      ? sources.push(phonemizerConfig.taggerSource) - 1
+      : -1;
+    const lexiconIdx = phonemizerConfig.lexiconSource
+      ? sources.push(phonemizerConfig.lexiconSource) - 1
+      : -1;
+    const neuralModelIdx = phonemizerConfig.neuralModelSource
+      ? sources.push(phonemizerConfig.neuralModelSource) - 1
+      : -1;
 
     const paths = await ResourceFetcher.fetch(
       onDownloadProgressCallback,
-      model.durationPredictorSource,
-      model.synthesizerSource,
-      voice.voiceSource,
-      voice.extra.taggerSource,
-      voice.extra.lexiconSource
+      ...sources
     );
 
-    if (paths === null || paths.length !== 5) {
+    if (paths === null || paths.length !== sources.length) {
       throw new RnExecutorchError(
         RnExecutorchErrorCode.DownloadInterrupted,
         'Download interrupted or missing resource.'
       );
     }
 
-    const modelPaths = paths.slice(0, 2) as [string, string];
-    const voiceDataPath = paths[2] as string;
-    const phonemizerPaths = paths.slice(3, 5) as [string, string];
-
     return await global.loadTextToSpeechKokoro(
-      voice.lang,
-      phonemizerPaths[0],
-      phonemizerPaths[1],
-      modelPaths[0],
-      modelPaths[1],
-      voiceDataPath
+      phonemizerConfig.lang,
+      taggerIdx >= 0 ? (paths[taggerIdx] as string) : '',
+      lexiconIdx >= 0 ? (paths[lexiconIdx] as string) : '',
+      neuralModelIdx >= 0 ? (paths[neuralModelIdx] as string) : '',
+      paths[0] as string, // DurationPredictor source
+      paths[1] as string, // Synthesizer source
+      paths[2] as string // Voice source
     );
   }
 
@@ -108,47 +102,33 @@ export class TextToSpeechModule {
   }
 
   /**
-   * Synthesizes the provided text into speech.
-   * Returns a promise that resolves to the full audio waveform as a `Float32Array`.
-   * @param text The input text to be synthesized.
-   * @param speed Optional speed multiplier for the speech synthesis (default is 1.0).
-   * @returns A promise resolving to the synthesized audio waveform.
+   * Synthesizes the provided input (text or IPA phonemes) into speech.
+   * @param input - The input text or phonemes to be synthesized.
+   * @param speed - Playback speed multiplier (default: 1.0).
+   * @param phonemize - If true (default), treats input as text and converts it to phonemes.
+   *                    If false, input is treated as phonemes.
+   * @returns A promise resolving to the full audio waveform as a `Float32Array`.
    */
   public async forward(
-    text: string,
-    speed: number = 1.0
+    input: string,
+    speed: number = 1.0,
+    phonemize: boolean = true
   ): Promise<Float32Array> {
     this.ensureLoaded('forward');
-    return await this.nativeModule.generate(text, speed);
-  }
-
-  /**
-   * Synthesizes pre-computed phonemes into speech, bypassing the built-in phonemizer.
-   * This allows using an external G2P system (e.g. the Python `phonemizer` library,
-   * espeak-ng, or any custom phonemizer).
-   * @param phonemes The pre-computed IPA phoneme string.
-   * @param speed Optional speed multiplier for the speech synthesis (default is 1.0).
-   * @returns A promise resolving to the synthesized audio waveform.
-   */
-  public async forwardFromPhonemes(
-    phonemes: string,
-    speed: number = 1.0
-  ): Promise<Float32Array> {
-    this.ensureLoaded('forwardFromPhonemes');
-    return await this.nativeModule.generateFromPhonemes(phonemes, speed);
+    return await this.nativeModule.generate(input, speed, phonemize);
   }
 
   /**
    * Starts a streaming synthesis session. Yields audio chunks as they are generated.
-   * @param input - Input object containing text and optional speed.
+   * @param input - Input object containing optional speed, phonemize flag and stopAutomatically flag.
    * @yields An audio chunk generated during synthesis.
    * @returns An async generator yielding Float32Array audio chunks.
    */
   public async *stream({
-    speed,
-    stopAutomatically,
+    speed = 1.0,
+    phonemize = true,
+    stopAutomatically = true,
   }: TextToSpeechStreamingInput): AsyncGenerator<Float32Array> {
-    // Stores computed audio segments
     const queue: Float32Array[] = [];
 
     let waiter: (() => void) | null = null;
@@ -165,66 +145,13 @@ export class TextToSpeechModule {
     (async () => {
       try {
         await this.nativeModule.stream(
-          speed,
-          stopAutomatically,
           (audio: number[]) => {
             queue.push(new Float32Array(audio));
             wake();
-          }
-        );
-        nativeStreamFinished = true;
-        wake();
-      } catch (e) {
-        error = e;
-        nativeStreamFinished = true;
-        wake();
-      }
-    })();
-
-    while (this.isStreaming) {
-      if (queue.length > 0) {
-        yield queue.shift()!;
-        if (nativeStreamFinished && queue.length === 0) {
-          return;
-        }
-        continue;
-      }
-      if (error) throw error;
-      await new Promise<void>((r) => (waiter = r));
-    }
-  }
-
-  /**
-   * Starts a streaming synthesis session from pre-computed phonemes.
-   * Bypasses the built-in phonemizer, allowing use of external G2P systems.
-   * @param input - Input object containing phonemes and optional speed.
-   * @yields An audio chunk generated during synthesis.
-   * @returns An async generator yielding Float32Array audio chunks.
-   */
-  public async *streamFromPhonemes({
-    phonemes,
-    speed,
-  }: TextToSpeechStreamingPhonemeInput): AsyncGenerator<Float32Array> {
-    const queue: Float32Array[] = [];
-
-    let waiter: (() => void) | null = null;
-    let error: unknown;
-    let nativeStreamFinished = false;
-
-    const wake = () => {
-      waiter?.();
-      waiter = null;
-    };
-
-    (async () => {
-      try {
-        await this.nativeModule.streamFromPhonemes(
-          phonemes,
+          },
           speed,
-          (audio: number[]) => {
-            queue.push(new Float32Array(audio));
-            wake();
-          }
+          phonemize,
+          stopAutomatically
         );
         nativeStreamFinished = true;
         wake();
@@ -244,16 +171,17 @@ export class TextToSpeechModule {
         continue;
       }
       if (error) throw error;
+      if (nativeStreamFinished && queue.length === 0) return;
       await new Promise<void>((r) => (waiter = r));
     }
   }
 
   /**
-   * Inserts new text chunk into the buffer to be processed in streaming mode.
-   * @param textChunk - The text fragment to append to the streaming buffer.
+   * Inserts new content (text or IPA phonemes) into the buffer to be processed in streaming mode.
+   * @param input - The text or phoneme fragment to append to the streaming buffer.
    */
-  public streamInsert(textChunk: string): void {
-    this.nativeModule.streamInsert(textChunk);
+  public streamInsert(input: string): void {
+    this.nativeModule.streamInsert(input);
   }
 
   /**
diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts
index 82a5a5471c..278eb18c30 100644
--- a/packages/react-native-executorch/src/types/tts.ts
+++ b/packages/react-native-executorch/src/types/tts.ts
@@ -1,11 +1,22 @@
 import { ResourceSource } from './common';
 import { RnExecutorchError } from '../errors/errorUtils';
 
+/**
+ * Per-model config for {@link TextToSpeechModule.fromModelName}.
+ * Each model name maps to its required fields.
+ * @category Types
+ */
+export type TextToSpeechModelSources = {
+  modelName: 'kokoro';
+  durationPredictorSource: ResourceSource;
+  synthesizerSource: ResourceSource;
+};
+
 /**
  * Union of all built-in Text to Speech model names.
  * @category Types
  */
-export type TextToSpeechModelName = 'kokoro-small' | 'kokoro-medium';
+export type TextToSpeechModelName = TextToSpeechModelSources['modelName'];
 
 /**
  * List all the languages available in TTS models (as lang shorthands)
@@ -13,68 +24,56 @@ export type TextToSpeechModelName = 'kokoro-small' | 'kokoro-medium';
  */
 export type TextToSpeechLanguage =
   | 'en-us' // American English
-  | 'en-gb'; // British English
+  | 'en-gb' // British English
+  | 'fr' // French
+  | 'es' // Spanish
+  | 'it' // Italian
+  | 'pt' // Portuguese
+  | 'pl' // Polish
+  | 'hi'; // Hindi
 
 /**
- * Voice configuration
- *
- * So far in Kokoro, each voice is directly associated with a language.
+ * Configuration for the Phonemizer used in Text-to-Speech models.
+ * Phonemization is the process of converting text into phonetic representations.
  * @category Types
- * @property {TextToSpeechLanguage} lang - speaker's language
- * @property {ResourceSource} voiceSource - a source to a binary file with voice embedding
- * @property {KokoroVoiceExtras} [extra] - an optional extra sources or properties related to specific voice
  */
-export interface VoiceConfig {
+export interface TextToSpeechPhonemizerConfig {
+  /**
+   * The language code for phonemization (e.g., 'en-us').
+   */
   lang: TextToSpeechLanguage;
-  voiceSource: ResourceSource;
-  extra?: KokoroVoiceExtras; // ... add more possible types
-}
 
-/**
- * Kokoro-specific voice extra props
- * @category Types
- * @property {ResourceSource} taggerSource - source to Kokoro's tagger model binary
- * @property {ResourceSource} lexiconSource - source to Kokoro's lexicon binary
- */
-export interface KokoroVoiceExtras {
-  taggerSource: ResourceSource;
-  lexiconSource: ResourceSource;
-}
+  /**
+   * Optional resource for the part-of-speech tagger.
+   * Utilized by more challenging languages, such as english.
+   */
+  taggerSource?: ResourceSource;
 
-/**
- * Kokoro model configuration.
- * Only the core Kokoro model sources, as phonemizer sources are included in voice configuration.
- * @category Types
- * @property {TextToSpeechModelName} modelName - model name identifier
- * @property {ResourceSource} durationPredictorSource - source to Kokoro's duration predictor model binary
- * @property {ResourceSource} synthesizerSource - source to Kokoro's synthesizer model binary
- */
-export interface KokoroConfig {
-  modelName: TextToSpeechModelName;
-  durationPredictorSource: ResourceSource;
-  synthesizerSource: ResourceSource;
-}
+  /**
+   * Optional resource for the pronunciation lexicon.
+   * If provided, it wil be a primary phonemization mechanism.
+   */
+  lexiconSource?: ResourceSource;
 
-/**
- * General Text to Speech module configuration
- * @category Types
- * @property {KokoroConfig} model - a selected T2S model
- * @property {VoiceConfig} voice - a selected speaker's voice
- * @property {KokoroOptions} [options] - a completely optional model-specific configuration
- */
-export interface TextToSpeechConfig {
-  model: KokoroConfig; // ... add other model types in the future
-  voice: VoiceConfig;
+  /**
+   * Optional neural model resource for Grapheme-to-Phoneme conversion.
+   * Serves as a fallback for lexicon or a primary phonemization mechanism if lexicon
+   * is not defined.
+   */
+  neuralModelSource?: ResourceSource;
 }
 
 /**
- * Props for the useTextToSpeech hook.
+ * Configuration for a specific model and voice in a Text-to-Speech module.
  * @category Types
- * @augments TextToSpeechConfig
- * @property {boolean} [preventLoad] - Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook.
+ * @property {TextToSpeechModelSources} model - The model sources and identifiers.
+ * @property {ResourceSource} voiceSource - The resource containing the voice-specific tensor stored in a binary format.
+ * @property {TextToSpeechPhonemizerConfig} phonemizerConfig - The phonemizer configuration to be used with this voice.
  */
-export interface TextToSpeechProps extends TextToSpeechConfig {
-  preventLoad?: boolean;
+export interface TextToSpeechModelConfig {
+  model: TextToSpeechModelSources;
+  voiceSource: ResourceSource;
+  phonemizerConfig: TextToSpeechPhonemizerConfig;
 }
 
 /**
@@ -82,24 +81,13 @@ export interface TextToSpeechProps extends TextToSpeechConfig {
  * @category Types
  * @property {string} text - a text to be spoken
  * @property {number} [speed] - optional speed argument - the higher it is, the faster the speech becomes
+ * @property {boolean} [phonemize] - if true (default), the input is treated as text and converted to phonemes.
+ *                                   If false, the input should already be in IPA phonemes.
  */
 export interface TextToSpeechInput {
   text?: string;
   speed?: number;
-}
-
-/**
- * Text to Speech module input for pre-computed phonemes.
- * Use this when you have your own phonemizer (e.g. the Python `phonemizer`
- * library, espeak-ng, or any custom G2P system) and want to bypass the
- * built-in phonemizer pipeline.
- * @category Types
- * @property {string} phonemes - pre-computed IPA phoneme string
- * @property {number} [speed] - optional speed argument - the higher it is, the faster the speech becomes
- */
-export interface TextToSpeechPhonemeInput {
-  phonemes: string;
-  speed?: number;
+  phonemize?: boolean;
 }
 
 /**
@@ -136,17 +124,6 @@ export interface TextToSpeechType {
    */
   forward: (input: TextToSpeechInput) => Promise<Float32Array>;
 
-  /**
-   * Synthesizes pre-computed phonemes into speech audio in a single pass.
-   * Bypasses the built-in phonemizer, allowing use of external G2P systems.
-   * @param input - The `TextToSpeechPhonemeInput` object containing pre-computed `phonemes` and optional `speed`.
-   * @returns A Promise that resolves with the generated audio data.
-   * @throws {RnExecutorchError} If the model is not loaded or is currently generating.
-   */
-  forwardFromPhonemes: (
-    input: TextToSpeechPhonemeInput
-  ) => Promise<Float32Array>;
-
   /**
    * Streams the generated audio data incrementally.
    * This is optimal for real-time playback, allowing audio to start playing before the full text is synthesized.
@@ -156,16 +133,6 @@ export interface TextToSpeechType {
    */
   stream: (input: TextToSpeechStreamingInput) => Promise<void>;
 
-  /**
-   * Streams pre-computed phonemes incrementally, bypassing the built-in phonemizer.
-   * @param input - The streaming input with pre-computed `phonemes` instead of `text`.
-   * @returns A Promise that resolves when the streaming process is complete.
-   * @throws {RnExecutorchError} If the model is not loaded or is currently generating.
-   */
-  streamFromPhonemes: (
-    input: TextToSpeechStreamingPhonemeInput
-  ) => Promise<void>;
-
   /**
    * Inserts new text chunk into the buffer to be processed in streaming mode.
    */
@@ -209,11 +176,3 @@ export interface TextToSpeechStreamingInput
   extends TextToSpeechInput, TextToSpeechStreamingCallbacks {
   stopAutomatically?: boolean;
 }
-
-/**
- * Streaming input definition for pre-computed phonemes.
- * Same as `TextToSpeechStreamingInput` but accepts `phonemes` instead of `text`.
- * @category Types
- */
-export interface TextToSpeechStreamingPhonemeInput
-  extends TextToSpeechPhonemeInput, TextToSpeechStreamingCallbacks {}
diff --git a/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a b/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a
deleted file mode 100644
index 5a38707580..0000000000
Binary files a/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a and /dev/null differ
diff --git a/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a b/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a
deleted file mode 100644
index 2306d4647a..0000000000
Binary files a/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a and /dev/null differ
diff --git a/packages/react-native-executorch/third-party/common/phonemis b/packages/react-native-executorch/third-party/common/phonemis
new file mode 160000
index 0000000000..2da5ef9971
--- /dev/null
+++ b/packages/react-native-executorch/third-party/common/phonemis
@@ -0,0 +1 @@
+Subproject commit 2da5ef9971fe0e2d92ebe1424c28905a18268a7d
diff --git a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/lexicon.h b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/lexicon.h
deleted file mode 100644
index 3af4268211..0000000000
--- a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/lexicon.h
+++ /dev/null
@@ -1,60 +0,0 @@
-#pragma once
-
-#include "../tagger/tag.h"
-#include "types.h"
-#include <optional>
-#include <string>
-#include <unordered_map>
-
-namespace phonemis::phonemizer {
-
-// Lexicon class
-// Provides phonemization of extracted tokens.
-// Wrapps a dictionary lookup for given word with additional
-// pre/post-processing.
-class Lexicon {
-public:
-  Lexicon(Lang language, const std::string &dict_filepath);
-
-  // Checks if given world exists in the lexicon in any form
-  bool is_known(const std::string &word) const;
-
-  // Returns the phonemization for given word, or "" if the phonemization failed
-  std::u32string get(const std::string &word, const tagger::Tag &tag,
-                     std::optional<float> base_stress = std::nullopt,
-                     std::optional<bool> vowel_next = std::nullopt);
-
-private:
-  // Helper functions - extract phonemes without stressing
-  std::u32string get_word(const std::string &word, const tagger::Tag &tag,
-                          std::optional<float> stress,
-                          std::optional<bool> vowel_next) const;
-
-  // Helper functions - word+suffix phonemization
-  // Phonemizes word ending with popular english suffixes, example: -ed, -s,
-  // -ing.
-  std::u32string stem_s(const std::string &word, const tagger::Tag &tag,
-                        std::optional<float> stress) const;
-  std::u32string stem_ed(const std::string &word, const tagger::Tag &tag,
-                         std::optional<float> stress) const;
-  std::u32string stem_ing(const std::string &word, const tagger::Tag &tag,
-                          std::optional<float> stress) const;
-
-  // Helper functions - dictionary lookup with stressing
-  // Returns an empty phoneme string if failed to extract phonemes.
-  std::u32string lookup(const std::string &word, const tagger::Tag &tag,
-                        std::optional<float> stress) const;
-  std::u32string lookup_nnp(const std::string &word) const;
-  std::u32string lookup_special(const std::string &word, const tagger::Tag &tag,
-                                std::optional<float> stress,
-                                std::optional<bool> vowel_next) const;
-
-  // Resolved language
-  Lang language_;
-
-  // Lookup dictionary: text -> phonemes
-  // Provide quick and direct phonemization for popular words.
-  std::unordered_map<std::string, std::u32string> dict_ = {};
-};
-
-} // namespace phonemis::phonemizer
\ No newline at end of file
diff --git a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/phonemizer.h b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/phonemizer.h
deleted file mode 100644
index 27f993939c..0000000000
--- a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/phonemizer.h
+++ /dev/null
@@ -1,29 +0,0 @@
-#pragma once
-
-#include "lexicon.h"
-#include <memory>
-#include <string>
-
-namespace phonemis::phonemizer {
-
-// Phonemizer class
-// Combines lexicon lookup-style phonemization with rule-based fallback
-class Phonemizer {
-public:
-  Phonemizer(Lang language, const std::string &lexicon_filepath = "");
-
-  // Main phonemization method
-  std::u32string phonemize(const std::string &word, const tagger::Tag &tag,
-                           std::optional<float> base_stress = std::nullopt,
-                           std::optional<bool> vowel_next = std::nullopt) const;
-
-private:
-  // Helper functions - rule-based fallback methods
-  std::u32string fallback(const std::string &word,
-                          const tagger::Tag &tag) const;
-
-  // Lexicon component
-  std::unique_ptr<Lexicon> lexicon_ = nullptr;
-};
-
-} // namespace phonemis::phonemizer
\ No newline at end of file
diff --git a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/types.h b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/types.h
deleted file mode 100644
index 7e6e8b4bcb..0000000000
--- a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/types.h
+++ /dev/null
@@ -1,13 +0,0 @@
-#pragma once
-
-namespace phonemis::phonemizer {
-
-// Available languages (english variants)
-enum class Lang {
-  EN_US,
-  EN_GB,
-
-  DEFAULT = EN_US
-};
-
-} // namespace phonemis::phonemizer
\ No newline at end of file
diff --git a/packages/react-native-executorch/third-party/include/phonemis/pipeline.h b/packages/react-native-executorch/third-party/include/phonemis/pipeline.h
deleted file mode 100644
index e8fdf35e31..0000000000
--- a/packages/react-native-executorch/third-party/include/phonemis/pipeline.h
+++ /dev/null
@@ -1,35 +0,0 @@
-#pragma once
-
-#include "phonemizer/phonemizer.h"
-#include "preprocessor/tools.h"
-#include "tagger/tagger.h"
-#include "tokenizer/tokenize.h"
-#include <memory>
-
-namespace phonemis {
-
-using phonemizer::Lang;
-using phonemizer::Phonemizer;
-using tagger::Tagger;
-
-// #### Main phonemization pipeline
-// Manages all the phonemization parts, from preprocessing, through
-// tokenization and tagging to final Phonemizer call.
-// Tagger and Lexicon .json data files are theoretically optional, but
-// skipping these arguments will significantly impact the phonemization quality.
-class Pipeline {
-public:
-  Pipeline(Lang language, const std::string &tagger_data_filepath = "",
-           const std::string &lexicon_data_filepath = "");
-
-  std::u32string process(const std::string &text);
-
-private:
-  Lang language_;
-
-  // Pipeline subcomponents
-  std::unique_ptr<Phonemizer> phonemizer_ = nullptr;
-  std::unique_ptr<Tagger> tagger_ = nullptr;
-};
-
-} // namespace phonemis
\ No newline at end of file
diff --git a/packages/react-native-executorch/third-party/include/phonemis/preprocessor/tools.h b/packages/react-native-executorch/third-party/include/phonemis/preprocessor/tools.h
deleted file mode 100644
index 9f77ba43de..0000000000
--- a/packages/react-native-executorch/third-party/include/phonemis/preprocessor/tools.h
+++ /dev/null
@@ -1,21 +0,0 @@
-#pragma once
-
-#include <string>
-#include <vector>
-
-namespace phonemis::preprocessor {
-
-// Normalizes the text by replacing all foreign characters
-// to latin-only phrases.
-std::string normalize_unicode(const std::string &text);
-
-// Divides a monolit text into multiple sentences.
-// A sentence always ends with a end of sentence character (defined in
-// constants.h).
-std::vector<std::string> split_sentences(const std::string &text);
-
-// Converts all the numbers in the text to spoken representations.
-// Usually expands the size of the text.
-std::string verbalize_numbers(const std::string &text);
-
-} // namespace phonemis::preprocessor
\ No newline at end of file
diff --git a/packages/react-native-executorch/third-party/include/phonemis/tagger/tag.h b/packages/react-native-executorch/third-party/include/phonemis/tagger/tag.h
deleted file mode 100644
index ba59af4e9b..0000000000
--- a/packages/react-native-executorch/third-party/include/phonemis/tagger/tag.h
+++ /dev/null
@@ -1,49 +0,0 @@
-#pragma once
-
-#include "../utilities/string_utils.h"
-#include <string>
-
-namespace phonemis::tagger {
-
-using namespace utilities;
-
-// Tag class definition
-// An abstraction layer which wrapps a simple string-based tag definition
-// with some additional logic.
-class Tag : public std::string {
-public:
-  // Inherit constructors and assignment from std::string
-  using std::string::string;
-  using std::string::operator=;
-  Tag(std::string const &s) : std::string(s) {}
-  Tag(std::string &&s) : std::string(std::move(s)) {}
-
-  // Extra logic
-  Tag parent_tag() const {
-    auto this_tag = static_cast<const std::string &>(*this);
-    if (this_tag == "VERB" || string_utils::starts_with(this_tag, "VB"))
-      return {"VERB"};
-    if (this_tag == "NOUN" || string_utils::starts_with(this_tag, "NN"))
-      return {"NOUN"};
-    if (string_utils::starts_with(this_tag, "ADV") ||
-        string_utils::starts_with(this_tag, "RB"))
-      return {"ADV"};
-    if (string_utils::starts_with(this_tag, "ADJ") ||
-        string_utils::starts_with(this_tag, "JJ"))
-      return {"ADJ"};
-    return (*this);
-  }
-};
-
-} // namespace phonemis::tagger
-
-// Hash definition
-// Required to use Tag objects as map keys.
-namespace std {
-template <> struct hash<phonemis::tagger::Tag> {
-  size_t operator()(phonemis::tagger::Tag const &t) const noexcept {
-    // Use std::string's hash implementation
-    return std::hash<std::string>()(static_cast<std::string const &>(t));
-  }
-};
-} // namespace std
\ No newline at end of file
diff --git a/packages/react-native-executorch/third-party/include/phonemis/tagger/tagger.h b/packages/react-native-executorch/third-party/include/phonemis/tagger/tagger.h
deleted file mode 100644
index c5ef085b7a..0000000000
--- a/packages/react-native-executorch/third-party/include/phonemis/tagger/tagger.h
+++ /dev/null
@@ -1,37 +0,0 @@
-#pragma once
-
-#include "../tokenizer/tokens.h"
-#include "tag.h"
-#include <string>
-#include <unordered_map>
-#include <unordered_set>
-#include <vector>
-
-namespace phonemis::tagger {
-
-// Tagger class
-// Provides PoS (Part of Speech) tagging functionality.
-// Requires a previous tokenization of the text (tokenizer module).
-// A modification of the Viterbi algorithm for bigram HMM (Hidden Markov Model)
-// tagger.
-class Tagger {
-public:
-  explicit Tagger(const std::string &hmm_data_path);
-
-  // Main tagging method - a modified Viterbi algorithm
-  // Works in place bo modyfing the 'tag' fields.
-  void tag(std::vector<tokenizer::Token> &sentence) const;
-
-private:
-  // Set of possible tags (states)
-  std::unordered_set<Tag> tags_;
-
-  // Probability maps - loaded from the input json file.
-  std::unordered_map<Tag, double> start_probs_ = {};
-  std::unordered_map<Tag, std::unordered_map<std::string, double>>
-      emission_probs_ = {};
-  std::unordered_map<Tag, std::unordered_map<Tag, double>> transition_probs_ =
-      {};
-};
-
-} // namespace phonemis::tagger
\ No newline at end of file
diff --git a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokenize.h b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokenize.h
deleted file mode 100644
index ab52e6946c..0000000000
--- a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokenize.h
+++ /dev/null
@@ -1,14 +0,0 @@
-#pragma once
-
-#include "tokens.h"
-#include "types.h"
-#include <string>
-#include <vector>
-
-namespace phonemis::tokenizer {
-
-// Tokenizes the input text into a vector of strings (tokens).
-// Follows specific rules for special characters and special words.
-std::vector<Token> tokenize(const std::string &text);
-
-} // namespace phonemis::tokenizer
diff --git a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokens.h b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokens.h
deleted file mode 100644
index 0f1c0d5f4e..0000000000
--- a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokens.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include "../tagger/tag.h"
-#include <optional>
-#include <string>
-
-namespace phonemis::tokenizer {
-
-// A main structure representing a single token extracted from text
-// Mandatory fields are extracted during the tokenization stage, while
-// extra fields might be processed later (for example, during the tagging stage)
-struct Token {
-  std::string text;
-  std::string whitespace = ""; // Following whitespace
-  bool is_first = false;       // Whether it is a first token in the sentence
-
-  // Extras
-  std::optional<tagger::Tag> tag =
-      std::nullopt; // A PoS (Part of Speech) tag, example: NN (noun)
-};
-
-} // namespace phonemis::tokenizer
\ No newline at end of file
diff --git a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/types.h b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/types.h
deleted file mode 100644
index 45e84a8735..0000000000
--- a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/types.h
+++ /dev/null
@@ -1,22 +0,0 @@
-#pragma once
-
-#include <string>
-
-namespace phonemis::tokenizer {
-
-namespace rules {
-// Separation rules for special characters
-enum class Separation {
-  JOIN_LEFT,    // Join to the word on its left
-  JOIN_RIGHT,   // Join to the word on its right
-  TOTAL_DIVIDE, // Always separate from both sides
-  TOTAL_JOIN    // Always join both sides
-};
-} // namespace rules
-
-struct SpecialCharacter {
-  char character;
-  rules::Separation sep_rule;
-};
-
-} // namespace phonemis::tokenizer
\ No newline at end of file
diff --git a/packages/react-native-executorch/third-party/include/phonemis/utilities/string_utils.h b/packages/react-native-executorch/third-party/include/phonemis/utilities/string_utils.h
deleted file mode 100644
index 481212cbe4..0000000000
--- a/packages/react-native-executorch/third-party/include/phonemis/utilities/string_utils.h
+++ /dev/null
@@ -1,155 +0,0 @@
-#pragma once
-
-#include <algorithm>
-#include <codecvt>
-#include <functional>
-#include <optional>
-#include <string>
-#include <string_view>
-#include <vector>
-
-namespace phonemis::utilities::string_utils {
-
-// -------------------------------------
-// String utils - byte format conversion
-// -------------------------------------
-
-// TODO: deprecated, replace with something else
-
-inline std::string char32_to_utf8(char32_t c) {
-  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> convert;
-  return convert.to_bytes(&c, &c + 1);
-}
-
-inline std::u32string utf8_to_u32string(const std::string &utf8) {
-  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> convert;
-  return convert.from_bytes(utf8);
-}
-
-inline std::string u32string_to_utf8(const std::u32string &u32) {
-  std::wstring_convert<std::codecvt_utf8<char32_t>, char32_t> convert;
-  return convert.to_bytes(u32);
-}
-
-// ----------------------------------------
-// String utils - capitalizing & lowerizing
-// ----------------------------------------
-
-// Capitalization (first letter only)
-template <typename StringT> inline void capitalize__(StringT &str) {
-  if (!str.empty())
-    str[0] = std::toupper(str[0]);
-}
-
-// Capitalization (an entire string)
-template <typename StringT> inline void to_upper__(StringT &str) {
-  std::transform(str.cbegin(), str.cend(), str.begin(),
-                 [](auto c) { return std::toupper(c); });
-}
-
-// Lowerization (an entire string)
-template <typename StringT> inline void to_lower__(StringT &str) {
-  std::transform(str.cbegin(), str.cend(), str.begin(),
-                 [](auto c) { return std::tolower(c); });
-}
-
-// ------------------------------------
-// String utils - other transformations
-// ------------------------------------
-
-// Filters a given string and omits all the characters which
-// do not pass given predicate.
-template <typename StringT, typename Pred>
-inline void filter__(StringT &str, Pred pred) {
-  str.erase(std::remove_if(str.begin(), str.end(), pred), str.end());
-}
-
-// Replaces all the occurances of a character `a` with a character `b`.
-// If `b` is not specified, then it removes all occurances of `a` without
-// replacement.
-template <typename StringT, typename CharT>
-inline void replace__(StringT &str, CharT a, std::optional<CharT> b) {
-  if (b.has_value())
-    std::replace(str.begin(), str.end(), a, b.value());
-  else
-    str.erase(std::remove(str.begin(), str.end(), a), str.end());
-}
-
-// Splits the string by the given character.
-template <typename StringT, typename CharT>
-inline std::vector<StringT> split(const StringT &str, CharT bpoint) {
-  std::vector<StringT> result = {};
-
-  auto it = str.begin();
-  while (it != str.end()) {
-    auto next = std::find(it, str.end(), bpoint);
-    result.emplace_back(it, next);
-
-    it = next;
-    if (it != str.end())
-      it++;
-  }
-
-  return result;
-}
-
-// Removes the leading and trailing characters equals to given character.
-// If the character is not specified, it removes white spaces instead.
-template <typename StringT, typename CharT>
-inline StringT strip(const StringT &str,
-                     std::optional<CharT> c = std::nullopt) {
-  auto lbound = std::find_if(str.cbegin(), str.cend(), [&c](CharT a) -> bool {
-    return c.has_value() ? a != c : !std::isspace(a);
-  });
-  auto rbound = std::find_if(str.crbegin(), str.crend(), [&c](CharT a) -> bool {
-    return c.has_value() ? a != c : !std::isspace(a);
-  });
-
-  return lbound != str.end() ? StringT(lbound, std::prev(rbound.base()))
-                             : StringT();
-}
-
-// -------------------------
-// String utils - predicates
-// -------------------------
-
-// Returns true if the string contains only alphabetic characters.
-template <typename StringT> inline bool is_alpha(const StringT &str) {
-  return std::all_of(str.cbegin(), str.cend(),
-                     [](char c) -> bool { return std::isalpha(c); });
-}
-
-// Returns true if the string starts with given suffix and false otherwise
-template <typename StringT>
-inline bool starts_with(const StringT &str, std::string_view prefix) {
-  return str.size() >= prefix.size() && str.substr(0, prefix.size()) == prefix;
-}
-
-// Returns true if the string ends with given suffix and false otherwise
-template <typename StringT>
-inline bool ends_with(const StringT &str, std::string_view suffix) {
-  return str.size() >= suffix.size() &&
-         str.substr(str.size() - suffix.size()) == suffix;
-}
-
-// --------------------------------------
-// String utils - (non)in-place resolving
-// --------------------------------------
-
-// Generates non-mutating wrapper `name(...)` that calls `name__(...)`
-// Used to create a non-inplace versions of the above functions.
-#define MAKE_NON_INPLACE(name)                                                 \
-  template <typename StringT, typename... Args>                                \
-  inline StringT name(const StringT &str, Args &&...args) {                    \
-    StringT tmp = str;                                                         \
-    name##__(tmp, std::forward<Args>(args)...);                                \
-    return tmp;                                                                \
-  }
-
-MAKE_NON_INPLACE(capitalize)
-MAKE_NON_INPLACE(to_lower)
-MAKE_NON_INPLACE(to_upper)
-MAKE_NON_INPLACE(filter)
-MAKE_NON_INPLACE(replace)
-
-} // namespace phonemis::utilities::string_utils
\ No newline at end of file
diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist
index bd0373672c..b2b2aa2478 100644
Binary files a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist and b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist differ
diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist
index 2372838d49..a6f2d4a5dc 100644
Binary files a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist and b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a b/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a
deleted file mode 100644
index 78f5169308..0000000000
Binary files a/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a and /dev/null differ
diff --git a/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a b/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a
deleted file mode 100644
index ccf1d2fa64..0000000000
Binary files a/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a and /dev/null differ
diff --git a/yarn.lock b/yarn.lock
index 3ecfd274a6..88c6a20978 100644
--- a/yarn.lock
+++ b/yarn.lock
@@ -15273,6 +15273,20 @@ __metadata:
   languageName: node
   linkType: hard
 
+"react-native-audio-api@npm:0.11.5":
+  version: 0.11.5
+  resolution: "react-native-audio-api@npm:0.11.5"
+  dependencies:
+    semver: "npm:^7.7.3"
+  peerDependencies:
+    react: "*"
+    react-native: "*"
+  bin:
+    setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js
+  checksum: 10/f8a388954c42cfd390b9adbfe6781f9d8049d43ea6ab83a8b229a0d0082df3489d9b48072d7166403ae95a33e8d741aab86ba2307d1bd4ff949fdb72e14ef42d
+  languageName: node
+  linkType: hard
+
 "react-native-audio-api@npm:0.12.0":
   version: 0.12.0
   resolution: "react-native-audio-api@npm:0.12.0"
@@ -16691,7 +16705,7 @@ __metadata:
     metro-config: "npm:^0.83.0"
     react: "npm:19.2.5"
     react-native: "npm:0.83.4"
-    react-native-audio-api: "npm:0.12.0"
+    react-native-audio-api: "npm:0.11.5"
     react-native-device-info: "npm:^15.0.2"
     react-native-executorch: "workspace:*"
     react-native-executorch-expo-resource-fetcher: "workspace:*"