diff --git a/.cspell-wordlist.txt b/.cspell-wordlist.txt index 84d006eefe..55eda9bfd5 100644 --- a/.cspell-wordlist.txt +++ b/.cspell-wordlist.txt @@ -203,3 +203,5 @@ fishjam Fishjam deinitialize Deinitialize +phonemize +phonemization \ No newline at end of file diff --git a/.gitmodules b/.gitmodules index 31a5d6e4b4..290a297a4c 100644 --- a/.gitmodules +++ b/.gitmodules @@ -4,3 +4,7 @@ [submodule "third-party/googletest"] path = third-party/googletest url = https://github.com/google/googletest.git +[submodule "packages/react-native-executorch/third-party/common/phonemis"] + path = packages/react-native-executorch/third-party/common/phonemis + url = https://github.com/IgorSwat/Phonemis + branch = main diff --git a/apps/speech/components/ModelPicker.tsx b/apps/speech/components/ModelPicker.tsx index 5e8284ee9a..9fee51ff34 100644 --- a/apps/speech/components/ModelPicker.tsx +++ b/apps/speech/components/ModelPicker.tsx @@ -1,10 +1,12 @@ import React, { useEffect, useRef, useState } from 'react'; import { Dimensions, + Modal, ScrollView, StyleSheet, Text, TouchableOpacity, + TouchableWithoutFeedback, View, } from 'react-native'; @@ -21,7 +23,7 @@ type Props = { disabled?: boolean; }; -const DROPDOWN_MAX_HEIGHT = 200; +const DROPDOWN_MAX_HEIGHT = 300; export function ModelPicker({ models, @@ -31,8 +33,11 @@ export function ModelPicker({ disabled, }: Props) { const [open, setOpen] = useState(false); - const [triggerHeight, setTriggerHeight] = useState(0); - const [expandUp, setExpandUp] = useState(false); + const [dropdownLayout, setDropdownLayout] = useState({ + x: 0, + y: 0, + width: 0, + }); const triggerRef = useRef>(null); const selected = models.find((m) => m.value === selectedModel); @@ -50,23 +55,22 @@ export function ModelPicker({ ( _x: number, _y: number, - _width: number, + width: number, height: number, - _pageX: number, + pageX: number, pageY: number ) => { - setTriggerHeight(height); const spaceBelow = Dimensions.get('window').height - (pageY + height); - setExpandUp(spaceBelow < DROPDOWN_MAX_HEIGHT); + const y = + spaceBelow >= DROPDOWN_MAX_HEIGHT + ? pageY + height + 2 + : pageY - Math.min(DROPDOWN_MAX_HEIGHT, models.length * 42) - 2; + setDropdownLayout({ x: pageX, y, width }); setOpen(true); } ); }; - const dropdownPosition = expandUp - ? { bottom: triggerHeight + 2 } - : { top: triggerHeight + 2 }; - return ( ({ {open ? '▲' : '▼'} - {open && ( - - {models.map((item) => { - const isSelected = item.value === selectedModel; - return ( - { - onSelect(item.value); - setOpen(false); - }} - > - - {item.label} - - - ); - })} - - )} + setOpen(false)} + > + setOpen(false)}> + + + {models.map((item) => { + const isSelected = item.value === selectedModel; + return ( + { + onSelect(item.value); + setOpen(false); + }} + > + + {item.label} + + + ); + })} + + + + ); } @@ -119,7 +139,6 @@ const styles = StyleSheet.create({ marginHorizontal: 12, marginVertical: 4, alignSelf: 'stretch', - zIndex: 100, }, trigger: { flexDirection: 'row', @@ -151,19 +170,15 @@ const styles = StyleSheet.create({ marginLeft: 6, }, dropdown: { - position: 'absolute', - left: 0, - right: 0, borderWidth: 1, borderColor: '#C1C6E5', borderRadius: 8, backgroundColor: '#fff', maxHeight: DROPDOWN_MAX_HEIGHT, - zIndex: 100, - elevation: 4, + elevation: 8, shadowColor: '#000', shadowOffset: { width: 0, height: 2 }, - shadowOpacity: 0.1, + shadowOpacity: 0.15, shadowRadius: 4, }, option: { diff --git a/apps/speech/package.json b/apps/speech/package.json index 2beb2cc41d..377e2e5700 100644 --- a/apps/speech/package.json +++ b/apps/speech/package.json @@ -20,7 +20,7 @@ "metro-config": "^0.83.0", "react": "19.2.5", "react-native": "0.83.4", - "react-native-audio-api": "0.12.0", + "react-native-audio-api": "0.11.5", "react-native-device-info": "^15.0.2", "react-native-executorch": "workspace:*", "react-native-executorch-expo-resource-fetcher": "workspace:*", diff --git a/apps/speech/screens/Quiz.tsx b/apps/speech/screens/Quiz.tsx index 8f03f1ae6d..ae7cf69998 100644 --- a/apps/speech/screens/Quiz.tsx +++ b/apps/speech/screens/Quiz.tsx @@ -18,8 +18,7 @@ import Animated, { } from 'react-native-reanimated'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { - KOKORO_MEDIUM, - KOKORO_VOICE_AM_SANTA, + KOKORO_AMERICAN_ENGLISH_MALE_SANTA, useTextToSpeech, } from 'react-native-executorch'; import { @@ -60,10 +59,7 @@ const createAudioBufferFromVector = ( export const Quiz = ({ onBack }: { onBack: () => void }) => { // --- Hooks & State --- - const model = useTextToSpeech({ - model: KOKORO_MEDIUM, - voice: KOKORO_VOICE_AM_SANTA, - }); + const model = useTextToSpeech(KOKORO_AMERICAN_ENGLISH_MALE_SANTA); const [shuffledQuestions] = useState(() => shuffleArray(QUESTIONS)); const [currentIndex, setCurrentIndex] = useState(0); diff --git a/apps/speech/screens/TextToSpeechLLMScreen.tsx b/apps/speech/screens/TextToSpeechLLMScreen.tsx index e99072869b..d94180096d 100644 --- a/apps/speech/screens/TextToSpeechLLMScreen.tsx +++ b/apps/speech/screens/TextToSpeechLLMScreen.tsx @@ -12,8 +12,7 @@ import SWMIcon from '../assets/swm_icon.svg'; import { useLLM, useTextToSpeech, - KOKORO_MEDIUM, - KOKORO_VOICE_AF_HEART, + KOKORO_AMERICAN_ENGLISH_FEMALE_HEART, LLAMA3_2_1B_QLORA, } from 'react-native-executorch'; import { @@ -54,10 +53,7 @@ export const TextToSpeechLLMScreen = ({ onBack }: TextToSpeechLLMProps) => { const [displayText, setDisplayText] = useState(''); const [isTtsStreaming, setIsTtsStreaming] = useState(false); const llm = useLLM({ model: LLAMA3_2_1B_QLORA }); - const tts = useTextToSpeech({ - model: KOKORO_MEDIUM, - voice: KOKORO_VOICE_AF_HEART, - }); + const tts = useTextToSpeech(KOKORO_AMERICAN_ENGLISH_FEMALE_HEART); const processedLengthRef = useRef(0); const audioContextRef = useRef(null); diff --git a/apps/speech/screens/TextToSpeechScreen.tsx b/apps/speech/screens/TextToSpeechScreen.tsx index 0cb64bfae7..65b3ca7506 100644 --- a/apps/speech/screens/TextToSpeechScreen.tsx +++ b/apps/speech/screens/TextToSpeechScreen.tsx @@ -10,37 +10,52 @@ import { } from 'react-native'; import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { - KOKORO_SMALL, - KOKORO_MEDIUM, - KOKORO_VOICE_AF_HEART, - KOKORO_VOICE_AF_RIVER, - KOKORO_VOICE_AF_SARAH, - KOKORO_VOICE_AM_ADAM, - KOKORO_VOICE_AM_MICHAEL, - KOKORO_VOICE_AM_SANTA, - KOKORO_VOICE_BF_EMMA, - KOKORO_VOICE_BM_DANIEL, useTextToSpeech, - KokoroConfig, - VoiceConfig, + TextToSpeechModelConfig, + KOKORO_AMERICAN_ENGLISH_FEMALE_HEART, + KOKORO_AMERICAN_ENGLISH_FEMALE_RIVER, + KOKORO_AMERICAN_ENGLISH_FEMALE_SARAH, + KOKORO_AMERICAN_ENGLISH_MALE_ADAM, + KOKORO_AMERICAN_ENGLISH_MALE_MICHAEL, + KOKORO_AMERICAN_ENGLISH_MALE_SANTA, + KOKORO_BRITISH_ENGLISH_FEMALE_EMMA, + KOKORO_BRITISH_ENGLISH_MALE_DANIEL, + KOKORO_FRENCH_FEMALE_SIWIS, + KOKORO_SPANISH_FEMALE_DORA, + KOKORO_SPANISH_MALE_ALEX, + KOKORO_ITALIAN_FEMALE_SARA, + KOKORO_ITALIAN_MALE_NICOLA, + KOKORO_PORTUGUESE_FEMALE_DORA, + KOKORO_PORTUGUESE_MALE_SANTA, + KOKORO_POLISH_MALE_MATEUSZ, + KOKORO_HINDI_FEMALE_ALPHA, + KOKORO_HINDI_MALE_OMEGA, + KOKORO_HINDI_MALE_PSI, } from 'react-native-executorch'; import { ModelPicker, ModelOption } from '../components/ModelPicker'; -const TTS_MODELS: ModelOption[] = [ - { label: 'Kokoro Small', value: KOKORO_SMALL }, - { label: 'Kokoro Medium', value: KOKORO_MEDIUM }, +const VOICES: ModelOption[] = [ + { label: '🇺🇸 AF Heart', value: KOKORO_AMERICAN_ENGLISH_FEMALE_HEART }, + { label: '🇺🇸 AF River', value: KOKORO_AMERICAN_ENGLISH_FEMALE_RIVER }, + { label: '🇺🇸 AF Sarah', value: KOKORO_AMERICAN_ENGLISH_FEMALE_SARAH }, + { label: '🇺🇸 AM Adam', value: KOKORO_AMERICAN_ENGLISH_MALE_ADAM }, + { label: '🇺🇸 AM Michael', value: KOKORO_AMERICAN_ENGLISH_MALE_MICHAEL }, + { label: '🇺🇸 AM Santa', value: KOKORO_AMERICAN_ENGLISH_MALE_SANTA }, + { label: '🇬🇧 BF Emma', value: KOKORO_BRITISH_ENGLISH_FEMALE_EMMA }, + { label: '🇬🇧 BM Daniel', value: KOKORO_BRITISH_ENGLISH_MALE_DANIEL }, + { label: '🇫🇷 FF Siwis', value: KOKORO_FRENCH_FEMALE_SIWIS }, + { label: '🇪🇸 EF Dora', value: KOKORO_SPANISH_FEMALE_DORA }, + { label: '🇪🇸 EM Alex', value: KOKORO_SPANISH_MALE_ALEX }, + { label: '🇮🇹 IF Sara', value: KOKORO_ITALIAN_FEMALE_SARA }, + { label: '🇮🇹 IM Nicola', value: KOKORO_ITALIAN_MALE_NICOLA }, + { label: '🇵🇹 PF Dora', value: KOKORO_PORTUGUESE_FEMALE_DORA }, + { label: '🇵🇹 PM Santa', value: KOKORO_PORTUGUESE_MALE_SANTA }, + { label: '🇵🇱 PM Mateusz', value: KOKORO_POLISH_MALE_MATEUSZ }, + { label: '🇮🇳 HF Alpha', value: KOKORO_HINDI_FEMALE_ALPHA }, + { label: '🇮🇳 HM Omega', value: KOKORO_HINDI_MALE_OMEGA }, + { label: '🇮🇳 HM Psi', value: KOKORO_HINDI_MALE_PSI }, ]; -const VOICES: ModelOption[] = [ - { label: 'AF Heart', value: KOKORO_VOICE_AF_HEART }, - { label: 'AF River', value: KOKORO_VOICE_AF_RIVER }, - { label: 'AF Sarah', value: KOKORO_VOICE_AF_SARAH }, - { label: 'AM Adam', value: KOKORO_VOICE_AM_ADAM }, - { label: 'AM Michael', value: KOKORO_VOICE_AM_MICHAEL }, - { label: 'AM Santa', value: KOKORO_VOICE_AM_SANTA }, - { label: 'BF Emma', value: KOKORO_VOICE_BF_EMMA }, - { label: 'BM Daniel', value: KOKORO_VOICE_BM_DANIEL }, -]; import FontAwesome from '@expo/vector-icons/FontAwesome'; import { AudioManager, @@ -77,16 +92,11 @@ const createAudioBufferFromVector = ( }; export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => { - const [selectedModel, setSelectedModel] = - useState(KOKORO_MEDIUM); - const [selectedVoice, setSelectedVoice] = useState( - KOKORO_VOICE_AF_HEART + const [selectedVoice, setSelectedVoice] = useState( + KOKORO_AMERICAN_ENGLISH_FEMALE_HEART ); - const model = useTextToSpeech({ - model: selectedModel, - voice: selectedVoice, - }); + const model = useTextToSpeech(selectedVoice); const [inputText, setInputText] = useState(''); const [isPlaying, setIsPlaying] = useState(false); @@ -94,6 +104,7 @@ export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => { const [error, setError] = useState(null); const audioContextRef = useRef(null); + const gainNodeRef = useRef(null); const sourceRef = useRef(null); useEffect(() => { @@ -103,12 +114,20 @@ export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => { iosOptions: ['defaultToSpeaker'], }); - audioContextRef.current = new AudioContext({ sampleRate: 24000 }); - audioContextRef.current.suspend(); + const context = new AudioContext({ sampleRate: 24000 }); + audioContextRef.current = context; + context.suspend(); + + // Increase the audio volume + const gainNode = context.createGain(); + gainNode.gain.value = 2.0; // Increase volume by 2x + gainNode.connect(context.destination); + gainNodeRef.current = gainNode; return () => { audioContextRef.current?.close(); audioContextRef.current = null; + gainNodeRef.current = null; }; }, []); @@ -142,7 +161,12 @@ export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => { const source = (sourceRef.current = audioContext.createBufferSource()); source.buffer = audioBuffer; - source.connect(audioContext.destination); + + if (gainNodeRef.current) { + source.connect(gainNodeRef.current); + } else { + source.connect(audioContext.destination); + } source.onEnded = () => resolve(); @@ -157,6 +181,7 @@ export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => { await model.stream({ text: inputText, + phonemize: true, onNext, onEnd, }); @@ -197,13 +222,6 @@ export const TextToSpeechScreen = ({ onBack }: { onBack: () => void }) => { setError(null)} /> - setSelectedModel(m)} - /> class ModelHostObject : public JsiHostObject { addFunctions(JSI_EXPORT_FUNCTION( ModelHostObject, synchronousHostFunction<&Model::streamInsert>, "streamInsert")); - addFunctions( - JSI_EXPORT_FUNCTION(ModelHostObject, - promiseHostFunction<&Model::generateFromPhonemes>, - "generateFromPhonemes")); - - addFunctions( - JSI_EXPORT_FUNCTION(ModelHostObject, - promiseHostFunction<&Model::streamFromPhonemes>, - "streamFromPhonemes")); } if constexpr (meta::HasGenerateFromString) { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp index ff71d2b536..4603cf6656 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.cpp @@ -48,10 +48,10 @@ DurationPredictor::DurationPredictor( [](const auto &a, const auto &b) { return a.second < b.second; }); } -std::tuple, int32_t> +std::tuple, int32_t, std::vector> DurationPredictor::generate(std::span tokens, - std::span textMask, std::span ref_hs, - float speed) { + std::span textMask, + std::span ref_hs, float speed) { size_t inputSize = tokens.size(); // Perform input shape checks @@ -78,11 +78,15 @@ DurationPredictor::generate(std::span tokens, auto tokensTensor = make_tensor_ptr({1, static_cast(tokens.size())}, const_cast(tokens.data()), ScalarType::Long); + auto textMaskTensor = make_tensor_ptr({1, static_cast(textMask.size())}, - textMask.data(), ScalarType::Bool); - auto voiceRefTensor = make_tensor_ptr({1, constants::kVoiceRefHalfSize}, - ref_hs.data(), ScalarType::Float); + const_cast(textMask.data()), ScalarType::Bool); + + auto voiceRefTensor = + make_tensor_ptr({1, constants::kVoiceRefHalfSize}, + const_cast(ref_hs.data()), ScalarType::Float); + auto speedTensor = make_tensor_ptr({1}, &speed, ScalarType::Float); // Execute the appropriate "forward_xyz" method, based on given method name @@ -126,6 +130,10 @@ DurationPredictor::generate(std::span tokens, indices.begin(), std::lower_bound(indices.begin(), indices.end(), originalLength)); + // Calculate timestamps - based on predicted durations. + std::vector timestamps = + calculateTimestamps(predDurPtr, inputSize); + /** * Returns: * - d: tensor containing the predicted durations for each token. @@ -133,13 +141,30 @@ DurationPredictor::generate(std::span tokens, * - effDuration: an effective duration after post-processing. */ return std::make_tuple(std::move(dTensor), std::move(indices), - std::move(effDuration)); + std::move(effDuration), std::move(timestamps)); } size_t DurationPredictor::getTokensLimit() const { return forwardMethods_.empty() ? 0 : forwardMethods_.back().second; } +std::vector +DurationPredictor::calculateTimestamps(const int64_t *predDurPtr, + size_t inputSize) const { + std::vector timestamps; + timestamps.reserve(inputSize); + + size_t accDur = 0; + for (size_t i = 0; i < inputSize; i++) { + int64_t dur = predDurPtr[i] * + constants::kTicksPerDuration; // Convert to audio samples + timestamps.emplace_back(accDur, accDur + dur); + accDur += dur; + } + + return timestamps; +} + void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens, int32_t targetDuration) const { // We expect durations tensor to be a Long tensor of a shape [1, n_tokens] @@ -175,7 +200,7 @@ void DurationPredictor::scaleDurations(Tensor &durations, size_t nTokens, shrinking ? std::ceil(scaled) - scaled : scaled - std::floor(scaled); durationsPtr[i] = static_cast(shrinking ? std::ceil(scaled) - : std::floor(scaled)); + : std::floor(scaled)); scaledSum += durationsPtr[i]; // Keeps the entries sorted by the remainders diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h index 0921fd17ac..b932aa07c4 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/DurationPredictor.h @@ -35,23 +35,29 @@ class DurationPredictor : public BaseModel { * d - Tensor: predicted durations for each token, * indices - std::vector: repeated token indices, * effDuration - int32_t: effective duration after - * post-processing. + * post-processing. + * timestamps - timestamp marks for each token (phoneme) */ - std::tuple, int32_t> - generate(std::span tokens, std::span textMask, - std::span ref_hs, float speed = 1.F); + std::tuple, int32_t, std::vector> + generate(std::span tokens, std::span textMask, + std::span ref_hs, float speed = 1.F); // Returns maximum supported amount of input tokens. size_t getTokensLimit() const; private: + // Helper function - calculating timestamps based on predicted durations + std::vector calculateTimestamps(const int64_t *predDurPtr, + size_t inputSize) const; + // Helper function - duration scalling // Performs integer scaling on the durations tensor to ensure the sum of // durations matches the given target duration - void scaleDurations(Tensor &durations, size_t nTokens, - int32_t targetDuration) const; + void scaleDurations( + Tensor &durations, size_t nTokens, + int32_t targetDuration) const; // Helper function - calculating effective + // duration based on duration tensor - // Helper function - calculating effective duration based on duration tensor // Since we apply padding to the input, the effective duration is // usually a little bit lower than the max duration defined by static input // size. diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp index ea43f09d47..06366a095c 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.cpp @@ -4,25 +4,39 @@ #include #include -#include +#include +#include #include +#include #include #include namespace rnexecutorch::models::text_to_speech::kokoro { Kokoro::Kokoro(const std::string &lang, const std::string &taggerDataSource, - const std::string &phonemizerDataSource, + const std::string &lexiconSource, + const std::string &neuralModelSource, const std::string &durationPredictorSource, const std::string &synthesizerSource, const std::string &voiceSource, std::shared_ptr callInvoker) : callInvoker_(std::move(callInvoker)), - phonemizer_(lang == "en-us" ? phonemis::Lang::EN_US - : lang == "en-gb" ? phonemis::Lang::EN_GB - : phonemis::Lang::DEFAULT, - taggerDataSource, phonemizerDataSource), - partitioner_(context_), + phonemizer_(phonemis::Config{ + .lang = lang, + .tagger = taggerDataSource.empty() + ? std::optional{} + : std::make_optional(phonemis::tagger::Config{ + .data_filepath = taggerDataSource}), + .phonemizer = + phonemis::phonemizer::Config{ + .lang = lang, + .lexicon_filepath = lexiconSource.empty() + ? std::nullopt + : std::make_optional(lexiconSource), + .nn_model_filepath = + neuralModelSource.empty() + ? std::nullopt + : std::make_optional(neuralModelSource)}}), durationPredictor_(durationPredictorSource, context_, callInvoker_), synthesizer_(synthesizerSource, context_, callInvoker_) { // Populate the voice array by reading given file @@ -76,16 +90,29 @@ void Kokoro::loadVoice(const std::string &voiceSource) { } } -std::vector -Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) { - // Divide the phonemes string into substrings. - // Affects the further calculations only in case of string size - // exceeding the biggest model's input. - auto subsentences = - partitioner_.divide(phonemes); +std::vector Kokoro::generate(std::u32string input, float speed, + bool phonemize) { + if (input.size() > params::kMaxTextSize) { + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + "Kokoro: maximum input text size exceeded"); + } + + if (input.empty()) { + return {}; + } + + // G2P (Grapheme to Phoneme) conversion + auto phonemes = phonemize ? phonemizer_(input) : input; + + // Divide the phonemes string into substrings, minimizing the amount of + // breaks. + auto partition = partitioner_.partition(phonemes, context_.inputTokensLimit, + Partitioner::Mode::MIN_BREAKS); std::vector audio = {}; - for (const auto &subsentence : subsentences) { + for (const auto &[offset, length] : partition.segments) { + auto subsentence = partition.content.substr(offset, length); + // Generate an audio vector with the Kokoro model auto audioPart = synthesize(subsentence, speed); @@ -94,6 +121,7 @@ Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) { size_t pauseMs = params::kPauseValues.contains(lastPhoneme) ? params::kPauseValues.at(lastPhoneme) : params::kDefaultPause; + // Add audio part and silence pause to the main audio vector audio.insert(audio.end(), std::make_move_iterator(audioPart.begin()), std::make_move_iterator(audioPart.end())); @@ -104,8 +132,9 @@ Kokoro::generateFromPhonemesImpl(const std::u32string &phonemes, float speed) { return audio; } -void Kokoro::streamFromPhonemesImpl(const std::u32string &phonemes, float speed, - std::shared_ptr callback) { +void Kokoro::stream(std::shared_ptr callback, float speed, + bool phonemize, bool stopOnEmptyBuffer) { + // Create a callback auto nativeCallback = [this, callback](const std::vector &audioVec) { if (this->isStreaming_) { this->callInvoker_->invokeAsync( @@ -116,70 +145,6 @@ void Kokoro::streamFromPhonemesImpl(const std::u32string &phonemes, float speed, } }; - // Use LATENCY strategy to minimize the time-to-first-audio for streaming - auto subsentences = - partitioner_.divide(phonemes); - - for (size_t i = 0; i < subsentences.size(); i++) { - if (!isStreaming_) { - break; - } - - const auto &subsentence = subsentences[i]; - - // Determine the silent padding duration to be stripped from the edges of - // the generated audio. If a chunk ends with a space or follows one that - // did, it indicates a word boundary split – we use a shorter padding - // to ensure natural speech flow. Otherwise, we use 50ms for standard - // pauses. - bool endsWithSpace = (subsentence.back() == U' '); - bool prevEndsWithSpace = (i > 0 && subsentences[i - 1].back() == U' '); - size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; // [ms] - - // Generate an audio vector with the Kokoro model - auto audioPart = synthesize(subsentence, speed, paddingMs); - - // Calculate and append a pause between the sentences - char32_t lastPhoneme = subsentence.back(); - size_t pauseMs = params::kPauseValues.contains(lastPhoneme) - ? params::kPauseValues.at(lastPhoneme) - : params::kDefaultPause; - audioPart.resize( - audioPart.size() + pauseMs * constants::kSamplesPerMilisecond, 0.F); - - // Push the audio right away to the JS side - nativeCallback(std::move(audioPart)); - } -} - -std::vector Kokoro::generate(std::string text, float speed) { - if (text.size() > params::kMaxTextSize) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "Kokoro: maximum input text size exceeded"); - } - - if (text.empty()) { - return {}; - } - - // G2P (Grapheme to Phoneme) conversion - auto phonemes = phonemizer_.process(text); - - return generateFromPhonemesImpl(phonemes, speed); -} - -std::vector Kokoro::generateFromPhonemes(std::string phonemes, - float speed) { - if (phonemes.empty()) { - return {}; - } - - return generateFromPhonemesImpl( - phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed); -} - -void Kokoro::stream(float speed, bool stopOnEmptyBuffer, - std::shared_ptr callback) { isStreaming_ = true; stopOnEmptyBuffer_ = stopOnEmptyBuffer; @@ -187,11 +152,16 @@ void Kokoro::stream(float speed, bool stopOnEmptyBuffer, // The extracted text is then passed to the inner loop, which performs a // standard streaming on a fixed amount of input text. while (isStreaming_) { - std::string text; + std::u32string input; // Extract the code relying on input buffer for a separate mutex lock // section. { + // Trim to remove trailing whitespace characters + inputTextBuffer_ = + phonemis::utils::strings::strip( + inputTextBuffer_); + std::scoped_lock lock(inputTextBufferMutex_); if (inputTextBuffer_.empty() && stopOnEmptyBuffer_) { break; @@ -212,7 +182,7 @@ void Kokoro::stream(float speed, bool stopOnEmptyBuffer, // chunks which end in the middle of a sentence. if (chunkSize > 0 || streamSkippedIterations >= params::kStreamMaxSkippedIterations) { - text = inputTextBuffer_.substr(0, chunkSize); + input = inputTextBuffer_.substr(0, chunkSize); inputTextBuffer_.erase(0, chunkSize); streamSkippedIterations = 0; } else { @@ -220,10 +190,93 @@ void Kokoro::stream(float speed, bool stopOnEmptyBuffer, } } - if (!text.empty()) { + if (!input.empty()) { // Now we proceed with a standard streaming logic for fixed-size input. - auto phonemes = phonemizer_.process(text); - streamFromPhonemesImpl(phonemes, speed, callback); + // Start with preprocessing the input once. + std::u32string buffer = phonemizer_.preprocess(input); + + // A variable to keep the information about phonemized (but not + // synthesized) tokens from the previous iteration. + size_t phonemizedTokens = 0; + + while (!buffer.empty() && isStreaming_) { + // Since we do not phonemize the entire input before partitioning, there + // is a possibility that some segment might exceed the token limit after + // phonemization. This is being handled later. + auto partition = partitioner_.partition( + buffer, context_.inputTokensLimit, Partitioner::Mode::MIN_LATENCY); + + for (size_t i = 0; i < partition.segments.size(); i++) { + if (!isStreaming_) { + break; + } + + const auto &[offset, length] = partition.segments[i]; + const auto subsentence = partition.content.substr(0, length); + + std::u32string phonemes; + + if (phonemize) { + size_t unchangedLength = std::min(length, phonemizedTokens); + // Include trailing space if it was already phonemized + if (unchangedLength < length && + subsentence[unchangedLength] == U' ' && + phonemizedTokens > unchangedLength) { + unchangedLength++; + } + + // We phonemize on the fly - meaning there is no time waste + // phonemizing the entire input if we only need one segment at the + // time.` + phonemes = subsentence.substr(0, unchangedLength); + if (unchangedLength < length) { + // Phonemize without preprocessing (since we already did that). + phonemes += + phonemizer_(subsentence.substr(unchangedLength), false); + } + } else { + // Simple case - no phonemization, no risk of exceeding the token + // limit. + phonemes = subsentence; + } + + if (phonemes.size() <= context_.inputTokensLimit - 2) { + // Determine the silent padding duration + bool endsWithSpace = (subsentence.back() == U' '); + bool prevEndsWithSpace = + (offset > 0 && partition.content[offset - 1] == U' '); + size_t paddingMs = endsWithSpace || prevEndsWithSpace ? 15 : 50; + + // Generate and push audio + auto audioPart = synthesize(phonemes, speed, paddingMs); + + size_t pauseMs = params::kPauseValues.contains(phonemes.back()) + ? params::kPauseValues.at(phonemes.back()) + : params::kDefaultPause; + + audioPart.resize(audioPart.size() + + pauseMs * constants::kSamplesPerMilisecond, + 0.F); + + nativeCallback(std::move(audioPart)); + + // Remove processed segment from buffer. + // Since we process it from left to right, we expect the segment to + // be at the beginning of the buffer. + buffer.erase(0, length); + phonemizedTokens = std::max(phonemizedTokens, length) - length; + } else { + // Length exceeds limit. Replace the sentence in buffer with its + // phonemization. + if (phonemize) { + buffer.replace(0, length, phonemes); + } + phonemizedTokens = phonemes.size(); + + break; + } + } + } } // A little bit of pause to not overload the thread. @@ -241,86 +294,97 @@ void Kokoro::stream(float speed, bool stopOnEmptyBuffer, } } -void Kokoro::streamFromPhonemes(std::string phonemes, float speed, - std::shared_ptr callback) { - if (phonemes.empty()) { - throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, - "Kokoro: phoneme string must not be empty"); - } - - isStreaming_ = true; - streamFromPhonemesImpl( - phonemis::utilities::string_utils::utf8_to_u32string(phonemes), speed, - callback); - isStreaming_ = false; -} - -void Kokoro::streamInsert(std::string textChunk) noexcept { - std::scoped_lock lock(inputTextBufferMutex_); - inputTextBuffer_.append(textChunk); -} - -void Kokoro::streamStop(bool instant) noexcept { - if (instant) { - isStreaming_ = false; - } else { - stopOnEmptyBuffer_ = true; - } -} - -std::vector Kokoro::synthesize(const std::u32string &phonemes, - float speed, size_t paddingMs) { +std::vector Kokoro::synthesize(std::u32string_view phonemes, float speed, + size_t paddingMs) { if (phonemes.empty()) { return {}; } - // Clamp the input to not go beyond number of input token limits - // Note that 2 tokens are always reserved for pre- and post-fix padding, - // so we effectively take at most (maxNoInputTokens_ - 2) tokens. - size_t noTokens = std::clamp(phonemes.size() + 2, constants::kMinInputTokens, - context_.inputTokensLimit); + // Remove leading whitespace if exists. + if (phonemes.front() == U' ') { + phonemes = phonemes.substr(1); + } - // Map phonemes to tokens + // 1. Prepare input tokens. + // Clamp input to avoid exceeding model limits (2 tokens reserved for pre/post + // padding). + const size_t noTokens = + std::clamp(phonemes.size() + 2, constants::kMinInputTokens, + context_.inputTokensLimit); const auto tokens = utils::tokenize(phonemes, {noTokens}); - // Select the appropriate voice vector - size_t voiceID = - std::min({phonemes.size() - 1, noTokens - 1, voice_.size() - 1}); - auto &voice = voice_[voiceID]; - - // Initialize text mask - // Exclude all the paddings apart from first and last one. - size_t realInputLength = std::min(phonemes.size() + 2, noTokens); + // 2. Initialize text mask. + // Exclude all paddings except the first and last ones. + // We use uint8_t instead of bool to avoid boolean span issues. std::vector textMask(noTokens, false); - std::fill(textMask.begin(), textMask.begin() + realInputLength, true); - - // Inference 1 - DurationPredictor - // The resulting duration vector is already scalled at this point - auto [d, indices, effectiveDuration] = durationPredictor_.generate( - std::span(tokens), - std::span(reinterpret_cast(textMask.data()), textMask.size()), - std::span(voice).last(constants::kVoiceRefHalfSize), speed); + std::fill(textMask.begin(), + textMask.begin() + std::min(phonemes.size() + 2, noTokens), true); - // Inference 2 - Synthesizer + // 3. Select the appropriate voice vector. + // Each number of input tokens corresponds to a different voice embedding + // vector. + const size_t voiceID = + std::min({phonemes.size() - 1, noTokens - 1, voice_.size() - 1}); + const auto &voice = voice_[voiceID]; + + // 4. Inference Phase 1: DurationPredictor (submodule). + auto [d, indices, effectiveDuration, timestamps] = + durationPredictor_.generate( + std::span(tokens), + std::span(reinterpret_cast(textMask.data()), textMask.size()), + std::span(voice).last(constants::kVoiceRefHalfSize), speed); + + // 5. Inference Phase 2: Synthesizer. + // Note that we reduce the size of the duration tensor to match the number of + // tokens. auto decoding = synthesizer_.generate( std::span(tokens), std::span(reinterpret_cast(textMask.data()), textMask.size()), std::span(indices), - // Note that we reduce the size of d tensor to match the initial number of - // input tokens std::span(d.mutable_data_ptr(), noTokens * d.sizes().back()), std::span(voice)); - auto audioTensor = decoding->at(0).toTensor(); - // Cut the resulting audio vector according to the effective duration - int32_t effLength = constants::kTicksPerDuration * effectiveDuration; + // 6. Post-processing: Finalize audio. + const auto audioTensor = decoding->at(0).toTensor(); + const int32_t audioLength = constants::kTicksPerDuration * effectiveDuration; + auto audio = - std::span(audioTensor.const_data_ptr(), effLength); - auto croppedAudio = + std::span(audioTensor.const_data_ptr(), audioLength); + + // To counter any potential trailing voice artifacts (which can occur due to + // slight mismatch of .pte model results) we cut it according to the predicted + // duration ticks. + if (noTokens > 2) { + // We want to skip both the last PAD token, as well as any potential EOS + // token just before it. + auto lastTokenTimestamp = + !phonemis::utils::unicode::isalpha(phonemes.back()) + ? timestamps[noTokens - 3].end + : timestamps[noTokens - 2].end; + + audio = audio.subspan(0, lastTokenTimestamp); + } + + // Now additional stripping of a (hopefully) pure silence. + audio = utils::stripAudio(audio, paddingMs * constants::kSamplesPerMilisecond); - return {croppedAudio.begin(), croppedAudio.end()}; + return {audio.begin(), audio.end()}; +} + +void Kokoro::streamInsert(std::u32string chunk) noexcept { + rnexecutorch::log(rnexecutorch::LOG_LEVEL::Info, "Inserting data"); + std::scoped_lock lock(inputTextBufferMutex_); + inputTextBuffer_.append(chunk); +} + +void Kokoro::streamStop(bool instant) noexcept { + if (instant) { + isStreaming_ = false; + } else { + stopOnEmptyBuffer_ = true; + } } std::size_t Kokoro::getMemoryLowerBound() const noexcept { diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h index e33631af61..adf736bd28 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Kokoro.h @@ -11,7 +11,7 @@ #include "Partitioner.h" #include "Synthesizer.h" #include "Types.h" -#include +#include #include namespace rnexecutorch { @@ -20,49 +20,51 @@ namespace models::text_to_speech::kokoro { class Kokoro { public: Kokoro(const std::string &lang, const std::string &taggerDataSource, - const std::string &phonemizerDataSource, + const std::string &lexiconSource, const std::string &neuralModelSource, const std::string &durationPredictorSource, const std::string &synthesizerSource, const std::string &voiceSource, std::shared_ptr callInvoker); /** - * Processes the entire text at once, before sending back to the JS side. - */ - std::vector generate(std::string text, float speed = 1.F); - - /** - * Similar to generate(), but accepts pre-computed phonemes (as a UTF-8 IPA - * string) and synthesizes audio, bypassing the built-in phonemizer. + * Generates complete audio for the provided text. + * + * @param text The input to be synthesized - either a raw text or IPA + * phonemes. + * @param speed Playback speed multiplier (default: 1.0). + * @param phonemize Optional, if set to false disables the phonemization and + * operates on raw input. + * @return A vector of PCM float samples representing the synthesized speech. */ - std::vector generateFromPhonemes(std::string phonemes, - float speed = 1.F); + std::vector generate(std::u32string input, float speed = 1.F, + bool phonemize = true); /** - * Processes text from inputTextBuffer_ in chunks, sending each chunk - * individualy to the JS side with asynchronous callbacks. + * Starts an asynchronous streaming process that processes text in chunks. + * The internal buffer can be expanded during streaming using `streamInsert`. * - * Allows an incrementally expanded input by using an input text buffer. + * @param callback A JSI function called with each generated audio chunk + * (std::vector). + * @param speed Playback speed multiplier. + * @param phonemize Optional, if set to false disables the phonemization and + * operates on raw input. + * @param stopOnEmptyBuffer If true, streaming terminates automatically when + * the buffer is exhausted. */ - void stream(float speed, bool stopOnEmptyBuffer, - std::shared_ptr callback); - - // Streaming variant that accepts pre-computed phonemes instead of text. - void streamFromPhonemes(std::string phonemes, float speed, - std::shared_ptr callback); + void stream(std::shared_ptr callback, float speed = 1.F, + bool phonemize = true, bool stopOnEmptyBuffer = false); /** - * Updates the input streaming buffer by adding more text to be processed. + * Appends new input data (either text or phonemes) to the buffer. * - * @param text A new chunk of text, appended to the end of the input buffer. + * @param chunk A text/phonemes chunk to be added to the streaming buffer. */ - void streamInsert(std::string textChunk) noexcept; + void streamInsert(std::u32string chunk) noexcept; /** - * Stops the streaming process. + * Signals the streaming process to stop. * - * @param instant If true, stops the streaming as soon as possible by - * switching the isStreaming_ flag. Otherwise allows to process the rest of - * the buffer first, by switching the stopOnEmptyBuffer_ flag. + * @param instant If true, stops immediately, discarding remaining buffered + * text. If false, finishes processing the current buffer before stopping. */ void streamStop(bool instant) noexcept; @@ -70,38 +72,32 @@ class Kokoro { void unload() noexcept; private: - // Helper function - loading voice array + // --- Initialization & Core Inference --- void loadVoice(const std::string &voiceSource); - - // Helper function - shared synthesis pipeline (partition + synthesize) - std::vector generateFromPhonemesImpl(const std::u32string &phonemes, - float speed); - void streamFromPhonemesImpl(const std::u32string &phonemes, float speed, - std::shared_ptr callback); - - // Helper function - generate specialization for given input size - std::vector synthesize(const std::u32string &phonemes, float speed, + std::vector synthesize(std::u32string_view phonemes, float speed, size_t paddingMs = 50); - // JS callback handle + // --- External Dependencies --- std::shared_ptr callInvoker_; - // Shared model context + // --- Model context --- Context context_; - // Submodules - arranged in order of their appearence in the model's pipeline + // --- Model Components --- + // Arranged in order of appearance in the generation pipeline phonemis::Pipeline phonemizer_; Partitioner partitioner_; DurationPredictor durationPredictor_; Synthesizer synthesizer_; - // Voice array — dynamically sized to match the voice file. - // Each row is a style vector for a given input token count. + // --- Data Buffers --- + // Voice embeddings: Each row is a style vector for a given input token count std::vector> voice_; - - // Streaming state control variables - std::string inputTextBuffer_; + // Streaming buffer + std::u32string inputTextBuffer_; mutable std::mutex inputTextBufferMutex_; + + // --- Streaming control State --- std::atomic isStreaming_{false}; std::atomic stopOnEmptyBuffer_{true}; int32_t streamSkippedIterations = 0; @@ -110,5 +106,7 @@ class Kokoro { REGISTER_CONSTRUCTOR(models::text_to_speech::kokoro::Kokoro, std::string, std::string, std::string, std::string, std::string, - std::string, std::shared_ptr); + std::string, std::string, + std::shared_ptr); + } // namespace rnexecutorch diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h index f517db0318..d8b14f4caf 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Params.h @@ -39,8 +39,8 @@ inline constexpr int32_t kStreamPause = 200; * (ms). */ inline const std::unordered_map kPauseValues = { - {U'.', 250}, {U'?', 350}, {U'!', 180}, {U';', 300}, - {U'…', 500}, {U',', 125}, {U':', 175}, {U'-', 175}}; // [ms] + {U'.', 375}, {U'?', 500}, {U'!', 250}, {U';', 400}, + {U'…', 600}, {U',', 130}, {U':', 250}, {U'-', 200}}; // [ms] /** * A default pause applied after a sentence finished with a character other @@ -54,43 +54,24 @@ namespace cropping { * The audio cropping algorithm is a moving average variant. * This value controls the number of steps in moving average. */ -inline constexpr uint32_t kAudioCroppingSteps = 20; +inline constexpr uint32_t kAudioCroppingSteps = 10; /** * Determines silent audio fragments in audio cropping algorithm. * The audio fragment is considered as a silence, if the moving average with K * steps does not exceed this threshold. */ -inline constexpr float kAudioSilenceThreshold = 0.01F; +inline constexpr float kAudioSilenceThreshold = 0.005F; } // namespace cropping // Partitioning related hyperparameters namespace partitioning { -/** - * A penalty for dividing text on end of sentence character (like . or !). - */ -inline constexpr int64_t kEosPenalty = 5; - -/** - * A penalty for dividing text on pause character (like , or -). - */ -inline constexpr int64_t kPausePenalty = 18; - -/** - * A penalty for dividing text in the middle of sentence - - * in other words, on white character. - * - * We want to avoid splitting the text between two words with no pause - * as much as possible, since it kills the naturalness of the speech. - */ -inline constexpr int64_t kWhitePenalty = 1000; - /** * Used in latency-focused partitioning variant. Decides on * how much more are big latencies in the beginning phase of * an input text penalized. */ -inline constexpr int32_t kTokenDiscountFactor = 1; +inline constexpr int64_t kTokenDiscountFactor = 1; /** * Used in latency-focused partitioning variant. Decides on @@ -99,7 +80,7 @@ inline constexpr int32_t kTokenDiscountFactor = 1; * For example, using kTokenDiscountRange = 128 means that after reaching * 128 tokens, the latency is completely omited and not penalized. */ -inline constexpr int32_t kTokenDiscountRange = 128; +inline constexpr int64_t kTokenDiscountRange = 128; } // namespace partitioning } // namespace rnexecutorch::models::text_to_speech::kokoro::params \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.cpp index 4dc55ade12..6ff0bb3ca1 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.cpp @@ -1,10 +1,10 @@ #include "Partitioner.h" #include "Constants.h" #include "Params.h" + #include -#include -#include -#include +#include +#include namespace rnexecutorch::models::text_to_speech::kokoro { @@ -13,117 +13,118 @@ using namespace params::partitioning; // Custom infinity definition constexpr Partitioner::Cost INF = 1e7; -template <> -std::vector -Partitioner::divide( - const std::u32string &phonemes) { - return divide(phonemes, - [this](Cost prevCost, int32_t rangeBegin, int32_t prevBp, - int32_t currBp, int32_t rangeEnd) { - if (rangeEnd - currBp - 1 > context_.inputTokensLimit) - return INF; - - // Simply cumulate the costs for both subranges - return prevCost + static_cast(rangeEnd - currBp - 1); - }); -} +Partitioner::Partition Partitioner::partition(std::u32string_view input, + size_t limit, Mode mode) const { + if (mode == Mode::MIN_BREAKS) { + return partition(input, limit, + [limit](Cost acc, size_t beg, int64_t prevBp, int64_t bp, + size_t end, Separator sep) -> Cost { + if (end - bp > limit) { + return INF; + } + + Cost sepPenalty = sep == Separator::EOS ? 1 + : sep == Separator::PAUSE ? 3 + : sep == Separator::WHITE ? 1000 + : 0; + + return acc + sepPenalty + static_cast(end - bp); + }); + } -template <> -std::vector Partitioner::divide( - const std::u32string &phonemes) { - return divide(phonemes, [this](Cost prevCost, int32_t rangeBegin, - int32_t prevBp, int32_t currBp, - int32_t rangeEnd) { - if (rangeEnd - currBp - 1 > context_.inputTokensLimit) - return INF; - - // Estimate the latency (simple linear difference between the rightmost - // subranges) - int32_t latency = std::max(0, (rangeEnd - currBp) - (currBp - prevBp)); - - // Estimate the discount factor (the further we go, the less we care about - // the latency) - int32_t discount = - kTokenDiscountFactor * std::max(0, kTokenDiscountRange - currBp - 1); - - return prevCost + - static_cast(latency * discount / kTokenDiscountRange); - }); + if (mode == Mode::MIN_LATENCY) { + return partition( + input, limit, + [limit](Cost acc, size_t beg, int64_t prevBp, int64_t bp, size_t end, + Separator sep) -> Cost { + if (end - bp > limit) { + return INF; + } + + Cost sepPenalty = sep == Separator::EOS ? 5 + : sep == Separator::PAUSE ? 18 + : sep == Separator::WHITE ? 1000 + : 0; + + int64_t rightmostRangeLength = end - bp; + int64_t prevRangeLength = bp - prevBp; + + int64_t latency = std::max(static_cast(0), + rightmostRangeLength - prevRangeLength); + int64_t discount = + kTokenDiscountFactor * + std::max(static_cast(0), kTokenDiscountRange - bp - 1); + + return acc + + static_cast(latency * discount / kTokenDiscountRange) + + sepPenalty; + }); + } + + return {input, {}}; } -// Helper function - partitioning -// A template which is controled by concrete operator instead of -// an abstract Strategy argument. -// Utilizes dynamic programming approach for finding the -// optimal solution. -std::vector Partitioner::divide( - const std::u32string &phonemes, - const std::function - &costFn) { - // DP array - // (cost, prev_breakpoint_idx) pairs - std::vector> mem(phonemes.size(), {INF, -1}); - - // Keep the potential break point indices to speed up the calculation. - std::deque eosPoints, pausePoints, whitePoints; - - for (int32_t i = 0; i < phonemes.size(); i++) { - auto &[estimation, prevBreakIdx] = mem[i]; - - // We assume that phonemes[i] is the last character of currently analyzed - // substring. First, estimate for the entire substring without further - // division. - estimation = costFn(0, 0, -1, -1, i + 1); - - // Now, try to divide into 2 substring and utilize already calculated values - // for left-side substring. +Partitioner::Partition Partitioner::partition(std::u32string_view input, + size_t limit, + CostFn costFn) const { + if (input.empty()) { + return {input, {}}; + } + + size_t n = input.size(); + std::vector> dp(n, {INF, -1}); + + std::deque eosPoints, pausePoints, whitePoints; + + for (size_t i = 0; i < n; ++i) { + auto &[bestCost, prevBpIdx] = dp[i]; + + bestCost = costFn(0, 0, -1, -1, i + 1, Separator::NO_SEP); + for (auto *q : {&eosPoints, &pausePoints, &whitePoints}) { - // First, clear the queus from useless entries (out of even largest model - // bounds). - while (!q->empty() && q->front() + context_.inputTokensLimit < i) { + while (!q->empty() && q->front() + limit < i) { q->pop_front(); } - // Now iterate through the reimaining positions. - Cost penalty = q == &eosPoints ? kEosPenalty - : q == &pausePoints ? kPausePenalty - : kWhitePenalty; - for (int32_t breakIdx : (*q)) { - Cost newEstimation = costFn(mem[breakIdx].first, 0, - mem[breakIdx].second, breakIdx, i + 1) + - penalty; - if (newEstimation < estimation && breakIdx > 0) { - estimation = newEstimation; - prevBreakIdx = breakIdx; + Separator sep = q == &eosPoints ? Separator::EOS + : q == &pausePoints ? Separator::PAUSE + : Separator::WHITE; + for (size_t breakIdx : (*q)) { + Cost cost = costFn(dp[breakIdx].first, 0, dp[breakIdx].second, breakIdx, + i, sep); + if (cost < bestCost && breakIdx > 0) { + bestCost = cost; + prevBpIdx = breakIdx; } } } - // Add current phoneme to the appropriate queue. - char32_t phoneme = phonemes[i]; - if (constants::kEndOfSentencePhonemes.contains(phoneme)) { + char32_t c = input[i]; + if (constants::kEndOfSentencePhonemes.contains(c)) { eosPoints.push_back(i); - } else if (constants::kPausePhonemes.contains(phoneme)) { + } else if (constants::kPausePhonemes.contains(c)) { pausePoints.push_back(i); - } else if (phoneme < 256 && std::isspace(static_cast(phoneme))) { + } else if (c < 256 && std::isspace(static_cast(c))) { whitePoints.push_back(i); } } - std::vector result = {}; + std::vector> segments; + int64_t currBp = dp[n - 1].second; + size_t lastIdx = n; - // Perform backtracking to obtain all the substrings. - // Note that because of backtracking, the order is reversed. - int32_t end = phonemes.size() - 1; - while (end != -1) { - int32_t begin = mem[end].second + 1; - result.push_back(phonemes.substr(begin, end - begin + 1)); - end = mem[end].second; + while (currBp != -1) { + size_t start = static_cast(currBp + 1); + segments.emplace_back(start, lastIdx - start); + lastIdx = static_cast(currBp + 1); + currBp = dp[currBp].second; } + // Add the first segment + segments.emplace_back(0, lastIdx); - std::ranges::reverse(result); + std::reverse(segments.begin(), segments.end()); - return result; + return {input, std::move(segments)}; } -} // namespace rnexecutorch::models::text_to_speech::kokoro \ No newline at end of file +} // namespace rnexecutorch::models::text_to_speech::kokoro diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.h index b327ca4f9b..5ed87bce77 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Partitioner.h @@ -1,58 +1,88 @@ #pragma once +#include "Types.h" + #include #include #include #include #include -#include "Types.h" - namespace rnexecutorch::models::text_to_speech::kokoro { class Partitioner { public: - Partitioner(const Context &modelContext) : context_(modelContext) {} - - // Partition strategy - // Defines how to divide phoneme string into substrings, by minimizing - // one of the selected properties. - enum class Strategy { - TOTAL_TIME = 0, // Only minimizes the estimated total time of processing - LATENCY, // Minimizes the streaming latency by dividing into small and - // similar length parts + /** + * Partitioning strategy. + * Affects the cost function choice, which changes the way input text is + * divided. + */ + enum class Mode { + MIN_BREAKS = 0, // Minimizes number of substrings (best quality) + MIN_LATENCY = + 1, // Minimizes the processing latency (best speed - streaming mode) }; - // Cost definition - using Cost = int64_t; + /** + * Represents the logical separator types. + */ + enum class Separator { + EOS = 1, // End of sentence marker (e.g., '.', '!', '?'). + PAUSE, // Mid-sentence pause (e.g., ',', ';', ':'). + WHITE, // Whitespace or other weak separators. - // Partition function - // Performs a division of the input phoneme string according to - // given strategy. - template - std::vector divide(const std::u32string &phonemes); + NO_SEP // No separation + }; -private: /** - * Helper function - partitioning + * Represents a heuristic evaluation of given partition. + * The lower it is, the better partition is. + */ + using Cost = uint32_t; + + /** + * A cost function type to evaluate given partition. * - * @param phonemes phoneme string to be partitioned - * @param costFn a custom cost function which takes: - * 1. starting cost (cost of the previous range or 0 if not - * present) - * 2. range begin - * 3. previous breakpoint (-1 if not present) - * 4. current breakpoint (-1 if not present) - * 5. range end (exclusive) + * @param acc Total cost accumulated from previous segments. + * @param beg Start index of the current range. + * @param prevBp Previous breakpoint index - useful for calculating some + * formulas. + * @param bp Breakpoint index (the split point, and the last character of the + * left-most subrange). -1 if there are no bps. + * @param end End index of the current range (inclusive). + * @param sep The type of the breakpoint. + */ + using CostFn = std::function; + + /** + * Holds the result of text partitioning. + * The content is stored as logical views to avoid copying. Segments + * defines ranges of the content views for smaller segments. */ - std::vector - divide(const std::u32string &phonemes, - const std::function - &costFn); - - // Shared model context - // A const reference to singleton in Kokoro. - const Context &context_; + struct Partition { + std::u32string_view content; + std::vector> + segments; // Pairs of {offset, length} for each segment. + }; + + /** + * Partitions the input text into segments according to the specified + * strategy. + * + * @param input The source text to be partitioned. + * @param limit The maximum available size of a single segment. + * @param mode The partitioning strategy to use (defaults to MIN_LATENCY). + * @return A Partition object containing the original content view and + * breakpoints. + */ + Partition partition(std::u32string_view input, size_t limit, + Mode mode = Mode::MIN_LATENCY) const; + +private: + // Internal partition implementation that uses a specific cost function. + Partition partition(std::u32string_view input, size_t limit, + CostFn costFn) const; }; } // namespace rnexecutorch::models::text_to_speech::kokoro \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.cpp index fd69c43eed..c5f44b71f8 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.cpp @@ -28,8 +28,9 @@ Synthesizer::Synthesizer(const std::string &modelSource, forwardMethods_.emplace_back(name, inputSize); } } - std::stable_sort(forwardMethods_.begin(), forwardMethods_.end(), - [](const auto &a, const auto &b) { return a.second < b.second; }); + std::stable_sort( + forwardMethods_.begin(), forwardMethods_.end(), + [](const auto &a, const auto &b) { return a.second < b.second; }); } // Fallback: if no methods discovered, validate "forward" directly @@ -43,11 +44,10 @@ Synthesizer::Synthesizer(const std::string &modelSource, } } -Result> Synthesizer::generate(std::span tokens, - std::span textMask, - std::span indices, - std::span dur, - std::span ref_s) { +Result> Synthesizer::generate( + std::span tokens, std::span textMask, + std::span indices, std::span dur, + std::span ref_s) { // Perform input shape checks // Both F0 and N vectors should be twice as long as duration CHECK_SIZE(tokens, textMask.size()); @@ -62,28 +62,33 @@ Result> Synthesizer::generate(std::span tokens, const_cast(tokens.data()), ScalarType::Long); auto textMaskTensor = make_tensor_ptr({1, static_cast(textMask.size())}, - textMask.data(), ScalarType::Bool); - auto indicesTensor = - make_tensor_ptr({duration}, indices.data(), ScalarType::Long); - auto durTensor = - make_tensor_ptr({1, noTokens, 640}, dur.data(), ScalarType::Float); - auto voiceRefTensor = make_tensor_ptr({1, constants::kVoiceRefSize}, - ref_s.data(), ScalarType::Float); + const_cast(textMask.data()), ScalarType::Bool); + auto indicesTensor = make_tensor_ptr( + {duration}, const_cast(indices.data()), ScalarType::Long); + auto durTensor = make_tensor_ptr( + {1, noTokens, 640}, const_cast(dur.data()), ScalarType::Float); + auto voiceRefTensor = + make_tensor_ptr({1, constants::kVoiceRefSize}, + const_cast(ref_s.data()), ScalarType::Float); // Select appropriate forward method based on token count - auto it = std::ranges::find_if(forwardMethods_, - [noTokens](const auto &entry) { return static_cast(entry.second) >= noTokens; }); - std::string selectedMethod = (it != forwardMethods_.end()) ? it->first : forwardMethods_.back().first; + auto it = + std::ranges::find_if(forwardMethods_, [noTokens](const auto &entry) { + return static_cast(entry.second) >= noTokens; + }); + std::string selectedMethod = + (it != forwardMethods_.end()) ? it->first : forwardMethods_.back().first; // Execute the selected forward method - auto results = execute(selectedMethod, - {tokensTensor, textMaskTensor, indicesTensor, durTensor, voiceRefTensor}); + auto results = + execute(selectedMethod, {tokensTensor, textMaskTensor, indicesTensor, + durTensor, voiceRefTensor}); if (!results.ok()) { throw RnExecutorchError( RnExecutorchErrorCode::InvalidModelOutput, "[Kokoro::Synthesizer] Failed to execute method " + selectedMethod + - ", error: " + + ", error: " + std::to_string(static_cast(results.error()))); } @@ -97,7 +102,8 @@ size_t Synthesizer::getTokensLimit() const { } size_t Synthesizer::getDurationLimit() const { - if (forwardMethods_.empty()) return 0; + if (forwardMethods_.empty()) + return 0; return getInputShape(forwardMethods_.back().first, 2)[0]; } diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.h index bfbbd02638..7b87e33c26 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Synthesizer.h @@ -40,17 +40,18 @@ class Synthesizer : public BaseModel { * @param ref_s a full voice array for given duration */ Result> generate(std::span tokens, - std::span textMask, - std::span indices, - std::span dur, - std::span ref_s); + std::span textMask, + std::span indices, + std::span dur, + std::span ref_s); // Model limits getters size_t getTokensLimit() const; size_t getDurationLimit() const; private: - // Forward methods discovered at construction (e.g. forward_8, forward_64, forward_128) + // Forward methods discovered at construction (e.g. forward_8, forward_64, + // forward_128) std::vector> forwardMethods_; // Shared model context // A const reference to singleton in Kokoro. diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Types.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Types.h index 20a0fe5f20..8a99dc09c8 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Types.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Types.h @@ -18,4 +18,14 @@ struct Context { size_t inputDurationLimit = 0; }; +/** + * Type definition - token timestamp. + * + * Values correspond to the amount of waveform samples. + */ +struct Timestamp { + size_t begin = 0; + size_t end = 0; +}; + } // namespace rnexecutorch::models::text_to_speech::kokoro diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp index a77e40a93c..37956e945c 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.cpp @@ -1,6 +1,7 @@ #include "Utils.h" #include "Constants.h" #include "Params.h" + #include #include #include @@ -9,86 +10,95 @@ namespace rnexecutorch::models::text_to_speech::kokoro::utils { using namespace params::cropping; -// Helper functions namespace { -// Normalizes an audio sample + float normalize(float sample) { - float v = std::abs(sample); - return v >= kAudioSilenceThreshold ? v : 0.F; + return std::max(0.0F, std::abs(sample) - kAudioSilenceThreshold); } -// Returns an index corresponding to the first (or last - if reverse=true) -// non-quiet part of an audio. -// Utilizes a moving average controled by hyperparameters from Constants.h. template size_t findAudioBound(std::span audio) { if (audio.empty()) { return 0; } - size_t length = audio.size(); + const size_t length = audio.size(); + float windowSum = 0.0F; + size_t processedCount = 0; + size_t currentIndex = reverse ? length - 1 : 0; - float sum = 0.F; - size_t count = 0; - size_t i = reverse ? length - 1 : 0; + while (processedCount < length) { + processedCount++; + windowSum += normalize(audio[currentIndex]); - while (count < length) { - count++; - sum += normalize(audio[i]); - if (count > kAudioCroppingSteps) { - sum -= normalize( - audio[reverse ? i + kAudioCroppingSteps : i - kAudioCroppingSteps]); + // Maintain the sliding window sum + if (processedCount > kAudioCroppingSteps) { + const size_t oldIndex = reverse ? currentIndex + kAudioCroppingSteps + : currentIndex - kAudioCroppingSteps; + windowSum -= normalize(audio[oldIndex]); } - if (count >= kAudioCroppingSteps && - sum / kAudioCroppingSteps >= kAudioSilenceThreshold) { - return i; + // Check if moving average exceeds threshold + if (processedCount >= kAudioCroppingSteps && + (windowSum / kAudioCroppingSteps) >= kAudioSilenceThreshold) { + return currentIndex; } - i = reverse ? i - 1 : i + 1; + currentIndex = reverse ? currentIndex - 1 : currentIndex + 1; } return reverse ? 0 : length - 1; } + } // namespace std::span stripAudio(std::span audio, size_t margin) { - auto lbound = findAudioBound(audio); - auto rbound = findAudioBound(audio); + if (audio.empty()) { + return {}; + } - lbound = lbound > margin ? lbound - margin : 0; - rbound = std::min(rbound + margin, audio.size() > 0 ? audio.size() - 1 : 0); + size_t lbound = findAudioBound(audio); + size_t rbound = findAudioBound(audio); - return audio.subspan(lbound, rbound >= lbound ? rbound - lbound + 1 : 0); + // Apply margins + lbound = (lbound > margin) ? lbound - margin : 0; + rbound = std::min(rbound + margin, audio.size() - 1); + + const size_t strippedLength = (rbound >= lbound) ? (rbound - lbound + 1) : 0; + return audio.subspan(lbound, strippedLength); } -std::vector tokenize(const std::u32string &phonemes, +std::vector tokenize(std::u32string_view phonemes, std::optional expectedSize) { if (expectedSize.has_value() && expectedSize.value() < 2) { - throw rnexecutorch::RnExecutorchError( - rnexecutorch::RnExecutorchErrorCode::InvalidUserInput, - "expected number of tokens cannot be lower than 2"); + throw RnExecutorchError(RnExecutorchErrorCode::InvalidUserInput, + "[Kokoro::Utils] Expected tokens must be >= 2"); } - // Number of tokens to populate, with and without edge pad tokens - size_t lengthWithPadding = - expectedSize.has_value() ? expectedSize.value() : phonemes.size() + 2; - size_t lengthWithoutPadding = lengthWithPadding - 2; - size_t effNoTokens = std::min(lengthWithoutPadding, phonemes.size()); - - // Note that we populate tokens[1:noTokens - 1], since first and last tokens - // are zeros (padding). Input could still contain unrecognized tokens, and - // that's why we use partition() at the end. - std::vector tokens(lengthWithPadding, constants::kPadToken); - std::transform(phonemes.begin(), phonemes.begin() + effNoTokens, + // 1. Determine lengths (2 tokens reserved for start/end padding) + const size_t totalLength = expectedSize.value_or(phonemes.size() + 2); + const size_t maxPhonemes = totalLength - 2; + const size_t effectivePhonemeCount = std::min(maxPhonemes, phonemes.size()); + + // 2. Initialize with pad tokens + std::vector tokens(totalLength, constants::kPadToken); + + // 3. Map phonemes to vocabulary tokens + // Starting from index 1 to leave index 0 as start-padding + std::transform(phonemes.begin(), phonemes.begin() + effectivePhonemeCount, tokens.begin() + 1, [](char32_t p) -> Token { return constants::kVocab.contains(p) ? constants::kVocab.at(p) : constants::kInvalidToken; }); - auto validSeqEnd = std::stable_partition( - tokens.begin() + 1, tokens.begin() + effNoTokens + 1, - [](Token t) -> bool { return t != constants::kInvalidToken; }); - std::fill(validSeqEnd, tokens.begin() + effNoTokens + 1, + + // 4. Remove invalid tokens while preserving order (bubbling them to the end + // of the content segment) + auto validEnd = std::stable_partition( + tokens.begin() + 1, tokens.begin() + effectivePhonemeCount + 1, + [](Token t) { return t != constants::kInvalidToken; }); + + // 5. Fill any gaps created by partitioning or sizing with pad tokens + std::fill(validEnd, tokens.begin() + effectivePhonemeCount + 1, constants::kPadToken); return tokens; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.h index 081d40c14c..c6996a3f40 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/text_to_speech/kokoro/Utils.h @@ -8,19 +8,20 @@ namespace rnexecutorch::models::text_to_speech::kokoro::utils { -// Removes silence from the beginning and the end of an audio (with some -// margin). -// Returns a [l - m, r + m] range of audio samples, where m is the margin, -// l and r correspond to lower and upper audio bound respectively. +/** + * Strips silence from audio edges using a sliding window. + * @param audio The input audio samples. + * @param margin Number of silence samples to preserve at each edge. + */ std::span stripAudio(std::span audio, size_t margin = 0); -// Tokenizes given phoneme string. -// Each phoneme corresponds to exactly one token, with 2 additional pad -// tokens added at both ends. -// If extecped number of tokens is provided, eventually expands the token vector -// with pad tokens to match the given length. -std::vector tokenize(const std::u32string &phonemes, +/** + * Maps phonemes to vocabulary tokens with start/end padding. + * @param phonemes UTF-32 phoneme sequence. + * @param expectedSize If set, pads the output to this exact length. + */ +std::vector tokenize(std::u32string_view phonemes, std::optional expectedSize = std::nullopt); } // namespace rnexecutorch::models::text_to_speech::kokoro::utils \ No newline at end of file diff --git a/packages/react-native-executorch/react-native-executorch.podspec b/packages/react-native-executorch/react-native-executorch.podspec index 902210d01a..5d2180cb9a 100644 --- a/packages/react-native-executorch/react-native-executorch.podspec +++ b/packages/react-native-executorch/react-native-executorch.podspec @@ -16,7 +16,6 @@ Pod::Spec.new do |s| pthreadpool_binaries_path = File.expand_path('$(PODS_TARGET_SRCROOT)/third-party/ios/libs/pthreadpool', __dir__) cpuinfo_binaries_path = File.expand_path('$(PODS_TARGET_SRCROOT)/third-party/ios/libs/cpuinfo', __dir__) - phonemis_binaries_path = File.expand_path('$(PODS_TARGET_SRCROOT)/third-party/ios/libs/phonemis', __dir__) s.user_target_xcconfig = { "HEADER_SEARCH_PATHS" => @@ -28,7 +27,6 @@ Pod::Spec.new do |s| '$(inherited)', "\"#{pthreadpool_binaries_path}/physical-arm64-release/libpthreadpool.a\"", "\"#{cpuinfo_binaries_path}/libcpuinfo.a\"", - "\"#{phonemis_binaries_path}/physical-arm64-release/libphonemis.a\"", ].join(' '), @@ -36,7 +34,6 @@ Pod::Spec.new do |s| '$(inherited)', "\"#{pthreadpool_binaries_path}/simulator-arm64-debug/libpthreadpool.a\"", "\"#{cpuinfo_binaries_path}/libcpuinfo.a\"", - "\"#{phonemis_binaries_path}/simulator-arm64-debug/libphonemis.a\"", ].join(' '), 'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'x86_64', @@ -50,7 +47,9 @@ Pod::Spec.new do |s| '"$(PODS_TARGET_SRCROOT)/third-party/include" '+ '"$(PODS_TARGET_SRCROOT)/third-party/include/cpuinfo" '+ '"$(PODS_TARGET_SRCROOT)/third-party/include/pthreadpool" '+ - '"$(PODS_TARGET_SRCROOT)/common" ', + '"$(PODS_TARGET_SRCROOT)/common" ' + + '"$(PODS_TARGET_SRCROOT)/third-party/common/phonemis/src" ', + "GCC_PREPROCESSOR_DEFINITIONS" => '$(inherited) ET_ON=1', "CLANG_CXX_LANGUAGE_STANDARD" => "c++20", 'EXCLUDED_ARCHS[sdk=iphonesimulator*]' => 'x86_64', } @@ -58,6 +57,7 @@ Pod::Spec.new do |s| s.source_files = [ "ios/**/*.{m,mm,h}", "common/**/*.{cpp,c,h,hpp}", + "third-party/common/phonemis/src/**/*.{cpp,hpp,h}", ] s.libraries = "z" @@ -70,8 +70,9 @@ Pod::Spec.new do |s| # react-native-skia. The headers are preserved by preserve_paths and # then made available by HEADER_SEARCH_PATHS. s.exclude_files = [ - "common/rnexecutorch/tests/**/*", - "common/rnexecutorch/jsi/*.{h,hpp}" + "common/rnexecutorch/tests/**/*.{cpp}", + "common/rnexecutorch/jsi/*.{h,hpp}", + "third-party/common/phonemis/src/phonemis/main.cpp" # Exclude the phonemis runner ] s.header_mappings_dir = "common/rnexecutorch" s.header_dir = "rnexecutorch" diff --git a/packages/react-native-executorch/src/constants/tts/models.ts b/packages/react-native-executorch/src/constants/tts/models.ts index 7b05a580c8..d973722720 100644 --- a/packages/react-native-executorch/src/constants/tts/models.ts +++ b/packages/react-native-executorch/src/constants/tts/models.ts @@ -1,28 +1,27 @@ -import { URL_PREFIX, VERSION_TAG } from '../versions'; +import { NEXT_VERSION_TAG, URL_PREFIX } from '../versions'; // Text to speech (tts) - Kokoro model(s) -const KOKORO_EN_MODELS_ROOT = `${URL_PREFIX}-kokoro/${VERSION_TAG}/xnnpack`; -const KOKORO_EN_SMALL_MODELS_ROOT = `${KOKORO_EN_MODELS_ROOT}/small`; -const KOKORO_EN_MEDIUM_MODELS_ROOT = `${KOKORO_EN_MODELS_ROOT}/medium`; +const KOKORO_MODEL_ROOT = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/xnnpack`; +const KOKORO_STANDARD_MODEL_ROOT = `${KOKORO_MODEL_ROOT}/standard`; +const KOKORO_POLISH_MODEL_ROOT = `${KOKORO_MODEL_ROOT}/polish`; /** - * A Kokoro model instance which processes the text in batches of maximum 64 tokens. - * Uses significant less memory than the medium model, but could produce - * a lower quality speech due to forced, aggressive text splitting. + * A standard Kokoro instance which processes the text in batches of maximum 128 tokens. + * Works well with built-in languages: english, spanish, french, italian, portugese and hindi. * @category Models - Text to Speech */ -export const KOKORO_SMALL = { - modelName: 'kokoro-small' as const, - durationPredictorSource: `${KOKORO_EN_SMALL_MODELS_ROOT}/duration_predictor.pte`, - synthesizerSource: `${KOKORO_EN_SMALL_MODELS_ROOT}/synthesizer.pte`, +export const KOKORO_STANDARD = { + modelName: 'kokoro' as const, + durationPredictorSource: `${KOKORO_STANDARD_MODEL_ROOT}/duration_predictor_std.pte`, + synthesizerSource: `${KOKORO_STANDARD_MODEL_ROOT}/synthesizer_std.pte`, }; /** - * A standard Kokoro instance which processes the text in batches of maximum 128 tokens. + * A fine-tuned Kokoro instance for Polish. * @category Models - Text to Speech */ -export const KOKORO_MEDIUM = { - modelName: 'kokoro-medium' as const, - durationPredictorSource: `${KOKORO_EN_MEDIUM_MODELS_ROOT}/duration_predictor.pte`, - synthesizerSource: `${KOKORO_EN_MEDIUM_MODELS_ROOT}/synthesizer.pte`, +export const KOKORO_POLISH = { + modelName: 'kokoro' as const, + durationPredictorSource: `${KOKORO_POLISH_MODEL_ROOT}/duration_predictor_pl.pte`, + synthesizerSource: `${KOKORO_POLISH_MODEL_ROOT}/synthesizer_pl.pte`, }; diff --git a/packages/react-native-executorch/src/constants/tts/voices.ts b/packages/react-native-executorch/src/constants/tts/voices.ts index cb98616906..c1bcb7116e 100644 --- a/packages/react-native-executorch/src/constants/tts/voices.ts +++ b/packages/react-native-executorch/src/constants/tts/voices.ts @@ -1,84 +1,286 @@ -import { KokoroVoiceExtras, VoiceConfig } from '../../types/tts'; -import { URL_PREFIX, VERSION_TAG } from '../versions'; - -// Kokoro voices - phonemizers -const KOKORO_PHONEMIZER_PREFIX = `${URL_PREFIX}-kokoro/${VERSION_TAG}/phonemizer`; -const KOKORO_PHONEMIZER_TAGGER_DATA = `${KOKORO_PHONEMIZER_PREFIX}/tags.json`; -const KOKORO_PHONEMIZER_LEXICON_EN_US_DATA = `${KOKORO_PHONEMIZER_PREFIX}/us_merged.json`; -const KOKORO_PHONEMIZER_LEXICON_EN_GB_DATA = `${KOKORO_PHONEMIZER_PREFIX}/gb_merged.json`; - -const EN_US_RESOURCES = { - taggerSource: KOKORO_PHONEMIZER_TAGGER_DATA, - lexiconSource: KOKORO_PHONEMIZER_LEXICON_EN_US_DATA, -} as KokoroVoiceExtras; -const EN_GB_RESOURCES = { - taggerSource: KOKORO_PHONEMIZER_TAGGER_DATA, - lexiconSource: KOKORO_PHONEMIZER_LEXICON_EN_GB_DATA, -} as KokoroVoiceExtras; +import { TextToSpeechModelConfig } from '../../types/tts'; +import { NEXT_VERSION_TAG, URL_PREFIX } from '../versions'; +import { KOKORO_STANDARD, KOKORO_POLISH } from './models'; + +// Common prefixes - voices & phonemization data +const KOKORO_VOICE_PREFIX = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/voices`; +const KOKORO_PHONEMIZER_PREFIX = `${URL_PREFIX}-kokoro/${NEXT_VERSION_TAG}/phonemizer`; + +const KOKORO_PHONEMIZER_EN_US_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/en-us`; +const KOKORO_PHONEMIZER_EN_US_TAGGER = `${KOKORO_PHONEMIZER_EN_US_PREFIX}/tags.json`; +const KOKORO_PHONEMIZER_EN_US_LEXICON = `${KOKORO_PHONEMIZER_EN_US_PREFIX}/lexicon.json`; +const KOKORO_PHONEMIZER_EN_US_MODEL = `${KOKORO_PHONEMIZER_EN_US_PREFIX}/phonemizer_en_us.pte`; + +const KOKORO_PHONEMIZER_EN_GB_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/en-gb`; +const KOKORO_PHONEMIZER_EN_GB_TAGGER = `${KOKORO_PHONEMIZER_EN_GB_PREFIX}/tags.json`; +const KOKORO_PHONEMIZER_EN_GB_LEXICON = `${KOKORO_PHONEMIZER_EN_GB_PREFIX}/lexicon.json`; +const KOKORO_PHONEMIZER_EN_GB_MODEL = `${KOKORO_PHONEMIZER_EN_GB_PREFIX}/phonemizer_en_gb.pte`; + +// French +const KOKORO_PHONEMIZER_FR_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/fr`; +const KOKORO_PHONEMIZER_FR_MODEL = `${KOKORO_PHONEMIZER_FR_PREFIX}/phonemizer_fr.pte`; + +// Spanish +const KOKORO_PHONEMIZER_ES_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/es`; +const KOKORO_PHONEMIZER_ES_MODEL = `${KOKORO_PHONEMIZER_ES_PREFIX}/phonemizer_es.pte`; + +// Italian +const KOKORO_PHONEMIZER_IT_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/it`; +const KOKORO_PHONEMIZER_IT_MODEL = `${KOKORO_PHONEMIZER_IT_PREFIX}/phonemizer_it.pte`; + +// Portuguese +const KOKORO_PHONEMIZER_PT_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/pt`; +const KOKORO_PHONEMIZER_PT_MODEL = `${KOKORO_PHONEMIZER_PT_PREFIX}/phonemizer_pt.pte`; + +// Polish +const KOKORO_PHONEMIZER_PL_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/pl`; +const KOKORO_PHONEMIZER_PL_MODEL = `${KOKORO_PHONEMIZER_PL_PREFIX}/phonemizer_pl.pte`; + +// Hindi +const KOKORO_PHONEMIZER_HI_PREFIX = `${KOKORO_PHONEMIZER_PREFIX}/hi`; +const KOKORO_PHONEMIZER_HI_MODEL = `${KOKORO_PHONEMIZER_HI_PREFIX}/phonemizer_hi.pte`; // Kokoro voices -const KOKORO_VOICE_PREFIX = `${URL_PREFIX}-kokoro/${VERSION_TAG}/voices`; /** * @category TTS Supported Voices */ -export const KOKORO_VOICE_AF_HEART = { - lang: 'en-us' as const, +export const KOKORO_AMERICAN_ENGLISH_FEMALE_HEART = { + model: KOKORO_STANDARD, voiceSource: `${KOKORO_VOICE_PREFIX}/af_heart.bin`, - extra: EN_US_RESOURCES, -} as VoiceConfig; + phonemizerConfig: { + lang: 'en-us' as const, + taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER, + lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON, + neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL, + }, +} as TextToSpeechModelConfig; + /** * @category TTS Supported Voices */ -export const KOKORO_VOICE_AF_RIVER = { - lang: 'en-us' as const, +export const KOKORO_AMERICAN_ENGLISH_FEMALE_RIVER = { + model: KOKORO_STANDARD, voiceSource: `${KOKORO_VOICE_PREFIX}/af_river.bin`, - extra: EN_US_RESOURCES, -} as VoiceConfig; + phonemizerConfig: { + lang: 'en-us' as const, + taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER, + lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON, + neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL, + }, +} as TextToSpeechModelConfig; + /** * @category TTS Supported Voices */ -export const KOKORO_VOICE_AF_SARAH = { - lang: 'en-us' as const, +export const KOKORO_AMERICAN_ENGLISH_FEMALE_SARAH = { + model: KOKORO_STANDARD, voiceSource: `${KOKORO_VOICE_PREFIX}/af_sarah.bin`, - extra: EN_US_RESOURCES, -} as VoiceConfig; + phonemizerConfig: { + lang: 'en-us' as const, + taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER, + lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON, + neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL, + }, +} as TextToSpeechModelConfig; + /** * @category TTS Supported Voices */ -export const KOKORO_VOICE_AM_ADAM = { - lang: 'en-us' as const, +export const KOKORO_AMERICAN_ENGLISH_MALE_ADAM = { + model: KOKORO_STANDARD, voiceSource: `${KOKORO_VOICE_PREFIX}/am_adam.bin`, - extra: EN_US_RESOURCES, -} as VoiceConfig; + phonemizerConfig: { + lang: 'en-us' as const, + taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER, + lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON, + neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL, + }, +} as TextToSpeechModelConfig; + /** * @category TTS Supported Voices */ -export const KOKORO_VOICE_AM_MICHAEL = { - lang: 'en-us' as const, +export const KOKORO_AMERICAN_ENGLISH_MALE_MICHAEL = { + model: KOKORO_STANDARD, voiceSource: `${KOKORO_VOICE_PREFIX}/am_michael.bin`, - extra: EN_US_RESOURCES, -} as VoiceConfig; + phonemizerConfig: { + lang: 'en-us' as const, + taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER, + lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON, + neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL, + }, +} as TextToSpeechModelConfig; + /** * @category TTS Supported Voices */ -export const KOKORO_VOICE_AM_SANTA = { - lang: 'en-us' as const, +export const KOKORO_AMERICAN_ENGLISH_MALE_SANTA = { + model: KOKORO_STANDARD, voiceSource: `${KOKORO_VOICE_PREFIX}/am_santa.bin`, - extra: EN_US_RESOURCES, -} as VoiceConfig; + phonemizerConfig: { + lang: 'en-us' as const, + taggerSource: KOKORO_PHONEMIZER_EN_US_TAGGER, + lexiconSource: KOKORO_PHONEMIZER_EN_US_LEXICON, + neuralModelSource: KOKORO_PHONEMIZER_EN_US_MODEL, + }, +} as TextToSpeechModelConfig; + /** * @category TTS Supported Voices */ -export const KOKORO_VOICE_BF_EMMA = { - lang: 'en-gb' as const, +export const KOKORO_BRITISH_ENGLISH_FEMALE_EMMA = { + model: KOKORO_STANDARD, voiceSource: `${KOKORO_VOICE_PREFIX}/bf_emma.bin`, - extra: EN_GB_RESOURCES, -} as VoiceConfig; + phonemizerConfig: { + lang: 'en-gb' as const, + taggerSource: KOKORO_PHONEMIZER_EN_GB_TAGGER, + lexiconSource: KOKORO_PHONEMIZER_EN_GB_LEXICON, + neuralModelSource: KOKORO_PHONEMIZER_EN_GB_MODEL, + }, +} as TextToSpeechModelConfig; + /** * @category TTS Supported Voices */ -export const KOKORO_VOICE_BM_DANIEL = { - lang: 'en-gb' as const, +export const KOKORO_BRITISH_ENGLISH_MALE_DANIEL = { + model: KOKORO_STANDARD, voiceSource: `${KOKORO_VOICE_PREFIX}/bm_daniel.bin`, - extra: EN_GB_RESOURCES, -} as VoiceConfig; + phonemizerConfig: { + lang: 'en-gb' as const, + taggerSource: KOKORO_PHONEMIZER_EN_GB_TAGGER, + lexiconSource: KOKORO_PHONEMIZER_EN_GB_LEXICON, + neuralModelSource: KOKORO_PHONEMIZER_EN_GB_MODEL, + }, +} as TextToSpeechModelConfig; + +/** + * @category TTS Supported Voices + */ +export const KOKORO_FRENCH_FEMALE_SIWIS = { + model: KOKORO_STANDARD, + voiceSource: `${KOKORO_VOICE_PREFIX}/ff_siwis.bin`, + phonemizerConfig: { + lang: 'fr' as const, + neuralModelSource: KOKORO_PHONEMIZER_FR_MODEL, + }, +} as TextToSpeechModelConfig; + +/** + * @category TTS Supported Voices + */ +export const KOKORO_SPANISH_FEMALE_DORA = { + model: KOKORO_STANDARD, + voiceSource: `${KOKORO_VOICE_PREFIX}/ef_dora.bin`, + phonemizerConfig: { + lang: 'es' as const, + neuralModelSource: KOKORO_PHONEMIZER_ES_MODEL, + }, +} as TextToSpeechModelConfig; + +/** + * @category TTS Supported Voices + */ +export const KOKORO_SPANISH_MALE_ALEX = { + model: KOKORO_STANDARD, + voiceSource: `${KOKORO_VOICE_PREFIX}/em_alex.bin`, + phonemizerConfig: { + lang: 'es' as const, + neuralModelSource: KOKORO_PHONEMIZER_ES_MODEL, + }, +} as TextToSpeechModelConfig; + +/** + * @category TTS Supported Voices + */ +export const KOKORO_ITALIAN_FEMALE_SARA = { + model: KOKORO_STANDARD, + voiceSource: `${KOKORO_VOICE_PREFIX}/if_sara.bin`, + phonemizerConfig: { + lang: 'it' as const, + neuralModelSource: KOKORO_PHONEMIZER_IT_MODEL, + }, +} as TextToSpeechModelConfig; + +/** + * @category TTS Supported Voices + */ +export const KOKORO_ITALIAN_MALE_NICOLA = { + model: KOKORO_STANDARD, + voiceSource: `${KOKORO_VOICE_PREFIX}/im_nicola.bin`, + phonemizerConfig: { + lang: 'it' as const, + neuralModelSource: KOKORO_PHONEMIZER_IT_MODEL, + }, +} as TextToSpeechModelConfig; + +/** + * @category TTS Supported Voices + */ +export const KOKORO_PORTUGUESE_FEMALE_DORA = { + model: KOKORO_STANDARD, + voiceSource: `${KOKORO_VOICE_PREFIX}/pf_dora.bin`, + phonemizerConfig: { + lang: 'pt' as const, + neuralModelSource: KOKORO_PHONEMIZER_PT_MODEL, + }, +} as TextToSpeechModelConfig; + +/** + * @category TTS Supported Voices + */ +export const KOKORO_PORTUGUESE_MALE_SANTA = { + model: KOKORO_STANDARD, + voiceSource: `${KOKORO_VOICE_PREFIX}/pm_santa.bin`, + phonemizerConfig: { + lang: 'pt' as const, + neuralModelSource: KOKORO_PHONEMIZER_PT_MODEL, + }, +} as TextToSpeechModelConfig; + +/** + * @category TTS Supported Voices + */ +export const KOKORO_HINDI_FEMALE_ALPHA = { + model: KOKORO_STANDARD, + voiceSource: `${KOKORO_VOICE_PREFIX}/hf_alpha.bin`, + phonemizerConfig: { + lang: 'hi' as const, + neuralModelSource: KOKORO_PHONEMIZER_HI_MODEL, + }, +} as TextToSpeechModelConfig; + +/** + * @category TTS Supported Voices + */ +export const KOKORO_HINDI_MALE_OMEGA = { + model: KOKORO_STANDARD, + voiceSource: `${KOKORO_VOICE_PREFIX}/hm_omega.bin`, + phonemizerConfig: { + lang: 'hi' as const, + neuralModelSource: KOKORO_PHONEMIZER_HI_MODEL, + }, +} as TextToSpeechModelConfig; + +/** + * @category TTS Supported Voices + */ +export const KOKORO_HINDI_MALE_PSI = { + model: KOKORO_STANDARD, + voiceSource: `${KOKORO_VOICE_PREFIX}/hm_psi.bin`, + phonemizerConfig: { + lang: 'hi' as const, + neuralModelSource: KOKORO_PHONEMIZER_HI_MODEL, + }, +} as TextToSpeechModelConfig; + +/** + * @category TTS Supported Voices + */ +export const KOKORO_POLISH_MALE_MATEUSZ = { + model: KOKORO_POLISH, + voiceSource: `${KOKORO_VOICE_PREFIX}/pm_mateusz.bin`, + phonemizerConfig: { + lang: 'pl' as const, + neuralModelSource: KOKORO_PHONEMIZER_PL_MODEL, + }, +} as TextToSpeechModelConfig; diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts index 70ecc3e73f..6e034693af 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useTextToSpeech.ts @@ -1,12 +1,10 @@ import { useCallback, useEffect, useState } from 'react'; import { TextToSpeechModule } from '../../modules/natural_language_processing/TextToSpeechModule'; import { - TextToSpeechProps, TextToSpeechInput, - TextToSpeechPhonemeInput, - TextToSpeechType, + TextToSpeechModelConfig, TextToSpeechStreamingInput, - TextToSpeechStreamingPhonemeInput, + TextToSpeechType, } from '../../types/tts'; import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; @@ -14,14 +12,15 @@ import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; /** * React hook for managing Text to Speech instance. * @category Hooks - * @param TextToSpeechProps - Configuration object containing `model` source, `voice` and optional `preventLoad`. + * @param model - Configuration object containing model config. + * @param options - Additional options for the hook. + * @param options.preventLoad - If true, prevents the model from loading automatically on initialization. * @returns Ready to use Text to Speech model. */ -export const useTextToSpeech = ({ - model, - voice, - preventLoad = false, -}: TextToSpeechProps): TextToSpeechType => { +export const useTextToSpeech = ( + model: TextToSpeechModelConfig, + { preventLoad = false }: { preventLoad?: boolean } = {} +): TextToSpeechType => { const [error, setError] = useState(null); const [isReady, setIsReady] = useState(false); const [isGenerating, setIsGenerating] = useState(false); @@ -38,7 +37,7 @@ export const useTextToSpeech = ({ setError(null); setIsReady(false); - TextToSpeechModule.fromModelName({ model, voice }, setDownloadProgress) + TextToSpeechModule.fromModelName(model, setDownloadProgress) .then((mod) => { if (!active) { mod.delete(); @@ -57,21 +56,21 @@ export const useTextToSpeech = ({ return () => { active = false; setModuleInstance((prev) => { + prev?.streamStop(true); prev?.delete(); return null; }); }; // eslint-disable-next-line react-hooks/exhaustive-deps }, [ - model.modelName, - model.durationPredictorSource, - model.synthesizerSource, - voice?.voiceSource, - voice?.extra, + model.model.modelName, + model.model.durationPredictorSource, + model.model.synthesizerSource, + model.voiceSource, + model.phonemizerConfig, preventLoad, ]); - // Shared guard for all generation methods const guardReady = useCallback( (methodName: string): TextToSpeechModule => { if (!isReady || !moduleInstance) @@ -93,19 +92,10 @@ export const useTextToSpeech = ({ const instance = guardReady('forward'); try { setIsGenerating(true); - return await instance.forward(input.text ?? '', input.speed ?? 1.0); - } finally { - setIsGenerating(false); - } - }; - - const forwardFromPhonemes = async (input: TextToSpeechPhonemeInput) => { - const instance = guardReady('forwardFromPhonemes'); - try { - setIsGenerating(true); - return await instance.forwardFromPhonemes( - input.phonemes ?? '', - input.speed ?? 1.0 + return await instance.forward( + input.text ?? '', + input.speed ?? 1.0, + input.phonemize ?? true ); } finally { setIsGenerating(false); @@ -118,8 +108,6 @@ export const useTextToSpeech = ({ setIsGenerating(true); try { if (input.text) { - // If the initial text does not end with an end of sentence character, - // we add an artificial dot to improve output's quality. instance.streamInsert( input.text + ('.?!;'.includes(input.text.trim().slice(-1)) ? '' : '.') @@ -129,34 +117,16 @@ export const useTextToSpeech = ({ await input.onBegin?.(); for await (const audio of instance.stream({ speed: input.speed ?? 1.0, + phonemize: input.phonemize ?? true, stopAutomatically: input.stopAutomatically ?? true, })) { if (input.onNext) { await input.onNext(audio); } } - } finally { - await input.onEnd?.(); - setIsGenerating(false); - } - }, - [guardReady] - ); - - const streamFromPhonemes = useCallback( - async (input: TextToSpeechStreamingPhonemeInput) => { - const instance = guardReady('streamFromPhonemes'); - setIsGenerating(true); - try { - await input.onBegin?.(); - for await (const audio of instance.streamFromPhonemes({ - phonemes: input.phonemes ?? '', - speed: input.speed ?? 1.0, - })) { - if (input.onNext) { - await input.onNext(audio); - } - } + } catch (e) { + instance.streamStop(true); + throw e; } finally { await input.onEnd?.(); setIsGenerating(false); @@ -188,9 +158,7 @@ export const useTextToSpeech = ({ isReady, isGenerating, forward, - forwardFromPhonemes, stream, - streamFromPhonemes, streamInsert, streamStop, downloadProgress, diff --git a/packages/react-native-executorch/src/index.ts b/packages/react-native-executorch/src/index.ts index 96d167a7d2..911e555045 100644 --- a/packages/react-native-executorch/src/index.ts +++ b/packages/react-native-executorch/src/index.ts @@ -100,7 +100,8 @@ declare global { var loadTextToSpeechKokoro: ( lang: string, taggerData: string, - phonemizerData: string, + lexiconData: string, + neuralPhonemizerData: string, durationPredictorSource: string, synthesizerSource: string, voice: string diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts index 6ab28543c6..d59fa56e30 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/TextToSpeechModule.ts @@ -1,12 +1,11 @@ import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; import { parseUnknownError, RnExecutorchError } from '../../errors/errorUtils'; import { ResourceFetcher } from '../../utils/ResourceFetcher'; +import { ResourceSource } from '../../types/common'; import { - KokoroConfig, - TextToSpeechConfig, + TextToSpeechModelConfig, + TextToSpeechModelSources, TextToSpeechStreamingInput, - TextToSpeechStreamingPhonemeInput, - VoiceConfig, } from '../../types/tts'; import { Logger } from '../../common/Logger'; @@ -24,26 +23,17 @@ export class TextToSpeechModule { /** * Creates a Text to Speech instance. - * @param config - Configuration object containing `model` and `voice`. - * Pass one of the built-in constants (e.g. `{ model: KOKORO_MEDIUM, voice: KOKORO_VOICE_AF_HEART }`), or use require() to pass them. + * @param config - Configuration object containing model and voice sources. * @param onDownloadProgress - Optional callback to monitor download progress, receiving a value between 0 and 1. * @returns A Promise resolving to a `TextToSpeechModule` instance. - * @example - * ```ts - * import { TextToSpeechModule, KOKORO_MEDIUM, KOKORO_VOICE_AF_HEART } from 'react-native-executorch'; - * const tts = await TextToSpeechModule.fromModelName( - * { model: KOKORO_MEDIUM, voice: KOKORO_VOICE_AF_HEART }, - * ); - * ``` */ static async fromModelName( - config: TextToSpeechConfig, + config: TextToSpeechModelConfig, onDownloadProgress: (progress: number) => void = () => {} ): Promise { try { const nativeModule = await TextToSpeechModule.loadKokoro( - config.model, - config.voice, + config, onDownloadProgress ); return new TextToSpeechModule(nativeModule); @@ -54,48 +44,52 @@ export class TextToSpeechModule { } private static async loadKokoro( - model: KokoroConfig, - voice: VoiceConfig, + config: TextToSpeechModelConfig, onDownloadProgressCallback: (progress: number) => void ): Promise { - if ( - !voice.extra || - !voice.extra.taggerSource || - !voice.extra.lexiconSource - ) { - throw new RnExecutorchError( - RnExecutorchErrorCode.InvalidConfig, - 'Kokoro: voice config is missing required extra fields: taggerSource and/or lexiconSource.' - ); - } + const { model, voiceSource, phonemizerConfig } = config; + const kokoroModel = model as Extract< + TextToSpeechModelSources, + { modelName: 'kokoro' } + >; + + const sources: ResourceSource[] = [ + kokoroModel.durationPredictorSource, + kokoroModel.synthesizerSource, + voiceSource, + ]; + + // Since each of these args is optional, we need to handle the sources array in a dynamic way. + const taggerIdx = phonemizerConfig.taggerSource + ? sources.push(phonemizerConfig.taggerSource) - 1 + : -1; + const lexiconIdx = phonemizerConfig.lexiconSource + ? sources.push(phonemizerConfig.lexiconSource) - 1 + : -1; + const neuralModelIdx = phonemizerConfig.neuralModelSource + ? sources.push(phonemizerConfig.neuralModelSource) - 1 + : -1; const paths = await ResourceFetcher.fetch( onDownloadProgressCallback, - model.durationPredictorSource, - model.synthesizerSource, - voice.voiceSource, - voice.extra.taggerSource, - voice.extra.lexiconSource + ...sources ); - if (paths === null || paths.length !== 5) { + if (paths === null || paths.length !== sources.length) { throw new RnExecutorchError( RnExecutorchErrorCode.DownloadInterrupted, 'Download interrupted or missing resource.' ); } - const modelPaths = paths.slice(0, 2) as [string, string]; - const voiceDataPath = paths[2] as string; - const phonemizerPaths = paths.slice(3, 5) as [string, string]; - return await global.loadTextToSpeechKokoro( - voice.lang, - phonemizerPaths[0], - phonemizerPaths[1], - modelPaths[0], - modelPaths[1], - voiceDataPath + phonemizerConfig.lang, + taggerIdx >= 0 ? (paths[taggerIdx] as string) : '', + lexiconIdx >= 0 ? (paths[lexiconIdx] as string) : '', + neuralModelIdx >= 0 ? (paths[neuralModelIdx] as string) : '', + paths[0] as string, // DurationPredictor source + paths[1] as string, // Synthesizer source + paths[2] as string // Voice source ); } @@ -108,47 +102,33 @@ export class TextToSpeechModule { } /** - * Synthesizes the provided text into speech. - * Returns a promise that resolves to the full audio waveform as a `Float32Array`. - * @param text The input text to be synthesized. - * @param speed Optional speed multiplier for the speech synthesis (default is 1.0). - * @returns A promise resolving to the synthesized audio waveform. + * Synthesizes the provided input (text or IPA phonemes) into speech. + * @param input - The input text or phonemes to be synthesized. + * @param speed - Playback speed multiplier (default: 1.0). + * @param phonemize - If true (default), treats input as text and converts it to phonemes. + * If false, input is treated as phonemes. + * @returns A promise resolving to the full audio waveform as a `Float32Array`. */ public async forward( - text: string, - speed: number = 1.0 + input: string, + speed: number = 1.0, + phonemize: boolean = true ): Promise { this.ensureLoaded('forward'); - return await this.nativeModule.generate(text, speed); - } - - /** - * Synthesizes pre-computed phonemes into speech, bypassing the built-in phonemizer. - * This allows using an external G2P system (e.g. the Python `phonemizer` library, - * espeak-ng, or any custom phonemizer). - * @param phonemes The pre-computed IPA phoneme string. - * @param speed Optional speed multiplier for the speech synthesis (default is 1.0). - * @returns A promise resolving to the synthesized audio waveform. - */ - public async forwardFromPhonemes( - phonemes: string, - speed: number = 1.0 - ): Promise { - this.ensureLoaded('forwardFromPhonemes'); - return await this.nativeModule.generateFromPhonemes(phonemes, speed); + return await this.nativeModule.generate(input, speed, phonemize); } /** * Starts a streaming synthesis session. Yields audio chunks as they are generated. - * @param input - Input object containing text and optional speed. + * @param input - Input object containing optional speed, phonemize flag and stopAutomatically flag. * @yields An audio chunk generated during synthesis. * @returns An async generator yielding Float32Array audio chunks. */ public async *stream({ - speed, - stopAutomatically, + speed = 1.0, + phonemize = true, + stopAutomatically = true, }: TextToSpeechStreamingInput): AsyncGenerator { - // Stores computed audio segments const queue: Float32Array[] = []; let waiter: (() => void) | null = null; @@ -165,66 +145,13 @@ export class TextToSpeechModule { (async () => { try { await this.nativeModule.stream( - speed, - stopAutomatically, (audio: number[]) => { queue.push(new Float32Array(audio)); wake(); - } - ); - nativeStreamFinished = true; - wake(); - } catch (e) { - error = e; - nativeStreamFinished = true; - wake(); - } - })(); - - while (this.isStreaming) { - if (queue.length > 0) { - yield queue.shift()!; - if (nativeStreamFinished && queue.length === 0) { - return; - } - continue; - } - if (error) throw error; - await new Promise((r) => (waiter = r)); - } - } - - /** - * Starts a streaming synthesis session from pre-computed phonemes. - * Bypasses the built-in phonemizer, allowing use of external G2P systems. - * @param input - Input object containing phonemes and optional speed. - * @yields An audio chunk generated during synthesis. - * @returns An async generator yielding Float32Array audio chunks. - */ - public async *streamFromPhonemes({ - phonemes, - speed, - }: TextToSpeechStreamingPhonemeInput): AsyncGenerator { - const queue: Float32Array[] = []; - - let waiter: (() => void) | null = null; - let error: unknown; - let nativeStreamFinished = false; - - const wake = () => { - waiter?.(); - waiter = null; - }; - - (async () => { - try { - await this.nativeModule.streamFromPhonemes( - phonemes, + }, speed, - (audio: number[]) => { - queue.push(new Float32Array(audio)); - wake(); - } + phonemize, + stopAutomatically ); nativeStreamFinished = true; wake(); @@ -244,16 +171,17 @@ export class TextToSpeechModule { continue; } if (error) throw error; + if (nativeStreamFinished && queue.length === 0) return; await new Promise((r) => (waiter = r)); } } /** - * Inserts new text chunk into the buffer to be processed in streaming mode. - * @param textChunk - The text fragment to append to the streaming buffer. + * Inserts new content (text or IPA phonemes) into the buffer to be processed in streaming mode. + * @param input - The text or phoneme fragment to append to the streaming buffer. */ - public streamInsert(textChunk: string): void { - this.nativeModule.streamInsert(textChunk); + public streamInsert(input: string): void { + this.nativeModule.streamInsert(input); } /** diff --git a/packages/react-native-executorch/src/types/tts.ts b/packages/react-native-executorch/src/types/tts.ts index 82a5a5471c..278eb18c30 100644 --- a/packages/react-native-executorch/src/types/tts.ts +++ b/packages/react-native-executorch/src/types/tts.ts @@ -1,11 +1,22 @@ import { ResourceSource } from './common'; import { RnExecutorchError } from '../errors/errorUtils'; +/** + * Per-model config for {@link TextToSpeechModule.fromModelName}. + * Each model name maps to its required fields. + * @category Types + */ +export type TextToSpeechModelSources = { + modelName: 'kokoro'; + durationPredictorSource: ResourceSource; + synthesizerSource: ResourceSource; +}; + /** * Union of all built-in Text to Speech model names. * @category Types */ -export type TextToSpeechModelName = 'kokoro-small' | 'kokoro-medium'; +export type TextToSpeechModelName = TextToSpeechModelSources['modelName']; /** * List all the languages available in TTS models (as lang shorthands) @@ -13,68 +24,56 @@ export type TextToSpeechModelName = 'kokoro-small' | 'kokoro-medium'; */ export type TextToSpeechLanguage = | 'en-us' // American English - | 'en-gb'; // British English + | 'en-gb' // British English + | 'fr' // French + | 'es' // Spanish + | 'it' // Italian + | 'pt' // Portuguese + | 'pl' // Polish + | 'hi'; // Hindi /** - * Voice configuration - * - * So far in Kokoro, each voice is directly associated with a language. + * Configuration for the Phonemizer used in Text-to-Speech models. + * Phonemization is the process of converting text into phonetic representations. * @category Types - * @property {TextToSpeechLanguage} lang - speaker's language - * @property {ResourceSource} voiceSource - a source to a binary file with voice embedding - * @property {KokoroVoiceExtras} [extra] - an optional extra sources or properties related to specific voice */ -export interface VoiceConfig { +export interface TextToSpeechPhonemizerConfig { + /** + * The language code for phonemization (e.g., 'en-us'). + */ lang: TextToSpeechLanguage; - voiceSource: ResourceSource; - extra?: KokoroVoiceExtras; // ... add more possible types -} -/** - * Kokoro-specific voice extra props - * @category Types - * @property {ResourceSource} taggerSource - source to Kokoro's tagger model binary - * @property {ResourceSource} lexiconSource - source to Kokoro's lexicon binary - */ -export interface KokoroVoiceExtras { - taggerSource: ResourceSource; - lexiconSource: ResourceSource; -} + /** + * Optional resource for the part-of-speech tagger. + * Utilized by more challenging languages, such as english. + */ + taggerSource?: ResourceSource; -/** - * Kokoro model configuration. - * Only the core Kokoro model sources, as phonemizer sources are included in voice configuration. - * @category Types - * @property {TextToSpeechModelName} modelName - model name identifier - * @property {ResourceSource} durationPredictorSource - source to Kokoro's duration predictor model binary - * @property {ResourceSource} synthesizerSource - source to Kokoro's synthesizer model binary - */ -export interface KokoroConfig { - modelName: TextToSpeechModelName; - durationPredictorSource: ResourceSource; - synthesizerSource: ResourceSource; -} + /** + * Optional resource for the pronunciation lexicon. + * If provided, it wil be a primary phonemization mechanism. + */ + lexiconSource?: ResourceSource; -/** - * General Text to Speech module configuration - * @category Types - * @property {KokoroConfig} model - a selected T2S model - * @property {VoiceConfig} voice - a selected speaker's voice - * @property {KokoroOptions} [options] - a completely optional model-specific configuration - */ -export interface TextToSpeechConfig { - model: KokoroConfig; // ... add other model types in the future - voice: VoiceConfig; + /** + * Optional neural model resource for Grapheme-to-Phoneme conversion. + * Serves as a fallback for lexicon or a primary phonemization mechanism if lexicon + * is not defined. + */ + neuralModelSource?: ResourceSource; } /** - * Props for the useTextToSpeech hook. + * Configuration for a specific model and voice in a Text-to-Speech module. * @category Types - * @augments TextToSpeechConfig - * @property {boolean} [preventLoad] - Boolean that can prevent automatic model loading (and downloading the data if you load it for the first time) after running the hook. + * @property {TextToSpeechModelSources} model - The model sources and identifiers. + * @property {ResourceSource} voiceSource - The resource containing the voice-specific tensor stored in a binary format. + * @property {TextToSpeechPhonemizerConfig} phonemizerConfig - The phonemizer configuration to be used with this voice. */ -export interface TextToSpeechProps extends TextToSpeechConfig { - preventLoad?: boolean; +export interface TextToSpeechModelConfig { + model: TextToSpeechModelSources; + voiceSource: ResourceSource; + phonemizerConfig: TextToSpeechPhonemizerConfig; } /** @@ -82,24 +81,13 @@ export interface TextToSpeechProps extends TextToSpeechConfig { * @category Types * @property {string} text - a text to be spoken * @property {number} [speed] - optional speed argument - the higher it is, the faster the speech becomes + * @property {boolean} [phonemize] - if true (default), the input is treated as text and converted to phonemes. + * If false, the input should already be in IPA phonemes. */ export interface TextToSpeechInput { text?: string; speed?: number; -} - -/** - * Text to Speech module input for pre-computed phonemes. - * Use this when you have your own phonemizer (e.g. the Python `phonemizer` - * library, espeak-ng, or any custom G2P system) and want to bypass the - * built-in phonemizer pipeline. - * @category Types - * @property {string} phonemes - pre-computed IPA phoneme string - * @property {number} [speed] - optional speed argument - the higher it is, the faster the speech becomes - */ -export interface TextToSpeechPhonemeInput { - phonemes: string; - speed?: number; + phonemize?: boolean; } /** @@ -136,17 +124,6 @@ export interface TextToSpeechType { */ forward: (input: TextToSpeechInput) => Promise; - /** - * Synthesizes pre-computed phonemes into speech audio in a single pass. - * Bypasses the built-in phonemizer, allowing use of external G2P systems. - * @param input - The `TextToSpeechPhonemeInput` object containing pre-computed `phonemes` and optional `speed`. - * @returns A Promise that resolves with the generated audio data. - * @throws {RnExecutorchError} If the model is not loaded or is currently generating. - */ - forwardFromPhonemes: ( - input: TextToSpeechPhonemeInput - ) => Promise; - /** * Streams the generated audio data incrementally. * This is optimal for real-time playback, allowing audio to start playing before the full text is synthesized. @@ -156,16 +133,6 @@ export interface TextToSpeechType { */ stream: (input: TextToSpeechStreamingInput) => Promise; - /** - * Streams pre-computed phonemes incrementally, bypassing the built-in phonemizer. - * @param input - The streaming input with pre-computed `phonemes` instead of `text`. - * @returns A Promise that resolves when the streaming process is complete. - * @throws {RnExecutorchError} If the model is not loaded or is currently generating. - */ - streamFromPhonemes: ( - input: TextToSpeechStreamingPhonemeInput - ) => Promise; - /** * Inserts new text chunk into the buffer to be processed in streaming mode. */ @@ -209,11 +176,3 @@ export interface TextToSpeechStreamingInput extends TextToSpeechInput, TextToSpeechStreamingCallbacks { stopAutomatically?: boolean; } - -/** - * Streaming input definition for pre-computed phonemes. - * Same as `TextToSpeechStreamingInput` but accepts `phonemes` instead of `text`. - * @category Types - */ -export interface TextToSpeechStreamingPhonemeInput - extends TextToSpeechPhonemeInput, TextToSpeechStreamingCallbacks {} diff --git a/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a b/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a deleted file mode 100644 index 5a38707580..0000000000 Binary files a/packages/react-native-executorch/third-party/android/libs/phonemis/arm64-v8a/libphonemis.a and /dev/null differ diff --git a/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a b/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a deleted file mode 100644 index 2306d4647a..0000000000 Binary files a/packages/react-native-executorch/third-party/android/libs/phonemis/x86_64/libphonemis.a and /dev/null differ diff --git a/packages/react-native-executorch/third-party/common/phonemis b/packages/react-native-executorch/third-party/common/phonemis new file mode 160000 index 0000000000..2da5ef9971 --- /dev/null +++ b/packages/react-native-executorch/third-party/common/phonemis @@ -0,0 +1 @@ +Subproject commit 2da5ef9971fe0e2d92ebe1424c28905a18268a7d diff --git a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/lexicon.h b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/lexicon.h deleted file mode 100644 index 3af4268211..0000000000 --- a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/lexicon.h +++ /dev/null @@ -1,60 +0,0 @@ -#pragma once - -#include "../tagger/tag.h" -#include "types.h" -#include -#include -#include - -namespace phonemis::phonemizer { - -// Lexicon class -// Provides phonemization of extracted tokens. -// Wrapps a dictionary lookup for given word with additional -// pre/post-processing. -class Lexicon { -public: - Lexicon(Lang language, const std::string &dict_filepath); - - // Checks if given world exists in the lexicon in any form - bool is_known(const std::string &word) const; - - // Returns the phonemization for given word, or "" if the phonemization failed - std::u32string get(const std::string &word, const tagger::Tag &tag, - std::optional base_stress = std::nullopt, - std::optional vowel_next = std::nullopt); - -private: - // Helper functions - extract phonemes without stressing - std::u32string get_word(const std::string &word, const tagger::Tag &tag, - std::optional stress, - std::optional vowel_next) const; - - // Helper functions - word+suffix phonemization - // Phonemizes word ending with popular english suffixes, example: -ed, -s, - // -ing. - std::u32string stem_s(const std::string &word, const tagger::Tag &tag, - std::optional stress) const; - std::u32string stem_ed(const std::string &word, const tagger::Tag &tag, - std::optional stress) const; - std::u32string stem_ing(const std::string &word, const tagger::Tag &tag, - std::optional stress) const; - - // Helper functions - dictionary lookup with stressing - // Returns an empty phoneme string if failed to extract phonemes. - std::u32string lookup(const std::string &word, const tagger::Tag &tag, - std::optional stress) const; - std::u32string lookup_nnp(const std::string &word) const; - std::u32string lookup_special(const std::string &word, const tagger::Tag &tag, - std::optional stress, - std::optional vowel_next) const; - - // Resolved language - Lang language_; - - // Lookup dictionary: text -> phonemes - // Provide quick and direct phonemization for popular words. - std::unordered_map dict_ = {}; -}; - -} // namespace phonemis::phonemizer \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/phonemizer.h b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/phonemizer.h deleted file mode 100644 index 27f993939c..0000000000 --- a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/phonemizer.h +++ /dev/null @@ -1,29 +0,0 @@ -#pragma once - -#include "lexicon.h" -#include -#include - -namespace phonemis::phonemizer { - -// Phonemizer class -// Combines lexicon lookup-style phonemization with rule-based fallback -class Phonemizer { -public: - Phonemizer(Lang language, const std::string &lexicon_filepath = ""); - - // Main phonemization method - std::u32string phonemize(const std::string &word, const tagger::Tag &tag, - std::optional base_stress = std::nullopt, - std::optional vowel_next = std::nullopt) const; - -private: - // Helper functions - rule-based fallback methods - std::u32string fallback(const std::string &word, - const tagger::Tag &tag) const; - - // Lexicon component - std::unique_ptr lexicon_ = nullptr; -}; - -} // namespace phonemis::phonemizer \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/types.h b/packages/react-native-executorch/third-party/include/phonemis/phonemizer/types.h deleted file mode 100644 index 7e6e8b4bcb..0000000000 --- a/packages/react-native-executorch/third-party/include/phonemis/phonemizer/types.h +++ /dev/null @@ -1,13 +0,0 @@ -#pragma once - -namespace phonemis::phonemizer { - -// Available languages (english variants) -enum class Lang { - EN_US, - EN_GB, - - DEFAULT = EN_US -}; - -} // namespace phonemis::phonemizer \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/pipeline.h b/packages/react-native-executorch/third-party/include/phonemis/pipeline.h deleted file mode 100644 index e8fdf35e31..0000000000 --- a/packages/react-native-executorch/third-party/include/phonemis/pipeline.h +++ /dev/null @@ -1,35 +0,0 @@ -#pragma once - -#include "phonemizer/phonemizer.h" -#include "preprocessor/tools.h" -#include "tagger/tagger.h" -#include "tokenizer/tokenize.h" -#include - -namespace phonemis { - -using phonemizer::Lang; -using phonemizer::Phonemizer; -using tagger::Tagger; - -// #### Main phonemization pipeline -// Manages all the phonemization parts, from preprocessing, through -// tokenization and tagging to final Phonemizer call. -// Tagger and Lexicon .json data files are theoretically optional, but -// skipping these arguments will significantly impact the phonemization quality. -class Pipeline { -public: - Pipeline(Lang language, const std::string &tagger_data_filepath = "", - const std::string &lexicon_data_filepath = ""); - - std::u32string process(const std::string &text); - -private: - Lang language_; - - // Pipeline subcomponents - std::unique_ptr phonemizer_ = nullptr; - std::unique_ptr tagger_ = nullptr; -}; - -} // namespace phonemis \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/preprocessor/tools.h b/packages/react-native-executorch/third-party/include/phonemis/preprocessor/tools.h deleted file mode 100644 index 9f77ba43de..0000000000 --- a/packages/react-native-executorch/third-party/include/phonemis/preprocessor/tools.h +++ /dev/null @@ -1,21 +0,0 @@ -#pragma once - -#include -#include - -namespace phonemis::preprocessor { - -// Normalizes the text by replacing all foreign characters -// to latin-only phrases. -std::string normalize_unicode(const std::string &text); - -// Divides a monolit text into multiple sentences. -// A sentence always ends with a end of sentence character (defined in -// constants.h). -std::vector split_sentences(const std::string &text); - -// Converts all the numbers in the text to spoken representations. -// Usually expands the size of the text. -std::string verbalize_numbers(const std::string &text); - -} // namespace phonemis::preprocessor \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/tagger/tag.h b/packages/react-native-executorch/third-party/include/phonemis/tagger/tag.h deleted file mode 100644 index ba59af4e9b..0000000000 --- a/packages/react-native-executorch/third-party/include/phonemis/tagger/tag.h +++ /dev/null @@ -1,49 +0,0 @@ -#pragma once - -#include "../utilities/string_utils.h" -#include - -namespace phonemis::tagger { - -using namespace utilities; - -// Tag class definition -// An abstraction layer which wrapps a simple string-based tag definition -// with some additional logic. -class Tag : public std::string { -public: - // Inherit constructors and assignment from std::string - using std::string::string; - using std::string::operator=; - Tag(std::string const &s) : std::string(s) {} - Tag(std::string &&s) : std::string(std::move(s)) {} - - // Extra logic - Tag parent_tag() const { - auto this_tag = static_cast(*this); - if (this_tag == "VERB" || string_utils::starts_with(this_tag, "VB")) - return {"VERB"}; - if (this_tag == "NOUN" || string_utils::starts_with(this_tag, "NN")) - return {"NOUN"}; - if (string_utils::starts_with(this_tag, "ADV") || - string_utils::starts_with(this_tag, "RB")) - return {"ADV"}; - if (string_utils::starts_with(this_tag, "ADJ") || - string_utils::starts_with(this_tag, "JJ")) - return {"ADJ"}; - return (*this); - } -}; - -} // namespace phonemis::tagger - -// Hash definition -// Required to use Tag objects as map keys. -namespace std { -template <> struct hash { - size_t operator()(phonemis::tagger::Tag const &t) const noexcept { - // Use std::string's hash implementation - return std::hash()(static_cast(t)); - } -}; -} // namespace std \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/tagger/tagger.h b/packages/react-native-executorch/third-party/include/phonemis/tagger/tagger.h deleted file mode 100644 index c5ef085b7a..0000000000 --- a/packages/react-native-executorch/third-party/include/phonemis/tagger/tagger.h +++ /dev/null @@ -1,37 +0,0 @@ -#pragma once - -#include "../tokenizer/tokens.h" -#include "tag.h" -#include -#include -#include -#include - -namespace phonemis::tagger { - -// Tagger class -// Provides PoS (Part of Speech) tagging functionality. -// Requires a previous tokenization of the text (tokenizer module). -// A modification of the Viterbi algorithm for bigram HMM (Hidden Markov Model) -// tagger. -class Tagger { -public: - explicit Tagger(const std::string &hmm_data_path); - - // Main tagging method - a modified Viterbi algorithm - // Works in place bo modyfing the 'tag' fields. - void tag(std::vector &sentence) const; - -private: - // Set of possible tags (states) - std::unordered_set tags_; - - // Probability maps - loaded from the input json file. - std::unordered_map start_probs_ = {}; - std::unordered_map> - emission_probs_ = {}; - std::unordered_map> transition_probs_ = - {}; -}; - -} // namespace phonemis::tagger \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokenize.h b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokenize.h deleted file mode 100644 index ab52e6946c..0000000000 --- a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokenize.h +++ /dev/null @@ -1,14 +0,0 @@ -#pragma once - -#include "tokens.h" -#include "types.h" -#include -#include - -namespace phonemis::tokenizer { - -// Tokenizes the input text into a vector of strings (tokens). -// Follows specific rules for special characters and special words. -std::vector tokenize(const std::string &text); - -} // namespace phonemis::tokenizer diff --git a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokens.h b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokens.h deleted file mode 100644 index 0f1c0d5f4e..0000000000 --- a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/tokens.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include "../tagger/tag.h" -#include -#include - -namespace phonemis::tokenizer { - -// A main structure representing a single token extracted from text -// Mandatory fields are extracted during the tokenization stage, while -// extra fields might be processed later (for example, during the tagging stage) -struct Token { - std::string text; - std::string whitespace = ""; // Following whitespace - bool is_first = false; // Whether it is a first token in the sentence - - // Extras - std::optional tag = - std::nullopt; // A PoS (Part of Speech) tag, example: NN (noun) -}; - -} // namespace phonemis::tokenizer \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/types.h b/packages/react-native-executorch/third-party/include/phonemis/tokenizer/types.h deleted file mode 100644 index 45e84a8735..0000000000 --- a/packages/react-native-executorch/third-party/include/phonemis/tokenizer/types.h +++ /dev/null @@ -1,22 +0,0 @@ -#pragma once - -#include - -namespace phonemis::tokenizer { - -namespace rules { -// Separation rules for special characters -enum class Separation { - JOIN_LEFT, // Join to the word on its left - JOIN_RIGHT, // Join to the word on its right - TOTAL_DIVIDE, // Always separate from both sides - TOTAL_JOIN // Always join both sides -}; -} // namespace rules - -struct SpecialCharacter { - char character; - rules::Separation sep_rule; -}; - -} // namespace phonemis::tokenizer \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/include/phonemis/utilities/string_utils.h b/packages/react-native-executorch/third-party/include/phonemis/utilities/string_utils.h deleted file mode 100644 index 481212cbe4..0000000000 --- a/packages/react-native-executorch/third-party/include/phonemis/utilities/string_utils.h +++ /dev/null @@ -1,155 +0,0 @@ -#pragma once - -#include -#include -#include -#include -#include -#include -#include - -namespace phonemis::utilities::string_utils { - -// ------------------------------------- -// String utils - byte format conversion -// ------------------------------------- - -// TODO: deprecated, replace with something else - -inline std::string char32_to_utf8(char32_t c) { - std::wstring_convert, char32_t> convert; - return convert.to_bytes(&c, &c + 1); -} - -inline std::u32string utf8_to_u32string(const std::string &utf8) { - std::wstring_convert, char32_t> convert; - return convert.from_bytes(utf8); -} - -inline std::string u32string_to_utf8(const std::u32string &u32) { - std::wstring_convert, char32_t> convert; - return convert.to_bytes(u32); -} - -// ---------------------------------------- -// String utils - capitalizing & lowerizing -// ---------------------------------------- - -// Capitalization (first letter only) -template inline void capitalize__(StringT &str) { - if (!str.empty()) - str[0] = std::toupper(str[0]); -} - -// Capitalization (an entire string) -template inline void to_upper__(StringT &str) { - std::transform(str.cbegin(), str.cend(), str.begin(), - [](auto c) { return std::toupper(c); }); -} - -// Lowerization (an entire string) -template inline void to_lower__(StringT &str) { - std::transform(str.cbegin(), str.cend(), str.begin(), - [](auto c) { return std::tolower(c); }); -} - -// ------------------------------------ -// String utils - other transformations -// ------------------------------------ - -// Filters a given string and omits all the characters which -// do not pass given predicate. -template -inline void filter__(StringT &str, Pred pred) { - str.erase(std::remove_if(str.begin(), str.end(), pred), str.end()); -} - -// Replaces all the occurances of a character `a` with a character `b`. -// If `b` is not specified, then it removes all occurances of `a` without -// replacement. -template -inline void replace__(StringT &str, CharT a, std::optional b) { - if (b.has_value()) - std::replace(str.begin(), str.end(), a, b.value()); - else - str.erase(std::remove(str.begin(), str.end(), a), str.end()); -} - -// Splits the string by the given character. -template -inline std::vector split(const StringT &str, CharT bpoint) { - std::vector result = {}; - - auto it = str.begin(); - while (it != str.end()) { - auto next = std::find(it, str.end(), bpoint); - result.emplace_back(it, next); - - it = next; - if (it != str.end()) - it++; - } - - return result; -} - -// Removes the leading and trailing characters equals to given character. -// If the character is not specified, it removes white spaces instead. -template -inline StringT strip(const StringT &str, - std::optional c = std::nullopt) { - auto lbound = std::find_if(str.cbegin(), str.cend(), [&c](CharT a) -> bool { - return c.has_value() ? a != c : !std::isspace(a); - }); - auto rbound = std::find_if(str.crbegin(), str.crend(), [&c](CharT a) -> bool { - return c.has_value() ? a != c : !std::isspace(a); - }); - - return lbound != str.end() ? StringT(lbound, std::prev(rbound.base())) - : StringT(); -} - -// ------------------------- -// String utils - predicates -// ------------------------- - -// Returns true if the string contains only alphabetic characters. -template inline bool is_alpha(const StringT &str) { - return std::all_of(str.cbegin(), str.cend(), - [](char c) -> bool { return std::isalpha(c); }); -} - -// Returns true if the string starts with given suffix and false otherwise -template -inline bool starts_with(const StringT &str, std::string_view prefix) { - return str.size() >= prefix.size() && str.substr(0, prefix.size()) == prefix; -} - -// Returns true if the string ends with given suffix and false otherwise -template -inline bool ends_with(const StringT &str, std::string_view suffix) { - return str.size() >= suffix.size() && - str.substr(str.size() - suffix.size()) == suffix; -} - -// -------------------------------------- -// String utils - (non)in-place resolving -// -------------------------------------- - -// Generates non-mutating wrapper `name(...)` that calls `name__(...)` -// Used to create a non-inplace versions of the above functions. -#define MAKE_NON_INPLACE(name) \ - template \ - inline StringT name(const StringT &str, Args &&...args) { \ - StringT tmp = str; \ - name##__(tmp, std::forward(args)...); \ - return tmp; \ - } - -MAKE_NON_INPLACE(capitalize) -MAKE_NON_INPLACE(to_lower) -MAKE_NON_INPLACE(to_upper) -MAKE_NON_INPLACE(filter) -MAKE_NON_INPLACE(replace) - -} // namespace phonemis::utilities::string_utils \ No newline at end of file diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist index bd0373672c..b2b2aa2478 100644 Binary files a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist and b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64-simulator/ExecutorchLib.framework/Info.plist differ diff --git a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist index 2372838d49..a6f2d4a5dc 100644 Binary files a/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist and b/packages/react-native-executorch/third-party/ios/ExecutorchLib.xcframework/ios-arm64/ExecutorchLib.framework/Info.plist differ diff --git a/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a b/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a deleted file mode 100644 index 78f5169308..0000000000 Binary files a/packages/react-native-executorch/third-party/ios/libs/phonemis/physical-arm64-release/libphonemis.a and /dev/null differ diff --git a/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a b/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a deleted file mode 100644 index ccf1d2fa64..0000000000 Binary files a/packages/react-native-executorch/third-party/ios/libs/phonemis/simulator-arm64-debug/libphonemis.a and /dev/null differ diff --git a/yarn.lock b/yarn.lock index 3ecfd274a6..88c6a20978 100644 --- a/yarn.lock +++ b/yarn.lock @@ -15273,6 +15273,20 @@ __metadata: languageName: node linkType: hard +"react-native-audio-api@npm:0.11.5": + version: 0.11.5 + resolution: "react-native-audio-api@npm:0.11.5" + dependencies: + semver: "npm:^7.7.3" + peerDependencies: + react: "*" + react-native: "*" + bin: + setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js + checksum: 10/f8a388954c42cfd390b9adbfe6781f9d8049d43ea6ab83a8b229a0d0082df3489d9b48072d7166403ae95a33e8d741aab86ba2307d1bd4ff949fdb72e14ef42d + languageName: node + linkType: hard + "react-native-audio-api@npm:0.12.0": version: 0.12.0 resolution: "react-native-audio-api@npm:0.12.0" @@ -16691,7 +16705,7 @@ __metadata: metro-config: "npm:^0.83.0" react: "npm:19.2.5" react-native: "npm:0.83.4" - react-native-audio-api: "npm:0.12.0" + react-native-audio-api: "npm:0.11.5" react-native-device-info: "npm:^15.0.2" react-native-executorch: "workspace:*" react-native-executorch-expo-resource-fetcher: "workspace:*"