diff --git a/apps/llm/app/index.tsx b/apps/llm/app/index.tsx index 72358ae72c..b67b3fa7ce 100644 --- a/apps/llm/app/index.tsx +++ b/apps/llm/app/index.tsx @@ -29,12 +29,6 @@ export default function Home() { > LLM Structured Output - router.navigate('voice_chat/')} - > - Voice Chat - router.navigate('multimodal_llm/')} diff --git a/apps/llm/app/voice_chat/index.tsx b/apps/llm/app/voice_chat/index.tsx deleted file mode 100644 index 23ab70bff4..0000000000 --- a/apps/llm/app/voice_chat/index.tsx +++ /dev/null @@ -1,311 +0,0 @@ -import { useContext, useEffect, useState } from 'react'; -import { - Keyboard, - KeyboardAvoidingView, - Platform, - StyleSheet, - Text, - TouchableOpacity, - TouchableWithoutFeedback, - View, -} from 'react-native'; -import SWMIcon from '../../assets/icons/swm_icon.svg'; -import Spinner from '../../components/Spinner'; -import ErrorBanner from '../../components/ErrorBanner'; -import { - useSpeechToText, - useLLM, - QWEN3_0_6B_QUANTIZED, - QWEN3_1_7B_QUANTIZED, - LLAMA3_2_1B_SPINQUANT, - WHISPER_TINY_EN, - WHISPER_TINY_EN_QUANTIZED, - WHISPER_BASE_EN, - WHISPER_SMALL_EN, - LLMProps, - SpeechToTextProps, -} from 'react-native-executorch'; -import { ModelPicker, ModelOption } from '../../components/ModelPicker'; -import PauseIcon from '../../assets/icons/pause_icon.svg'; -import MicIcon from '../../assets/icons/mic_icon.svg'; -import StopIcon from '../../assets/icons/stop_icon.svg'; -import ColorPalette from '../../colors'; -import Messages from '../../components/Messages'; -import { AudioManager, AudioRecorder } from 'react-native-audio-api'; -import DeviceInfo from 'react-native-device-info'; -import { useIsFocused } from '@react-navigation/native'; -import { useSafeAreaInsets } from 'react-native-safe-area-context'; -import { GeneratingContext } from '../../context'; - -type LLMModelSources = LLMProps['model']; -type STTModelSources = SpeechToTextProps['model']; - -const LLM_MODELS: ModelOption[] = [ - { label: 'Qwen3 0.6B', value: QWEN3_0_6B_QUANTIZED }, - { label: 'Qwen3 1.7B', value: QWEN3_1_7B_QUANTIZED }, - { label: 'Llama 1B', value: LLAMA3_2_1B_SPINQUANT }, -]; - -const STT_MODELS: ModelOption[] = [ - { label: 'Whisper Tiny', value: WHISPER_TINY_EN }, - { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED }, - { label: 'Whisper Base', value: WHISPER_BASE_EN }, - { label: 'Whisper Small', value: WHISPER_SMALL_EN }, -]; - -export default function VoiceChatScreenWrapper() { - const isFocused = useIsFocused(); - - return isFocused ? : null; -} - -function VoiceChatScreen() { - const { bottom } = useSafeAreaInsets(); - const [isRecording, setIsRecording] = useState(false); - const [liveTranscription, setLiveTranscription] = useState(''); - const [selectedLLM, setSelectedLLM] = - useState(QWEN3_0_6B_QUANTIZED); - const [selectedSTT, setSelectedSTT] = - useState(WHISPER_TINY_EN); - const [error, setError] = useState(null); - - const [recorder] = useState(() => new AudioRecorder()); - - const { setGlobalGenerating } = useContext(GeneratingContext); - - const llm = useLLM({ model: selectedLLM }); - const speechToText = useSpeechToText({ - model: selectedSTT, - }); - - useEffect(() => { - setGlobalGenerating(llm.isGenerating || speechToText.isGenerating); - }, [llm.isGenerating, speechToText.isGenerating, setGlobalGenerating]); - - useEffect(() => { - AudioManager.setAudioSessionOptions({ - iosCategory: 'playAndRecord', - iosMode: 'spokenAudio', - iosOptions: ['allowBluetoothHFP', 'defaultToSpeaker'], - }); - AudioManager.requestRecordingPermissions(); - }, []); - - const handleRecordPress = async () => { - if (isRecording) { - setIsRecording(false); - recorder.stop(); - speechToText.streamStop(); - } else { - setIsRecording(true); - setLiveTranscription(''); - - const sampleRate = 16000; - recorder.onAudioReady( - { - sampleRate, - bufferLength: 0.1 * sampleRate, - channelCount: 1, - }, - ({ buffer }) => { - speechToText.streamInsert(buffer.getChannelData(0)); - } - ); - recorder.start(); - - let finalResult = ''; - - try { - for await (const result of speechToText.stream()) { - const text = result.committed.text + result.nonCommitted.text; - setLiveTranscription(text); - finalResult = text; - } - } catch (e) { - setError(e instanceof Error ? e.message : String(e)); - } finally { - if (finalResult.trim().length > 0) { - await llm.sendMessage(finalResult); - setLiveTranscription(''); - } - } - } - }; - - useEffect(() => { - if (llm.error) setError(String(llm.error)); - }, [llm.error]); - - useEffect(() => { - if (speechToText.error) setError(String(speechToText.error)); - }, [speechToText.error]); - - return (!llm.isReady || !speechToText.isReady) && - !llm.error && - !speechToText.error ? ( - - ) : ( - - - - - Qwen 3 x Whisper - - setError(null)} /> - {llm.messageHistory.length > 0 || liveTranscription.length > 0 ? ( - - 0 - ? [ - ...llm.messageHistory, - { - role: 'user', - content: liveTranscription, - }, - ] - : llm.messageHistory - } - llmResponse={llm.response} - isGenerating={llm.isGenerating} - deleteMessage={llm.deleteMessage} - /> - - ) : ( - - Hello! 👋 - - Tap the mic and speak to me. I'll transcribe your voice and - respond using a language model — all on-device. - - - )} - - setSelectedLLM(m)} - /> - setSelectedSTT(m)} - /> - - - {DeviceInfo.isEmulatorSync() ? ( - - - recording disabled on emulator - - - ) : ( - <> - {llm.isGenerating ? ( - - - - ) : ( - - {isRecording ? ( - - ) : ( - - )} - - )} - - )} - - - - ); -} - -const styles = StyleSheet.create({ - keyboardAvoidingView: { - flex: 1, - }, - topContainer: { - height: 68, - width: '100%', - alignItems: 'center', - justifyContent: 'center', - }, - chatContainer: { - flex: 10, - width: '100%', - }, - textModelName: { - color: ColorPalette.primary, - }, - helloMessageContainer: { - flex: 10, - width: '100%', - alignItems: 'center', - justifyContent: 'center', - }, - helloText: { - fontFamily: 'medium', - fontSize: 30, - color: ColorPalette.primary, - }, - bottomHelloText: { - fontFamily: 'regular', - fontSize: 20, - lineHeight: 28, - textAlign: 'center', - color: ColorPalette.primary, - }, - bottomContainer: { - height: 100, - width: '100%', - justifyContent: 'center', - alignItems: 'center', - paddingHorizontal: 16, - }, - recordTouchable: { - height: '100%', - justifyContent: 'center', - alignItems: 'center', - }, - recordingInfo: { - width: '100%', - display: 'flex', - justifyContent: 'center', - alignItems: 'center', - }, - emulatorBox: { - padding: 10, - margin: 10, - borderWidth: 1, - borderRadius: 8, - borderColor: 'gray', - justifyContent: 'center', - alignItems: 'center', - }, - emulatorWarning: { - color: 'gray', - fontSize: 16, - }, -}); diff --git a/apps/speech/package.json b/apps/speech/package.json index 93e07755dd..47de1396a3 100644 --- a/apps/speech/package.json +++ b/apps/speech/package.json @@ -20,7 +20,7 @@ "metro-config": "^0.83.0", "react": "19.2.5", "react-native": "0.83.4", - "react-native-audio-api": "0.12.0", + "react-native-audio-api": "0.12.2", "react-native-device-info": "^15.0.2", "react-native-executorch": "workspace:*", "react-native-executorch-expo-resource-fetcher": "workspace:*", diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index dfd39c15b4..ad4f6505c8 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -14,21 +14,25 @@ import { SafeAreaProvider, SafeAreaView } from 'react-native-safe-area-context'; import { useSpeechToText, WHISPER_TINY_EN, - WHISPER_TINY_EN_QUANTIZED, + WHISPER_TINY_EN_COREML, WHISPER_BASE_EN, + WHISPER_BASE_EN_COREML, WHISPER_SMALL_EN, TranscriptionResult, SpeechToTextProps, + WHISPER_SMALL_EN_COREML, } from 'react-native-executorch'; import { ModelPicker, ModelOption } from '../components/ModelPicker'; type STTModelSources = SpeechToTextProps['model']; const MODELS: ModelOption[] = [ - { label: 'Whisper Tiny', value: WHISPER_TINY_EN }, - { label: 'Whisper Tiny Q', value: WHISPER_TINY_EN_QUANTIZED }, - { label: 'Whisper Base', value: WHISPER_BASE_EN }, - { label: 'Whisper Small', value: WHISPER_SMALL_EN }, + { label: 'Whisper Tiny EN (XNNPACK)', value: WHISPER_TINY_EN }, + { label: 'Whisper Tiny EN (CoreML)', value: WHISPER_TINY_EN_COREML }, + { label: 'Whisper Base EN (XNNPACK)', value: WHISPER_BASE_EN }, + { label: 'Whisper Base EN (CoreML)', value: WHISPER_BASE_EN_COREML }, + { label: 'Whisper Small EN (XNNPACK)', value: WHISPER_SMALL_EN }, + { label: 'Whisper Small EN (CoreML)', value: WHISPER_SMALL_EN_COREML }, ]; import FontAwesome from '@expo/vector-icons/FontAwesome'; import { @@ -45,9 +49,12 @@ import ErrorBanner from '../components/ErrorBanner'; const isSimulator = DeviceInfo.isEmulatorSync(); +const DEFAULT_MODEL = + Platform.OS === 'ios' ? WHISPER_BASE_EN_COREML : WHISPER_TINY_EN; + export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const [selectedModel, setSelectedModel] = - useState(WHISPER_TINY_EN); + useState(DEFAULT_MODEL); const model = useSpeechToText({ model: selectedModel, @@ -148,7 +155,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { recorder.current.onAudioReady( { sampleRate, - bufferLength: 0.1 * sampleRate, + bufferLength: 0.1 * sampleRate, // 100 ms channelCount: 1, }, ({ buffer }) => { @@ -178,6 +185,7 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { try { const streamIter = model.stream({ verbose: enableTimestamps, + timeout: 100, }); for await (const { committed, nonCommitted } of streamIter) { diff --git a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h index a20fd7b1bc..ec47586266 100644 --- a/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h +++ b/packages/react-native-executorch/common/rnexecutorch/host_objects/JsiConversions.h @@ -599,8 +599,7 @@ inline jsi::Value getJsiValue(const Segment &seg, jsi::Runtime &runtime) { jsi::Object wordObj(runtime); wordObj.setProperty( runtime, "word", - jsi::String::createFromUtf8(runtime, seg.words[i].content + - seg.words[i].punctations)); + jsi::String::createFromUtf8(runtime, seg.words[i].content)); wordObj.setProperty(runtime, "start", static_cast(seg.words[i].start)); wordObj.setProperty(runtime, "end", static_cast(seg.words[i].end)); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp index 4b58c5039b..9537642d58 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.cpp @@ -94,7 +94,7 @@ TranscriptionResult wordsToResult(const std::vector &words, std::string fullText; for (const auto &w : words) { - fullText += w.content + w.punctations; + fullText += w.content; } res.text = fullText; @@ -115,7 +115,8 @@ TranscriptionResult wordsToResult(const std::vector &words, } // namespace void SpeechToText::stream(std::shared_ptr callback, - std::string languageOption, bool verbose) { + std::string languageOption, bool verbose, + uint32_t timeout) { if (isStreaming_) { throw RnExecutorchError(RnExecutorchErrorCode::StreamingInProgress, "Streaming is already in progress!"); @@ -158,10 +159,10 @@ void SpeechToText::stream(std::shared_ptr callback, // running transcriptions too rapidly (before the audio buffer is filled // with significant amount of new data) can cause streamer to commit wrong // phrases. - std::this_thread::sleep_for(std::chrono::milliseconds(100)); + std::this_thread::sleep_for(std::chrono::milliseconds(timeout)); } - std::vector finalWords = streamer_->finish(); + std::vector finalWords = streamer_->finish(options); TranscriptionResult finalRes = wordsToResult(finalWords, languageOption, verbose); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h index ade835869c..ec51862793 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/SpeechToText.h @@ -42,7 +42,8 @@ class SpeechToText { // Stream void stream(std::shared_ptr callback, - std::string languageOption, bool enableTimestamps); + std::string languageOption, bool enableTimestamps, + uint32_t timeout); void streamStop(); void streamInsert(std::span waveform); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h index 357309391d..efe6cc2819 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/schema/OnlineASR.h @@ -36,7 +36,7 @@ class OnlineASR { virtual ProcessResult process(const DecodingOptions &options) = 0; - virtual std::vector finish() = 0; + virtual std::vector finish(const DecodingOptions &options) = 0; virtual void reset() = 0; }; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h index e7319f95b5..2343d1faab 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/common/types/Word.h @@ -4,13 +4,14 @@ namespace rnexecutorch::models::speech_to_text { +/** + * Basically a different representation of token, + * with timestamps calculated. + */ struct Word { std::string content; float start; float end; - - std::string - punctations; // Trailing punctations which appear after the main content }; } // namespace rnexecutorch::models::speech_to_text diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp index d1debeb0f0..d2555a79fa 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/ASR.cpp @@ -138,8 +138,9 @@ executorch::aten::Tensor ASR::decode(std::span tokens, positionShape, cachePositions.data(), ScalarType::Long); const auto encoderOutputSize = static_cast(encoderOutput.size()); - std::vector encShape = {1, constants::kNumFrames, - encoderOutputSize / constants::kNumFrames}; + std::vector encShape = { + 1, static_cast(constants::kNumFrames), + encoderOutputSize / static_cast(constants::kNumFrames)}; auto encoderTensor = executorch::extension::make_tensor_ptr( std::move(encShape), const_cast(encoderOutput.data()), ScalarType::Float); @@ -262,11 +263,21 @@ ASR::generate(std::span waveform, const DecodingOptions &options, std::vector scores; uint64_t startPos = 0; - while (std::cmp_less_equal(startPos + sequenceIds.size(), - constants::kMaxDecodeLength)) { - executorch::aten::Tensor logitsTensor = - this->decode(sequenceIds, encoderFeatures, startPos); + // Prefill: feed each initial token individually so decode() always sees 1 + // token + std::span firstToken(sequenceIds.data(), 1); + executorch::aten::Tensor logitsTensor = + this->decode(firstToken, encoderFeatures, startPos); + ++startPos; + for (size_t i = 1; i < sequenceIds.size(); ++i) { + std::span single(sequenceIds.data() + i, 1); + logitsTensor = this->decode(single, encoderFeatures, startPos); + ++startPos; + } + + // Autoregressive decoding: always 1 token at a time + while (std::cmp_less(startPos, constants::kMaxDecodeLength)) { const size_t logitsInnerDim = logitsTensor.size(1); const size_t logitsDictSize = logitsTensor.size(2); const float *logitsData = logitsTensor.const_data_ptr() + @@ -302,15 +313,16 @@ ASR::generate(std::span waveform, const DecodingOptions &options, nextProb = probs[nextId]; } - // Move the startPos pointer by the amount of tokens we processed - startPos += sequenceIds.size(); - sequenceIds = {nextId}; cachedTokens.push_back(nextId); scores.push_back(nextProb); if (nextId == endOfTranscriptionToken_) { break; } + + std::span single(&cachedTokens.back(), 1); + logitsTensor = this->decode(single, encoderFeatures, startPos); + ++startPos; } return {.tokens = std::vector(cachedTokens.cbegin() + @@ -437,7 +449,7 @@ ASR::estimateWordLevelTimestampsLinear(std::span tokens, const float wEnd = wStart + timePerChar * wSize; prevCharCount += wSize; - // We store punctations separately to other characters. + // Detect and extract trailing punctuations. std::string puncts = ""; while (!w.empty() && constants::kPunctations.contains(w.back())) { puncts += w.back(); @@ -445,7 +457,14 @@ ASR::estimateWordLevelTimestampsLinear(std::span tokens, } std::reverse(puncts.begin(), puncts.end()); - wordObjs.emplace_back(std::move(w), wStart, wEnd, std::move(puncts)); + // Add the core word. + wordObjs.emplace_back(std::move(w), wStart, wEnd); + + // If punctuation was present, add it as a separate "word" with an + // instantaneous timestamp at the end of the original word. + if (!puncts.empty()) { + wordObjs.emplace_back(std::move(puncts), wEnd, wEnd); + } } return wordObjs; diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h index 0b284345ec..62a9f968f7 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Constants.h @@ -9,34 +9,37 @@ namespace rnexecutorch::models::speech_to_text::whisper::constants { // Maximum duration of each audio chunk to process (in seconds) // It is intentionally set to 29 since otherwise only the last chunk would be // correctly transcribe due to the model's positional encoding limit -constexpr static int32_t kChunkSize = 29; +inline constexpr size_t kChunkSize = 29; // Sampling rate expected by Whisper and the model's audio pipeline (16 kHz) -constexpr static int32_t kSamplingRate = 16000; -constexpr static int32_t kSamplesPerMilisecond = kSamplingRate / 1000; +inline constexpr size_t kSamplingRate = 16000; +inline constexpr size_t kSamplesPerMilisecond = kSamplingRate / 1000; + +inline constexpr size_t kMaxSamples = kChunkSize * kSamplingRate; // The maximum number of tokens the decoder can generate per chunk -constexpr static int32_t kMaxDecodeLength = 128; +inline constexpr size_t kMaxDecodeLength = 128; // Minimum allowed chunk length before processing (in audio samples) -constexpr static int32_t kMinChunkSamples = 1 * kSamplingRate; +inline constexpr size_t kMinChunkSamples = 1 * kSamplingRate; // Number of mel frames output by the encoder (derived from input spectrogram) -constexpr static int32_t kNumFrames = 1500; +inline constexpr size_t kNumFrames = 1500; // Time precision used by Whisper timestamps: each token spans 0.02 seconds -constexpr static float kTimePrecision = 0.02f; +inline constexpr float kTimePrecision = 0.02f; // Special characters serving as pause / end of sentence -static const std::unordered_set kPunctations = {',', '.', '?', +inline const std::unordered_set kPunctations = {',', '.', '?', '!', ':', ';'}; +inline const std::unordered_set kEosPunctations = {'.', '?', '!', ';'}; // Special token constants namespace tokens { -static const std::string kStartOfTranscript = "<|startoftranscript|>"; -static const std::string kEndOfTranscript = "<|endoftext|>"; -static const std::string kBeginTimestamp = "<|0.00|>"; -static const std::string kBlankAudio = "[BLANK_AUDIO]"; +inline const std::string kStartOfTranscript = "<|startoftranscript|>"; +inline const std::string kEndOfTranscript = "<|endoftext|>"; +inline const std::string kBeginTimestamp = "<|0.00|>"; +inline const std::string kBlankAudio = "[BLANK_AUDIO]"; } // namespace tokens } // namespace rnexecutorch::models::speech_to_text::whisper::constants \ No newline at end of file diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp deleted file mode 100644 index ce365e4e44..0000000000 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.cpp +++ /dev/null @@ -1,199 +0,0 @@ -#include "HypothesisBuffer.h" -#include "Params.h" -#include "Utils.h" - -#include -#include - -namespace rnexecutorch::models::speech_to_text::whisper::stream { - -void HypothesisBuffer::insert(std::span words, float offset) { - // Step 1 - decide which words should be considered as fresh. - fresh_.clear(); - - // We try to find the last committed word in a transcription string. - // Everything beyond that word will be considered as fresh. - // To make the algorithm more resilient to repeated strings of words, - // we check also the preceeding words as well as timestamps (with liberal - // range). - size_t firstFreshWordIdx = 0; - if (!committed_.empty()) { - std::optional lastMatchingWordIdx = - findCommittedSuffix(words, params::kStreamCommitedSuffixSearchSize, - params::kStreamMaxOverlapTimestampDiff1, - params::kStreamWordsPerErrorRate); - firstFreshWordIdx = lastMatchingWordIdx.value_or(0); - } - - bool isCompletelyFresh = firstFreshWordIdx == 0; - for (size_t i = firstFreshWordIdx; i < words.size(); i++) { - const auto &word = words[i]; - - // Global start is a beginning timestamp relative only to the beginning of - // the current streaming process. - const float startGlobal = word.start + offset; - const float endGlobal = word.end + offset; - - if (!isCompletelyFresh || - startGlobal > lastCommittedTime_ - params::kStreamFreshThreshold) { - fresh_.emplace_back(word.content, startGlobal, endGlobal, - word.punctations); - } - } - - // Step 2 - we have already selected the fresh words. Now it's time to - // correct any mistakes and remove the words which overlap with already - // commited segments - to avoid duplicates. - if (!fresh_.empty() && !committed_.empty()) { - // Calculate the largest overlapping fragment size. - // Note that we use size limit (kStreamMaxOverlapSize) for efficiency of the - // algorithm, and timestamp difference limit - // (kStreamMaxOverlapTimestampDiff) to avoid removing correct fragments - // which were just repeated after some time. - size_t overlapSize = utils::findLargestOverlapingFragment( - committed_, fresh_, params::kStreamMaxOverlapSize, - params::kStreamMaxOverlapTimestampDiff2); - - if (overlapSize > 0) { - fresh_.erase(fresh_.begin(), fresh_.begin() + overlapSize); - } - } -} - -std::deque HypothesisBuffer::commit() { - std::deque toCommit = {}; - - // Find a stable prefix: words that haven't changed between last and current - // iteration. - while (!fresh_.empty() && !hypothesis_.empty() && - fresh_.front().content == hypothesis_.front().content) { - // The last word from the fresh_ buffer must also match punctations with the - // hypothesis. This is done in order to ensure correct punctation marks in - // the resulting transcription. - if (fresh_.size() == 1 && - fresh_.front().punctations != hypothesis_.front().punctations) { - break; - } - - // Take timestamps from the hypothesis, but actual content from the fresh - // buffer. - toCommit.emplace_back(std::move(fresh_.front().content), - hypothesis_.front().start, hypothesis_.front().end, - std::move(fresh_.front().punctations)); - fresh_.pop_front(); - hypothesis_.pop_front(); - } - - // Save the last committed word timestamp. - // This will mark the end of the entire committed sequence. - if (!toCommit.empty()) { - lastCommittedTime_ = toCommit.back().end; - } - - // The remaining words from the fresh buffer (uncommitted phrase) - // become a hypothesis for the next iteration. - hypothesis_ = std::move(fresh_); - fresh_.clear(); - - // The last step is to commit the selected words. - committed_.insert(committed_.end(), toCommit.cbegin(), toCommit.cend()); - - return toCommit; -} - -void HypothesisBuffer::releaseCommits(size_t wordsToKeep) { - if (committed_.size() > wordsToKeep) { - size_t nWordsToErase = committed_.size() - wordsToKeep; - committed_.erase(committed_.begin(), committed_.begin() + nWordsToErase); - } -} - -void HypothesisBuffer::reset() { - fresh_.clear(); - hypothesis_.clear(); - committed_.clear(); - - lastCommittedTime_ = 0.f; -} - -std::optional HypothesisBuffer::findCommittedSuffix( - std::span words, size_t nCommitted, - float timestampDiffTolerance, size_t wordsPerMistake) { - if (words.empty() || committed_.empty() || nCommitted == 0) { - return std::nullopt; - } - - // Determine the subset size of committed words to check against. - size_t committedToMatchSize = std::min(nCommitted, committed_.size()); - - // Iterate backwards through 'words' to find the most recent occurrence of a - // suffix of 'committed_' (or the full 'committed_' sequence). - for (int32_t i = static_cast(words.size()) - 1; i >= 0; --i) { - bool match = true; - size_t matchedCount = 0; - size_t contentMistakeCount = 0; - - // Linearly interpolate tolerance if we are at the beginning and can't check - // all committed words. - float effectiveTolerance = timestampDiffTolerance; - if (i < static_cast(committedToMatchSize) - 1) { - effectiveTolerance *= - static_cast(i + 1) / static_cast(committedToMatchSize); - } - - // Try to match backwards from words[i] and committed_.back() - for (size_t j = 0; j < committedToMatchSize; ++j) { - int32_t wordsIdx = i - static_cast(j); - int32_t committedIdx = - static_cast(committed_.size()) - 1 - static_cast(j); - - if (wordsIdx < 0) { - // We reached the beginning of the words span. - // The algorithm allows matching a partial prefix if it's at the start. - break; - } - - const Word &w1 = words[wordsIdx]; - const Word &w2 = committed_[committedIdx]; - - // Check timestamps within tolerance - if (std::max(std::abs(w1.start - w2.start), std::abs(w1.end - w2.end)) > - effectiveTolerance) { - match = false; - break; - } - - // Allow sparse content mismatches while still treating the overall - // sequence as matching. - if (utils::equalsIgnoreCase(w1.content, w2.content)) { - matchedCount++; - } else { - contentMistakeCount++; - } - - // Early exit if mistake count already exceeds what we can recover from - // given the remaining words to check. - if (wordsPerMistake > 0) { - size_t remainingToMatch = committedToMatchSize - 1 - j; - size_t maxPossibleMatched = matchedCount + remainingToMatch; - if (contentMistakeCount > (maxPossibleMatched / wordsPerMistake)) { - match = false; - break; - } - } - } - - // One content mistake is allowed per M matched words. - size_t maxAllowedMistakes = - (wordsPerMistake == 0) ? 0 : (matchedCount / wordsPerMistake); - - if (match && matchedCount > 0 && - contentMistakeCount <= maxAllowedMistakes) { - return static_cast(i); - } - } - - return std::nullopt; -} - -} // namespace rnexecutorch::models::speech_to_text::whisper::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h deleted file mode 100644 index 25833ec01b..0000000000 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/HypothesisBuffer.h +++ /dev/null @@ -1,82 +0,0 @@ -#pragma once - -#include -#include -#include - -#include "../common/types/Word.h" - -namespace rnexecutorch::models::speech_to_text::whisper::stream { - -/** - * A buffer for managing streaming transcription hypotheses. - * This class handles stabilization of the transcription result by tracking - * "fresh" hypotheses and "committing" them once they are stable across updates. - */ -class HypothesisBuffer { -public: - /** - * Inserts new words into the fresh_ buffer. - * Words are filtered based on the last committed time and checked for - * overlaps with existing committed words to prevent duplicates. - * - * @param newWords A span of recently generated words. - * @param offset Time offset to adjust the word timestamps. - */ - void insert(std::span words, float offset); - - /** - * Attempts to commit words present in the fresh_ buffer. - * A phrase from fresh_ buffer can only be committed if it also appears - * in the hypothesis_ buffer (uncommitted words from previous iteration). - * - * Uncommitted words become a 'hypothesis' and are moved into the hypothesis_ - * buffer. - * - * @return A sequence of words committed in the current iteration. - */ - std::deque commit(); - - /** - * Shrinks the committed_ buffer by erasing all words except N latest ones. - * - * Used primarily to relieve increasing memory usage during very - * long streaming sessions. - * - * @param wordsToKeep - number of trailing words to be kept in. - */ - void releaseCommits(size_t wordsToKeep); - - /** - * Resets all the stored buffers and state variables to the initial state - */ - void reset(); - - // Declare a friendship with OnlineASR to allow it to access the internal - // state of stored buffers. - friend class OnlineASR; - -private: - // Finds the most recent occurance of given committed string of words - // in a custom span of words. - // Returns the index of the last matching word (or nullopt if not present). - std::optional findCommittedSuffix(std::span words, - size_t nCommitted, - float timestampDiffTolerance = 1.F, - size_t wordsPerMistake = 4); - - // Stored buffers - // The lifecycle of a correct result word looks as following: - // fresh buffer -> hypothesis buffer -> commited - std::deque - fresh_; // 'New' words from current iterations, which require some checks - // before they go into hypothesis_ buffer. - std::deque - hypothesis_; // Words potentially to be commited, stored between - // iterations (obtained from fresh_ buffer). - std::deque committed_; // A history of already commited words. - - float lastCommittedTime_ = 0.0f; -}; - -} // namespace rnexecutorch::models::speech_to_text::whisper::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp index ded2183201..188c77d80d 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.cpp @@ -1,35 +1,43 @@ +#include "OnlineASR.h" + #include #include -#include -#include +#include #include "Constants.h" -#include "OnlineASR.h" #include "Params.h" #include "Utils.h" namespace rnexecutorch::models::speech_to_text::whisper::stream { -namespace { -std::vector move_to_vector(std::deque &container) { - return std::vector(std::make_move_iterator(container.begin()), - std::make_move_iterator(container.end())); +OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) { + // Reserve an expected amount of memory for audio buffer. + audioBuffer_.reserve((constants::kChunkSize + 1) * constants::kSamplingRate); } -} // namespace -OnlineASR::OnlineASR(const ASR *asr) : asr_(asr) { - // Reserve a minimal expected amount of memory for audio buffer. - audioBuffer_.reserve(static_cast(2 * params::kStreamChunkThreshold * - constants::kSamplingRate)); +bool OnlineASR::isReady() const { + std::scoped_lock lock(streamingMutex); + + return audioBuffer_.size() >= constants::kMinChunkSamples; } void OnlineASR::insertAudioChunk(std::span audio) { - std::scoped_lock lock(audioBufferMutex_); + std::scoped_lock lock(streamingMutex); + audioBuffer_.insert(audioBuffer_.end(), audio.begin(), audio.end()); -} -bool OnlineASR::isReady() const { - return audioBuffer_.size() >= constants::kMinChunkSamples; + // Automatic buffer cleanup. + // + // This prevents the audio buffer from growing indefinitely during continuous + // streaming. It is particularly useful when VAD (Voice Activity Detection) + // is used and elements are inserted but not processed for a long time. + // It should not pass the condition in a normal streaming, that is when + // process() method is called regularly within reasonable steps of time. + if (audioBuffer_.size() > constants::kMaxSamples) { + // Note that results are not actually committed now, but saved for + // a later call of process(). + memory_.toCommit = commitAndClean(memory_.transcript); + } } ProcessResult OnlineASR::process(const DecodingOptions &options) { @@ -38,126 +46,213 @@ ProcessResult OnlineASR::process(const DecodingOptions &options) { // Copy the audio buffer to avoid keeping the lock during the entire // transcription process. { - std::scoped_lock lock(audioBufferMutex_); + std::scoped_lock lock(streamingMutex); audioCopy = audioBuffer_; } - std::vector transcriptions = asr_->transcribe(audioBuffer_, options); + // Obtain a transcription for current audio buffer state. + // It's very unlikely that buffer will exceed whisper's maximum capacity, but + // for absolute safety we can additionally clip the buffer. + std::span input( + audioCopy.begin(), + audioCopy.begin() + std::min(constants::kMaxSamples, audioCopy.size())); - if (transcriptions.empty()) { - return {.committed = {}, .nonCommitted = {}}; - } + std::vector transcriptions = asr_->transcribe(input, options); // Flatten segments into a single word sequence. + // This is basically our 'nonCommitted' part for now. std::vector words; - words.reserve(transcriptions.front().words.size()); - for (auto &segment : transcriptions) { - words.insert(words.end(), std::make_move_iterator(segment.words.begin()), - std::make_move_iterator(segment.words.end())); + std::move(segment.words.begin(), segment.words.end(), + std::back_inserter(words)); } - hypothesisBuffer_.insert(words, bufferTimeOffset_); - - // Apply fix for timestamps. - if (!hypothesisBuffer_.fresh_.empty()) { - size_t noNewWords = hypothesisBuffer_.fresh_.size(); - float establishedEnd = hypothesisBuffer_.lastCommittedTime_; - float newBegin = hypothesisBuffer_.fresh_.front().start; - const float newEnd = hypothesisBuffer_.fresh_.back().end; - float shift = 0.F; - for (size_t i = 0; i < hypothesisBuffer_.fresh_.size(); i++) { - const float originalEnd = hypothesisBuffer_.fresh_[i].end; - - if (i < hypothesisBuffer_.hypothesis_.size() && - utils::equalsIgnoreCase(hypothesisBuffer_.fresh_[i].content, - hypothesisBuffer_.hypothesis_[i].content)) { - hypothesisBuffer_.fresh_[i].start = - hypothesisBuffer_.hypothesis_[i].start; - hypothesisBuffer_.fresh_[i].end = hypothesisBuffer_.hypothesis_[i].end; - shift = hypothesisBuffer_.fresh_[i].end - originalEnd; - - establishedEnd = hypothesisBuffer_.hypothesis_[i].end; - newBegin = hypothesisBuffer_.fresh_[i].end; - noNewWords--; - continue; - } - - // In case of a new word, we apply timestamp range scaling - // based on timestamps established in previous iterations. - const float freshDuration = newEnd - establishedEnd; - const float epsilon = std::max( - 0.F, 0.85F * (freshDuration - - static_cast(noNewWords / - params::kStreamWordsPerSecond))); - float scale = - (freshDuration - epsilon) / std::max(newEnd - newBegin, 0.2F); - hypothesisBuffer_.fresh_[i].start = - shift + (hypothesisBuffer_.fresh_[i].start - newEnd) * scale + newEnd; - hypothesisBuffer_.fresh_[i].end = - shift + (hypothesisBuffer_.fresh_[i].end - newEnd) * scale + newEnd; + // Aquire lock for the rest of the method (extensive usage of audioBuffer_). + std::scoped_lock lock(streamingMutex); + + // Step 1: examine all previously saved EOS points. + // The idea is to remove entries which have changed or no longer exist + // due to model correcting it's output. + for (size_t i = 0; i < memory_.eos.size(); i++) { + const auto &eos = memory_.eos[i]; + if (eos.position >= words.size() || !utils::isEos(words[eos.position]) || + (eos.position > 0 && + eos.preceeding != words[eos.position - 1].content)) { + memory_.eos.erase(memory_.eos.begin() + i, memory_.eos.end()); + break; } } - auto committed = hypothesisBuffer_.commit(); - auto nonCommitted = hypothesisBuffer_.hypothesis_; + // Step 2: check if the newest EOS character from transcript should be + // saved to eos_ vector. + auto lastEosIt = std::find_if(words.rbegin(), words.rend(), utils::isEos); + if (lastEosIt != words.rend()) { + size_t lastEosIndex = std::distance(words.begin(), lastEosIt.base()) - 1; - // We want to save the most recent end of sentence word - // to improve the audio cutting mechanism. - for (const auto &word : committed) { - if (!word.punctations.empty()) { - lastSentenceEnd_ = word.end; + // Because of step 1, we know that if the last EOS exist in eos_, + // then it must be the last entry. + if (memory_.eos.empty() || memory_.eos.back().position != lastEosIndex) { + // Register last EOS entry + std::string preceeding = + lastEosIndex > 0 ? words[lastEosIndex - 1].content : ""; + memory_.eos.emplace_back(lastEosIndex, preceeding, lastEosIt->end); } } - // Since Whisper does not accept waveforms longer than 30 seconds, we need - // to cut the audio at some safe point. - { - std::scoped_lock lock(audioBufferMutex_); - - const float audioDuration = - static_cast(audioBuffer_.size()) / constants::kSamplingRate; - if (audioDuration > params::kStreamChunkThreshold) { - // Leave some portion of audio in, to improve model behavior - // in future iterations. - const float erasePoint = - hypothesisBuffer_.lastCommittedTime_ == lastSentenceEnd_ - ? audioDuration - : std::min(lastSentenceEnd_, params::kStreamChunkThreshold); - const float minEraseDuration = - audioDuration - params::kStreamAudioBufferMaxReserve; - const float maxEraseDuration = - audioDuration - params::kStreamAudioBufferMinReserve; - const float eraseDuration = std::clamp( - erasePoint - bufferTimeOffset_, minEraseDuration, maxEraseDuration); - const size_t nSamplesToErase = - static_cast(eraseDuration * constants::kSamplingRate); + std::vector committed; - audioBuffer_.erase(audioBuffer_.begin(), - audioBuffer_.begin() + nSamplesToErase); - bufferTimeOffset_ += eraseDuration; - } + // Step 3: collect all the words which could possible get committed + // in-between iterations. + if (!memory_.toCommit.empty()) { + committed.insert(committed.end(), + std::make_move_iterator(memory_.toCommit.begin()), + std::make_move_iterator(memory_.toCommit.end())); + memory_.toCommit.clear(); } - return {.committed = move_to_vector(committed), - .nonCommitted = move_to_vector(nonCommitted)}; + // Step 4: clear the buffer if it is getting too large. + // The idea is to use the saved EOS entries and try to cut the buffer + // in a 'good' spot - where it will remove a significant audio chunk, yet + // won't affect most recent, unfinished speech samples. + size_t bufferSize = audioBuffer_.size(); + if (bufferSize > static_cast(params::kStreamSafeBufferDuration * + constants::kSamplingRate)) { + auto newCommitted = commitAndClean(words); + + committed.insert(committed.end(), + std::make_move_iterator(newCommitted.begin()), + std::make_move_iterator(newCommitted.end())); + } + + // Save the uncommitted part to streamer's memory, + // cause it might be necessary when committing inside streamInsert(). + memory_.transcript = words; + + // Note that uncommitted part represented by recent transcription (words) + // is already shrinked if something has been committed during the cleanup + // phase. + return {.committed = std::move(committed), .nonCommitted = std::move(words)}; } -std::vector OnlineASR::finish() { - // We always push the last remaining hypothesis, even if it's not - // confirmed in second iteration, to avoid ending up with broken sentences. - std::deque remaining = hypothesisBuffer_.hypothesis_; +std::vector OnlineASR::finish(const DecodingOptions &options) { + ProcessResult result = process(options); + + // Last-tick committed delta + whatever never made it past the commit + // threshold. + std::vector residual = std::move(result.committed); + residual.insert(residual.end(), + std::make_move_iterator(result.nonCommitted.begin()), + std::make_move_iterator(result.nonCommitted.end())); + + reset(); - return move_to_vector(remaining); + return residual; } void OnlineASR::reset() { - std::scoped_lock lock(audioBufferMutex_); - - hypothesisBuffer_.reset(); - bufferTimeOffset_ = 0.f; + std::scoped_lock lock(streamingMutex); audioBuffer_.clear(); + + // Reset memory. + memory_.transcript.clear(); + memory_.eos.clear(); + memory_.toCommit.clear(); +} + +std::vector OnlineASR::commitAndClean(std::vector &transcript) { + const size_t bufferSize = audioBuffer_.size(); + const float midBufferThreshold = params::kStreamMaxDuration / 2.0F; + + std::vector committed; + + // If we don't have any EOS entries, then we most likely have not + // recorded any speech. In this case we can safely cut the maximum amount of + // audio data. + if (memory_.eos.empty()) { + size_t cut = + bufferSize - params::kStreamSafetyThreshold * constants::kSamplingRate; + + audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); + } + + // If we have exactly one (most recent) EOS entry in the eos_, then + // we need to be more careful. + // Normally we want to keep at least one sentence in, but if the sentence + // covers a significant amount of buffer, we have no choice. + else if (memory_.eos.size() == 1) { + const float eosTimestamp = memory_.eos[0].tmstpend; + + const float upperHalfDuration = + std::max(0.0F, eosTimestamp - midBufferThreshold); + const float wordsPerSecond = + upperHalfDuration > 0.1F + ? static_cast(transcript.size()) / upperHalfDuration + : 0.0F; + + // The EOS sits early enough that cutting up to the safety margin won't + // touch the ongoing (post-EOS) speech. + const bool eosSafe = eosTimestamp < params::kStreamSafeBufferDuration - + params::kStreamSafetyThreshold; + + if (!eosSafe && wordsPerSecond < params::kWordsPerSecondLow) { + // EOS lies past the midpoint, but a low word density implies the spoken + // audio is concentrated in the upper half. Drop the lower half and + // shift the EOS accordingly. + audioBuffer_.erase(audioBuffer_.begin(), + audioBuffer_.begin() + + static_cast(midBufferThreshold * + constants::kSamplingRate)); + memory_.eos[0].tmstpend -= midBufferThreshold; + } else { + // Cut everything up to and including the sentence — either by the + // safety margin (when EOS is early) or (more aggresively) right at the + // EOS boundary — and commit its words. + const size_t cut = + eosSafe + ? bufferSize - + static_cast(params::kStreamSafetyThreshold * + constants::kSamplingRate) + : static_cast(eosTimestamp * constants::kSamplingRate); + + audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); + + committed.insert(committed.end(), + std::make_move_iterator(transcript.begin()), + std::make_move_iterator(transcript.end())); + + transcript.clear(); + memory_.eos.clear(); + } + } + + // In case of 2 or more sentences, we generally want to keep the last one + // intact. This would provide a bit of stability to the algorithm. + else { + const auto &secondTolastEntry = memory_.eos[memory_.eos.size() - 2]; + + const size_t cut = static_cast(secondTolastEntry.tmstpend * + constants::kSamplingRate); + const size_t lastCommittedPos = secondTolastEntry.position; + + audioBuffer_.erase(audioBuffer_.begin(), audioBuffer_.begin() + cut); + + // Move all words up to the last committed position (inclusive) to the + // committed buffer. + committed.insert( + committed.end(), std::make_move_iterator(transcript.begin()), + std::make_move_iterator(transcript.begin() + lastCommittedPos + 1)); + transcript.erase(transcript.begin(), + transcript.begin() + lastCommittedPos + 1); + + // Retain only the most recent EOS entry, shifting both its timestamp + // and its position to match the new (truncated) transcript origin. + memory_.eos.erase(memory_.eos.begin(), memory_.eos.end() - 1); + memory_.eos[0].tmstpend -= secondTolastEntry.tmstpend; + memory_.eos[0].position -= lastCommittedPos + 1; + } + + return committed; } } // namespace rnexecutorch::models::speech_to_text::whisper::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h index df6d469e39..7547d16bd5 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/OnlineASR.h @@ -1,13 +1,13 @@ #pragma once +#include +#include +#include + #include "../common/schema/OnlineASR.h" #include "../common/types/ProcessResult.h" -#include "../common/types/Segment.h" #include "../common/types/Word.h" #include "ASR.h" -#include "HypothesisBuffer.h" - -#include namespace rnexecutorch::models::speech_to_text::whisper::stream { @@ -21,60 +21,65 @@ class OnlineASR : public schema::OnlineASR { OnlineASR(const ASR *asr); /** - * Appends new audio samples to the internal processing buffer. - * - * @param audio A span of PCM float samples (expected 16kHz). + * Checks if the buffer contains enough audio for the next processing step. + * @return True if ready, false otherwise. */ - void insertAudioChunk(std::span audio) override; + bool isReady() const override; /** - * Determines whether the model is ready to process the next iteration. - * - * @return True if audioBuffer has enough samples, False otherwise + * Appends audio samples to the internal buffer. + * @param audio Span containing the audio data. */ - bool isReady() const override; + void insertAudioChunk(std::span audio) override; /** - * Processes the current audio buffer and returns new transcription results. - * Stability is managed by an internal HypothesisBuffer to ensure that - * only confirmed (stable) text is returned as "committed". - * - * @param options Decoding configuration (language, etc.). - * @return A ProcessResult containing newly committed and uncommitted - * words. + * Processes the current buffered audio and returns transcription results. + * @param options Decoding options for the transcription. + * @return Transcription result containing committed and volatile tokens. */ ProcessResult process(const DecodingOptions &options) override; /** - * Finalizes the current streaming session. - * Flushes any remaining words from the hypothesis buffer. - * - * @return A vector of remaining transcribed words. + * Finalizes the current stream and returns all words. + * @return Vector of detected words. */ - std::vector finish() override; + std::vector finish(const DecodingOptions &options) override; /** - * Reset the streaming state by resetting the buffers + * Resets the internal state and clears buffers. */ void reset() override; private: + // Cleans up the buffer and returns committed words based on given transcript. + std::vector commitAndClean(std::vector &transcript); + // ASR module connection for transcribing the audio const ASR *asr_; - // Helper buffers - audio buffer - // Stores the increasing amounts of streamed audio. - // Cleared from time to time after reaching a threshold size. + // Audio buffer (input) - accumulates obtained audio samples. std::vector audioBuffer_ = {}; - mutable std::mutex audioBufferMutex_; - float bufferTimeOffset_ = 0.F; // Audio buffer offset + mutable std::mutex streamingMutex; // Covers both buffer & memory - // Helper buffers - hypothesis buffer - // Manages the whisper streaming hypothesis mechanism. - HypothesisBuffer hypothesisBuffer_; + // Streaming memory. + // In general, helps to navigate continous streaming state and improve buffer + // handling algorithms. + struct Memory { + // State management helper. + struct EOSEntry { + size_t position; // An absolute position (index) in the transcription + // (word sequence). + std::string preceeding; // A preceeding word in the transcription + float tmstpend; // Ending timestamp of the sentence. + }; - // State members to keep track of specyfic aspects of buffer state - float lastSentenceEnd_ = 0.F; + std::vector + transcript; // The most recent transcription result (uncommitted only!). + std::vector + eos; // End of sentence points from the most recent transcription. + std::vector toCommit; // Words to be committed in the next iteration + // (next process() call). + } memory_; }; -} // namespace rnexecutorch::models::speech_to_text::whisper::stream \ No newline at end of file +} // namespace rnexecutorch::models::speech_to_text::whisper::stream diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h index 5eb74c06cc..847a22b1e0 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Params.h @@ -1,6 +1,9 @@ #pragma once +#include "Constants.h" + #include +#include /** * Hyperparameters @@ -11,90 +14,50 @@ namespace rnexecutorch::models::speech_to_text::whisper::params { /** - * Determines the range of buffer left when skipping an audio chunk - * of size lower than maximum allowed chunk size. - * - * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer], - * then instead of moving to the last returned timestamp, we jump across the - * entire 30 seconds chunk. This resolves the issue of multiple redundant - * segments being produced by the transcription algorithm. + * Maximum duration of audio that the streaming buffer keeps before forcing + * a cleanup. Aligned with Whisper's maximum supported input length. */ -constexpr static int32_t kChunkBreakBuffer = 2; // [s] +constexpr inline float kStreamMaxDuration = + static_cast(constants::kChunkSize); /** - * Determines the maximum timestamp difference available for a word to be - * considered as fresh in streaming algorithm. + * The minimum amount of recent audio always kept in the buffer when a blind + * cut is performed. Acts as the lower bound on what survives a cleanup. */ -constexpr static float kStreamFreshThreshold = 3.F; // [s], originally 0.5 +constexpr inline float kStreamSafetyThreshold = 3.F; // [s] /** - * The size of the most recent committed suffix searched in - * fresh words string. - * - * For example, if the committed buffer contains ["I", "did" "a" "very" "nasty" - * "thing."], and kStreamCommitedSuffixSearchSize = 3, then we search for - * ["very" "nasty" "thing."] suffix. + * Forced-cleanup threshold. Once the buffer grows past this duration we run + * the EOS-anchored cleanup routine. */ -constexpr static size_t kStreamCommitedSuffixSearchSize = 5; +constexpr inline float kStreamSafeBufferDuration = + kStreamMaxDuration - kStreamSafetyThreshold; // [s] /** - * Determines the maximum expected size of overlapping fragments between - * fresh words buffer and commited words buffer in streaming mode. - * - * It is a limit of maximum amount of erased repeated words from fresh buffer. - * The bigger it gets, the less probable it is to commit the same phrase twice. + * An estimate of the number of words spoken per second. + * Used for estimating transcription progress and buffer management heuristics. */ -constexpr static size_t kStreamMaxOverlapSize = - 12; // Number of overlaping words +constexpr inline float kWordsPerSecondEstimation = 2.25F; /** - * Similar to kMaxStreamOverlapSize, but this one determines - * the maximum allowed timestamp difference between the overlaping fragments. - * - * It's the first, more strict threshold, used when searching for recently - * committed entries. + * Upper bound for words per second estimate in fast speech. */ -constexpr static float kStreamMaxOverlapTimestampDiff1 = 6.F; // [s] +constexpr inline float kWordsPerSecondHigh = 4.F; /** - * Similar to kMaxStreamOverlapSize, but this one determines - * the maximum allowed timestamp difference between the overlaping fragments. - * - * It's the second, more liberal threshold, used in overlap correction - * algorithm. + * Lower bound for words per second estimate in slow speech. */ -constexpr static float kStreamMaxOverlapTimestampDiff2 = 15.F; // [s] +constexpr inline float kWordsPerSecondLow = 1.5F; /** - * Number of words per 1 allowed mistake (error correction). + * Determines the range of buffer left when skipping an audio chunk + * of size lower than maximum allowed chunk size. * - * For example, if kStreamWordsPerErrorRate = 4, then we allow maximum 1 mistake - * in a 4 word string. - */ -constexpr static size_t kStreamWordsPerErrorRate = 5; - -/** - * A threshold which exceeded causes the main streaming audio buffer to be - * cleared. - */ -constexpr static float kStreamChunkThreshold = 20.F; // [s] - -/** - * Decides how much of recent audio waveform is always kept in when - * clearing the audio buffer in streaming algorithm. - */ -constexpr static float kStreamAudioBufferMinReserve = 2.F; // [s] - -/** - * Decides how much of recent audio waveform can be kept in when - * clearing the audio buffer in streaming algorithm. - */ -constexpr static float kStreamAudioBufferMaxReserve = 6.F; // [s] - -/** - * An estimate of number of words per second produced in a standard - * human conversation speech. + * If the audio length does not exceed [kChunkSize * kSamplingRate] - [buffer], + * then instead of moving to the last returned timestamp, we jump across the + * entire 30 seconds chunk. This resolves the issue of multiple redundant + * segments being produced by the transcription algorithm. */ -constexpr static float kStreamWordsPerSecond = 2.5F; +constexpr inline int32_t kChunkBreakBuffer = 2; // [s] -} // namespace rnexecutorch::models::speech_to_text::whisper::params \ No newline at end of file +} // namespace rnexecutorch::models::speech_to_text::whisper::params diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h index 2e4e3b5076..48c84a84b7 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/whisper/Utils.h @@ -1,6 +1,7 @@ #pragma once #include "../common/types/Word.h" +#include "Constants.h" #include #include #include @@ -8,70 +9,14 @@ namespace rnexecutorch::models::speech_to_text::whisper::utils { -// Compares two strings without case-sensitivity. -inline bool equalsIgnoreCase(const std::string &a, const std::string &b) { - if (a.size() != b.size()) { - return false; - } - return std::equal(a.begin(), a.end(), b.begin(), [](char c1, char c2) { - return std::tolower(static_cast(c1)) == - std::tolower(static_cast(c2)); - }); -} - /** - * Finds the largest (in number of words) overlaping fragment between word - * vectors A (suffix) and B (prefix). + * Checks if the given word represents an End-of-Sentence (EOS) punctuation. * - * An overlaping fragment is any fragment C, which can be simultaneously a - * suffix of A and a prefix of B. Example: A = 'Jane likes food and playing - * games', B = 'playing games and sleeping', the overlap fragment C = 'playing - * games'. - * - * @param suffixVec An input vector, where only suffixes can overlap. - * Typically the 'commited' buffer in streaming algorithm. - * @param preffixVec An input vector, where only prefixes can overlap. - * Typically the 'fresh' buffer in streaming algorithm. - * @param maxCheckRange The maximum size of overlapping fragment. Determines the - * range of search. - * @param maxTimestampDiff The maximum allowed timestamp difference between - * overlaping fragments. If exceeded, the fragment are not considered as - * overlaping. - * @return The size of the largest found overlaping fragment. + * @param word The word to check. */ -template -inline size_t findLargestOverlapingFragment(const Container &suffixVec, - const Container &prefixVec, - size_t maxCheckRange = 10, - float maxTimestampDiff = 100.f) { - size_t range = std::min({suffixVec.size(), prefixVec.size(), maxCheckRange}); - - if (range == 0) { - return 0; - } - - // i starts at the index where the suffix of length 'range' begins. - for (size_t i = suffixVec.size() - range; i < suffixVec.size(); ++i) { - // We search for overlaps by searching for the first word of prefixVec - if (equalsIgnoreCase(suffixVec[i].content, prefixVec[0].content)) { - size_t calculatedSize = suffixVec.size() - i; - - bool isEqual = - std::equal(suffixVec.begin() + i, suffixVec.end(), prefixVec.begin(), - [maxTimestampDiff](const Word &sWord, const Word &pWord) { - return equalsIgnoreCase(sWord.content, pWord.content) && - std::max(std::fabs(sWord.start - pWord.start), - std::fabs(sWord.end - pWord.end)) <= - maxTimestampDiff; - }); - - if (isEqual) { - return calculatedSize; - } - } - } - - return 0; +constexpr inline bool isEos(const Word &word) { + return word.content.size() == 1 && + constants::kEosPunctations.contains(word.content[0]); } } // namespace rnexecutorch::models::speech_to_text::whisper::utils \ No newline at end of file diff --git a/packages/react-native-executorch/src/constants/modelUrls.ts b/packages/react-native-executorch/src/constants/modelUrls.ts index 6fb20f9ca3..c423594213 100644 --- a/packages/react-native-executorch/src/constants/modelUrls.ts +++ b/packages/react-native-executorch/src/constants/modelUrls.ts @@ -773,32 +773,29 @@ export const STYLE_TRANSFER_UDNIE_QUANTIZED = { } as const; // S2T -const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_EN_MODEL = `${URL_PREFIX}-whisper-tiny.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack.pte`; +const WHISPER_TINY_EN_TOKENIZER = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_TINY_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_en_xnnpack_fp32.pte`; +const WHISPER_TINY_EN_MODEL_COREML = `${URL_PREFIX}-whisper-tiny.en/${NEXT_VERSION_TAG}/coreml/whisper_tiny_en_coreml_fp32.pte`; -const WHISPER_TINY_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-tiny-quantized.en/${VERSION_TAG}/xnnpack/whisper_tiny_en_quantized_xnnpack.pte`; +const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_BASE_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/xnnpack/whisper_base_en_xnnpack_fp32.pte`; +const WHISPER_BASE_EN_MODEL_COREML = `${URL_PREFIX}-whisper-base.en/${NEXT_VERSION_TAG}/coreml/whisper_base_en_coreml_fp32.pte`; -const WHISPER_BASE_EN_TOKENIZER = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_EN_MODEL = `${URL_PREFIX}-whisper-base.en/${VERSION_TAG}/xnnpack/whisper_base_en_xnnpack.pte`; +const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_SMALL_EN_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/xnnpack/whisper_small_en_xnnpack_fp32.pte`; +const WHISPER_SMALL_EN_MODEL_COREML = `${URL_PREFIX}-whisper-small.en/${NEXT_VERSION_TAG}/coreml/whisper_small_en_coreml_fp32.pte`; -const WHISPER_BASE_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-base-quantized.en/${VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-base-quantized.en/${VERSION_TAG}/xnnpack/whisper_base_en_quantized_xnnpack.pte`; +const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_TINY_MODEL_XNNPACK = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/xnnpack/whisper_tiny_xnnpack_fp32.pte`; +const WHISPER_TINY_MODEL_COREML = `${URL_PREFIX}-whisper-tiny/${NEXT_VERSION_TAG}/coreml/whisper_tiny_coreml_fp32.pte`; -const WHISPER_SMALL_EN_TOKENIZER = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_EN_MODEL = `${URL_PREFIX}-whisper-small.en/${VERSION_TAG}/xnnpack/whisper_small_en_xnnpack.pte`; +const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_BASE_MODEL_XNNPACK = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/xnnpack/whisper_base_xnnpack_fp32.pte`; +const WHISPER_BASE_MODEL_COREML = `${URL_PREFIX}-whisper-base/${NEXT_VERSION_TAG}/coreml/whisper_base_coreml_fp32.pte`; -const WHISPER_SMALL_EN_QUANTIZED_TOKENIZER = `${URL_PREFIX}-whisper-small-quantized.en/${VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_EN_QUANTIZED_MODEL = `${URL_PREFIX}-whisper-small-quantized.en/${VERSION_TAG}/xnnpack/whisper_small_en_quantized_xnnpack.pte`; - -const WHISPER_TINY_TOKENIZER = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/tokenizer.json`; -const WHISPER_TINY_MODEL = `${URL_PREFIX}-whisper-tiny/${VERSION_TAG}/xnnpack/whisper_tiny_xnnpack.pte`; - -const WHISPER_BASE_TOKENIZER = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/tokenizer.json`; -const WHISPER_BASE_MODEL = `${URL_PREFIX}-whisper-base/${VERSION_TAG}/xnnpack/whisper_base_xnnpack.pte`; - -const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/tokenizer.json`; -const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/whisper_small_xnnpack.pte`; +const WHISPER_SMALL_TOKENIZER = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/tokenizer.json`; +const WHISPER_SMALL_MODEL_XNNPACK = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/xnnpack/whisper_small_xnnpack_fp32.pte`; +const WHISPER_SMALL_MODEL_COREML = `${URL_PREFIX}-whisper-small/${NEXT_VERSION_TAG}/coreml/whisper_small_coreml_fp32.pte`; /** * @category Models - Speech To Text @@ -806,18 +803,15 @@ const WHISPER_SMALL_MODEL = `${URL_PREFIX}-whisper-small/${VERSION_TAG}/xnnpack/ export const WHISPER_TINY_EN = { modelName: 'whisper-tiny-en', isMultilingual: false, - modelSource: WHISPER_TINY_EN_MODEL, + modelSource: WHISPER_TINY_EN_MODEL_XNNPACK, tokenizerSource: WHISPER_TINY_EN_TOKENIZER, } as const; -/** - * @category Models - Speech To Text - */ -export const WHISPER_TINY_EN_QUANTIZED = { - modelName: 'whisper-tiny-en-quantized', +export const WHISPER_TINY_EN_COREML = { + modelName: 'whisper-tiny-en', isMultilingual: false, - modelSource: WHISPER_TINY_EN_QUANTIZED_MODEL, - tokenizerSource: WHISPER_TINY_EN_QUANTIZED_TOKENIZER, + modelSource: WHISPER_TINY_EN_MODEL_COREML, + tokenizerSource: WHISPER_TINY_EN_TOKENIZER, } as const; /** @@ -826,18 +820,18 @@ export const WHISPER_TINY_EN_QUANTIZED = { export const WHISPER_BASE_EN = { modelName: 'whisper-base-en', isMultilingual: false, - modelSource: WHISPER_BASE_EN_MODEL, + modelSource: WHISPER_BASE_EN_MODEL_XNNPACK, tokenizerSource: WHISPER_BASE_EN_TOKENIZER, } as const; /** * @category Models - Speech To Text */ -export const WHISPER_BASE_EN_QUANTIZED = { - modelName: 'whisper-base-en-quantized', +export const WHISPER_BASE_EN_COREML = { + modelName: 'whisper-base-en', isMultilingual: false, - modelSource: WHISPER_BASE_EN_QUANTIZED_MODEL, - tokenizerSource: WHISPER_BASE_EN_QUANTIZED_TOKENIZER, + modelSource: WHISPER_BASE_EN_MODEL_COREML, + tokenizerSource: WHISPER_BASE_EN_TOKENIZER, } as const; /** @@ -846,18 +840,18 @@ export const WHISPER_BASE_EN_QUANTIZED = { export const WHISPER_SMALL_EN = { modelName: 'whisper-small-en', isMultilingual: false, - modelSource: WHISPER_SMALL_EN_MODEL, + modelSource: WHISPER_SMALL_EN_MODEL_XNNPACK, tokenizerSource: WHISPER_SMALL_EN_TOKENIZER, } as const; /** * @category Models - Speech To Text */ -export const WHISPER_SMALL_EN_QUANTIZED = { - modelName: 'whisper-small-en-quantized', +export const WHISPER_SMALL_EN_COREML = { + modelName: 'whisper-small-en', isMultilingual: false, - modelSource: WHISPER_SMALL_EN_QUANTIZED_MODEL, - tokenizerSource: WHISPER_SMALL_EN_QUANTIZED_TOKENIZER, + modelSource: WHISPER_SMALL_EN_MODEL_COREML, + tokenizerSource: WHISPER_SMALL_EN_TOKENIZER, } as const; /** @@ -866,7 +860,17 @@ export const WHISPER_SMALL_EN_QUANTIZED = { export const WHISPER_TINY = { modelName: 'whisper-tiny', isMultilingual: true, - modelSource: WHISPER_TINY_MODEL, + modelSource: WHISPER_TINY_MODEL_XNNPACK, + tokenizerSource: WHISPER_TINY_TOKENIZER, +} as const; + +/** + * @category Models - Speech To Text + */ +export const WHISPER_TINY_COREML = { + modelName: 'whisper-tiny', + isMultilingual: true, + modelSource: WHISPER_TINY_MODEL_COREML, tokenizerSource: WHISPER_TINY_TOKENIZER, } as const; @@ -876,7 +880,17 @@ export const WHISPER_TINY = { export const WHISPER_BASE = { modelName: 'whisper-base', isMultilingual: true, - modelSource: WHISPER_BASE_MODEL, + modelSource: WHISPER_BASE_MODEL_XNNPACK, + tokenizerSource: WHISPER_BASE_TOKENIZER, +} as const; + +/** + * @category Models - Speech To Text + */ +export const WHISPER_BASE_COREML = { + modelName: 'whisper-base', + isMultilingual: true, + modelSource: WHISPER_BASE_MODEL_COREML, tokenizerSource: WHISPER_BASE_TOKENIZER, } as const; @@ -886,7 +900,17 @@ export const WHISPER_BASE = { export const WHISPER_SMALL = { modelName: 'whisper-small', isMultilingual: true, - modelSource: WHISPER_SMALL_MODEL, + modelSource: WHISPER_SMALL_MODEL_XNNPACK, + tokenizerSource: WHISPER_SMALL_TOKENIZER, +} as const; + +/** + * @category Models - Speech To Text + */ +export const WHISPER_SMALL_COREML = { + modelName: 'whisper-small', + isMultilingual: true, + modelSource: WHISPER_SMALL_MODEL_COREML, tokenizerSource: WHISPER_SMALL_TOKENIZER, } as const; @@ -1314,14 +1338,17 @@ export const MODEL_REGISTRY = { STYLE_TRANSFER_UDNIE, STYLE_TRANSFER_UDNIE_QUANTIZED, WHISPER_TINY_EN, - WHISPER_TINY_EN_QUANTIZED, + WHISPER_TINY_EN_COREML, WHISPER_BASE_EN, - WHISPER_BASE_EN_QUANTIZED, + WHISPER_BASE_EN_COREML, WHISPER_SMALL_EN, - WHISPER_SMALL_EN_QUANTIZED, + WHISPER_SMALL_EN_COREML, WHISPER_TINY, + WHISPER_TINY_COREML, WHISPER_BASE, + WHISPER_BASE_COREML, WHISPER_SMALL, + WHISPER_SMALL_COREML, DEEPLAB_V3_RESNET50, DEEPLAB_V3_RESNET101, DEEPLAB_V3_MOBILENET_V3_LARGE, diff --git a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts index 9f428c98b2..5ac929a67f 100644 --- a/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts +++ b/packages/react-native-executorch/src/hooks/natural_language_processing/useSpeechToText.ts @@ -5,6 +5,7 @@ import { SpeechToTextType, SpeechToTextProps, TranscriptionResult, + StreamingOptions, } from '../../types/stt'; import { RnExecutorchErrorCode } from '../../errors/ErrorCodes'; import { RnExecutorchError, parseUnknownError } from '../../errors/errorUtils'; @@ -104,7 +105,7 @@ export const useSpeechToText = ({ ); const stream = useCallback( - async function* (options: DecodingOptions = {}): AsyncGenerator< + async function* (options: StreamingOptions = {}): AsyncGenerator< { committed: TranscriptionResult; nonCommitted: TranscriptionResult; diff --git a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts index a1bf6231ad..36464ee964 100644 --- a/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts +++ b/packages/react-native-executorch/src/modules/natural_language_processing/SpeechToTextModule.ts @@ -2,6 +2,7 @@ import { DecodingOptions, SpeechToTextModelConfig, SpeechToTextModelName, + StreamingOptions, TranscriptionResult, } from '../../types/stt'; import { ResourceFetcher } from '../../utils/ResourceFetcher'; @@ -177,7 +178,7 @@ export class SpeechToTextModule { * @yields An object containing `committed` and `nonCommitted` transcription results. * @returns An async generator yielding transcription updates. */ - public async *stream(options: DecodingOptions = {}): AsyncGenerator<{ + public async *stream(options: StreamingOptions = {}): AsyncGenerator<{ committed: TranscriptionResult; nonCommitted: TranscriptionResult; }> { @@ -185,6 +186,7 @@ export class SpeechToTextModule { const verbose = !!options.verbose; const language = options.language || ''; + const timeout = options.timeout || 100; const queue: { committed: TranscriptionResult; @@ -219,7 +221,8 @@ export class SpeechToTextModule { wake(); }, language, - verbose + verbose, + timeout ); finished = true; diff --git a/packages/react-native-executorch/src/types/stt.ts b/packages/react-native-executorch/src/types/stt.ts index 0a6ed11f70..20f1013ef0 100644 --- a/packages/react-native-executorch/src/types/stt.ts +++ b/packages/react-native-executorch/src/types/stt.ts @@ -94,7 +94,7 @@ export interface SpeechToTextType { * @returns Asynchronous generator that returns `committed` and `nonCommitted` transcription. * Both `committed` and `nonCommitted` are of type `TranscriptionResult` */ - stream(options?: DecodingOptions | undefined): AsyncGenerator< + stream(options?: StreamingOptions | undefined): AsyncGenerator< { committed: TranscriptionResult; nonCommitted: TranscriptionResult; @@ -208,6 +208,15 @@ export interface DecodingOptions { verbose?: boolean; } +/** + * Configuration options for the speech-to-text streaming process. + * @category Types + * @property {number} [timeout] - Specifies (in milliseconds) how much does streamer wait between model inferences. + */ +export interface StreamingOptions extends DecodingOptions { + timeout?: number; +} + /** * Structure that represent single token with timestamp information. * @category Types diff --git a/yarn.lock b/yarn.lock index 256469db22..9584660eb7 100644 --- a/yarn.lock +++ b/yarn.lock @@ -15249,6 +15249,24 @@ __metadata: languageName: node linkType: hard +"react-native-audio-api@npm:0.12.2": + version: 0.12.2 + resolution: "react-native-audio-api@npm:0.12.2" + dependencies: + semver: "npm:^7.7.3" + peerDependencies: + react: "*" + react-native: "*" + react-native-worklets: ">= 0.6.0" + peerDependenciesMeta: + react-native-worklets: + optional: true + bin: + setup-rn-audio-api-web: scripts/setup-rn-audio-api-web.js + checksum: 10/ed495058382188c8beb51ce89f2ef14d846dc0c0a07c65a7b4c71aa106fb7ea14aa8660b05fb33941c038d1a7ab2ba4ab3eb039fe481841938c45396903c6060 + languageName: node + linkType: hard + "react-native-builder-bob@npm:^0.40.12": version: 0.40.18 resolution: "react-native-builder-bob@npm:0.40.18" @@ -16627,7 +16645,7 @@ __metadata: metro-config: "npm:^0.83.0" react: "npm:19.2.5" react-native: "npm:0.83.4" - react-native-audio-api: "npm:0.12.0" + react-native-audio-api: "npm:0.12.2" react-native-device-info: "npm:^15.0.2" react-native-executorch: "workspace:*" react-native-executorch-expo-resource-fetcher: "workspace:*"