Skip to content
Draft
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
29 changes: 18 additions & 11 deletions livekit-agents/livekit/agents/voice/audio_recognition.py
Original file line number Diff line number Diff line change
Expand Up @@ -807,6 +807,16 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
await self._flush_held_transcripts(cooldown=end_cooldown)
# no return here to allow the new event to be processed normally

has_stt_end_time = bool(
ev.alternatives
and ev.alternatives[0].end_time > 0
and self._input_started_at is not None
)
stt_last_speaking_time = (
ev.alternatives[0].end_time + self._input_started_at
if has_stt_end_time
else time.time()
)
if ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
transcript = ev.alternatives[0].text
language = ev.alternatives[0].language
Expand Down Expand Up @@ -843,13 +853,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
self._audio_interim_transcript = ""
self._audio_preflight_transcript = ""

if not self._vad or self._last_speaking_time is None:
if self._last_speaking_time is None or (not self._vad and has_stt_end_time):
# vad disabled, use stt timestamp
# TODO: this would screw up transcription latency metrics
# but we'll live with it for now.
# the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
# and using that timestamp for _last_speaking_time
self._last_speaking_time = time.time()
self._last_speaking_time = stt_last_speaking_time

if self._vad_base_turn_detection or self._user_turn_committed:
if transcript_changed:
Expand Down Expand Up @@ -900,9 +906,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
self._audio_preflight_transcript = (self._audio_transcript + " " + transcript).lstrip()
self._audio_interim_transcript = transcript

if not self._vad or self._last_speaking_time is None:
if self._last_speaking_time is None or (not self._vad and has_stt_end_time):
# vad disabled, use stt timestamp
self._last_speaking_time = time.time()
self._last_speaking_time = stt_last_speaking_time

if self._turn_detection_mode != "manual" or self._user_turn_committed:
confidence_vals = list(self._final_transcript_confidence) + [confidence]
Expand Down Expand Up @@ -941,8 +947,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:

self._speaking = False
self._user_turn_committed = True
if not self._vad or self._last_speaking_time is None:
self._last_speaking_time = time.time()
if self._last_speaking_time is None or (not self._vad and has_stt_end_time):
# vad disabled, use stt timestamp
self._last_speaking_time = stt_last_speaking_time

chat_ctx = self._hooks.retrieve_chat_ctx().copy()
self._run_eou_detection(chat_ctx)
Expand All @@ -957,7 +964,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
self._hooks.on_start_of_speech(None, speech_start_time=self._speech_start_time)

self._speaking = True
self._last_speaking_time = time.time()
self._last_speaking_time = stt_last_speaking_time

if self._end_of_turn_task is not None:
self._end_of_turn_task.cancel()
Expand Down
Loading