livekit · chenghao-mou · May 7, 2026
diff --git a/livekit-agents/livekit/agents/voice/audio_recognition.py b/livekit-agents/livekit/agents/voice/audio_recognition.py
@@ -807,6 +807,16 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
                 await self._flush_held_transcripts(cooldown=end_cooldown)
                 # no return here to allow the new event to be processed normally
 
+        has_stt_end_time = bool(
+            ev.alternatives
+            and ev.alternatives[0].end_time > 0
+            and self._input_started_at is not None
+        )
+        stt_last_speaking_time = (
+            ev.alternatives[0].end_time + self._input_started_at
+            if has_stt_end_time
+            else time.time()
+        )
         if ev.type == stt.SpeechEventType.FINAL_TRANSCRIPT:
             transcript = ev.alternatives[0].text
             language = ev.alternatives[0].language
@@ -843,13 +853,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
             self._audio_interim_transcript = ""
             self._audio_preflight_transcript = ""
 
-            if not self._vad or self._last_speaking_time is None:
+            if self._last_speaking_time is None or (not self._vad and has_stt_end_time):
                 # vad disabled, use stt timestamp
-                # TODO: this would screw up transcription latency metrics
-                # but we'll live with it for now.
-                # the correct way is to ensure STT fires SpeechEventType.END_OF_SPEECH
-                # and using that timestamp for _last_speaking_time
-                self._last_speaking_time = time.time()
+                self._last_speaking_time = stt_last_speaking_time
 
             if self._vad_base_turn_detection or self._user_turn_committed:
                 if transcript_changed:
@@ -900,9 +906,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
             self._audio_preflight_transcript = (self._audio_transcript + " " + transcript).lstrip()
             self._audio_interim_transcript = transcript
 
-            if not self._vad or self._last_speaking_time is None:
+            if self._last_speaking_time is None or (not self._vad and has_stt_end_time):
                 # vad disabled, use stt timestamp
-                self._last_speaking_time = time.time()
+                self._last_speaking_time = stt_last_speaking_time
 
             if self._turn_detection_mode != "manual" or self._user_turn_committed:
                 confidence_vals = list(self._final_transcript_confidence) + [confidence]
@@ -941,8 +947,9 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
 
             self._speaking = False
             self._user_turn_committed = True
-            if not self._vad or self._last_speaking_time is None:
-                self._last_speaking_time = time.time()
+            if self._last_speaking_time is None or (not self._vad and has_stt_end_time):
+                # vad disabled, use stt timestamp
+                self._last_speaking_time = stt_last_speaking_time
 
             chat_ctx = self._hooks.retrieve_chat_ctx().copy()
             self._run_eou_detection(chat_ctx)
@@ -957,7 +964,7 @@ async def _on_stt_event(self, ev: stt.SpeechEvent) -> None:
                 self._hooks.on_start_of_speech(None, speech_start_time=self._speech_start_time)
 
             self._speaking = True
-            self._last_speaking_time = time.time()
+            self._last_speaking_time = stt_last_speaking_time
 
             if self._end_of_turn_task is not None:
                 self._end_of_turn_task.cancel()