Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/voice_agents/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ session = AgentSession(
### 🚀 Getting Started

- [`basic_agent.py`](./basic_agent.py) - A fundamental voice agent using LiveKit Inference with metrics collection
- [`mistral_realtime_stt_agent.py`](./mistral_realtime_stt_agent.py) - Comparison agent using Mistral realtime STT, OpenAI LLM, and ElevenLabs Flash v2.5 TTS

### 🛠️ Tool Integration & Function Calling

Expand Down
106 changes: 106 additions & 0 deletions examples/voice_agents/mistral_realtime_stt_agent.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
"""Comparison voice agent using Mistral STT and ElevenLabs TTS.

Requires:
MISTRAL_API_KEY for Mistral STT
OPENAI_API_KEY for the LLM
ELEVEN_API_KEY for ElevenLabs TTS

This example uses Mistral's realtime streaming STT model, OpenAI for the LLM,
and ElevenLabs Flash v2.5 for TTS using an explicit voice ID already referenced
elsewhere in the repo. The Mistral STT settings are tuned to the best plugin-only
configuration we observed in comparison runs against the Deepgram example.

For the cleanest console-mode testing, use headphones to avoid the agent hearing
its own playback through the microphone.
"""

import logging

from dotenv import load_dotenv

from livekit.agents import (
Agent,
AgentServer,
AgentSession,
JobContext,
JobProcess,
TurnHandlingOptions,
cli,
room_io,
)
from livekit.plugins import elevenlabs, mistralai, openai, silero
from livekit.plugins.turn_detector.multilingual import MultilingualModel

logger = logging.getLogger("mistral-realtime-stt-agent")

load_dotenv()


class MyAgent(Agent):
def __init__(self) -> None:
super().__init__(
instructions=(
"Your name is Kelly, built by LiveKit. "
"Keep responses concise, natural, and voice-friendly. "
"Reply in one or two short sentences."
)
)

async def on_enter(self) -> None:
self.session.generate_reply(
instructions=(
"Greet the user and mention that you are using Mistral for speech-to-text "
"and ElevenLabs for text-to-speech. Keep the greeting short."
)
)


server = AgentServer()


def prewarm(proc: JobProcess) -> None:
proc.userdata["vad"] = silero.VAD.load()


server.setup_fnc = prewarm


@server.rtc_session()
async def entrypoint(ctx: JobContext) -> None:
ctx.log_context_fields = {"room": ctx.room.name}

session = AgentSession(
stt=mistralai.STT(
model="voxtral-mini-transcribe-realtime-2602",
target_streaming_delay_ms=240,
chunk_duration_ms=10,
finalize_delay_ms=650,
),
llm=openai.LLM(model="gpt-4.1-mini"),
tts=elevenlabs.TTS(
model="eleven_flash_v2_5",
voice_id="hpp4J3VqNfWAUOO0d1Us",
),
vad=ctx.proc.userdata["vad"],
turn_handling=TurnHandlingOptions(
turn_detection=MultilingualModel(),
interruption={
"resume_false_interruption": True,
"false_interruption_timeout": 1.0,
},
),
preemptive_generation=False,
aec_warmup_duration=3.0,
)

await session.start(
agent=MyAgent(),
room=ctx.room,
room_options=room_io.RoomOptions(
audio_input=room_io.AudioInputOptions(),
),
)


if __name__ == "__main__":
cli.run_app(server)
Original file line number Diff line number Diff line change
Expand Up @@ -4,11 +4,11 @@

from .llm import LLM
from .log import logger
from .stt import STT
from .stt import STT, SpeechStream
from .tts import TTS
from .version import __version__

__all__ = ["LLM", "STT", "TTS", "__version__"]
__all__ = ["LLM", "STT", "SpeechStream", "TTS", "__version__"]


class MistralAIPlugin(Plugin):
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,11 @@
]

STTModels = Literal[
"voxtral-small-2507", "voxtral-mini-2507", "voxtral-mini-latest", "voxtral-small-latest"
"voxtral-small-2507",
"voxtral-mini-2507",
"voxtral-mini-latest",
"voxtral-small-latest",
"voxtral-mini-transcribe-realtime-2602",
]

TTSModels = Literal["voxtral-mini-tts-2603", "voxtral-mini-tts-latest"]
Expand Down
Loading
Loading