Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -232,6 +232,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the
- **OpenAI**: Uses OpenAI's TTS model with the 'fable' voice.
- **Deepgram**: Uses Deepgram's TTS model with the 'aura-angus-en' voice.
- **ElevenLabs**: Uses ElevenLabs' TTS model with the 'Paul J.' voice.
- **Google Gemini**: Uses Google's Gemini 2.0 Flash model with the 'Aoede' voice for text-to-speech generation.
- **Local**: Placeholder for a local TTS model.

## Detailed Module Descriptions 📘
Expand Down
1 change: 1 addition & 0 deletions example.env
Original file line number Diff line number Diff line change
Expand Up @@ -5,3 +5,4 @@ ELEVENLABS_API_KEY="ELEVENLABS_API_KEY"
CARTESIA_API_KEY="CARTESIA_API_KEY"
LOCAL_MODEL_PATH=path/to/local/model
PIPER_SERVER_URL=http://localhost:5000
GOOGLE_API_KEY="GEMINI_API_KEY"
3 changes: 2 additions & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,5 @@ sounddevice
cartesia
soundfile
ollama
pydub
pydub
google-genai
3 changes: 2 additions & 1 deletion voice_assistant/api_key_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,8 @@
"tts": {
"openai": Config.OPENAI_API_KEY,
"deepgram":Config.DEEPGRAM_API_KEY,
"elevenlabs": Config.ELEVENLABS_API_KEY
"elevenlabs": Config.ELEVENLABS_API_KEY,
"gemini": Config.GOOGLE_API_KEY
}
}

Expand Down
16 changes: 11 additions & 5 deletions voice_assistant/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,9 +21,13 @@ class Config:
LOCAL_MODEL_PATH (str): Path to the local model.
"""
# Model selection
TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi
RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama
TTS_MODEL = 'openai' # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper
TRANSCRIPTION_MODEL = 'groq' # possible values: openai, groq, deepgram, fastwhisperapi
RESPONSE_MODEL = 'groq' # possible values: openai, groq, ollama
TTS_MODEL = 'gemini' # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper, gemini

# Gemini TTS configuration
GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts" # https://ai.google.dev/gemini-api/docs/speech-generation#supported-models
GEMINI_TTS_VOICE = "Aoede" # https://ai.google.dev/gemini-api/docs/speech-generation#voices

# Piper Server configuration
PIPER_SERVER_URL = os.getenv("PIPER_SERVER_URL")
Expand All @@ -34,7 +38,7 @@ class Config:

# LLM Selection
OLLAMA_LLM="llama3:8b"
GROQ_LLM="llama3-8b-8192"
GROQ_LLM="llama-3.3-70b-versatile" # Updated from decommissioned llama3-8b-8192
OPENAI_LLM="gpt-4o"

# API keys and paths
Expand All @@ -44,6 +48,7 @@ class Config:
ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY")
GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")

# for serving the MeloTTS model
TTS_PORT_LOCAL = 5150
Expand All @@ -64,7 +69,7 @@ def validate_config():
Config._validate_model('RESPONSE_MODEL', [
'openai', 'groq', 'ollama', 'local'])
Config._validate_model('TTS_MODEL', [
'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper'])
'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper', 'gemini'])

Config._validate_api_key('TRANSCRIPTION_MODEL', 'openai', 'OPENAI_API_KEY')
Config._validate_api_key('TRANSCRIPTION_MODEL', 'groq', 'GROQ_API_KEY')
Expand All @@ -77,6 +82,7 @@ def validate_config():
Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY')
Config._validate_api_key('TTS_MODEL', 'elevenlabs', 'ELEVENLABS_API_KEY')
Config._validate_api_key('TTS_MODEL', 'cartesia', 'CARTESIA_API_KEY')
Config._validate_api_key('TTS_MODEL', 'gemini', 'GOOGLE_API_KEY')

@staticmethod
def _validate_model(attribute, valid_options):
Expand Down
46 changes: 39 additions & 7 deletions voice_assistant/text_to_speech.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,9 +5,12 @@
import elevenlabs
import soundfile as sf
import requests
import wave
from google import genai
from google.genai import types

from openai import OpenAI
from deepgram import DeepgramClient, SpeakOptions
from deepgram import DeepgramClient
from elevenlabs.client import ElevenLabs
from cartesia import Cartesia

Expand Down Expand Up @@ -41,13 +44,14 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca

elif model == 'deepgram':
client = DeepgramClient(api_key=api_key)
options = SpeakOptions(
model="aura-arcas-en", #"aura-luna-en", # https://developers.deepgram.com/docs/tts-models
encoding="linear16",
container="wav"
response = client.speak.v1.audio.generate(
text=text,
model="aura-arcas-en", # https://developers.deepgram.com/docs/tts-models
)
SPEAK_OPTIONS = {"text": text}
response = client.speak.v("1").save(output_file_path, SPEAK_OPTIONS, options)

# Save the audio file
with open(output_file_path, "wb") as audio_file:
audio_file.write(response.stream.getvalue())

elif model == 'elevenlabs':
client = ElevenLabs(api_key=api_key)
Expand All @@ -58,6 +62,34 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
model="eleven_turbo_v2"
)
elevenlabs.save(audio, output_file_path)

elif model == 'gemini':
client = genai.Client(api_key=api_key)

response = client.models.generate_content(
model=Config.GEMINI_TTS_MODEL,
contents=text,
config=types.GenerateContentConfig(
response_modalities=["AUDIO"],
speech_config=types.SpeechConfig(
voice_config=types.VoiceConfig(
prebuilt_voice_config=types.PrebuiltVoiceConfig(
voice_name=Config.GEMINI_TTS_VOICE
)
)
)
)
)

# Get PCM audio data and save as proper WAV file with headers
audio_data = response.candidates[0].content.parts[0].inline_data.data

# Write as WAV file with proper headers (24kHz, 16-bit, mono)
with wave.open(output_file_path, "wb") as wf:
wf.setnchannels(1) # mono
wf.setsampwidth(2) # 16-bit
wf.setframerate(24000) # 24kHz sample rate
wf.writeframes(audio_data)

elif model == "cartesia":
client = Cartesia(api_key=api_key)
Expand Down
21 changes: 10 additions & 11 deletions voice_assistant/transcription.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
from colorama import Fore, init
from openai import OpenAI
from groq import Groq
from deepgram import DeepgramClient,PrerecordedOptions,FileSource
from deepgram import DeepgramClient

fast_url = "http://localhost:8000"
checked_fastwhisperapi = False
Expand Down Expand Up @@ -80,17 +80,16 @@ def _transcribe_with_groq(api_key, audio_file_path):


def _transcribe_with_deepgram(api_key, audio_file_path):
deepgram = DeepgramClient(api_key)
client = DeepgramClient(api_key)
try:
with open(audio_file_path, "rb") as file:
buffer_data = file.read()

payload = {"buffer": buffer_data}
options = PrerecordedOptions(model="nova-2", smart_format=True)
response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
data = json.loads(response.to_json())

transcript = data['results']['channels'][0]['alternatives'][0]['transcript']
with open(audio_file_path, "rb") as audio_file:
response = client.listen.v1.media.transcribe_file(
request=audio_file.read(),
model="nova-2",
smart_format=True
)

transcript = response.results.channels[0].alternatives[0].transcript
return transcript
except Exception as e:
logging.error(f"{Fore.RED}Deepgram transcription error: {e}{Fore.RESET}")
Expand Down