PromtEngineer · shoryasethia · Jan 7, 2026 · Jan 24, 2026 · Jan 24, 2026 · Jan 24, 2026
diff --git a/README.md b/README.md
@@ -232,6 +232,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the
 - **OpenAI**: Uses OpenAI's TTS model with the 'fable' voice.
 - **Deepgram**: Uses Deepgram's TTS model with the 'aura-angus-en' voice.
 - **ElevenLabs**: Uses ElevenLabs' TTS model with the 'Paul J.' voice.
+- **Google Gemini**: Uses Google's Gemini 2.0 Flash model with the 'Aoede' voice for text-to-speech generation.
 - **Local**: Placeholder for a local TTS model.
 
 ## Detailed Module Descriptions  📘

diff --git a/example.env b/example.env
@@ -5,3 +5,4 @@ ELEVENLABS_API_KEY="ELEVENLABS_API_KEY"
 CARTESIA_API_KEY="CARTESIA_API_KEY"
 LOCAL_MODEL_PATH=path/to/local/model
 PIPER_SERVER_URL=http://localhost:5000
+GOOGLE_API_KEY="GEMINI_API_KEY"
diff --git a/requirements.txt b/requirements.txt
@@ -34,4 +34,5 @@ sounddevice
 cartesia
 soundfile
 ollama
-pydub
+pydub
+google-genai
diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py
@@ -15,7 +15,8 @@
     "tts": {
         "openai": Config.OPENAI_API_KEY,
         "deepgram":Config.DEEPGRAM_API_KEY,
-        "elevenlabs": Config.ELEVENLABS_API_KEY
+        "elevenlabs": Config.ELEVENLABS_API_KEY,
+        "gemini": Config.GOOGLE_API_KEY
     }
 }
 

diff --git a/voice_assistant/config.py b/voice_assistant/config.py
@@ -21,9 +21,13 @@ class Config:
         LOCAL_MODEL_PATH (str): Path to the local model.
     """
     # Model selection
-    TRANSCRIPTION_MODEL = 'deepgram'  # possible values: openai, groq, deepgram, fastwhisperapi
-    RESPONSE_MODEL = 'openai'  # possible values: openai, groq, ollama
-    TTS_MODEL = 'openai'  # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper
+    TRANSCRIPTION_MODEL = 'groq'  # possible values: openai, groq, deepgram, fastwhisperapi
+    RESPONSE_MODEL = 'groq'  # possible values: openai, groq, ollama
+    TTS_MODEL = 'gemini'  # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper, gemini
+
+    # Gemini TTS configuration
+    GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts" # https://ai.google.dev/gemini-api/docs/speech-generation#supported-models
+    GEMINI_TTS_VOICE = "Aoede" # https://ai.google.dev/gemini-api/docs/speech-generation#voices
 
     # Piper Server configuration
     PIPER_SERVER_URL = os.getenv("PIPER_SERVER_URL")
@@ -34,7 +38,7 @@ class Config:
 
     # LLM Selection
     OLLAMA_LLM="llama3:8b"
-    GROQ_LLM="llama3-8b-8192"
+    GROQ_LLM="llama-3.3-70b-versatile"  # Updated from decommissioned llama3-8b-8192
     OPENAI_LLM="gpt-4o"
 
     # API keys and paths
@@ -44,6 +48,7 @@ class Config:
     ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
     LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
     CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY")
+    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 
     # for serving the MeloTTS model
     TTS_PORT_LOCAL = 5150
@@ -64,7 +69,7 @@ def validate_config():
         Config._validate_model('RESPONSE_MODEL', [
             'openai', 'groq', 'ollama', 'local'])
         Config._validate_model('TTS_MODEL', [
-            'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper'])
+            'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper', 'gemini'])
 
         Config._validate_api_key('TRANSCRIPTION_MODEL', 'openai', 'OPENAI_API_KEY')
         Config._validate_api_key('TRANSCRIPTION_MODEL', 'groq', 'GROQ_API_KEY')
@@ -77,6 +82,7 @@ def validate_config():
         Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY')
         Config._validate_api_key('TTS_MODEL', 'elevenlabs', 'ELEVENLABS_API_KEY')
         Config._validate_api_key('TTS_MODEL', 'cartesia', 'CARTESIA_API_KEY')
+        Config._validate_api_key('TTS_MODEL', 'gemini', 'GOOGLE_API_KEY')
 
     @staticmethod
     def _validate_model(attribute, valid_options):

diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py
@@ -5,9 +5,12 @@
 import elevenlabs
 import soundfile as sf
 import requests
+import wave
+from google import genai
+from google.genai import types
 
 from openai import OpenAI
-from deepgram import DeepgramClient, SpeakOptions
+from deepgram import DeepgramClient
 from elevenlabs.client import ElevenLabs
 from cartesia import Cartesia
 
@@ -41,13 +44,14 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
 
         elif model == 'deepgram':
             client = DeepgramClient(api_key=api_key)
-            options = SpeakOptions(
-                model="aura-arcas-en", #"aura-luna-en", # https://developers.deepgram.com/docs/tts-models
-                encoding="linear16",
-                container="wav"
+            response = client.speak.v1.audio.generate(
+                text=text,
+                model="aura-arcas-en",  # https://developers.deepgram.com/docs/tts-models
             )
-            SPEAK_OPTIONS = {"text": text}
-            response = client.speak.v("1").save(output_file_path, SPEAK_OPTIONS, options)
+
+            # Save the audio file
+            with open(output_file_path, "wb") as audio_file:
+                audio_file.write(response.stream.getvalue())
 
         elif model == 'elevenlabs':
             client = ElevenLabs(api_key=api_key)
@@ -58,6 +62,34 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
                 model="eleven_turbo_v2"
             )
             elevenlabs.save(audio, output_file_path)
+
+        elif model == 'gemini':
+            client = genai.Client(api_key=api_key)
+
+            response = client.models.generate_content(
+                model=Config.GEMINI_TTS_MODEL,
+                contents=text,
+                config=types.GenerateContentConfig(
+                    response_modalities=["AUDIO"],
+                    speech_config=types.SpeechConfig(
+                        voice_config=types.VoiceConfig(
+                            prebuilt_voice_config=types.PrebuiltVoiceConfig(
+                                voice_name=Config.GEMINI_TTS_VOICE
+                            )
+                        )
+                    )
+                )
+            )
+
+            # Get PCM audio data and save as proper WAV file with headers
+            audio_data = response.candidates[0].content.parts[0].inline_data.data
+
+            # Write as WAV file with proper headers (24kHz, 16-bit, mono)
+            with wave.open(output_file_path, "wb") as wf:
+                wf.setnchannels(1)  # mono
+                wf.setsampwidth(2)  # 16-bit
+                wf.setframerate(24000)  # 24kHz sample rate
+                wf.writeframes(audio_data)
 
         elif model == "cartesia":
             client = Cartesia(api_key=api_key)

diff --git a/voice_assistant/transcription.py b/voice_assistant/transcription.py
@@ -8,7 +8,7 @@
 from colorama import Fore, init
 from openai import OpenAI
 from groq import Groq
-from deepgram import DeepgramClient,PrerecordedOptions,FileSource
+from deepgram import DeepgramClient
 
 fast_url = "http://localhost:8000"
 checked_fastwhisperapi = False
@@ -80,17 +80,16 @@ def _transcribe_with_groq(api_key, audio_file_path):
 
 
 def _transcribe_with_deepgram(api_key, audio_file_path):
-    deepgram = DeepgramClient(api_key)
+    client = DeepgramClient(api_key)
     try:
-        with open(audio_file_path, "rb") as file:
-            buffer_data = file.read()
-
-        payload = {"buffer": buffer_data}
-        options = PrerecordedOptions(model="nova-2", smart_format=True)
-        response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
-        data = json.loads(response.to_json())
-
-        transcript = data['results']['channels'][0]['alternatives'][0]['transcript']
+        with open(audio_file_path, "rb") as audio_file:
+            response = client.listen.v1.media.transcribe_file(
+                request=audio_file.read(),
+                model="nova-2",
+                smart_format=True
+            )
+
+        transcript = response.results.channels[0].alternatives[0].transcript
         return transcript
     except Exception as e:
         logging.error(f"{Fore.RED}Deepgram transcription error: {e}{Fore.RESET}")