From 48cd583cf3be2096ee7efa82c38ce5361e05d865 Mon Sep 17 00:00:00 2001
From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com>
Date: Thu, 8 Jan 2026 00:18:07 +0530
Subject: [PATCH 1/7] Add Google Gemini TTS API support

---
 README.md                          |  1 +
 requirements.txt                   |  3 ++-
 run_voice_assistant.py             |  2 +-
 voice_assistant/api_key_manager.py |  3 ++-
 voice_assistant/config.py          |  4 +++-
 voice_assistant/text_to_speech.py  | 23 +++++++++++++++++++++++
 6 files changed, 32 insertions(+), 4 deletions(-)

diff --git a/README.md b/README.md
index 308d490..36b824b 100644
--- a/README.md
+++ b/README.md
@@ -232,6 +232,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the
 - **OpenAI**: Uses OpenAI's TTS model with the 'fable' voice.
 - **Deepgram**: Uses Deepgram's TTS model with the 'aura-angus-en' voice.
 - **ElevenLabs**: Uses ElevenLabs' TTS model with the 'Paul J.' voice.
+- **Google Gemini**: Uses Google's Gemini 2.0 Flash model with the 'Aoede' voice for text-to-speech generation.
 - **Local**: Placeholder for a local TTS model.
 
 ## Detailed Module Descriptions  📘
diff --git a/requirements.txt b/requirements.txt
index d533fbb..ad8b4a8 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -34,4 +34,5 @@ sounddevice
 cartesia
 soundfile
 ollama
-pydub
\ No newline at end of file
+pydub
+google-generativeai
\ No newline at end of file
diff --git a/run_voice_assistant.py b/run_voice_assistant.py
index 418e1c0..1fab32c 100644
--- a/run_voice_assistant.py
+++ b/run_voice_assistant.py
@@ -65,7 +65,7 @@ def main():
             chat_history.append({"role": "assistant", "content": response_text})
 
             # Determine the output file format based on the TTS model
-            if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia':
+            if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia' or Config.TTS_MODEL == 'gemini':
                 output_file = 'output.mp3'
             else:
                 output_file = 'output.wav'
diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py
index 68668e3..2f9f22d 100644
--- a/voice_assistant/api_key_manager.py
+++ b/voice_assistant/api_key_manager.py
@@ -15,7 +15,8 @@
     "tts": {
         "openai": Config.OPENAI_API_KEY,
         "deepgram":Config.DEEPGRAM_API_KEY,
-        "elevenlabs": Config.ELEVENLABS_API_KEY
+        "elevenlabs": Config.ELEVENLABS_API_KEY,
+        "gemini": Config.GOOGLE_API_KEY
     }
 }
 
diff --git a/voice_assistant/config.py b/voice_assistant/config.py
index 73fbc87..6bb6f22 100644
--- a/voice_assistant/config.py
+++ b/voice_assistant/config.py
@@ -44,6 +44,7 @@ class Config:
     ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY")
     LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH")
     CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY")
+    GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY")
 
     # for serving the MeloTTS model
     TTS_PORT_LOCAL = 5150
@@ -64,7 +65,7 @@ def validate_config():
         Config._validate_model('RESPONSE_MODEL', [
             'openai', 'groq', 'ollama', 'local'])
         Config._validate_model('TTS_MODEL', [
-            'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper'])
+            'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper', 'gemini'])
 
         Config._validate_api_key('TRANSCRIPTION_MODEL', 'openai', 'OPENAI_API_KEY')
         Config._validate_api_key('TRANSCRIPTION_MODEL', 'groq', 'GROQ_API_KEY')
@@ -77,6 +78,7 @@ def validate_config():
         Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY')
         Config._validate_api_key('TTS_MODEL', 'elevenlabs', 'ELEVENLABS_API_KEY')
         Config._validate_api_key('TTS_MODEL', 'cartesia', 'CARTESIA_API_KEY')
+        Config._validate_api_key('TTS_MODEL', 'gemini', 'GOOGLE_API_KEY')
 
     @staticmethod
     def _validate_model(attribute, valid_options):
diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py
index be3ee96..bf9eba2 100644
--- a/voice_assistant/text_to_speech.py
+++ b/voice_assistant/text_to_speech.py
@@ -5,6 +5,7 @@
 import elevenlabs
 import soundfile as sf
 import requests
+import google.generativeai as genai
 
 from openai import OpenAI
 from deepgram import DeepgramClient, SpeakOptions
@@ -58,6 +59,28 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
                 model="eleven_turbo_v2"
             )
             elevenlabs.save(audio, output_file_path)
+
+        elif model == 'gemini':
+            genai.configure(api_key=api_key)
+            model_instance = genai.GenerativeModel("gemini-2.0-flash-exp")
+            
+            response = model_instance.generate_content(
+                text,
+                generation_config=genai.GenerationConfig(
+                    response_modalities=["AUDIO"],
+                    speech_config=genai.SpeechConfig(
+                        voice_config=genai.VoiceConfig(
+                            prebuilt_voice_config=genai.PrebuiltVoiceConfig(
+                                voice_name="Aoede"
+                            )
+                        )
+                    )
+                )
+            )
+            
+            # Save the audio content to file
+            with open(output_file_path, "wb") as f:
+                f.write(response.candidates[0].content.parts[0].inline_data.data)
         
         elif model == "cartesia":
             client = Cartesia(api_key=api_key)

From 33d5f05c5411a56ef3d29b1c789d1cb68e2714e1 Mon Sep 17 00:00:00 2001
From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com>
Date: Sat, 24 Jan 2026 23:38:12 +0530
Subject: [PATCH 2/7] Add configurable Gemini TTS model and voice settings

---
 example.env                       |  1 +
 voice_assistant/config.py         | 10 +++++++---
 voice_assistant/text_to_speech.py |  4 ++--
 voice_assistant/transcription.py  |  2 +-
 4 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/example.env b/example.env
index 05527b1..e2f9d00 100644
--- a/example.env
+++ b/example.env
@@ -5,3 +5,4 @@ ELEVENLABS_API_KEY="ELEVENLABS_API_KEY"
 CARTESIA_API_KEY="CARTESIA_API_KEY"
 LOCAL_MODEL_PATH=path/to/local/model
 PIPER_SERVER_URL=http://localhost:5000
+GOOGLE_API_KEY="GEMINI_API_KEY"
\ No newline at end of file
diff --git a/voice_assistant/config.py b/voice_assistant/config.py
index 6bb6f22..4bc3267 100644
--- a/voice_assistant/config.py
+++ b/voice_assistant/config.py
@@ -21,9 +21,13 @@ class Config:
         LOCAL_MODEL_PATH (str): Path to the local model.
     """
     # Model selection
-    TRANSCRIPTION_MODEL = 'deepgram'  # possible values: openai, groq, deepgram, fastwhisperapi
-    RESPONSE_MODEL = 'openai'  # possible values: openai, groq, ollama
-    TTS_MODEL = 'openai'  # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper
+    TRANSCRIPTION_MODEL = 'groq'  # possible values: openai, groq, deepgram, fastwhisperapi
+    RESPONSE_MODEL = 'groq'  # possible values: openai, groq, ollama
+    TTS_MODEL = 'gemini'  # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper, gemini
+
+    # Gemini TTS configuration
+    GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts" # https://ai.google.dev/gemini-api/docs/speech-generation#supported-models
+    GEMINI_TTS_VOICE = "Aoede" # https://ai.google.dev/gemini-api/docs/speech-generation#voices
 
     # Piper Server configuration
     PIPER_SERVER_URL = os.getenv("PIPER_SERVER_URL")
diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py
index bf9eba2..0355334 100644
--- a/voice_assistant/text_to_speech.py
+++ b/voice_assistant/text_to_speech.py
@@ -62,7 +62,7 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
 
         elif model == 'gemini':
             genai.configure(api_key=api_key)
-            model_instance = genai.GenerativeModel("gemini-2.0-flash-exp")
+            model_instance = genai.GenerativeModel(Config.GEMINI_TTS_MODEL)
             
             response = model_instance.generate_content(
                 text,
@@ -71,7 +71,7 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
                     speech_config=genai.SpeechConfig(
                         voice_config=genai.VoiceConfig(
                             prebuilt_voice_config=genai.PrebuiltVoiceConfig(
-                                voice_name="Aoede"
+                                voice_name=Config.GEMINI_TTS_VOICE
                             )
                         )
                     )
diff --git a/voice_assistant/transcription.py b/voice_assistant/transcription.py
index 2caa38b..6227d30 100644
--- a/voice_assistant/transcription.py
+++ b/voice_assistant/transcription.py
@@ -8,7 +8,7 @@
 from colorama import Fore, init
 from openai import OpenAI
 from groq import Groq
-from deepgram import DeepgramClient,PrerecordedOptions,FileSource
+from deepgram import DeepgramClient, PrerecordedOptions, FileSource
 
 fast_url = "http://localhost:8000"
 checked_fastwhisperapi = False

From 9a0278bb8238bd3a270635c7147b7ad65d64be62 Mon Sep 17 00:00:00 2001
From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com>
Date: Sat, 24 Jan 2026 23:44:18 +0530
Subject: [PATCH 3/7] Fix deepgram-sdk compatibility by upgrading to v5 and
 updating API calls

---
 voice_assistant/text_to_speech.py | 15 ++++++++-------
 voice_assistant/transcription.py  | 21 ++++++++++-----------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py
index be3ee96..31a4b2b 100644
--- a/voice_assistant/text_to_speech.py
+++ b/voice_assistant/text_to_speech.py
@@ -7,7 +7,7 @@
 import requests
 
 from openai import OpenAI
-from deepgram import DeepgramClient, SpeakOptions
+from deepgram import DeepgramClient
 from elevenlabs.client import ElevenLabs
 from cartesia import Cartesia
 
@@ -41,13 +41,14 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
 
         elif model == 'deepgram':
             client = DeepgramClient(api_key=api_key)
-            options = SpeakOptions(
-                model="aura-arcas-en", #"aura-luna-en", # https://developers.deepgram.com/docs/tts-models
-                encoding="linear16",
-                container="wav"
+            response = client.speak.v1.audio.generate(
+                text=text,
+                model="aura-arcas-en",  # https://developers.deepgram.com/docs/tts-models
             )
-            SPEAK_OPTIONS = {"text": text}
-            response = client.speak.v("1").save(output_file_path, SPEAK_OPTIONS, options)
+            
+            # Save the audio file
+            with open(output_file_path, "wb") as audio_file:
+                audio_file.write(response.stream.getvalue())
         
         elif model == 'elevenlabs':
             client = ElevenLabs(api_key=api_key)
diff --git a/voice_assistant/transcription.py b/voice_assistant/transcription.py
index 2caa38b..a063c1f 100644
--- a/voice_assistant/transcription.py
+++ b/voice_assistant/transcription.py
@@ -8,7 +8,7 @@
 from colorama import Fore, init
 from openai import OpenAI
 from groq import Groq
-from deepgram import DeepgramClient,PrerecordedOptions,FileSource
+from deepgram import DeepgramClient
 
 fast_url = "http://localhost:8000"
 checked_fastwhisperapi = False
@@ -80,17 +80,16 @@ def _transcribe_with_groq(api_key, audio_file_path):
 
 
 def _transcribe_with_deepgram(api_key, audio_file_path):
-    deepgram = DeepgramClient(api_key)
+    client = DeepgramClient(api_key)
     try:
-        with open(audio_file_path, "rb") as file:
-            buffer_data = file.read()
-
-        payload = {"buffer": buffer_data}
-        options = PrerecordedOptions(model="nova-2", smart_format=True)
-        response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options)
-        data = json.loads(response.to_json())
-
-        transcript = data['results']['channels'][0]['alternatives'][0]['transcript']
+        with open(audio_file_path, "rb") as audio_file:
+            response = client.listen.v1.media.transcribe_file(
+                request=audio_file.read(),
+                model="nova-2",
+                smart_format=True
+            )
+        
+        transcript = response.results.channels[0].alternatives[0].transcript
         return transcript
     except Exception as e:
         logging.error(f"{Fore.RED}Deepgram transcription error: {e}{Fore.RESET}")

From 907b1aa633848854b43c475597b5e7cadb3fa8bc Mon Sep 17 00:00:00 2001
From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com>
Date: Sat, 24 Jan 2026 23:47:30 +0530
Subject: [PATCH 4/7] Fix transcription.py import for deepgram SDK v5

---
 voice_assistant/transcription.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/voice_assistant/transcription.py b/voice_assistant/transcription.py
index f056535..a063c1f 100644
--- a/voice_assistant/transcription.py
+++ b/voice_assistant/transcription.py
@@ -8,7 +8,7 @@
 from colorama import Fore, init
 from openai import OpenAI
 from groq import Groq
-from deepgram import DeepgramClient,PrerecordedOptions,FileSource
+from deepgram import DeepgramClient
 
 fast_url = "http://localhost:8000"
 checked_fastwhisperapi = False

From df7d41d631009bbe905f162517b8a44a148ea31e Mon Sep 17 00:00:00 2001
From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com>
Date: Sat, 24 Jan 2026 23:52:05 +0530
Subject: [PATCH 5/7] Migrate to google-genai package and fix Groq model

---
 requirements.txt                  |  2 +-
 voice_assistant/config.py         |  2 +-
 voice_assistant/text_to_speech.py | 22 ++++++++++++----------
 3 files changed, 14 insertions(+), 12 deletions(-)

diff --git a/requirements.txt b/requirements.txt
index ad8b4a8..3126b2d 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -35,4 +35,4 @@ cartesia
 soundfile
 ollama
 pydub
-google-generativeai
\ No newline at end of file
+google-genai
\ No newline at end of file
diff --git a/voice_assistant/config.py b/voice_assistant/config.py
index 4bc3267..5450b74 100644
--- a/voice_assistant/config.py
+++ b/voice_assistant/config.py
@@ -38,7 +38,7 @@ class Config:
 
     # LLM Selection
     OLLAMA_LLM="llama3:8b"
-    GROQ_LLM="llama3-8b-8192"
+    GROQ_LLM="llama-3.3-70b-versatile"  # Updated from decommissioned llama3-8b-8192
     OPENAI_LLM="gpt-4o"
 
     # API keys and paths
diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py
index cf1c37a..3f904a9 100644
--- a/voice_assistant/text_to_speech.py
+++ b/voice_assistant/text_to_speech.py
@@ -5,7 +5,8 @@
 import elevenlabs
 import soundfile as sf
 import requests
-import google.generativeai as genai
+from google import genai
+from google.genai import types
 
 from openai import OpenAI
 from deepgram import DeepgramClient
@@ -62,16 +63,16 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
             elevenlabs.save(audio, output_file_path)
 
         elif model == 'gemini':
-            genai.configure(api_key=api_key)
-            model_instance = genai.GenerativeModel(Config.GEMINI_TTS_MODEL)
+            client = genai.Client(api_key=api_key)
             
-            response = model_instance.generate_content(
-                text,
-                generation_config=genai.GenerationConfig(
+            response = client.models.generate_content(
+                model=Config.GEMINI_TTS_MODEL,
+                contents=text,
+                config=types.GenerateContentConfig(
                     response_modalities=["AUDIO"],
-                    speech_config=genai.SpeechConfig(
-                        voice_config=genai.VoiceConfig(
-                            prebuilt_voice_config=genai.PrebuiltVoiceConfig(
+                    speech_config=types.SpeechConfig(
+                        voice_config=types.VoiceConfig(
+                            prebuilt_voice_config=types.PrebuiltVoiceConfig(
                                 voice_name=Config.GEMINI_TTS_VOICE
                             )
                         )
@@ -80,8 +81,9 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
             )
             
             # Save the audio content to file
+            audio_data = response.candidates[0].content.parts[0].inline_data.data
             with open(output_file_path, "wb") as f:
-                f.write(response.candidates[0].content.parts[0].inline_data.data)
+                f.write(audio_data)
         
         elif model == "cartesia":
             client = Cartesia(api_key=api_key)

From 56361279e6505fc233d83dbc051d1a6e774d9e2e Mon Sep 17 00:00:00 2001
From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com>
Date: Sat, 24 Jan 2026 23:54:05 +0530
Subject: [PATCH 6/7] Fix Gemini TTS audio format to WAV

---
 run_voice_assistant.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/run_voice_assistant.py b/run_voice_assistant.py
index 1fab32c..418e1c0 100644
--- a/run_voice_assistant.py
+++ b/run_voice_assistant.py
@@ -65,7 +65,7 @@ def main():
             chat_history.append({"role": "assistant", "content": response_text})
 
             # Determine the output file format based on the TTS model
-            if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia' or Config.TTS_MODEL == 'gemini':
+            if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia':
                 output_file = 'output.mp3'
             else:
                 output_file = 'output.wav'

From 506043775f9ae2858f2163b21bf61e0eb7e93c4b Mon Sep 17 00:00:00 2001
From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com>
Date: Sat, 24 Jan 2026 23:55:41 +0530
Subject: [PATCH 7/7] Add proper WAV headers for Gemini TTS PCM audio output

---
 voice_assistant/text_to_speech.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py
index 3f904a9..05facbc 100644
--- a/voice_assistant/text_to_speech.py
+++ b/voice_assistant/text_to_speech.py
@@ -5,6 +5,7 @@
 import elevenlabs
 import soundfile as sf
 import requests
+import wave
 from google import genai
 from google.genai import types
 
@@ -80,10 +81,15 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca
                 )
             )
             
-            # Save the audio content to file
+            # Get PCM audio data and save as proper WAV file with headers
             audio_data = response.candidates[0].content.parts[0].inline_data.data
-            with open(output_file_path, "wb") as f:
-                f.write(audio_data)
+            
+            # Write as WAV file with proper headers (24kHz, 16-bit, mono)
+            with wave.open(output_file_path, "wb") as wf:
+                wf.setnchannels(1)  # mono
+                wf.setsampwidth(2)  # 16-bit
+                wf.setframerate(24000)  # 24kHz sample rate
+                wf.writeframes(audio_data)
         
         elif model == "cartesia":
             client = Cartesia(api_key=api_key)