From 48cd583cf3be2096ee7efa82c38ce5361e05d865 Mon Sep 17 00:00:00 2001 From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com> Date: Thu, 8 Jan 2026 00:18:07 +0530 Subject: [PATCH 1/7] Add Google Gemini TTS API support --- README.md | 1 + requirements.txt | 3 ++- run_voice_assistant.py | 2 +- voice_assistant/api_key_manager.py | 3 ++- voice_assistant/config.py | 4 +++- voice_assistant/text_to_speech.py | 23 +++++++++++++++++++++++ 6 files changed, 32 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 308d490..36b824b 100644 --- a/README.md +++ b/README.md @@ -232,6 +232,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the - **OpenAI**: Uses OpenAI's TTS model with the 'fable' voice. - **Deepgram**: Uses Deepgram's TTS model with the 'aura-angus-en' voice. - **ElevenLabs**: Uses ElevenLabs' TTS model with the 'Paul J.' voice. +- **Google Gemini**: Uses Google's Gemini 2.0 Flash model with the 'Aoede' voice for text-to-speech generation. - **Local**: Placeholder for a local TTS model. ## Detailed Module Descriptions 📘 diff --git a/requirements.txt b/requirements.txt index d533fbb..ad8b4a8 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,4 +34,5 @@ sounddevice cartesia soundfile ollama -pydub \ No newline at end of file +pydub +google-generativeai \ No newline at end of file diff --git a/run_voice_assistant.py b/run_voice_assistant.py index 418e1c0..1fab32c 100644 --- a/run_voice_assistant.py +++ b/run_voice_assistant.py @@ -65,7 +65,7 @@ def main(): chat_history.append({"role": "assistant", "content": response_text}) # Determine the output file format based on the TTS model - if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia': + if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia' or Config.TTS_MODEL == 'gemini': output_file = 'output.mp3' else: output_file = 'output.wav' diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py index 68668e3..2f9f22d 100644 --- a/voice_assistant/api_key_manager.py +++ b/voice_assistant/api_key_manager.py @@ -15,7 +15,8 @@ "tts": { "openai": Config.OPENAI_API_KEY, "deepgram":Config.DEEPGRAM_API_KEY, - "elevenlabs": Config.ELEVENLABS_API_KEY + "elevenlabs": Config.ELEVENLABS_API_KEY, + "gemini": Config.GOOGLE_API_KEY } } diff --git a/voice_assistant/config.py b/voice_assistant/config.py index 73fbc87..6bb6f22 100644 --- a/voice_assistant/config.py +++ b/voice_assistant/config.py @@ -44,6 +44,7 @@ class Config: ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH") CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY") + GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") # for serving the MeloTTS model TTS_PORT_LOCAL = 5150 @@ -64,7 +65,7 @@ def validate_config(): Config._validate_model('RESPONSE_MODEL', [ 'openai', 'groq', 'ollama', 'local']) Config._validate_model('TTS_MODEL', [ - 'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper']) + 'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper', 'gemini']) Config._validate_api_key('TRANSCRIPTION_MODEL', 'openai', 'OPENAI_API_KEY') Config._validate_api_key('TRANSCRIPTION_MODEL', 'groq', 'GROQ_API_KEY') @@ -77,6 +78,7 @@ def validate_config(): Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY') Config._validate_api_key('TTS_MODEL', 'elevenlabs', 'ELEVENLABS_API_KEY') Config._validate_api_key('TTS_MODEL', 'cartesia', 'CARTESIA_API_KEY') + Config._validate_api_key('TTS_MODEL', 'gemini', 'GOOGLE_API_KEY') @staticmethod def _validate_model(attribute, valid_options): diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py index be3ee96..bf9eba2 100644 --- a/voice_assistant/text_to_speech.py +++ b/voice_assistant/text_to_speech.py @@ -5,6 +5,7 @@ import elevenlabs import soundfile as sf import requests +import google.generativeai as genai from openai import OpenAI from deepgram import DeepgramClient, SpeakOptions @@ -58,6 +59,28 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca model="eleven_turbo_v2" ) elevenlabs.save(audio, output_file_path) + + elif model == 'gemini': + genai.configure(api_key=api_key) + model_instance = genai.GenerativeModel("gemini-2.0-flash-exp") + + response = model_instance.generate_content( + text, + generation_config=genai.GenerationConfig( + response_modalities=["AUDIO"], + speech_config=genai.SpeechConfig( + voice_config=genai.VoiceConfig( + prebuilt_voice_config=genai.PrebuiltVoiceConfig( + voice_name="Aoede" + ) + ) + ) + ) + ) + + # Save the audio content to file + with open(output_file_path, "wb") as f: + f.write(response.candidates[0].content.parts[0].inline_data.data) elif model == "cartesia": client = Cartesia(api_key=api_key) From 33d5f05c5411a56ef3d29b1c789d1cb68e2714e1 Mon Sep 17 00:00:00 2001 From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com> Date: Sat, 24 Jan 2026 23:38:12 +0530 Subject: [PATCH 2/7] Add configurable Gemini TTS model and voice settings --- example.env | 1 + voice_assistant/config.py | 10 +++++++--- voice_assistant/text_to_speech.py | 4 ++-- voice_assistant/transcription.py | 2 +- 4 files changed, 11 insertions(+), 6 deletions(-) diff --git a/example.env b/example.env index 05527b1..e2f9d00 100644 --- a/example.env +++ b/example.env @@ -5,3 +5,4 @@ ELEVENLABS_API_KEY="ELEVENLABS_API_KEY" CARTESIA_API_KEY="CARTESIA_API_KEY" LOCAL_MODEL_PATH=path/to/local/model PIPER_SERVER_URL=http://localhost:5000 +GOOGLE_API_KEY="GEMINI_API_KEY" \ No newline at end of file diff --git a/voice_assistant/config.py b/voice_assistant/config.py index 6bb6f22..4bc3267 100644 --- a/voice_assistant/config.py +++ b/voice_assistant/config.py @@ -21,9 +21,13 @@ class Config: LOCAL_MODEL_PATH (str): Path to the local model. """ # Model selection - TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi - RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama - TTS_MODEL = 'openai' # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper + TRANSCRIPTION_MODEL = 'groq' # possible values: openai, groq, deepgram, fastwhisperapi + RESPONSE_MODEL = 'groq' # possible values: openai, groq, ollama + TTS_MODEL = 'gemini' # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper, gemini + + # Gemini TTS configuration + GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts" # https://ai.google.dev/gemini-api/docs/speech-generation#supported-models + GEMINI_TTS_VOICE = "Aoede" # https://ai.google.dev/gemini-api/docs/speech-generation#voices # Piper Server configuration PIPER_SERVER_URL = os.getenv("PIPER_SERVER_URL") diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py index bf9eba2..0355334 100644 --- a/voice_assistant/text_to_speech.py +++ b/voice_assistant/text_to_speech.py @@ -62,7 +62,7 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca elif model == 'gemini': genai.configure(api_key=api_key) - model_instance = genai.GenerativeModel("gemini-2.0-flash-exp") + model_instance = genai.GenerativeModel(Config.GEMINI_TTS_MODEL) response = model_instance.generate_content( text, @@ -71,7 +71,7 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca speech_config=genai.SpeechConfig( voice_config=genai.VoiceConfig( prebuilt_voice_config=genai.PrebuiltVoiceConfig( - voice_name="Aoede" + voice_name=Config.GEMINI_TTS_VOICE ) ) ) diff --git a/voice_assistant/transcription.py b/voice_assistant/transcription.py index 2caa38b..6227d30 100644 --- a/voice_assistant/transcription.py +++ b/voice_assistant/transcription.py @@ -8,7 +8,7 @@ from colorama import Fore, init from openai import OpenAI from groq import Groq -from deepgram import DeepgramClient,PrerecordedOptions,FileSource +from deepgram import DeepgramClient, PrerecordedOptions, FileSource fast_url = "http://localhost:8000" checked_fastwhisperapi = False From 9a0278bb8238bd3a270635c7147b7ad65d64be62 Mon Sep 17 00:00:00 2001 From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com> Date: Sat, 24 Jan 2026 23:44:18 +0530 Subject: [PATCH 3/7] Fix deepgram-sdk compatibility by upgrading to v5 and updating API calls --- voice_assistant/text_to_speech.py | 15 ++++++++------- voice_assistant/transcription.py | 21 ++++++++++----------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py index be3ee96..31a4b2b 100644 --- a/voice_assistant/text_to_speech.py +++ b/voice_assistant/text_to_speech.py @@ -7,7 +7,7 @@ import requests from openai import OpenAI -from deepgram import DeepgramClient, SpeakOptions +from deepgram import DeepgramClient from elevenlabs.client import ElevenLabs from cartesia import Cartesia @@ -41,13 +41,14 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca elif model == 'deepgram': client = DeepgramClient(api_key=api_key) - options = SpeakOptions( - model="aura-arcas-en", #"aura-luna-en", # https://developers.deepgram.com/docs/tts-models - encoding="linear16", - container="wav" + response = client.speak.v1.audio.generate( + text=text, + model="aura-arcas-en", # https://developers.deepgram.com/docs/tts-models ) - SPEAK_OPTIONS = {"text": text} - response = client.speak.v("1").save(output_file_path, SPEAK_OPTIONS, options) + + # Save the audio file + with open(output_file_path, "wb") as audio_file: + audio_file.write(response.stream.getvalue()) elif model == 'elevenlabs': client = ElevenLabs(api_key=api_key) diff --git a/voice_assistant/transcription.py b/voice_assistant/transcription.py index 2caa38b..a063c1f 100644 --- a/voice_assistant/transcription.py +++ b/voice_assistant/transcription.py @@ -8,7 +8,7 @@ from colorama import Fore, init from openai import OpenAI from groq import Groq -from deepgram import DeepgramClient,PrerecordedOptions,FileSource +from deepgram import DeepgramClient fast_url = "http://localhost:8000" checked_fastwhisperapi = False @@ -80,17 +80,16 @@ def _transcribe_with_groq(api_key, audio_file_path): def _transcribe_with_deepgram(api_key, audio_file_path): - deepgram = DeepgramClient(api_key) + client = DeepgramClient(api_key) try: - with open(audio_file_path, "rb") as file: - buffer_data = file.read() - - payload = {"buffer": buffer_data} - options = PrerecordedOptions(model="nova-2", smart_format=True) - response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options) - data = json.loads(response.to_json()) - - transcript = data['results']['channels'][0]['alternatives'][0]['transcript'] + with open(audio_file_path, "rb") as audio_file: + response = client.listen.v1.media.transcribe_file( + request=audio_file.read(), + model="nova-2", + smart_format=True + ) + + transcript = response.results.channels[0].alternatives[0].transcript return transcript except Exception as e: logging.error(f"{Fore.RED}Deepgram transcription error: {e}{Fore.RESET}") From 907b1aa633848854b43c475597b5e7cadb3fa8bc Mon Sep 17 00:00:00 2001 From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com> Date: Sat, 24 Jan 2026 23:47:30 +0530 Subject: [PATCH 4/7] Fix transcription.py import for deepgram SDK v5 --- voice_assistant/transcription.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/voice_assistant/transcription.py b/voice_assistant/transcription.py index f056535..a063c1f 100644 --- a/voice_assistant/transcription.py +++ b/voice_assistant/transcription.py @@ -8,7 +8,7 @@ from colorama import Fore, init from openai import OpenAI from groq import Groq -from deepgram import DeepgramClient,PrerecordedOptions,FileSource +from deepgram import DeepgramClient fast_url = "http://localhost:8000" checked_fastwhisperapi = False From df7d41d631009bbe905f162517b8a44a148ea31e Mon Sep 17 00:00:00 2001 From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com> Date: Sat, 24 Jan 2026 23:52:05 +0530 Subject: [PATCH 5/7] Migrate to google-genai package and fix Groq model --- requirements.txt | 2 +- voice_assistant/config.py | 2 +- voice_assistant/text_to_speech.py | 22 ++++++++++++---------- 3 files changed, 14 insertions(+), 12 deletions(-) diff --git a/requirements.txt b/requirements.txt index ad8b4a8..3126b2d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -35,4 +35,4 @@ cartesia soundfile ollama pydub -google-generativeai \ No newline at end of file +google-genai \ No newline at end of file diff --git a/voice_assistant/config.py b/voice_assistant/config.py index 4bc3267..5450b74 100644 --- a/voice_assistant/config.py +++ b/voice_assistant/config.py @@ -38,7 +38,7 @@ class Config: # LLM Selection OLLAMA_LLM="llama3:8b" - GROQ_LLM="llama3-8b-8192" + GROQ_LLM="llama-3.3-70b-versatile" # Updated from decommissioned llama3-8b-8192 OPENAI_LLM="gpt-4o" # API keys and paths diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py index cf1c37a..3f904a9 100644 --- a/voice_assistant/text_to_speech.py +++ b/voice_assistant/text_to_speech.py @@ -5,7 +5,8 @@ import elevenlabs import soundfile as sf import requests -import google.generativeai as genai +from google import genai +from google.genai import types from openai import OpenAI from deepgram import DeepgramClient @@ -62,16 +63,16 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca elevenlabs.save(audio, output_file_path) elif model == 'gemini': - genai.configure(api_key=api_key) - model_instance = genai.GenerativeModel(Config.GEMINI_TTS_MODEL) + client = genai.Client(api_key=api_key) - response = model_instance.generate_content( - text, - generation_config=genai.GenerationConfig( + response = client.models.generate_content( + model=Config.GEMINI_TTS_MODEL, + contents=text, + config=types.GenerateContentConfig( response_modalities=["AUDIO"], - speech_config=genai.SpeechConfig( - voice_config=genai.VoiceConfig( - prebuilt_voice_config=genai.PrebuiltVoiceConfig( + speech_config=types.SpeechConfig( + voice_config=types.VoiceConfig( + prebuilt_voice_config=types.PrebuiltVoiceConfig( voice_name=Config.GEMINI_TTS_VOICE ) ) @@ -80,8 +81,9 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca ) # Save the audio content to file + audio_data = response.candidates[0].content.parts[0].inline_data.data with open(output_file_path, "wb") as f: - f.write(response.candidates[0].content.parts[0].inline_data.data) + f.write(audio_data) elif model == "cartesia": client = Cartesia(api_key=api_key) From 56361279e6505fc233d83dbc051d1a6e774d9e2e Mon Sep 17 00:00:00 2001 From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com> Date: Sat, 24 Jan 2026 23:54:05 +0530 Subject: [PATCH 6/7] Fix Gemini TTS audio format to WAV --- run_voice_assistant.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run_voice_assistant.py b/run_voice_assistant.py index 1fab32c..418e1c0 100644 --- a/run_voice_assistant.py +++ b/run_voice_assistant.py @@ -65,7 +65,7 @@ def main(): chat_history.append({"role": "assistant", "content": response_text}) # Determine the output file format based on the TTS model - if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia' or Config.TTS_MODEL == 'gemini': + if Config.TTS_MODEL == 'openai' or Config.TTS_MODEL == 'elevenlabs' or Config.TTS_MODEL == 'melotts' or Config.TTS_MODEL == 'cartesia': output_file = 'output.mp3' else: output_file = 'output.wav' From 506043775f9ae2858f2163b21bf61e0eb7e93c4b Mon Sep 17 00:00:00 2001 From: Shorya Sethia <132898518+shoryasethia@users.noreply.github.com> Date: Sat, 24 Jan 2026 23:55:41 +0530 Subject: [PATCH 7/7] Add proper WAV headers for Gemini TTS PCM audio output --- voice_assistant/text_to_speech.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py index 3f904a9..05facbc 100644 --- a/voice_assistant/text_to_speech.py +++ b/voice_assistant/text_to_speech.py @@ -5,6 +5,7 @@ import elevenlabs import soundfile as sf import requests +import wave from google import genai from google.genai import types @@ -80,10 +81,15 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca ) ) - # Save the audio content to file + # Get PCM audio data and save as proper WAV file with headers audio_data = response.candidates[0].content.parts[0].inline_data.data - with open(output_file_path, "wb") as f: - f.write(audio_data) + + # Write as WAV file with proper headers (24kHz, 16-bit, mono) + with wave.open(output_file_path, "wb") as wf: + wf.setnchannels(1) # mono + wf.setsampwidth(2) # 16-bit + wf.setframerate(24000) # 24kHz sample rate + wf.writeframes(audio_data) elif model == "cartesia": client = Cartesia(api_key=api_key)