diff --git a/README.md b/README.md index 308d490..36b824b 100644 --- a/README.md +++ b/README.md @@ -232,6 +232,7 @@ If you are running LLM locally via [Ollama](https://ollama.com/), make sure the - **OpenAI**: Uses OpenAI's TTS model with the 'fable' voice. - **Deepgram**: Uses Deepgram's TTS model with the 'aura-angus-en' voice. - **ElevenLabs**: Uses ElevenLabs' TTS model with the 'Paul J.' voice. +- **Google Gemini**: Uses Google's Gemini 2.0 Flash model with the 'Aoede' voice for text-to-speech generation. - **Local**: Placeholder for a local TTS model. ## Detailed Module Descriptions 📘 diff --git a/example.env b/example.env index 05527b1..e2f9d00 100644 --- a/example.env +++ b/example.env @@ -5,3 +5,4 @@ ELEVENLABS_API_KEY="ELEVENLABS_API_KEY" CARTESIA_API_KEY="CARTESIA_API_KEY" LOCAL_MODEL_PATH=path/to/local/model PIPER_SERVER_URL=http://localhost:5000 +GOOGLE_API_KEY="GEMINI_API_KEY" \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index d533fbb..3126b2d 100644 --- a/requirements.txt +++ b/requirements.txt @@ -34,4 +34,5 @@ sounddevice cartesia soundfile ollama -pydub \ No newline at end of file +pydub +google-genai \ No newline at end of file diff --git a/voice_assistant/api_key_manager.py b/voice_assistant/api_key_manager.py index 68668e3..2f9f22d 100644 --- a/voice_assistant/api_key_manager.py +++ b/voice_assistant/api_key_manager.py @@ -15,7 +15,8 @@ "tts": { "openai": Config.OPENAI_API_KEY, "deepgram":Config.DEEPGRAM_API_KEY, - "elevenlabs": Config.ELEVENLABS_API_KEY + "elevenlabs": Config.ELEVENLABS_API_KEY, + "gemini": Config.GOOGLE_API_KEY } } diff --git a/voice_assistant/config.py b/voice_assistant/config.py index 73fbc87..5450b74 100644 --- a/voice_assistant/config.py +++ b/voice_assistant/config.py @@ -21,9 +21,13 @@ class Config: LOCAL_MODEL_PATH (str): Path to the local model. """ # Model selection - TRANSCRIPTION_MODEL = 'deepgram' # possible values: openai, groq, deepgram, fastwhisperapi - RESPONSE_MODEL = 'openai' # possible values: openai, groq, ollama - TTS_MODEL = 'openai' # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper + TRANSCRIPTION_MODEL = 'groq' # possible values: openai, groq, deepgram, fastwhisperapi + RESPONSE_MODEL = 'groq' # possible values: openai, groq, ollama + TTS_MODEL = 'gemini' # possible values: openai, deepgram, elevenlabs, melotts, cartesia, piper, gemini + + # Gemini TTS configuration + GEMINI_TTS_MODEL = "gemini-2.5-flash-preview-tts" # https://ai.google.dev/gemini-api/docs/speech-generation#supported-models + GEMINI_TTS_VOICE = "Aoede" # https://ai.google.dev/gemini-api/docs/speech-generation#voices # Piper Server configuration PIPER_SERVER_URL = os.getenv("PIPER_SERVER_URL") @@ -34,7 +38,7 @@ class Config: # LLM Selection OLLAMA_LLM="llama3:8b" - GROQ_LLM="llama3-8b-8192" + GROQ_LLM="llama-3.3-70b-versatile" # Updated from decommissioned llama3-8b-8192 OPENAI_LLM="gpt-4o" # API keys and paths @@ -44,6 +48,7 @@ class Config: ELEVENLABS_API_KEY = os.getenv("ELEVENLABS_API_KEY") LOCAL_MODEL_PATH = os.getenv("LOCAL_MODEL_PATH") CARTESIA_API_KEY = os.getenv("CARTESIA_API_KEY") + GOOGLE_API_KEY = os.getenv("GOOGLE_API_KEY") # for serving the MeloTTS model TTS_PORT_LOCAL = 5150 @@ -64,7 +69,7 @@ def validate_config(): Config._validate_model('RESPONSE_MODEL', [ 'openai', 'groq', 'ollama', 'local']) Config._validate_model('TTS_MODEL', [ - 'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper']) + 'openai', 'deepgram', 'elevenlabs', 'melotts', 'cartesia', 'local', 'piper', 'gemini']) Config._validate_api_key('TRANSCRIPTION_MODEL', 'openai', 'OPENAI_API_KEY') Config._validate_api_key('TRANSCRIPTION_MODEL', 'groq', 'GROQ_API_KEY') @@ -77,6 +82,7 @@ def validate_config(): Config._validate_api_key('TTS_MODEL', 'deepgram', 'DEEPGRAM_API_KEY') Config._validate_api_key('TTS_MODEL', 'elevenlabs', 'ELEVENLABS_API_KEY') Config._validate_api_key('TTS_MODEL', 'cartesia', 'CARTESIA_API_KEY') + Config._validate_api_key('TTS_MODEL', 'gemini', 'GOOGLE_API_KEY') @staticmethod def _validate_model(attribute, valid_options): diff --git a/voice_assistant/text_to_speech.py b/voice_assistant/text_to_speech.py index be3ee96..05facbc 100644 --- a/voice_assistant/text_to_speech.py +++ b/voice_assistant/text_to_speech.py @@ -5,9 +5,12 @@ import elevenlabs import soundfile as sf import requests +import wave +from google import genai +from google.genai import types from openai import OpenAI -from deepgram import DeepgramClient, SpeakOptions +from deepgram import DeepgramClient from elevenlabs.client import ElevenLabs from cartesia import Cartesia @@ -41,13 +44,14 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca elif model == 'deepgram': client = DeepgramClient(api_key=api_key) - options = SpeakOptions( - model="aura-arcas-en", #"aura-luna-en", # https://developers.deepgram.com/docs/tts-models - encoding="linear16", - container="wav" + response = client.speak.v1.audio.generate( + text=text, + model="aura-arcas-en", # https://developers.deepgram.com/docs/tts-models ) - SPEAK_OPTIONS = {"text": text} - response = client.speak.v("1").save(output_file_path, SPEAK_OPTIONS, options) + + # Save the audio file + with open(output_file_path, "wb") as audio_file: + audio_file.write(response.stream.getvalue()) elif model == 'elevenlabs': client = ElevenLabs(api_key=api_key) @@ -58,6 +62,34 @@ def text_to_speech(model: str, api_key:str, text:str, output_file_path:str, loca model="eleven_turbo_v2" ) elevenlabs.save(audio, output_file_path) + + elif model == 'gemini': + client = genai.Client(api_key=api_key) + + response = client.models.generate_content( + model=Config.GEMINI_TTS_MODEL, + contents=text, + config=types.GenerateContentConfig( + response_modalities=["AUDIO"], + speech_config=types.SpeechConfig( + voice_config=types.VoiceConfig( + prebuilt_voice_config=types.PrebuiltVoiceConfig( + voice_name=Config.GEMINI_TTS_VOICE + ) + ) + ) + ) + ) + + # Get PCM audio data and save as proper WAV file with headers + audio_data = response.candidates[0].content.parts[0].inline_data.data + + # Write as WAV file with proper headers (24kHz, 16-bit, mono) + with wave.open(output_file_path, "wb") as wf: + wf.setnchannels(1) # mono + wf.setsampwidth(2) # 16-bit + wf.setframerate(24000) # 24kHz sample rate + wf.writeframes(audio_data) elif model == "cartesia": client = Cartesia(api_key=api_key) diff --git a/voice_assistant/transcription.py b/voice_assistant/transcription.py index 2caa38b..a063c1f 100644 --- a/voice_assistant/transcription.py +++ b/voice_assistant/transcription.py @@ -8,7 +8,7 @@ from colorama import Fore, init from openai import OpenAI from groq import Groq -from deepgram import DeepgramClient,PrerecordedOptions,FileSource +from deepgram import DeepgramClient fast_url = "http://localhost:8000" checked_fastwhisperapi = False @@ -80,17 +80,16 @@ def _transcribe_with_groq(api_key, audio_file_path): def _transcribe_with_deepgram(api_key, audio_file_path): - deepgram = DeepgramClient(api_key) + client = DeepgramClient(api_key) try: - with open(audio_file_path, "rb") as file: - buffer_data = file.read() - - payload = {"buffer": buffer_data} - options = PrerecordedOptions(model="nova-2", smart_format=True) - response = deepgram.listen.prerecorded.v("1").transcribe_file(payload, options) - data = json.loads(response.to_json()) - - transcript = data['results']['channels'][0]['alternatives'][0]['transcript'] + with open(audio_file_path, "rb") as audio_file: + response = client.listen.v1.media.transcribe_file( + request=audio_file.read(), + model="nova-2", + smart_format=True + ) + + transcript = response.results.channels[0].alternatives[0].transcript return transcript except Exception as e: logging.error(f"{Fore.RED}Deepgram transcription error: {e}{Fore.RESET}")